diff --git a/client/alloc_endpoint.go b/client/alloc_endpoint.go index 6b1ff7604200..a0bfc3920e91 100644 --- a/client/alloc_endpoint.go +++ b/client/alloc_endpoint.go @@ -8,9 +8,8 @@ import ( "io" "time" - metrics "github.com/armon/go-metrics" + "github.com/armon/go-metrics" "github.com/hashicorp/go-msgpack/codec" - "github.com/hashicorp/nomad/acl" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/helper" @@ -137,6 +136,29 @@ func (a *Allocations) Stats(args *cstructs.AllocStatsRequest, reply *cstructs.Al return nil } +// Checks is used to retrieve nomad service discovery check status information. +func (a *Allocations) Checks(args *cstructs.AllocChecksRequest, reply *cstructs.AllocChecksResponse) error { + defer metrics.MeasureSince([]string{"client", "allocations", "checks"}, time.Now()) + + // Get the allocation + alloc, err := a.c.GetAlloc(args.AllocID) + if err != nil { + return err + } + + // Check read-job permission + if aclObj, aclErr := a.c.ResolveToken(args.AuthToken); aclErr != nil { + return aclErr + } else if aclObj != nil && !aclObj.AllowNsOp(alloc.Namespace, acl.NamespaceCapabilityReadJob) { + return nstructs.ErrPermissionDenied + } + + // Get the status information for the allocation + reply.Results = a.c.checkStore.List(alloc.ID) + + return nil +} + // exec is used to execute command in a running task func (a *Allocations) exec(conn io.ReadWriteCloser) { defer metrics.MeasureSince([]string{"client", "allocations", "exec"}, time.Now()) diff --git a/client/alloc_endpoint_test.go b/client/alloc_endpoint_test.go index 272819ecc35c..c8f2560d02be 100644 --- a/client/alloc_endpoint_test.go +++ b/client/alloc_endpoint_test.go @@ -23,6 +23,7 @@ import ( nconfig "github.com/hashicorp/nomad/nomad/structs/config" "github.com/hashicorp/nomad/plugins/drivers" "github.com/hashicorp/nomad/testutil" + "github.com/shoenig/test/must" "github.com/stretchr/testify/require" "golang.org/x/sys/unix" ) @@ -529,6 +530,92 @@ func TestAllocations_Stats_ACL(t *testing.T) { } } +func TestAlloc_Checks(t *testing.T) { + ci.Parallel(t) + + client, cleanup := TestClient(t, nil) + t.Cleanup(func() { + must.NoError(t, cleanup()) + }) + + now := time.Date(2022, 3, 4, 5, 6, 7, 8, time.UTC).Unix() + + qr1 := &nstructs.CheckQueryResult{ + ID: "abc123", + Mode: "healthiness", + Status: "passing", + Output: "nomad: http ok", + Timestamp: now, + Group: "group", + Task: "task", + Service: "service", + Check: "check", + } + + qr2 := &nstructs.CheckQueryResult{ + ID: "def456", + Mode: "readiness", + Status: "passing", + Output: "nomad: http ok", + Timestamp: now, + Group: "group", + Service: "service2", + Check: "check", + } + + t.Run("alloc does not exist", func(t *testing.T) { + request := cstructs.AllocChecksRequest{AllocID: "d3e34248-4843-be75-d4fd-4899975cfb38"} + var response cstructs.AllocChecksResponse + err := client.ClientRPC("Allocations.Checks", &request, &response) + must.EqError(t, err, `Unknown allocation "d3e34248-4843-be75-d4fd-4899975cfb38"`) + }) + + t.Run("no checks for alloc", func(t *testing.T) { + alloc := mock.Alloc() + must.NoError(t, client.addAlloc(alloc, "")) + + request := cstructs.AllocChecksRequest{AllocID: alloc.ID} + var response cstructs.AllocChecksResponse + err := client.ClientRPC("Allocations.Checks", &request, &response) + must.NoError(t, err) + must.MapEmpty(t, response.Results) + }) + + t.Run("two in one alloc", func(t *testing.T) { + alloc := mock.Alloc() + must.NoError(t, client.addAlloc(alloc, "")) + must.NoError(t, client.checkStore.Set(alloc.ID, qr1)) + must.NoError(t, client.checkStore.Set(alloc.ID, qr2)) + + request := cstructs.AllocChecksRequest{AllocID: alloc.ID} + var response cstructs.AllocChecksResponse + err := client.ClientRPC("Allocations.Checks", &request, &response) + must.NoError(t, err) + must.MapEq(t, map[nstructs.CheckID]*nstructs.CheckQueryResult{ + "abc123": qr1, + "def456": qr2, + }, response.Results) + }) + + t.Run("ignore unrelated alloc", func(t *testing.T) { + alloc1 := mock.Alloc() + must.NoError(t, client.addAlloc(alloc1, "")) + + alloc2 := mock.Alloc() + must.NoError(t, client.addAlloc(alloc2, "")) + must.NoError(t, client.checkStore.Set(alloc1.ID, qr1)) + must.NoError(t, client.checkStore.Set(alloc2.ID, qr2)) + + request := cstructs.AllocChecksRequest{AllocID: alloc1.ID} + var response cstructs.AllocChecksResponse + err := client.ClientRPC("Allocations.Checks", &request, &response) + must.NoError(t, err) + must.MapEq(t, map[nstructs.CheckID]*nstructs.CheckQueryResult{ + "abc123": qr1, + }, response.Results) + }) +} + func TestAlloc_ExecStreaming(t *testing.T) { ci.Parallel(t) require := require.New(t) diff --git a/client/allochealth/tracker.go b/client/allochealth/tracker.go index 34db96f13fc6..b2add7d75d03 100644 --- a/client/allochealth/tracker.go +++ b/client/allochealth/tracker.go @@ -8,9 +8,11 @@ import ( "time" "github.com/hashicorp/consul/api" - hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/serviceregistration" + "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" cstructs "github.com/hashicorp/nomad/client/structs" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" ) @@ -18,9 +20,9 @@ const ( // AllocHealthEventSource is the source used for emitting task events AllocHealthEventSource = "Alloc Unhealthy" - // consulCheckLookupInterval is the interval at which we check if the - // Consul checks are healthy or unhealthy. - consulCheckLookupInterval = 500 * time.Millisecond + // checkLookupInterval is the pace at which we check if the Consul or Nomad + // checks for an allocation are healthy or unhealthy. + checkLookupInterval = 500 * time.Millisecond ) // Tracker tracks the health of an allocation and makes health events watchable @@ -40,23 +42,30 @@ type Tracker struct { // considered healthy minHealthyTime time.Duration - // checkLookupInterval is the interval at which we check if the - // Consul checks are healthy or unhealthy. + // checkLookupInterval is the repeated interval after which which we check + // if the Consul checks are healthy or unhealthy. checkLookupInterval time.Duration - // useChecks specifies whether to use Consul healh checks or not + // useChecks specifies whether to consider Consul and Nomad service checks. useChecks bool - // consulCheckCount is the number of checks the task group will attempt to - // register + // consulCheckCount is the total number of Consul service checks in the task + // group including task level checks. consulCheckCount int + // nomadCheckCount is the total the number of Nomad service checks in the task + // group including task level checks. + nomadCheckCount int + // allocUpdates is a listener for retrieving new alloc updates allocUpdates *cstructs.AllocListener - // consulClient is used to look up the state of the task's checks + // consulClient is used to look up the status of Consul service checks consulClient serviceregistration.Handler + // checkStore is used to lookup the status of Nomad service checks + checkStore checkstore.Shim + // healthy is used to signal whether we have determined the allocation to be // healthy or unhealthy healthy chan bool @@ -69,11 +78,11 @@ type Tracker struct { // These tasks may terminate without affecting alloc health lifecycleTasks map[string]string - // l is used to lock shared fields listed below - l sync.Mutex + // lock is used to lock shared fields listed below + lock sync.Mutex // tasksHealthy marks whether all the tasks have met their health check - // (disregards Consul) + // (disregards Consul and Nomad checks) tasksHealthy bool // allocFailed marks whether the allocation failed @@ -82,22 +91,31 @@ type Tracker struct { // checksHealthy marks whether all the task's Consul checks are healthy checksHealthy bool - // taskHealth contains the health state for each task + // taskHealth contains the health state for each task in the allocation + // name -> state taskHealth map[string]*taskHealthState + // logger is for logging things logger hclog.Logger } -// NewTracker returns a health tracker for the given allocation. An alloc -// listener and consul API object are given so that the watcher can detect -// health changes. -func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.Allocation, - allocUpdates *cstructs.AllocListener, consulClient serviceregistration.Handler, - minHealthyTime time.Duration, useChecks bool) *Tracker { +// NewTracker returns a health tracker for the given allocation. +// +// Depending on job configuration, an allocation's health takes into consideration +// - An alloc listener +// - Consul checks (via consul API) +// - Nomad checks (via client state) +func NewTracker( + parentCtx context.Context, + logger hclog.Logger, + alloc *structs.Allocation, + allocUpdates *cstructs.AllocListener, + consulClient serviceregistration.Handler, + checkStore checkstore.Shim, + minHealthyTime time.Duration, + useChecks bool, +) *Tracker { - // Do not create a named sub-logger as the hook controlling - // this struct should pass in an appropriately named - // sub-logger. t := &Tracker{ healthy: make(chan bool, 1), allocStopped: make(chan struct{}), @@ -107,7 +125,8 @@ func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.A useChecks: useChecks, allocUpdates: allocUpdates, consulClient: consulClient, - checkLookupInterval: consulCheckLookupInterval, + checkStore: checkStore, + checkLookupInterval: checkLookupInterval, logger: logger, lifecycleTasks: map[string]string{}, } @@ -120,24 +139,37 @@ func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.A t.lifecycleTasks[task.Name] = task.Lifecycle.Hook } - for _, s := range task.Services { - t.consulCheckCount += len(s.Checks) - } + c, n := countChecks(task.Services) + t.consulCheckCount += c + t.nomadCheckCount += n } - for _, s := range t.tg.Services { - t.consulCheckCount += len(s.Checks) - } + c, n := countChecks(t.tg.Services) + t.consulCheckCount += c + t.nomadCheckCount += n t.ctx, t.cancelFn = context.WithCancel(parentCtx) return t } +func countChecks(services []*structs.Service) (consul, nomad int) { + for _, service := range services { + switch service.Provider { + case structs.ServiceProviderConsul: + consul += len(service.Checks) + case structs.ServiceProviderNomad: + nomad += len(service.Checks) + } + } + return +} + // Start starts the watcher. func (t *Tracker) Start() { go t.watchTaskEvents() if t.useChecks { go t.watchConsulEvents() + go t.watchNomadEvents() } } @@ -157,8 +189,8 @@ func (t *Tracker) AllocStoppedCh() <-chan struct{} { // health has been determined. Only tasks that have contributed to the // allocation being unhealthy will have an event. func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent { - t.l.Lock() - defer t.l.Unlock() + t.lock.Lock() + defer t.lock.Unlock() // Nothing to do since the failure wasn't task related if t.allocFailed { @@ -180,10 +212,11 @@ func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent { } // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the -// allocation is terminal, health is immediately broadcasted. +// allocation is terminal, health is immediately broadcast. func (t *Tracker) setTaskHealth(healthy, terminal bool) { - t.l.Lock() - defer t.l.Unlock() + t.lock.Lock() + defer t.lock.Unlock() + t.tasksHealthy = healthy // if unhealthy, force waiting for new checks health status @@ -192,15 +225,23 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) { return } - // If we are marked healthy but we also require Consul to be healthy and it - // isn't yet, return, unless the task is terminal - requireConsul := t.useChecks && t.consulCheckCount > 0 - if !terminal && healthy && requireConsul && !t.checksHealthy { + // If we are marked healthy but we also require Consul checks to be healthy + // and they are not yet, return, unless the task is terminal. + usesConsulChecks := t.useChecks && t.consulCheckCount > 0 + if !terminal && healthy && usesConsulChecks && !t.checksHealthy { + return + } + + // If we are marked healthy but also require Nomad checks to be healthy and + // they are not yet, return, unless the task is terminal. + usesNomadChecks := t.useChecks && t.nomadCheckCount > 0 + if !terminal && healthy && usesNomadChecks && !t.checksHealthy { return } select { case t.healthy <- healthy: + // nothing default: } @@ -210,9 +251,13 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) { // setCheckHealth is used to mark the checks as either healthy or unhealthy. // returns true if health is propagated and no more health monitoring is needed +// +// todo: this is currently being shared by watchConsulEvents and watchNomadEvents, +// and must be split up if/when we support registering services (and thus checks) +// of different providers. func (t *Tracker) setCheckHealth(healthy bool) bool { - t.l.Lock() - defer t.l.Unlock() + t.lock.Lock() + defer t.lock.Unlock() // check health should always be false if tasks are unhealthy // as checks might be missing from unhealthy tasks @@ -225,10 +270,11 @@ func (t *Tracker) setCheckHealth(healthy bool) bool { select { case t.healthy <- healthy: + // nothing default: } - // Shutdown the tracker + // Shutdown the tracker, things are healthy so nothing to do t.cancelFn() return true } @@ -244,6 +290,8 @@ func (t *Tracker) markAllocStopped() { func (t *Tracker) watchTaskEvents() { alloc := t.alloc allStartedTime := time.Time{} + + // todo(shoenig): refactor with healthyFuture healthyTimer := time.NewTimer(0) if !healthyTimer.Stop() { select { @@ -262,16 +310,17 @@ func (t *Tracker) watchTaskEvents() { } // Store the task states - t.l.Lock() + t.lock.Lock() for task, state := range alloc.TaskStates { //TODO(schmichael) for now skip unknown tasks as //they're task group services which don't currently //support checks anyway + //TODO(shoenig) is ^ still true? if v, ok := t.taskHealth[task]; ok { v.state = state } } - t.l.Unlock() + t.lock.Unlock() // Detect if the alloc is unhealthy or if all tasks have started yet latestStartTime := time.Time{} @@ -308,9 +357,9 @@ func (t *Tracker) watchTaskEvents() { // individual tasks failed, that means something failed at the alloc // level. if alloc.ClientStatus == structs.AllocClientStatusFailed { - t.l.Lock() + t.lock.Lock() t.allocFailed = true - t.l.Unlock() + t.lock.Unlock() t.setTaskHealth(false, true) return @@ -349,27 +398,60 @@ func (t *Tracker) watchTaskEvents() { } } -// watchConsulEvents is a watcher for the health of the allocation's Consul -// checks. If all checks report healthy the watcher will exit after the -// MinHealthyTime has been reached, Otherwise the watcher will continue to -// check unhealthy checks until the ctx is cancelled -func (t *Tracker) watchConsulEvents() { - // checkTicker is the ticker that triggers us to look at the checks in - // Consul - checkTicker := time.NewTicker(t.checkLookupInterval) - defer checkTicker.Stop() +// healthyFuture is used to fire after checks have been healthy for MinHealthyTime +type healthyFuture struct { + timer *time.Timer +} - // healthyTimer fires when the checks have been healthy for the - // MinHealthyTime - healthyTimer := time.NewTimer(0) - if !healthyTimer.Stop() { +// newHealthyFuture will create a healthyFuture in a disabled state, and +// will do nothing until a call to wait takes place +func newHealthyFuture() *healthyFuture { + timer := time.NewTimer(0) + ht := &healthyFuture{timer: timer} + ht.disable() + return ht +} + +// disable the healthyFuture from triggering +func (h *healthyFuture) disable() { + if !h.timer.Stop() { + // must ensure channel is clear + // https://pkg.go.dev/time#Timer.Stop select { - case <-healthyTimer.C: + case <-h.timer.C: default: } } +} + +// wait will reset the healthyFuture to trigger after dur passes. +func (h *healthyFuture) wait(dur time.Duration) { + // must ensure timer is stopped + // https://pkg.go.dev/time#Timer.Reset + h.disable() + h.timer.Reset(dur) +} + +// C returns a channel on which the future will send when ready. +func (h *healthyFuture) C() <-chan time.Time { + return h.timer.C +} - // primed marks whether the healthy timer has been set +// watchConsulEvents is a watcher for the health of the allocation's Consul +// checks. If all checks report healthy the watcher will exit after the +// MinHealthyTime has been reached, otherwise the watcher will continue to +// check unhealthy checks until the ctx is cancelled. +// +// Does not watch Nomad service checks; see watchNomadEvents for those. +func (t *Tracker) watchConsulEvents() { + // checkTicker is the ticker that triggers us to look at the checks in Consul + checkTicker := time.NewTicker(t.checkLookupInterval) + defer checkTicker.Stop() + + // waiter is used to fire when the checks have been healthy for the MinHealthyTime + waiter := newHealthyFuture() + + // primed marks whether the healthy waiter has been set primed := false // Store whether the last Consul checks call was successful or not @@ -381,8 +463,12 @@ func (t *Tracker) watchConsulEvents() { OUTER: for { select { + + // we are shutting down case <-t.ctx.Done(): return + + // it is time to check the checks case <-checkTicker.C: newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID) if err != nil { @@ -395,12 +481,15 @@ OUTER: consulChecksErr = false allocReg = newAllocReg } - case <-healthyTimer.C: + + // enough time has passed with healthy checks + case <-waiter.C(): if t.setCheckHealth(true) { // final health set and propagated return } - // tasks are unhealthy, reset and wait until all is healthy + // checks are healthy but tasks are unhealthy, + // reset and wait until all is healthy primed = false } @@ -409,7 +498,7 @@ OUTER: } // Store the task registrations - t.l.Lock() + t.lock.Lock() for task, reg := range allocReg.Tasks { //TODO(schmichael) for now skip unknown tasks as //they're task group services which don't currently @@ -418,7 +507,7 @@ OUTER: v.taskRegistrations = reg } } - t.l.Unlock() + t.lock.Unlock() // Detect if all the checks are passing passed := true @@ -427,16 +516,16 @@ OUTER: for _, treg := range allocReg.Tasks { for _, sreg := range treg.Services { for _, check := range sreg.Checks { - onupdate := sreg.CheckOnUpdate[check.CheckID] + onUpdate := sreg.CheckOnUpdate[check.CheckID] switch check.Status { case api.HealthPassing: continue case api.HealthWarning: - if onupdate == structs.OnUpdateIgnoreWarn || onupdate == structs.OnUpdateIgnore { + if onUpdate == structs.OnUpdateIgnoreWarn || onUpdate == structs.OnUpdateIgnore { continue } case api.HealthCritical: - if onupdate == structs.OnUpdateIgnore { + if onUpdate == structs.OnUpdateIgnore { continue } default: @@ -452,25 +541,94 @@ OUTER: if !passed { // Reset the timer since we have transitioned back to unhealthy if primed { - if !healthyTimer.Stop() { - select { - case <-healthyTimer.C: - default: - } - } primed = false + waiter.disable() } } else if !primed { // Reset the timer to fire after MinHealthyTime - if !healthyTimer.Stop() { - select { - case <-healthyTimer.C: - default: + primed = true + waiter.disable() + waiter.wait(t.minHealthyTime) + } + } +} + +// watchNomadEvents is a watcher for the health of the allocation's Nomad checks. +// If all checks report healthy the watcher will exit after the MinHealthyTime has +// been reached, otherwise the watcher will continue to check unhealthy checks until +// the ctx is cancelled. +// +// Does not watch Consul service checks; see watchConsulEvents for those. +func (t *Tracker) watchNomadEvents() { + // checkTicker is the ticker that triggers us to look at the checks in Nomad + checkTicker, cancel := helper.NewSafeTimer(t.checkLookupInterval) + defer cancel() + + // waiter is used to fire when the checks have been healthy for the MinHealthyTime + waiter := newHealthyFuture() + + // allocID of the allocation we are watching checks for + allocID := t.alloc.ID + + // primed marks whether the healthy waiter has been set + primed := false + + // latest set of nomad check results + var results map[structs.CheckID]*structs.CheckQueryResult + + for { + select { + + // we are shutting down + case <-t.ctx.Done(): + return + + // it is time to check the checks + case <-checkTicker.C: + results = t.checkStore.List(allocID) + checkTicker.Reset(t.checkLookupInterval) + + // enough time has passed with healthy checks + case <-waiter.C(): + if t.setCheckHealth(true) { // todo(shoenig) this needs to be split between Consul and Nomad + return // final health set and propagated + } + // checks are healthy but tasks are unhealthy, reset and wait + primed = false + } + + // scan to see if any checks are failing + passing := true + for _, result := range results { + switch result.Status { + case structs.CheckSuccess: + continue + case structs.CheckFailure: + if result.Mode == structs.Readiness { + continue } + passing = false + default: + // i.e. pending check; do not consider healthy or ready + passing = false + } + + if !passing { + break // 1+ check is failing; no need to continue } + } + + if !passing { + // at least one check is failing, transition to unhealthy + t.setCheckHealth(false) + primed = false + waiter.disable() + } + if passing && !primed { + // healthy but not yet primed, set timer to wait primed = true - healthyTimer.Reset(t.minHealthyTime) + waiter.wait(t.minHealthyTime) } } } @@ -488,15 +646,13 @@ type taskHealthState struct { // strategy of the group. It returns true if the task has contributed to the // allocation being unhealthy and if so, an event description of why. func (t *taskHealthState) event(deadline time.Time, healthyDeadline, minHealthyTime time.Duration, useChecks bool) (string, bool) { - requireChecks := false desiredChecks := 0 for _, s := range t.task.Services { if nc := len(s.Checks); nc > 0 { - requireChecks = true desiredChecks += nc } } - requireChecks = requireChecks && useChecks + requireChecks := (desiredChecks > 0) && useChecks if t.state != nil { if t.state.Failed { @@ -507,7 +663,7 @@ func (t *taskHealthState) event(deadline time.Time, healthyDeadline, minHealthyT case structs.TaskStatePending: return fmt.Sprintf("Task not running by healthy_deadline of %v", healthyDeadline), true case structs.TaskStateDead: - // hook tasks are healthy when dead successfully + // non-sidecar hook lifecycle tasks are healthy if they exit with success if t.task.Lifecycle == nil || t.task.Lifecycle.Sidecar { return "Unhealthy because of dead task", true } diff --git a/client/allochealth/tracker_test.go b/client/allochealth/tracker_test.go index 94584e28fa24..4403073c154a 100644 --- a/client/allochealth/tracker_test.go +++ b/client/allochealth/tracker_test.go @@ -10,16 +10,19 @@ import ( consulapi "github.com/hashicorp/consul/api" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/serviceregistration" - regMock "github.com/hashicorp/nomad/client/serviceregistration/mock" + "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" + regmock "github.com/hashicorp/nomad/client/serviceregistration/mock" + "github.com/hashicorp/nomad/client/state" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" + "github.com/shoenig/test/must" "github.com/stretchr/testify/require" ) -func TestTracker_Checks_Healthy(t *testing.T) { +func TestTracker_ConsulChecks_Healthy(t *testing.T) { ci.Parallel(t) alloc := mock.Alloc() @@ -60,7 +63,7 @@ func TestTracker_Checks_Healthy(t *testing.T) { // Don't reply on the first call var called uint64 - consul := regMock.NewServiceRegistrationHandler(logger) + consul := regmock.NewServiceRegistrationHandler(logger) consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) { if atomic.AddUint64(&called, 1) == 1 { return nil, nil @@ -76,9 +79,9 @@ func TestTracker_Checks_Healthy(t *testing.T) { ctx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() + checks := checkstore.NewStore(logger, state.NewMemDB(logger)) checkInterval := 10 * time.Millisecond - tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, - time.Millisecond, true) + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true) tracker.checkLookupInterval = checkInterval tracker.Start() @@ -90,6 +93,143 @@ func TestTracker_Checks_Healthy(t *testing.T) { } } +func TestTracker_NomadChecks_Healthy(t *testing.T) { + ci.Parallel(t) + + alloc := mock.Alloc() + alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up + alloc.Job.TaskGroups[0].Tasks[0].Services[0].Provider = "nomad" + + logger := testlog.HCLogger(t) + b := cstructs.NewAllocBroadcaster(logger) + defer b.Close() + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Synthesize running alloc and tasks + alloc.ClientStatus = structs.AllocClientStatusRunning + alloc.TaskStates = map[string]*structs.TaskState{ + alloc.Job.TaskGroups[0].Tasks[0].Name: { + State: structs.TaskStateRunning, + StartedAt: time.Now(), + }, + } + + checks := checkstore.NewStore(logger, state.NewMemDB(logger)) + err := checks.Set(alloc.ID, &structs.CheckQueryResult{ + ID: "abc123", + Mode: "healthiness", + Status: "pending", + Output: "nomad: waiting to run", + Timestamp: time.Now().Unix(), + Group: alloc.TaskGroup, + Task: alloc.Job.TaskGroups[0].Tasks[0].Name, + Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name, + Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name, + }) + must.NoError(t, err) + + consul := regmock.NewServiceRegistrationHandler(logger) + checkInterval := 10 * time.Millisecond + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true) + tracker.checkLookupInterval = checkInterval + tracker.Start() + + go func() { + // wait a bit then update the check to passing + time.Sleep(15 * time.Millisecond) + must.NoError(t, checks.Set(alloc.ID, &structs.CheckQueryResult{ + ID: "abc123", + Mode: "healthiness", + Status: "success", + Output: "nomad: http ok", + Timestamp: time.Now().Unix(), + Group: alloc.TaskGroup, + Task: alloc.Job.TaskGroups[0].Tasks[0].Name, + Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name, + Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name, + })) + }() + + select { + case <-time.After(4 * checkInterval): + t.Fatalf("timed out while waiting for success") + case healthy := <-tracker.HealthyCh(): + must.True(t, healthy) + } +} + +func TestTracker_NomadChecks_Unhealthy(t *testing.T) { + ci.Parallel(t) + + alloc := mock.Alloc() + alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up + alloc.Job.TaskGroups[0].Tasks[0].Services[0].Provider = "nomad" + + logger := testlog.HCLogger(t) + b := cstructs.NewAllocBroadcaster(logger) + defer b.Close() + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Synthesize running alloc and tasks + alloc.ClientStatus = structs.AllocClientStatusRunning + alloc.TaskStates = map[string]*structs.TaskState{ + alloc.Job.TaskGroups[0].Tasks[0].Name: { + State: structs.TaskStateRunning, + StartedAt: time.Now(), + }, + } + + checks := checkstore.NewStore(logger, state.NewMemDB(logger)) + err := checks.Set(alloc.ID, &structs.CheckQueryResult{ + ID: "abc123", + Mode: "healthiness", + Status: "pending", // start out pending + Output: "nomad: waiting to run", + Timestamp: time.Now().Unix(), + Group: alloc.TaskGroup, + Task: alloc.Job.TaskGroups[0].Tasks[0].Name, + Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name, + Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name, + }) + must.NoError(t, err) + + consul := regmock.NewServiceRegistrationHandler(logger) + checkInterval := 10 * time.Millisecond + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true) + tracker.checkLookupInterval = checkInterval + tracker.Start() + + go func() { + // wait a bit then update the check to failing + time.Sleep(15 * time.Millisecond) + must.NoError(t, checks.Set(alloc.ID, &structs.CheckQueryResult{ + ID: "abc123", + Mode: "healthiness", + Status: "failing", + Output: "connection refused", + Timestamp: time.Now().Unix(), + Group: alloc.TaskGroup, + Task: alloc.Job.TaskGroups[0].Tasks[0].Name, + Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name, + Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name, + })) + }() + + // make sure we are always unhealthy across 4 check intervals + for i := 0; i < 4; i++ { + <-time.After(checkInterval) + select { + case <-tracker.HealthyCh(): + t.Fatalf("should not receive on healthy chan with failing check") + default: + } + } +} + func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) { ci.Parallel(t) @@ -112,13 +252,13 @@ func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) { b := cstructs.NewAllocBroadcaster(logger) defer b.Close() - consul := regMock.NewServiceRegistrationHandler(logger) + consul := regmock.NewServiceRegistrationHandler(logger) ctx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() + checks := checkstore.NewStore(logger, state.NewMemDB(logger)) checkInterval := 10 * time.Millisecond - tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, - time.Millisecond, true) + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true) tracker.checkLookupInterval = checkInterval tracker.Start() @@ -153,13 +293,13 @@ func TestTracker_Succeeded_PostStart_Healthy(t *testing.T) { b := cstructs.NewAllocBroadcaster(logger) defer b.Close() - consul := regMock.NewServiceRegistrationHandler(logger) + consul := regmock.NewServiceRegistrationHandler(logger) ctx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() + checks := checkstore.NewStore(logger, state.NewMemDB(logger)) checkInterval := 10 * time.Millisecond - tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, - alloc.Job.TaskGroups[0].Migrate.MinHealthyTime, true) + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, alloc.Job.TaskGroups[0].Migrate.MinHealthyTime, true) tracker.checkLookupInterval = checkInterval tracker.Start() @@ -171,7 +311,7 @@ func TestTracker_Succeeded_PostStart_Healthy(t *testing.T) { } } -func TestTracker_Checks_Unhealthy(t *testing.T) { +func TestTracker_ConsulChecks_Unhealthy(t *testing.T) { ci.Parallel(t) alloc := mock.Alloc() @@ -220,7 +360,7 @@ func TestTracker_Checks_Unhealthy(t *testing.T) { // Don't reply on the first call var called uint64 - consul := regMock.NewServiceRegistrationHandler(logger) + consul := regmock.NewServiceRegistrationHandler(logger) consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) { if atomic.AddUint64(&called, 1) == 1 { return nil, nil @@ -236,9 +376,9 @@ func TestTracker_Checks_Unhealthy(t *testing.T) { ctx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() + checks := checkstore.NewStore(logger, state.NewMemDB(logger)) checkInterval := 10 * time.Millisecond - tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, - time.Millisecond, true) + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true) tracker.checkLookupInterval = checkInterval tracker.Start() @@ -249,9 +389,9 @@ func TestTracker_Checks_Unhealthy(t *testing.T) { require.NoError(t, err) }) - tracker.l.Lock() + tracker.lock.Lock() require.False(t, tracker.checksHealthy) - tracker.l.Unlock() + tracker.lock.Unlock() select { case v := <-tracker.HealthyCh(): @@ -270,8 +410,7 @@ func TestTracker_Healthy_IfBothTasksAndConsulChecksAreHealthy(t *testing.T) { ctx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() - tracker := NewTracker(ctx, logger, alloc, nil, nil, - time.Millisecond, true) + tracker := NewTracker(ctx, logger, alloc, nil, nil, nil, time.Millisecond, true) assertNoHealth := func() { require.NoError(t, tracker.ctx.Err()) @@ -362,7 +501,7 @@ func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) { // Don't reply on the first call var called uint64 - consul := regMock.NewServiceRegistrationHandler(logger) + consul := regmock.NewServiceRegistrationHandler(logger) consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) { if atomic.AddUint64(&called, 1) == 1 { return nil, nil @@ -378,9 +517,9 @@ func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) { ctx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() + checks := checkstore.NewStore(logger, state.NewMemDB(logger)) checkInterval := 10 * time.Millisecond - tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, - time.Millisecond, true) + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true) tracker.checkLookupInterval = checkInterval tracker.Start() @@ -419,7 +558,7 @@ func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) { } -func TestTracker_Checks_OnUpdate(t *testing.T) { +func TestTracker_ConsulChecks_OnUpdate(t *testing.T) { ci.Parallel(t) cases := []struct { @@ -504,7 +643,7 @@ func TestTracker_Checks_OnUpdate(t *testing.T) { // Don't reply on the first call var called uint64 - consul := regMock.NewServiceRegistrationHandler(logger) + consul := regmock.NewServiceRegistrationHandler(logger) consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) { if atomic.AddUint64(&called, 1) == 1 { return nil, nil @@ -520,9 +659,9 @@ func TestTracker_Checks_OnUpdate(t *testing.T) { ctx, cancelFn := context.WithCancel(context.Background()) defer cancelFn() + checks := checkstore.NewStore(logger, state.NewMemDB(logger)) checkInterval := 10 * time.Millisecond - tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, - time.Millisecond, true) + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true) tracker.checkLookupInterval = checkInterval tracker.Start() @@ -548,3 +687,120 @@ func TestTracker_Checks_OnUpdate(t *testing.T) { }) } } + +func TestTracker_NomadChecks_OnUpdate(t *testing.T) { + ci.Parallel(t) + + cases := []struct { + name string + checkMode structs.CheckMode + checkResult structs.CheckStatus + expectedPass bool + }{ + { + name: "mode is healthiness and check is healthy", + checkMode: structs.Healthiness, + checkResult: structs.CheckSuccess, + expectedPass: true, + }, + { + name: "mode is healthiness and check is unhealthy", + checkMode: structs.Healthiness, + checkResult: structs.CheckFailure, + expectedPass: false, + }, + { + name: "mode is readiness and check is healthy", + checkMode: structs.Readiness, + checkResult: structs.CheckSuccess, + expectedPass: true, + }, + { + name: "mode is readiness and check is healthy", + checkMode: structs.Readiness, + checkResult: structs.CheckFailure, + expectedPass: true, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + alloc := mock.Alloc() + alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up + alloc.Job.TaskGroups[0].Tasks[0].Services[0].Provider = "nomad" + + logger := testlog.HCLogger(t) + b := cstructs.NewAllocBroadcaster(logger) + defer b.Close() + + // Synthesize running alloc and tasks + alloc.ClientStatus = structs.AllocClientStatusRunning + alloc.TaskStates = map[string]*structs.TaskState{ + alloc.Job.TaskGroups[0].Tasks[0].Name: { + State: structs.TaskStateRunning, + StartedAt: time.Now(), + }, + } + + // Set a check that is pending + checks := checkstore.NewStore(logger, state.NewMemDB(logger)) + err := checks.Set(alloc.ID, &structs.CheckQueryResult{ + ID: "abc123", + Mode: tc.checkMode, + Status: structs.CheckPending, + Output: "nomad: waiting to run", + Timestamp: time.Now().Unix(), + Group: alloc.TaskGroup, + Task: alloc.Job.TaskGroups[0].Tasks[0].Name, + Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name, + Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name, + }) + must.NoError(t, err) + + go func() { + // wait a bit then update the check to passing + time.Sleep(15 * time.Millisecond) + must.NoError(t, checks.Set(alloc.ID, &structs.CheckQueryResult{ + ID: "abc123", + Mode: tc.checkMode, + Status: tc.checkResult, + Output: "some output", + Timestamp: time.Now().Unix(), + Group: alloc.TaskGroup, + Task: alloc.Job.TaskGroups[0].Tasks[0].Name, + Service: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name, + Check: alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name, + })) + }() + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + consul := regmock.NewServiceRegistrationHandler(logger) + minHealthyTime := 1 * time.Millisecond + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, minHealthyTime, true) + tracker.checkLookupInterval = 10 * time.Millisecond + tracker.Start() + + select { + case <-time.After(8 * tracker.checkLookupInterval): + if !tc.expectedPass { + // tracker should still be running + must.NoError(t, tracker.ctx.Err()) + return + } + t.Fatal("timed out while waiting for health") + case h := <-tracker.HealthyCh(): + require.True(t, h) + } + + // For healthy checks, the tracker should stop watching + select { + case <-tracker.ctx.Done(): + // Ok, tracker should exit after reporting healthy + default: + t.Fatal("expected tracker to exit after reporting healthy") + } + }) + } +} diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index f8eaf8aa86aa..624c3145e9b3 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -22,6 +22,7 @@ import ( "github.com/hashicorp/nomad/client/pluginmanager/csimanager" "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" "github.com/hashicorp/nomad/client/serviceregistration" + "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" "github.com/hashicorp/nomad/client/serviceregistration/wrapper" cstate "github.com/hashicorp/nomad/client/state" cstructs "github.com/hashicorp/nomad/client/structs" @@ -41,6 +42,7 @@ type allocRunner struct { // Logger is the logger for the alloc runner. logger log.Logger + // clientConfig is the client configuration block. clientConfig *config.Config // stateUpdater is used to emit updated alloc state @@ -183,6 +185,9 @@ type allocRunner struct { // to perform service and check registration and deregistration. serviceRegWrapper *wrapper.HandlerWrapper + // checkStore contains check status information + checkStore checkstore.Shim + // getter is an interface for retrieving artifacts. getter cinterfaces.ArtifactGetter } @@ -229,6 +234,7 @@ func NewAllocRunner(config *Config) (*allocRunner, error) { serversContactedCh: config.ServersContactedCh, rpcClient: config.RPCClient, serviceRegWrapper: config.ServiceRegWrapper, + checkStore: config.CheckStore, getter: config.Getter, } diff --git a/client/allocrunner/alloc_runner_hooks.go b/client/allocrunner/alloc_runner_hooks.go index 30611b394aae..0069bb559002 100644 --- a/client/allocrunner/alloc_runner_hooks.go +++ b/client/allocrunner/alloc_runner_hooks.go @@ -149,21 +149,22 @@ func (ar *allocRunner) initRunnerHooks(config *clientconfig.Config) error { newCgroupHook(ar.Alloc(), ar.cpusetManager), newUpstreamAllocsHook(hookLogger, ar.prevAllocWatcher), newDiskMigrationHook(hookLogger, ar.prevAllocMigrator, ar.allocDir), - newAllocHealthWatcherHook(hookLogger, alloc, hs, ar.Listener(), ar.consulClient), + newAllocHealthWatcherHook(hookLogger, alloc, hs, ar.Listener(), ar.consulClient, ar.checkStore), newNetworkHook(hookLogger, ns, alloc, nm, nc, ar, builtTaskEnv), newGroupServiceHook(groupServiceHookConfig{ - alloc: alloc, - namespace: alloc.ServiceProviderNamespace(), - serviceRegWrapper: ar.serviceRegWrapper, - restarter: ar, - taskEnvBuilder: envBuilder, - networkStatusGetter: ar, - logger: hookLogger, - shutdownDelayCtx: ar.shutdownDelayCtx, + alloc: alloc, + namespace: alloc.ServiceProviderNamespace(), + serviceRegWrapper: ar.serviceRegWrapper, + restarter: ar, + taskEnvBuilder: envBuilder, + networkStatus: ar, + logger: hookLogger, + shutdownDelayCtx: ar.shutdownDelayCtx, }), newConsulGRPCSocketHook(hookLogger, alloc, ar.allocDir, config.ConsulConfig), newConsulHTTPSocketHook(hookLogger, alloc, ar.allocDir, config.ConsulConfig), newCSIHook(alloc, hookLogger, ar.csiManager, ar.rpcClient, ar, hrs, ar.clientConfig.Node.SecretID), + newChecksHook(hookLogger, alloc, ar.checkStore, ar), } return nil diff --git a/client/allocrunner/checks_hook.go b/client/allocrunner/checks_hook.go new file mode 100644 index 000000000000..19ea439e7a34 --- /dev/null +++ b/client/allocrunner/checks_hook.go @@ -0,0 +1,270 @@ +package allocrunner + +import ( + "context" + "sync" + "time" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/client/allocrunner/interfaces" + "github.com/hashicorp/nomad/client/serviceregistration/checks" + "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/nomad/structs" +) + +const ( + // checksHookName is the name of this hook as appears in logs + checksHookName = "checks_hook" +) + +// observers maintains a map from check_id -> observer for a particular check. Each +// observer in the map must share the same context. +type observers map[structs.CheckID]*observer + +// An observer is used to execute a particular check on its interval and update the +// check store with those results. +type observer struct { + ctx context.Context + cancel context.CancelFunc + checker checks.Checker + checkStore checkstore.Shim + + qc *checks.QueryContext + check *structs.ServiceCheck + allocID string +} + +// start checking our check on its interval +func (o *observer) start() { + // compromise between immediate (too early) and waiting full interval (slow) + firstWait := o.check.Interval / 2 + + timer, cancel := helper.NewSafeTimer(firstWait) + defer cancel() + + for { + select { + + // exit the observer + case <-o.ctx.Done(): + return + + // time to execute the check + case <-timer.C: + query := checks.GetCheckQuery(o.check) + result := o.checker.Do(o.ctx, o.qc, query) + + // and put the results into the store + _ = o.checkStore.Set(o.allocID, result) + + // setup timer for next interval + timer.Reset(o.check.Interval) + } + } +} + +// stop checking our check - this will also interrupt an in-progress execution +func (o *observer) stop() { + o.cancel() +} + +// checksHook manages checks of Nomad service registrations, at both the group and +// task level, by storing / removing them from the Client state store. +// +// Does not manage Consul service checks; see groupServiceHook instead. +type checksHook struct { + logger hclog.Logger + network structs.NetworkStatus + shim checkstore.Shim + checker checks.Checker + allocID string + + // fields that get re-initialized on allocation update + lock sync.RWMutex + ctx context.Context + stop func() + observers observers + alloc *structs.Allocation +} + +func newChecksHook( + logger hclog.Logger, + alloc *structs.Allocation, + shim checkstore.Shim, + network structs.NetworkStatus, +) *checksHook { + h := &checksHook{ + logger: logger.Named(checksHookName), + allocID: alloc.ID, + alloc: alloc, + shim: shim, + network: network, + checker: checks.New(logger), + } + h.initialize(alloc) + return h +} + +// initialize the dynamic fields of checksHook, which is to say setup all the +// observers and query context things associated with the alloc. +// +// Should be called during initial setup only. +func (h *checksHook) initialize(alloc *structs.Allocation) { + h.lock.Lock() + defer h.lock.Unlock() + + tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) + if tg == nil { + return + } + + // fresh context and stop function for this allocation + h.ctx, h.stop = context.WithCancel(context.Background()) + + // fresh set of observers + h.observers = make(observers) + + // set the initial alloc + h.alloc = alloc +} + +// observe will create the observer for each service in services. +// services must use only nomad service provider. +// +// Caller must hold h.lock. +func (h *checksHook) observe(alloc *structs.Allocation, services []*structs.Service) { + var ports structs.AllocatedPorts + var networks structs.Networks + if alloc.AllocatedResources != nil { + ports = alloc.AllocatedResources.Shared.Ports + networks = alloc.AllocatedResources.Shared.Networks + } + + for _, service := range services { + for _, check := range service.Checks { + + // remember the initialization time + now := time.Now().UTC().Unix() + + // create the deterministic check id for this check + id := structs.NomadCheckID(alloc.ID, alloc.TaskGroup, check) + + // an observer for this check already exists + if _, exists := h.observers[id]; exists { + continue + } + + ctx, cancel := context.WithCancel(h.ctx) + + // create the observer for this check + h.observers[id] = &observer{ + ctx: ctx, + cancel: cancel, + check: check.Copy(), + checkStore: h.shim, + checker: h.checker, + allocID: h.allocID, + qc: &checks.QueryContext{ + ID: id, + CustomAddress: service.Address, + ServicePortLabel: service.PortLabel, + Ports: ports, + Networks: networks, + NetworkStatus: h.network, + Group: alloc.Name, + Task: service.TaskName, + Service: service.Name, + Check: check.Name, + }, + } + + // insert a pending result into state store for each check + result := checks.Stub(id, structs.GetCheckMode(check), now, alloc.Name, service.TaskName, service.Name, check.Name) + if err := h.shim.Set(h.allocID, result); err != nil { + h.logger.Error("failed to set initial check status", "id", h.allocID, "error", err) + continue + } + + // start the observer + go h.observers[id].start() + } + } +} + +func (h *checksHook) Name() string { + return checksHookName +} + +func (h *checksHook) Prerun() error { + h.lock.Lock() + defer h.lock.Unlock() + + group := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup) + if group == nil { + return nil + } + + // create and start observers of nomad service checks in alloc + h.observe(h.alloc, group.NomadServices()) + + return nil +} + +func (h *checksHook) Update(request *interfaces.RunnerUpdateRequest) error { + h.lock.Lock() + defer h.lock.Unlock() + + group := request.Alloc.Job.LookupTaskGroup(request.Alloc.TaskGroup) + if group == nil { + return nil + } + + // get all group and task level services using nomad provider + services := group.NomadServices() + + // create a set of the updated set of checks + next := make([]structs.CheckID, 0, len(h.observers)) + for _, service := range services { + for _, check := range service.Checks { + next = append(next, structs.NomadCheckID( + request.Alloc.ID, + request.Alloc.TaskGroup, + check, + )) + } + } + + // stop the observers of the checks we are removing + remove := h.shim.Difference(request.Alloc.ID, next) + for _, id := range remove { + h.observers[id].stop() + delete(h.observers, id) + } + + // remove checks that are no longer part of the allocation + if err := h.shim.Remove(request.Alloc.ID, remove); err != nil { + return err + } + + // remember this new alloc + h.alloc = request.Alloc + + // ensure we are observing new checks (idempotent) + h.observe(request.Alloc, services) + + return nil +} + +func (h *checksHook) PreKill() { + h.lock.Lock() + defer h.lock.Unlock() + + // terminate our hook context, which threads down to all observers + h.stop() + + // purge all checks for this allocation from the client state store + if err := h.shim.Purge(h.allocID); err != nil { + h.logger.Error("failed to purge check results", "alloc_id", h.allocID, "error", err) + } +} diff --git a/client/allocrunner/checks_hook_test.go b/client/allocrunner/checks_hook_test.go new file mode 100644 index 000000000000..b204c2e8059b --- /dev/null +++ b/client/allocrunner/checks_hook_test.go @@ -0,0 +1,312 @@ +package allocrunner + +import ( + "fmt" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/allocrunner/interfaces" + "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" + "github.com/hashicorp/nomad/client/state" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/testutil" + "github.com/shoenig/test/must" +) + +var ( + _ interfaces.RunnerPrerunHook = (*checksHook)(nil) + _ interfaces.RunnerUpdateHook = (*checksHook)(nil) + _ interfaces.RunnerPreKillHook = (*checksHook)(nil) +) + +func makeCheckStore(logger hclog.Logger) checkstore.Shim { + db := state.NewMemDB(logger) + checkStore := checkstore.NewStore(logger, db) + return checkStore +} + +func allocWithNomadChecks(addr, port string, onGroup bool) *structs.Allocation { + alloc := mock.Alloc() + group := alloc.Job.LookupTaskGroup(alloc.TaskGroup) + + task := "task-one" + if onGroup { + task = "" + } + + services := []*structs.Service{ + { + Name: "service-one", + TaskName: "web", + PortLabel: port, + AddressMode: "auto", + Address: addr, + Provider: "nomad", + Checks: []*structs.ServiceCheck{ + { + Name: "check-ok", + Type: "http", + Path: "/", + Protocol: "http", + PortLabel: port, + AddressMode: "auto", + Interval: 250 * time.Millisecond, + Timeout: 1 * time.Second, + Method: "GET", + TaskName: task, + }, + { + Name: "check-error", + Type: "http", + Path: "/fail", + Protocol: "http", + PortLabel: port, + AddressMode: "auto", + Interval: 250 * time.Millisecond, + Timeout: 1 * time.Second, + Method: "GET", + TaskName: task, + }, + { + Name: "check-hang", + Type: "http", + Path: "/hang", + Protocol: "http", + PortLabel: port, + AddressMode: "auto", + Interval: 250 * time.Millisecond, + Timeout: 500 * time.Millisecond, + Method: "GET", + TaskName: task, + }, + }, + }, + } + + switch onGroup { + case true: + group.Tasks[0].Services = nil + group.Services = services + case false: + group.Services = nil + group.Tasks[0].Services = services + } + return alloc +} + +func allocWithDifferentNomadChecks(id, addr, port string) *structs.Allocation { + alloc := allocWithNomadChecks(addr, port, true) + alloc.ID = id + group := alloc.Job.LookupTaskGroup(alloc.TaskGroup) + + group.Services[0].Checks[2].Path = "/" // the hanging check is now ok + + // append 4th check, this one is failing + group.Services[0].Checks = append(group.Services[0].Checks, &structs.ServiceCheck{ + Name: "check-error-2", + Type: "http", + Path: "/fail", + Protocol: "http", + PortLabel: port, + AddressMode: "auto", + Interval: 250 * time.Millisecond, + Timeout: 1 * time.Second, + Method: "GET", + }) + return alloc +} + +var checkHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/fail": + w.WriteHeader(500) + _, _ = io.WriteString(w, "500 problem") + case "/hang": + time.Sleep(2 * time.Second) + _, _ = io.WriteString(w, "too slow") + default: + w.WriteHeader(200) + _, _ = io.WriteString(w, "200 ok") + } +}) + +func TestCheckHook_Checks_ResultsSet(t *testing.T) { + ci.Parallel(t) + + logger := testlog.HCLogger(t) + + // create an http server with various responses + ts := httptest.NewServer(checkHandler) + defer ts.Close() + + cases := []struct { + name string + onGroup bool + }{ + {name: "group-level", onGroup: true}, + {name: "task-level", onGroup: false}, + } + + for _, tc := range cases { + checkStore := makeCheckStore(logger) + + // get the address and port for http server + tokens := strings.Split(ts.URL, ":") + addr, port := strings.TrimPrefix(tokens[1], "//"), tokens[2] + + network := mock.NewNetworkStatus(addr) + + alloc := allocWithNomadChecks(addr, port, tc.onGroup) + + h := newChecksHook(logger, alloc, checkStore, network) + + // initialize is called; observers are created but not started yet + must.MapEmpty(t, h.observers) + + // calling pre-run starts the observers + err := h.Prerun() + must.NoError(t, err) + + testutil.WaitForResultUntil( + 2*time.Second, + func() (bool, error) { + results := checkStore.List(alloc.ID) + passing, failing, pending := 0, 0, 0 + for _, result := range results { + switch result.Status { + case structs.CheckSuccess: + passing++ + case structs.CheckFailure: + failing++ + case structs.CheckPending: + pending++ + } + } + if passing != 1 || failing != 2 || pending != 0 { + fmt.Printf("results %v\n", results) + return false, fmt.Errorf( + "expected 1 passing, 2 failing, 0 pending, got %d passing, %d failing, %d pending", + passing, failing, pending, + ) + } + return true, nil + }, + func(err error) { + t.Fatalf(err.Error()) + }, + ) + + h.PreKill() // stop observers, cleanup + + // assert shim no longer contains results for the alloc + results := checkStore.List(alloc.ID) + must.MapEmpty(t, results) + } +} + +func TestCheckHook_Checks_UpdateSet(t *testing.T) { + ci.Parallel(t) + + logger := testlog.HCLogger(t) + + // create an http server with various responses + ts := httptest.NewServer(checkHandler) + defer ts.Close() + + // get the address and port for http server + tokens := strings.Split(ts.URL, ":") + addr, port := strings.TrimPrefix(tokens[1], "//"), tokens[2] + + shim := makeCheckStore(logger) + + network := mock.NewNetworkStatus(addr) + + alloc := allocWithNomadChecks(addr, port, true) + + h := newChecksHook(logger, alloc, shim, network) + + // calling pre-run starts the observers + err := h.Prerun() + must.NoError(t, err) + + // initial set of checks + testutil.WaitForResultUntil( + 2*time.Second, + func() (bool, error) { + results := shim.List(alloc.ID) + passing, failing, pending := 0, 0, 0 + for _, result := range results { + switch result.Status { + case structs.CheckSuccess: + passing++ + case structs.CheckFailure: + failing++ + case structs.CheckPending: + pending++ + } + } + if passing != 1 || failing != 2 || pending != 0 { + fmt.Printf("results %v\n", results) + return false, fmt.Errorf( + "(initial set) expected 1 passing, 2 failing, 0 pending, got %d passing, %d failing, %d pending", + passing, failing, pending, + ) + } + return true, nil + }, + func(err error) { + t.Fatalf(err.Error()) + }, + ) + + request := &interfaces.RunnerUpdateRequest{ + Alloc: allocWithDifferentNomadChecks(alloc.ID, addr, port), + } + + err = h.Update(request) + must.NoError(t, err) + + // updated set of checks + testutil.WaitForResultUntil( + 2*time.Second, + func() (bool, error) { + results := shim.List(alloc.ID) + passing, failing, pending := 0, 0, 0 + for _, result := range results { + switch result.Status { + case structs.CheckSuccess: + passing++ + case structs.CheckFailure: + failing++ + case structs.CheckPending: + pending++ + } + } + if passing != 2 || failing != 2 || pending != 0 { + fmt.Printf("results %v\n", results) + return false, fmt.Errorf( + "(updated set) expected 2 passing, 2 failing, 0 pending, got %d passing, %d failing, %d pending", + passing, failing, pending, + ) + } + return true, nil + }, + func(err error) { + t.Fatalf(err.Error()) + }, + ) + + h.PreKill() // stop observers, cleanup + + // assert shim no longer contains results for the alloc + results := shim.List(alloc.ID) + must.MapEmpty(t, results) +} diff --git a/client/allocrunner/config.go b/client/allocrunner/config.go index 99d0490ef134..4b9cbc2e23a0 100644 --- a/client/allocrunner/config.go +++ b/client/allocrunner/config.go @@ -12,6 +12,7 @@ import ( "github.com/hashicorp/nomad/client/pluginmanager/csimanager" "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" "github.com/hashicorp/nomad/client/serviceregistration" + "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" "github.com/hashicorp/nomad/client/serviceregistration/wrapper" cstate "github.com/hashicorp/nomad/client/state" "github.com/hashicorp/nomad/client/vaultclient" @@ -87,6 +88,9 @@ type Config struct { // to perform service and check registration and deregistration. ServiceRegWrapper *wrapper.HandlerWrapper + // CheckStore contains check result information. + CheckStore checkstore.Shim + // Getter is an interface for retrieving artifacts. Getter interfaces.ArtifactGetter } diff --git a/client/allocrunner/groupservice_hook.go b/client/allocrunner/group_service_hook.go similarity index 82% rename from client/allocrunner/groupservice_hook.go rename to client/allocrunner/group_service_hook.go index 3e567537bb6a..866303ea0c98 100644 --- a/client/allocrunner/groupservice_hook.go +++ b/client/allocrunner/group_service_hook.go @@ -11,6 +11,7 @@ import ( "github.com/hashicorp/nomad/client/serviceregistration/wrapper" "github.com/hashicorp/nomad/client/taskenv" agentconsul "github.com/hashicorp/nomad/command/agent/consul" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" ) @@ -18,21 +19,17 @@ const ( groupServiceHookName = "group_services" ) -type networkStatusGetter interface { - NetworkStatus() *structs.AllocNetworkStatus -} - // groupServiceHook manages task group Consul service registration and // deregistration. type groupServiceHook struct { - allocID string - jobID string - group string - restarter agentconsul.WorkloadRestarter - prerun bool - deregistered bool - networkStatusGetter networkStatusGetter - shutdownDelayCtx context.Context + allocID string + jobID string + group string + restarter agentconsul.WorkloadRestarter + prerun bool + deregistered bool + networkStatus structs.NetworkStatus + shutdownDelayCtx context.Context // namespace is the Nomad or Consul namespace in which service // registrations will be made. This field may be updated. @@ -58,12 +55,12 @@ type groupServiceHook struct { } type groupServiceHookConfig struct { - alloc *structs.Allocation - restarter agentconsul.WorkloadRestarter - taskEnvBuilder *taskenv.Builder - networkStatusGetter networkStatusGetter - shutdownDelayCtx context.Context - logger log.Logger + alloc *structs.Allocation + restarter agentconsul.WorkloadRestarter + taskEnvBuilder *taskenv.Builder + networkStatus structs.NetworkStatus + shutdownDelayCtx context.Context + logger log.Logger // namespace is the Nomad or Consul namespace in which service // registrations will be made. @@ -83,18 +80,18 @@ func newGroupServiceHook(cfg groupServiceHookConfig) *groupServiceHook { } h := &groupServiceHook{ - allocID: cfg.alloc.ID, - jobID: cfg.alloc.JobID, - group: cfg.alloc.TaskGroup, - restarter: cfg.restarter, - namespace: cfg.namespace, - taskEnvBuilder: cfg.taskEnvBuilder, - delay: shutdownDelay, - networkStatusGetter: cfg.networkStatusGetter, - logger: cfg.logger.Named(groupServiceHookName), - serviceRegWrapper: cfg.serviceRegWrapper, - services: tg.Services, - shutdownDelayCtx: cfg.shutdownDelayCtx, + allocID: cfg.alloc.ID, + jobID: cfg.alloc.JobID, + group: cfg.alloc.TaskGroup, + restarter: cfg.restarter, + namespace: cfg.namespace, + taskEnvBuilder: cfg.taskEnvBuilder, + delay: shutdownDelay, + networkStatus: cfg.networkStatus, + logger: cfg.logger.Named(groupServiceHookName), + serviceRegWrapper: cfg.serviceRegWrapper, + services: tg.Services, + shutdownDelayCtx: cfg.shutdownDelayCtx, } if cfg.alloc.AllocatedResources != nil { @@ -210,10 +207,13 @@ func (h *groupServiceHook) preKillLocked() { h.logger.Debug("delay before killing tasks", "group", h.group, "shutdown_delay", h.delay) + timer, cancel := helper.NewSafeTimer(h.delay) + defer cancel() + select { // Wait for specified shutdown_delay unless ignored // This will block an agent from shutting down. - case <-time.After(h.delay): + case <-timer.C: case <-h.shutdownDelayCtx.Done(): } } @@ -241,8 +241,8 @@ func (h *groupServiceHook) getWorkloadServices() *serviceregistration.WorkloadSe interpolatedServices := taskenv.InterpolateServices(h.taskEnvBuilder.Build(), h.services) var netStatus *structs.AllocNetworkStatus - if h.networkStatusGetter != nil { - netStatus = h.networkStatusGetter.NetworkStatus() + if h.networkStatus != nil { + netStatus = h.networkStatus.NetworkStatus() } // Create task services struct with request's driver metadata diff --git a/client/allocrunner/groupservice_hook_test.go b/client/allocrunner/group_service_hook_test.go similarity index 100% rename from client/allocrunner/groupservice_hook_test.go rename to client/allocrunner/group_service_hook_test.go diff --git a/client/allocrunner/health_hook.go b/client/allocrunner/health_hook.go index 4c2bef43ab90..bf799a97bf20 100644 --- a/client/allocrunner/health_hook.go +++ b/client/allocrunner/health_hook.go @@ -6,35 +6,39 @@ import ( "sync" "time" - log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/allochealth" "github.com/hashicorp/nomad/client/allocrunner/interfaces" "github.com/hashicorp/nomad/client/serviceregistration" + "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/nomad/structs" ) -// healthMutator is able to set/clear alloc health. +// healthSetter is able to set/clear alloc health. type healthSetter interface { // HasHealth returns true if health is already set. HasHealth() bool - // Set health via the mutator + // SetHealth via the mutator. SetHealth(healthy, isDeploy bool, taskEvents map[string]*structs.TaskEvent) - // Clear health when the deployment ID changes + // ClearHealth for when the deployment ID changes. ClearHealth() } // allocHealthWatcherHook is responsible for watching an allocation's task // status and (optionally) Consul health check status to determine if the -// allocation is health or unhealthy. Used by deployments and migrations. +// allocation is healthy or unhealthy. Used by deployments and migrations. type allocHealthWatcherHook struct { healthSetter healthSetter - // consul client used to monitor health checks + // consul client used to monitor Consul service health checks consul serviceregistration.Handler + // checkStore is used to monitor Nomad service health checks + checkStore checkstore.Shim + // listener is given to trackers to listen for alloc updates and closed // when the alloc is destroyed. listener *cstructs.AllocListener @@ -64,11 +68,11 @@ type allocHealthWatcherHook struct { // hold hookLock to access. isDeploy bool - logger log.Logger + logger hclog.Logger } -func newAllocHealthWatcherHook(logger log.Logger, alloc *structs.Allocation, hs healthSetter, - listener *cstructs.AllocListener, consul serviceregistration.Handler) interfaces.RunnerHook { +func newAllocHealthWatcherHook(logger hclog.Logger, alloc *structs.Allocation, hs healthSetter, + listener *cstructs.AllocListener, consul serviceregistration.Handler, checkStore checkstore.Shim) interfaces.RunnerHook { // Neither deployments nor migrations care about the health of // non-service jobs so never watch their health @@ -85,6 +89,7 @@ func newAllocHealthWatcherHook(logger log.Logger, alloc *structs.Allocation, hs cancelFn: func() {}, // initialize to prevent nil func panics watchDone: closedDone, consul: consul, + checkStore: checkStore, healthSetter: hs, listener: listener, } @@ -132,8 +137,9 @@ func (h *allocHealthWatcherHook) init() error { h.logger.Trace("watching", "deadline", deadline, "checks", useChecks, "min_healthy_time", minHealthyTime) // Create a new tracker, start it, and watch for health results. - tracker := allochealth.NewTracker(ctx, h.logger, h.alloc, - h.listener, h.consul, minHealthyTime, useChecks) + tracker := allochealth.NewTracker( + ctx, h.logger, h.alloc, h.listener, h.consul, h.checkStore, minHealthyTime, useChecks, + ) tracker.Start() // Create a new done chan and start watching for health updates @@ -194,7 +200,7 @@ func (h *allocHealthWatcherHook) Postrun() error { func (h *allocHealthWatcherHook) Shutdown() { // Same as Postrun - h.Postrun() + _ = h.Postrun() } // watchHealth watches alloc health until it is set, the alloc is stopped, the diff --git a/client/allocrunner/health_hook_test.go b/client/allocrunner/health_hook_test.go index a14e55ab68c6..3d885dffd06a 100644 --- a/client/allocrunner/health_hook_test.go +++ b/client/allocrunner/health_hook_test.go @@ -96,7 +96,8 @@ func TestHealthHook_PrerunPostrun(t *testing.T) { consul := regMock.NewServiceRegistrationHandler(logger) hs := &mockHealthSetter{} - h := newAllocHealthWatcherHook(logger, mock.Alloc(), hs, b.Listen(), consul) + checks := new(mock.CheckShim) + h := newAllocHealthWatcherHook(logger, mock.Alloc(), hs, b.Listen(), consul, checks) // Assert we implemented the right interfaces prerunh, ok := h.(interfaces.RunnerPrerunHook) @@ -134,7 +135,8 @@ func TestHealthHook_PrerunUpdatePostrun(t *testing.T) { consul := regMock.NewServiceRegistrationHandler(logger) hs := &mockHealthSetter{} - h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul).(*allocHealthWatcherHook) + checks := new(mock.CheckShim) + h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul, checks).(*allocHealthWatcherHook) // Prerun require.NoError(h.Prerun()) @@ -173,7 +175,8 @@ func TestHealthHook_UpdatePrerunPostrun(t *testing.T) { consul := regMock.NewServiceRegistrationHandler(logger) hs := &mockHealthSetter{} - h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul).(*allocHealthWatcherHook) + checks := new(mock.CheckShim) + h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul, checks).(*allocHealthWatcherHook) // Set a DeploymentID to cause ClearHealth to be called alloc.DeploymentID = uuid.Generate() @@ -214,7 +217,8 @@ func TestHealthHook_Postrun(t *testing.T) { consul := regMock.NewServiceRegistrationHandler(logger) hs := &mockHealthSetter{} - h := newAllocHealthWatcherHook(logger, mock.Alloc(), hs, b.Listen(), consul).(*allocHealthWatcherHook) + checks := new(mock.CheckShim) + h := newAllocHealthWatcherHook(logger, mock.Alloc(), hs, b.Listen(), consul, checks).(*allocHealthWatcherHook) // Postrun require.NoError(h.Postrun()) @@ -280,7 +284,8 @@ func TestHealthHook_SetHealth_healthy(t *testing.T) { hs := newMockHealthSetter() - h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul).(*allocHealthWatcherHook) + checks := new(mock.CheckShim) + h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul, checks).(*allocHealthWatcherHook) // Prerun require.NoError(h.Prerun()) @@ -368,7 +373,8 @@ func TestHealthHook_SetHealth_unhealthy(t *testing.T) { hs := newMockHealthSetter() - h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul).(*allocHealthWatcherHook) + checks := new(mock.CheckShim) + h := newAllocHealthWatcherHook(logger, alloc.Copy(), hs, b.Listen(), consul, checks).(*allocHealthWatcherHook) // Prerun require.NoError(h.Prerun()) @@ -389,7 +395,7 @@ func TestHealthHook_SetHealth_unhealthy(t *testing.T) { func TestHealthHook_SystemNoop(t *testing.T) { ci.Parallel(t) - h := newAllocHealthWatcherHook(testlog.HCLogger(t), mock.SystemAlloc(), nil, nil, nil) + h := newAllocHealthWatcherHook(testlog.HCLogger(t), mock.SystemAlloc(), nil, nil, nil, nil) // Assert that it's the noop impl _, ok := h.(noopAllocHealthWatcherHook) @@ -410,7 +416,7 @@ func TestHealthHook_SystemNoop(t *testing.T) { func TestHealthHook_BatchNoop(t *testing.T) { ci.Parallel(t) - h := newAllocHealthWatcherHook(testlog.HCLogger(t), mock.BatchAlloc(), nil, nil, nil) + h := newAllocHealthWatcherHook(testlog.HCLogger(t), mock.BatchAlloc(), nil, nil, nil, nil) // Assert that it's the noop impl _, ok := h.(noopAllocHealthWatcherHook) diff --git a/client/allocrunner/interfaces/runner_lifecycle.go b/client/allocrunner/interfaces/runner_lifecycle.go index 7855deaa3f4d..392aae7c82a8 100644 --- a/client/allocrunner/interfaces/runner_lifecycle.go +++ b/client/allocrunner/interfaces/runner_lifecycle.go @@ -4,19 +4,19 @@ import ( "github.com/hashicorp/nomad/nomad/structs" ) -// RunnnerHook is a lifecycle hook into the life cycle of an allocation runner. +// RunnerHook is a lifecycle hook into the life cycle of an allocation runner. type RunnerHook interface { Name() string } -// RunnerPrerunHooks are executed before calling TaskRunner.Run for +// A RunnerPrerunHook is executed before calling TaskRunner.Run for // non-terminal allocations. Terminal allocations do *not* call prerun. type RunnerPrerunHook interface { RunnerHook Prerun() error } -// RunnerPreKillHooks are executed inside of KillTasks before +// A RunnerPreKillHook is executed inside of KillTasks before // iterating and killing each task. It will run before the Leader // task is killed. type RunnerPreKillHook interface { @@ -25,7 +25,7 @@ type RunnerPreKillHook interface { PreKill() } -// RunnerPostrunHooks are executed after calling TaskRunner.Run, even for +// A RunnerPostrunHook is executed after calling TaskRunner.Run, even for // terminal allocations. Therefore Postrun hooks must be safe to call without // first calling Prerun hooks. type RunnerPostrunHook interface { @@ -33,7 +33,7 @@ type RunnerPostrunHook interface { Postrun() error } -// RunnerDestroyHooks are executed after AllocRunner.Run has exited and must +// A RunnerDestroyHook is executed after AllocRunner.Run has exited and must // make a best effort cleanup allocation resources. Destroy hooks must be safe // to call without first calling Prerun. type RunnerDestroyHook interface { @@ -41,10 +41,10 @@ type RunnerDestroyHook interface { Destroy() error } -// RunnerUpdateHooks are executed when an allocation update is received from +// A RunnerUpdateHook is executed when an allocation update is received from // the server. Update is called concurrently with AllocRunner execution and // therefore must be safe for concurrent access with other hook methods. Calls -// to Update are serialized so allocaiton updates will always be processed in +// to Update are serialized so allocation updates will always be processed in // order. type RunnerUpdateHook interface { RunnerHook @@ -55,7 +55,7 @@ type RunnerUpdateRequest struct { Alloc *structs.Allocation } -// RunnerTaskRestartHooks are executed just before the allocation runner is +// A RunnerTaskRestartHook is executed just before the allocation runner is // going to restart all tasks. type RunnerTaskRestartHook interface { RunnerHook diff --git a/client/allocrunner/testing.go b/client/allocrunner/testing.go index b45ef60468aa..6f2fd7b03dfb 100644 --- a/client/allocrunner/testing.go +++ b/client/allocrunner/testing.go @@ -14,6 +14,7 @@ import ( "github.com/hashicorp/nomad/client/devicemanager" "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" + "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" "github.com/hashicorp/nomad/client/serviceregistration/mock" "github.com/hashicorp/nomad/client/serviceregistration/wrapper" "github.com/hashicorp/nomad/client/state" @@ -67,12 +68,14 @@ func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, fu consulRegMock := mock.NewServiceRegistrationHandler(clientConf.Logger) nomadRegMock := mock.NewServiceRegistrationHandler(clientConf.Logger) + stateDB := new(state.NoopDB) + conf := &Config{ // Copy the alloc in case the caller edits and reuses it Alloc: alloc.Copy(), Logger: clientConf.Logger, ClientConfig: clientConf, - StateDB: state.NoopDB{}, + StateDB: stateDB, Consul: consulRegMock, ConsulSI: consul.NewMockServiceIdentitiesClient(), Vault: vaultclient.NewMockVaultClient(), @@ -84,6 +87,7 @@ func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, fu CpusetManager: new(cgutil.NoopCpusetManager), ServersContactedCh: make(chan struct{}), ServiceRegWrapper: wrapper.NewHandlerWrapper(clientConf.Logger, consulRegMock, nomadRegMock), + CheckStore: checkstore.NewStore(clientConf.Logger, stateDB), Getter: getter.TestDefaultGetter(t), } diff --git a/client/allocrunner/util.go b/client/allocrunner/util.go deleted file mode 100644 index cc0b24938e4e..000000000000 --- a/client/allocrunner/util.go +++ /dev/null @@ -1 +0,0 @@ -package allocrunner diff --git a/client/client.go b/client/client.go index 4b822863d434..5d2194d3a8e4 100644 --- a/client/client.go +++ b/client/client.go @@ -37,6 +37,7 @@ import ( "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" "github.com/hashicorp/nomad/client/servers" "github.com/hashicorp/nomad/client/serviceregistration" + "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" "github.com/hashicorp/nomad/client/serviceregistration/nsd" "github.com/hashicorp/nomad/client/serviceregistration/wrapper" "github.com/hashicorp/nomad/client/state" @@ -166,7 +167,7 @@ type AllocRunner interface { } // Client is used to implement the client interaction with Nomad. Clients -// are expected to register as a schedulable node to the servers, and to +// are expected to register as a schedule-able node to the servers, and to // run allocations as determined by the servers. type Client struct { config *config.Config @@ -237,6 +238,10 @@ type Client struct { // registrations. nomadService serviceregistration.Handler + // checkStore is used to store group and task checks and their current pass/fail + // status. + checkStore checkstore.Shim + // serviceRegWrapper wraps the consulService and nomadService // implementations so that the alloc and task runner service hooks can call // this without needing to identify which backend provider should be used. @@ -693,6 +698,10 @@ func (c *Client) init() error { c.cpusetManager = new(cgutil.NoopCpusetManager) } } + + // setup the check store + c.checkStore = checkstore.NewStore(c.logger, c.stateDB) + return nil } @@ -1175,6 +1184,7 @@ func (c *Client) restoreState() error { DriverManager: c.drivermanager, ServersContactedCh: c.serversContactedCh, ServiceRegWrapper: c.serviceRegWrapper, + CheckStore: c.checkStore, RPCClient: c, Getter: c.getter, } @@ -2513,6 +2523,7 @@ func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error DeviceManager: c.devicemanager, DriverManager: c.drivermanager, ServiceRegWrapper: c.serviceRegWrapper, + CheckStore: c.checkStore, RPCClient: c, Getter: c.getter, } diff --git a/client/serviceregistration/checks/checkstore/shim.go b/client/serviceregistration/checks/checkstore/shim.go new file mode 100644 index 000000000000..f83ff512ec69 --- /dev/null +++ b/client/serviceregistration/checks/checkstore/shim.go @@ -0,0 +1,147 @@ +package checkstore + +import ( + "sync" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/client/serviceregistration/checks" + "github.com/hashicorp/nomad/client/state" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/nomad/structs" + "golang.org/x/exp/slices" +) + +// A Shim is used to track the latest check status information, one layer above +// the client persistent store so we can do efficient indexing, etc. +type Shim interface { + // Set the latest result for a specific check. + Set(allocID string, result *structs.CheckQueryResult) error + + // List the latest results for a specific allocation. + List(allocID string) map[structs.CheckID]*structs.CheckQueryResult + + // Difference returns the set of IDs being stored that are not in ids. + Difference(allocID string, ids []structs.CheckID) []structs.CheckID + + // Remove will remove ids from the cache and persistent store. + Remove(allocID string, ids []structs.CheckID) error + + // Purge results for a specific allocation. + Purge(allocID string) error +} + +type shim struct { + log hclog.Logger + + db state.StateDB + + lock sync.RWMutex + current checks.ClientResults +} + +// NewStore creates a new store. +func NewStore(log hclog.Logger, db state.StateDB) Shim { + s := &shim{ + log: log.Named("check_store"), + db: db, + current: make(checks.ClientResults), + } + s.restore() + return s +} + +func (s *shim) restore() { + s.lock.Lock() + defer s.lock.Unlock() + + results, err := s.db.GetCheckResults() + if err != nil { + s.log.Error("failed to restore health check results", "error", err) + return + } + + for id, m := range results { + s.current[id] = helper.CopyMap(m) + } +} + +func (s *shim) Set(allocID string, qr *structs.CheckQueryResult) error { + s.lock.Lock() + defer s.lock.Unlock() + + if _, exists := s.current[allocID]; !exists { + s.current[allocID] = make(map[structs.CheckID]*structs.CheckQueryResult) + } + + // lookup existing result + previous, exists := s.current[allocID][qr.ID] + + // only insert a stub if no result exists yet + if qr.Status == structs.CheckPending && exists { + return nil + } + + s.log.Trace("setting check status", "alloc_id", allocID, "check_id", qr.ID, "status", qr.Status) + + // always keep in-memory shim up to date with latest result + s.current[allocID][qr.ID] = qr + + // only update persistent store if status changes (optimization) + // on Client restart restored check results may be outdated but the status + // is the same as the most recent result + if !exists || previous.Status != qr.Status { + return s.db.PutCheckResult(allocID, qr) + } + + return nil +} + +func (s *shim) List(allocID string) map[structs.CheckID]*structs.CheckQueryResult { + s.lock.RLock() + defer s.lock.RUnlock() + + m, exists := s.current[allocID] + if !exists { + return nil + } + + return helper.CopyMap(m) +} + +func (s *shim) Purge(allocID string) error { + s.lock.Lock() + defer s.lock.Unlock() + + // remove from our map + delete(s.current, allocID) + + // remove from persistent store + return s.db.PurgeCheckResults(allocID) +} + +func (s *shim) Remove(allocID string, ids []structs.CheckID) error { + s.lock.Lock() + defer s.lock.Unlock() + + // remove from cache + for _, id := range ids { + delete(s.current[allocID], id) + } + + // remove from persistent store + return s.db.DeleteCheckResults(allocID, ids) +} + +func (s *shim) Difference(allocID string, ids []structs.CheckID) []structs.CheckID { + s.lock.Lock() + defer s.lock.Unlock() + + var remove []structs.CheckID + for id := range s.current[allocID] { + if !slices.Contains(ids, id) { + remove = append(remove, id) + } + } + + return remove +} diff --git a/client/serviceregistration/checks/checkstore/shim_test.go b/client/serviceregistration/checks/checkstore/shim_test.go new file mode 100644 index 000000000000..d79520756ebe --- /dev/null +++ b/client/serviceregistration/checks/checkstore/shim_test.go @@ -0,0 +1,379 @@ +package checkstore + +import ( + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/serviceregistration/checks" + "github.com/hashicorp/nomad/client/state" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" + "golang.org/x/exp/slices" +) + +var ( + success = structs.CheckSuccess + failure = structs.CheckFailure + pending = structs.CheckPending +) + +func newQR(id string, status structs.CheckStatus) *structs.CheckQueryResult { + return &structs.CheckQueryResult{ + ID: structs.CheckID(id), + Status: status, + } +} + +// alias for brevity +type qrMap = map[structs.CheckID]*structs.CheckQueryResult + +func TestShim_New(t *testing.T) { + ci.Parallel(t) + logger := testlog.HCLogger(t) + + t.Run("restore empty", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + m := s.List("none") + must.MapEmpty(t, m) + }) + + t.Run("restore checks", func(t *testing.T) { + db := state.NewMemDB(logger) + must.NoError(t, db.PutCheckResult("alloc1", newQR("id1", success))) + must.NoError(t, db.PutCheckResult("alloc1", newQR("id2", failure))) + must.NoError(t, db.PutCheckResult("alloc2", newQR("id3", pending))) + s := NewStore(logger, db) + m1 := s.List("alloc1") + must.MapEq(t, qrMap{ + "id1": newQR("id1", success), + "id2": newQR("id2", failure), + }, m1) + m2 := s.List("alloc2") + must.MapEq(t, qrMap{"id3": newQR("id3", pending)}, m2) + }) +} + +func TestShim_Set(t *testing.T) { + ci.Parallel(t) + logger := testlog.HCLogger(t) + + t.Run("insert pending", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + // insert initial pending check into empty database + qr1 := newQR("id1", pending) + qr1.Timestamp = 1 + must.NoError(t, s.Set("alloc1", qr1)) + + // ensure underlying db has check + internal, err := db.GetCheckResults() + must.NoError(t, err) + must.Eq(t, checks.ClientResults{"alloc1": {"id1": qr1}}, internal) + }) + + t.Run("ignore followup pending", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + // insert a check + qr1 := newQR("id1", success) + qr1.Timestamp = 1 + must.NoError(t, s.Set("alloc1", qr1)) + + // insert a followup pending check (e.g. client restart) + qr2 := newQR("id1", pending) + qr2.Timestamp = 2 + t.Run("into existing", func(t *testing.T) { + must.NoError(t, s.Set("alloc1", qr2)) + }) + + // ensure shim maintains success result + list := s.List("alloc1") + must.Eq(t, qrMap{"id1": qr1}, list) + + // ensure underlying db maintains success result + internal, err := db.GetCheckResults() + must.NoError(t, err) + must.Eq(t, checks.ClientResults{"alloc1": {"id1": qr1}}, internal) + }) + + t.Run("insert status change", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + // insert initial check, success + qr1 := newQR("id1", success) + must.NoError(t, s.Set("alloc1", qr1)) + + // insert followup check, failure + qr2 := newQR("id1", failure) + must.NoError(t, s.Set("alloc1", qr2)) + + // ensure shim sees newest status result + list := s.List("alloc1") + must.Eq(t, qrMap{"id1": qr2}, list) + + // ensure underlying db sees newest status result + internal, err := db.GetCheckResults() + must.NoError(t, err) + must.Eq(t, checks.ClientResults{"alloc1": {"id1": qr2}}, internal) + }) + + t.Run("insert status same", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + // insert initial check, success + qr1 := newQR("id1", success) + qr1.Timestamp = 1 + must.NoError(t, s.Set("alloc1", qr1)) + + // insert followup check, also success + qr2 := newQR("id1", success) + qr2.Timestamp = 2 + must.NoError(t, s.Set("alloc1", qr2)) + + // ensure shim sees newest status result + list := s.List("alloc1") + must.Eq(t, qrMap{"id1": qr2}, list) + + // ensure underlying db sees stale result (optimization) + internal, err := db.GetCheckResults() + must.NoError(t, err) + must.Eq(t, checks.ClientResults{"alloc1": {"id1": qr1}}, internal) + }) +} + +func TestShim_List(t *testing.T) { + ci.Parallel(t) + logger := testlog.HCLogger(t) + + t.Run("list empty", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + list := s.List("alloc1") + must.MapEmpty(t, list) + }) + + t.Run("list mix", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + // insert some checks + must.NoError(t, s.Set("alloc1", newQR("id1", success))) + must.NoError(t, s.Set("alloc1", newQR("id2", failure))) + must.NoError(t, s.Set("alloc2", newQR("id1", pending))) + + list1 := s.List("alloc1") + must.MapEq(t, qrMap{ + "id1": newQR("id1", success), + "id2": newQR("id2", failure), + }, list1) + + list2 := s.List("alloc2") + must.MapEq(t, qrMap{ + "id1": newQR("id1", pending), + }, list2) + + internal, err := db.GetCheckResults() + must.NoError(t, err) + must.MapEq(t, checks.ClientResults{ + "alloc1": { + "id1": newQR("id1", success), + "id2": newQR("id2", failure), + }, + "alloc2": { + "id1": newQR("id1", pending), + }, + }, internal) + }) +} + +func TestShim_Difference(t *testing.T) { + ci.Parallel(t) + logger := testlog.HCLogger(t) + + t.Run("empty store", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + ids := []structs.CheckID{"id1", "id2", "id3"} + unwanted := s.Difference("alloc1", ids) + must.Empty(t, unwanted) + }) + + t.Run("empty unwanted", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + // insert some checks + must.NoError(t, s.Set("alloc1", newQR("id1", success))) + must.NoError(t, s.Set("alloc1", newQR("id2", failure))) + must.NoError(t, s.Set("alloc2", newQR("id1", pending))) + + var ids []structs.CheckID + var exp = []structs.CheckID{"id1", "id2"} + unwanted := s.Difference("alloc1", ids) + slices.Sort(unwanted) + must.Eq(t, exp, unwanted) + }) + + t.Run("subset unwanted", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + // insert some checks + must.NoError(t, s.Set("alloc1", newQR("id1", success))) + must.NoError(t, s.Set("alloc1", newQR("id2", failure))) + must.NoError(t, s.Set("alloc1", newQR("id3", success))) + must.NoError(t, s.Set("alloc1", newQR("id4", success))) + must.NoError(t, s.Set("alloc1", newQR("id5", pending))) + + ids := []structs.CheckID{"id1", "id3", "id4"} + exp := []structs.CheckID{"id2", "id5"} + unwanted := s.Difference("alloc1", ids) + slices.Sort(unwanted) + must.Eq(t, exp, unwanted) + }) + + t.Run("unexpected unwanted", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + // insert some checks + must.NoError(t, s.Set("alloc1", newQR("id1", success))) + must.NoError(t, s.Set("alloc1", newQR("id2", failure))) + must.NoError(t, s.Set("alloc1", newQR("id3", success))) + + ids := []structs.CheckID{"id1", "id4"} + exp := []structs.CheckID{"id2", "id3"} + unwanted := s.Difference("alloc1", ids) + slices.Sort(unwanted) + must.Eq(t, exp, unwanted) + }) +} + +func TestShim_Remove(t *testing.T) { + ci.Parallel(t) + logger := testlog.HCLogger(t) + + t.Run("remove from empty store", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + ids := []structs.CheckID{"id1", "id2"} + err := s.Remove("alloc1", ids) + must.NoError(t, err) + }) + + t.Run("remove empty set from store", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + // insert some checks + must.NoError(t, s.Set("alloc1", newQR("id1", success))) + must.NoError(t, s.Set("alloc1", newQR("id2", failure))) + must.NoError(t, s.Set("alloc2", newQR("id1", pending))) + + var ids []structs.CheckID + err := s.Remove("alloc1", ids) + must.NoError(t, err) + + // ensure shim still contains checks + list := s.List("alloc1") + must.Eq(t, qrMap{"id1": newQR("id1", success), "id2": newQR("id2", failure)}, list) + + // ensure underlying db still contains all checks + internal, err := db.GetCheckResults() + must.NoError(t, err) + must.Eq(t, checks.ClientResults{ + "alloc1": { + "id1": newQR("id1", success), + "id2": newQR("id2", failure), + }, + "alloc2": { + "id1": newQR("id1", pending), + }, + }, internal) + }) + + t.Run("remove subset from store", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + // insert some checks + must.NoError(t, s.Set("alloc1", newQR("id1", success))) + must.NoError(t, s.Set("alloc1", newQR("id2", failure))) + must.NoError(t, s.Set("alloc1", newQR("id3", success))) + must.NoError(t, s.Set("alloc1", newQR("id4", pending))) + must.NoError(t, s.Set("alloc2", newQR("id1", pending))) + must.NoError(t, s.Set("alloc2", newQR("id2", success))) + + ids := []structs.CheckID{"id1", "id4"} + err := s.Remove("alloc1", ids) + must.NoError(t, err) + + // ensure shim still contains remaining checks + list := s.List("alloc1") + must.Eq(t, qrMap{"id2": newQR("id2", failure), "id3": newQR("id3", success)}, list) + + // ensure underlying db still contains remaining checks + internal, err := db.GetCheckResults() + must.NoError(t, err) + must.MapEq(t, checks.ClientResults{ + "alloc1": { + "id2": newQR("id2", failure), + "id3": newQR("id3", success), + }, + "alloc2": { + "id1": newQR("id1", pending), + "id2": newQR("id2", success), + }, + }, internal) + }) +} + +func TestShim_Purge(t *testing.T) { + ci.Parallel(t) + logger := testlog.HCLogger(t) + + t.Run("purge from empty", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + err := s.Purge("alloc1") + must.NoError(t, err) + }) + + t.Run("purge one alloc", func(t *testing.T) { + db := state.NewMemDB(logger) + s := NewStore(logger, db) + + // insert some checks + must.NoError(t, s.Set("alloc1", newQR("id1", success))) + must.NoError(t, s.Set("alloc1", newQR("id2", failure))) + must.NoError(t, s.Set("alloc2", newQR("id1", pending))) + + err := s.Purge("alloc1") + must.NoError(t, err) + + // ensure alloc1 is gone from shim + list1 := s.List("alloc1") + must.MapEmpty(t, list1) + + // ensure alloc2 remains in shim + list2 := s.List("alloc2") + must.MapEq(t, qrMap{"id1": newQR("id1", pending)}, list2) + + // ensure alloc is gone from underlying db + internal, err := db.GetCheckResults() + must.NoError(t, err) + must.MapEq(t, checks.ClientResults{ + "alloc2": {"id1": newQR("id1", pending)}, + }, internal) + }) +} diff --git a/client/serviceregistration/checks/client.go b/client/serviceregistration/checks/client.go new file mode 100644 index 000000000000..c26431436f47 --- /dev/null +++ b/client/serviceregistration/checks/client.go @@ -0,0 +1,209 @@ +package checks + +import ( + "bytes" + "context" + "fmt" + "io" + "net" + "net/http" + "net/url" + "strconv" + "time" + + "github.com/hashicorp/go-cleanhttp" + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/client/serviceregistration" + "github.com/hashicorp/nomad/nomad/structs" + "oss.indeed.com/go/libtime" +) + +const ( + // maxTimeoutHTTP is a fail-safe value for the HTTP client, ensuring a Nomad + // Client does not leak goroutines hanging on to unresponsive endpoints. + maxTimeoutHTTP = 10 * time.Minute +) + +// Checker executes a check given an allocation-specific context, and produces +// a resulting structs.CheckQueryResult +type Checker interface { + Do(context.Context, *QueryContext, *Query) *structs.CheckQueryResult +} + +// New creates a new Checker capable of executing HTTP and TCP checks. +func New(log hclog.Logger) Checker { + httpClient := cleanhttp.DefaultPooledClient() + httpClient.Timeout = maxTimeoutHTTP + return &checker{ + log: log.Named("checks"), + httpClient: httpClient, + clock: libtime.SystemClock(), + } +} + +type checker struct { + log hclog.Logger + clock libtime.Clock + httpClient *http.Client +} + +func (c *checker) now() int64 { + return c.clock.Now().UTC().Unix() +} + +// Do will execute the Query given the QueryContext and produce a structs.CheckQueryResult +func (c *checker) Do(ctx context.Context, qc *QueryContext, q *Query) *structs.CheckQueryResult { + var qr *structs.CheckQueryResult + + timeout, cancel := context.WithTimeout(ctx, q.Timeout) + defer cancel() + + switch q.Type { + case "http": + qr = c.checkHTTP(timeout, qc, q) + default: + qr = c.checkTCP(timeout, qc, q) + } + + qr.ID = qc.ID + qr.Group = qc.Group + qr.Task = qc.Task + qr.Service = qc.Service + qr.Check = qc.Check + return qr +} + +// resolve the address to use when executing Query given a QueryContext +func address(qc *QueryContext, q *Query) (string, error) { + mode := q.AddressMode + if mode == "" { // determine resolution for check address + if qc.CustomAddress != "" { + // if the service is using a custom address, enable the check to + // inherit that custom address + mode = structs.AddressModeAuto + } else { + // otherwise a check defaults to the host address + mode = structs.AddressModeHost + } + } + + label := q.PortLabel + if label == "" { + label = qc.ServicePortLabel + } + + status := qc.NetworkStatus.NetworkStatus() + addr, port, err := serviceregistration.GetAddress( + qc.CustomAddress, // custom address + mode, // check address mode + label, // port label + qc.Networks, // allocation networks + nil, // driver network (not supported) + qc.Ports, // ports + status, // allocation network status + ) + if err != nil { + return "", err + } + if port > 0 { + addr = net.JoinHostPort(addr, strconv.Itoa(port)) + } + return addr, nil +} + +func (c *checker) checkTCP(ctx context.Context, qc *QueryContext, q *Query) *structs.CheckQueryResult { + qr := &structs.CheckQueryResult{ + Mode: q.Mode, + Timestamp: c.now(), + Status: structs.CheckPending, + } + + addr, err := address(qc, q) + if err != nil { + qr.Output = err.Error() + qr.Status = structs.CheckFailure + return qr + } + + if _, err = new(net.Dialer).DialContext(ctx, "tcp", addr); err != nil { + qr.Output = err.Error() + qr.Status = structs.CheckFailure + return qr + } + + qr.Output = "nomad: tcp ok" + qr.Status = structs.CheckSuccess + return qr +} + +func (c *checker) checkHTTP(ctx context.Context, qc *QueryContext, q *Query) *structs.CheckQueryResult { + qr := &structs.CheckQueryResult{ + Mode: q.Mode, + Timestamp: c.now(), + Status: structs.CheckPending, + } + + addr, err := address(qc, q) + if err != nil { + qr.Output = err.Error() + qr.Status = structs.CheckFailure + return qr + } + + u := (&url.URL{ + Scheme: q.Protocol, + Host: addr, + Path: q.Path, + }).String() + + request, err := http.NewRequest(q.Method, u, nil) + if err != nil { + qr.Output = fmt.Sprintf("nomad: %s", err.Error()) + qr.Status = structs.CheckFailure + return qr + } + request = request.WithContext(ctx) + + result, err := c.httpClient.Do(request) + if err != nil { + qr.Output = fmt.Sprintf("nomad: %s", err.Error()) + qr.Status = structs.CheckFailure + return qr + } + + // match the result status code to the http status code + qr.StatusCode = result.StatusCode + + switch { + case result.StatusCode == 200: + qr.Status = structs.CheckSuccess + qr.Output = "nomad: http ok" + return qr + case result.StatusCode < 400: + qr.Status = structs.CheckSuccess + default: + qr.Status = structs.CheckFailure + } + + // status code was not 200; read the response body and set that as the + // check result output content + qr.Output = limitRead(result.Body) + + return qr +} + +const ( + // outputSizeLimit is the maximum number of bytes to read and store of an http + // check output. Set to 3kb which fits in 1 page with room for other fields. + outputSizeLimit = 3 * 1024 +) + +func limitRead(r io.Reader) string { + b := make([]byte, 0, outputSizeLimit) + output := bytes.NewBuffer(b) + limited := io.LimitReader(r, outputSizeLimit) + if _, err := io.Copy(output, limited); err != nil { + return fmt.Sprintf("nomad: %s", err.Error()) + } + return output.String() +} diff --git a/client/serviceregistration/checks/client_test.go b/client/serviceregistration/checks/client_test.go new file mode 100644 index 000000000000..2f6cb7fb47f5 --- /dev/null +++ b/client/serviceregistration/checks/client_test.go @@ -0,0 +1,342 @@ +package checks + +import ( + "context" + "fmt" + "io" + "net" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/helper/freeport" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" + "oss.indeed.com/go/libtime/libtimetest" +) + +func TestChecker_Do_HTTP(t *testing.T) { + ci.Parallel(t) + + // an example response that will be truncated + tooLong, truncate := bigResponse() + + // create an http server with various responses + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/fail": + w.WriteHeader(500) + _, _ = io.WriteString(w, "500 problem") + case "/hang": + time.Sleep(1 * time.Second) + _, _ = io.WriteString(w, "too slow") + case "/long-fail": + w.WriteHeader(500) + _, _ = io.WriteString(w, tooLong) + case "/long-not-fail": + w.WriteHeader(201) + _, _ = io.WriteString(w, tooLong) + default: + w.WriteHeader(200) + _, _ = io.WriteString(w, "200 ok") + } + })) + defer ts.Close() + + // get the address and port for http server + tokens := strings.Split(ts.URL, ":") + addr, port := strings.TrimPrefix(tokens[1], "//"), tokens[2] + + // create a mock clock so we can assert time is set + now := time.Date(2022, 1, 2, 3, 4, 5, 6, time.UTC) + clock := libtimetest.NewClockMock(t).NowMock.Return(now) + + makeQueryContext := func() *QueryContext { + return &QueryContext{ + ID: "abc123", + CustomAddress: addr, + ServicePortLabel: port, + Networks: nil, + NetworkStatus: mock.NewNetworkStatus(addr), + Ports: nil, + Group: "group", + Task: "task", + Service: "service", + Check: "check", + } + } + + makeQuery := func( + kind structs.CheckMode, + path string, + ) *Query { + return &Query{ + Mode: kind, + Type: "http", + Timeout: 100 * time.Millisecond, + AddressMode: "auto", + PortLabel: port, + Protocol: "http", + Path: path, + Method: "GET", + } + } + + makeExpResult := func( + kind structs.CheckMode, + status structs.CheckStatus, + code int, + output string, + ) *structs.CheckQueryResult { + return &structs.CheckQueryResult{ + ID: "abc123", + Mode: kind, + Status: status, + StatusCode: code, + Output: output, + Timestamp: now.Unix(), + Group: "group", + Task: "task", + Service: "service", + Check: "check", + } + } + + cases := []struct { + name string + qc *QueryContext + q *Query + expResult *structs.CheckQueryResult + }{{ + name: "200 healthiness", + qc: makeQueryContext(), + q: makeQuery(structs.Healthiness, "/"), + expResult: makeExpResult( + structs.Healthiness, + structs.CheckSuccess, + http.StatusOK, + "nomad: http ok", + ), + }, { + name: "200 readiness", + qc: makeQueryContext(), + q: makeQuery(structs.Readiness, "/"), + expResult: makeExpResult( + structs.Readiness, + structs.CheckSuccess, + http.StatusOK, + "nomad: http ok", + ), + }, { + name: "500 healthiness", + qc: makeQueryContext(), + q: makeQuery(structs.Healthiness, "fail"), + expResult: makeExpResult( + structs.Healthiness, + structs.CheckFailure, + http.StatusInternalServerError, + "500 problem", + ), + }, { + name: "hang", + qc: makeQueryContext(), + q: makeQuery(structs.Healthiness, "hang"), + expResult: makeExpResult( + structs.Healthiness, + structs.CheckFailure, + 0, + fmt.Sprintf(`nomad: Get "%s/hang": context deadline exceeded`, ts.URL), + ), + }, { + name: "500 truncate", + qc: makeQueryContext(), + q: makeQuery(structs.Healthiness, "long-fail"), + expResult: makeExpResult( + structs.Healthiness, + structs.CheckFailure, + http.StatusInternalServerError, + truncate, + ), + }, { + name: "201 truncate", + qc: makeQueryContext(), + q: makeQuery(structs.Healthiness, "long-not-fail"), + expResult: makeExpResult( + structs.Healthiness, + structs.CheckSuccess, + http.StatusCreated, + truncate, + ), + }} + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + logger := testlog.HCLogger(t) + + c := New(logger) + c.(*checker).clock = clock + + ctx := context.Background() + result := c.Do(ctx, tc.qc, tc.q) + must.Eq(t, tc.expResult, result) + }) + } +} + +// bigResponse creates a response payload larger than the maximum outputSizeLimit +// as well as the same response but truncated to length of outputSizeLimit +func bigResponse() (string, string) { + size := outputSizeLimit + 5 + b := make([]byte, size, size) + for i := 0; i < size; i++ { + b[i] = 'a' + } + s := string(b) + return s, s[:outputSizeLimit] +} + +func TestChecker_Do_TCP(t *testing.T) { + ci.Parallel(t) + + // create a mock clock so we can assert time is set + now := time.Date(2022, 1, 2, 3, 4, 5, 6, time.UTC) + clock := libtimetest.NewClockMock(t).NowMock.Return(now) + + makeQueryContext := func(address string, port int) *QueryContext { + return &QueryContext{ + ID: "abc123", + CustomAddress: address, + ServicePortLabel: fmt.Sprintf("%d", port), + Networks: nil, + NetworkStatus: mock.NewNetworkStatus(address), + Ports: nil, + Group: "group", + Task: "task", + Service: "service", + Check: "check", + } + } + + makeQuery := func( + kind structs.CheckMode, + port int, + ) *Query { + return &Query{ + Mode: kind, + Type: "tcp", + Timeout: 100 * time.Millisecond, + AddressMode: "auto", + PortLabel: fmt.Sprintf("%d", port), + } + } + + makeExpResult := func( + kind structs.CheckMode, + status structs.CheckStatus, + output string, + ) *structs.CheckQueryResult { + return &structs.CheckQueryResult{ + ID: "abc123", + Mode: kind, + Status: status, + Output: output, + Timestamp: now.Unix(), + Group: "group", + Task: "task", + Service: "service", + Check: "check", + } + } + + ports := freeport.MustTake(3) + defer freeport.Return(ports) + + cases := []struct { + name string + qc *QueryContext + q *Query + tcpMode string // "ok", "off", "hang" + tcpPort int + expResult *structs.CheckQueryResult + }{{ + name: "tcp ok", + qc: makeQueryContext("localhost", ports[0]), + q: makeQuery(structs.Healthiness, ports[0]), + tcpMode: "ok", + tcpPort: ports[0], + expResult: makeExpResult( + structs.Healthiness, + structs.CheckSuccess, + "nomad: tcp ok", + ), + }, { + name: "tcp not listening", + qc: makeQueryContext("127.0.0.1", ports[1]), + q: makeQuery(structs.Healthiness, ports[1]), + tcpMode: "off", + tcpPort: ports[1], + expResult: makeExpResult( + structs.Healthiness, + structs.CheckFailure, + fmt.Sprintf("dial tcp 127.0.0.1:%d: connect: connection refused", ports[1]), + ), + }, { + name: "tcp slow accept", + qc: makeQueryContext("localhost", ports[2]), + q: makeQuery(structs.Healthiness, ports[2]), + tcpMode: "hang", + tcpPort: ports[2], + expResult: makeExpResult( + structs.Healthiness, + structs.CheckFailure, + "dial tcp: lookup localhost: i/o timeout", + ), + }} + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + logger := testlog.HCLogger(t) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + c := New(logger) + c.(*checker).clock = clock + + switch tc.tcpMode { + case "ok": + // simulate tcp server by listening + go tcpServer(ctx, tc.tcpPort) + case "hang": + // simulate tcp hang by setting an already expired context + timeout, stop := context.WithDeadline(ctx, now.Add(-1*time.Second)) + defer stop() + ctx = timeout + case "off": + // simulate tcp dead connection by not listening + } + + result := c.Do(ctx, tc.qc, tc.q) + must.Eq(t, tc.expResult, result) + }) + } +} + +func tcpServer(ctx context.Context, port int) { + var lc net.ListenConfig + l, _ := lc.Listen(ctx, "tcp", net.JoinHostPort( + "localhost", fmt.Sprintf("%d", port), + )) + defer func() { + _ = l.Close() + }() + con, _ := l.Accept() + defer func() { + _ = con.Close() + }() +} diff --git a/client/serviceregistration/checks/result.go b/client/serviceregistration/checks/result.go new file mode 100644 index 000000000000..8ef5859ce70e --- /dev/null +++ b/client/serviceregistration/checks/result.go @@ -0,0 +1,102 @@ +package checks + +import ( + "time" + + "github.com/hashicorp/nomad/nomad/structs" +) + +// GetCheckQuery extracts the needed info from c to actually execute the check. +func GetCheckQuery(c *structs.ServiceCheck) *Query { + var protocol = c.Protocol // ensure appropriate default + if c.Type == "http" && protocol == "" { + protocol = "http" + } + return &Query{ + Mode: structs.GetCheckMode(c), + Type: c.Type, + Timeout: c.Timeout, + AddressMode: c.AddressMode, + PortLabel: c.PortLabel, + Path: c.Path, + Method: c.Method, + Protocol: protocol, + } +} + +// A Query is derived from a ServiceCheck and contains the minimal +// amount of information needed to actually execute that check. +type Query struct { + Mode structs.CheckMode // readiness or healthiness + Type string // tcp or http + + Timeout time.Duration // connection / request timeout + + AddressMode string // host, driver, or alloc + PortLabel string // label or value + + Protocol string // http checks only (http or https) + Path string // http checks only + Method string // http checks only +} + +// A QueryContext contains allocation and service parameters necessary for +// address resolution. +type QueryContext struct { + ID structs.CheckID + CustomAddress string + ServicePortLabel string + Networks structs.Networks + NetworkStatus structs.NetworkStatus + Ports structs.AllocatedPorts + + Group string + Task string + Service string + Check string +} + +// Stub creates a temporary QueryResult for the check of ID in the Pending state +// so we can represent the status of not being checked yet. +func Stub( + id structs.CheckID, kind structs.CheckMode, now int64, + group, task, service, check string, +) *structs.CheckQueryResult { + return &structs.CheckQueryResult{ + ID: id, + Mode: kind, + Status: structs.CheckPending, + Output: "nomad: waiting to run", + Timestamp: now, + Group: group, + Task: task, + Service: service, + Check: check, + } +} + +// AllocationResults is a view of the check_id -> latest result for group and task +// checks in an allocation. +type AllocationResults map[structs.CheckID]*structs.CheckQueryResult + +// diff returns the set of IDs in ids that are not in m. +func (m AllocationResults) diff(ids []structs.CheckID) []structs.CheckID { + var missing []structs.CheckID + for _, id := range ids { + if _, exists := m[id]; !exists { + missing = append(missing, id) + } + } + return missing +} + +// ClientResults is a holistic view of alloc_id -> check_id -> latest result +// group and task checks across all allocations on a client. +type ClientResults map[string]AllocationResults + +func (cr ClientResults) Insert(allocID string, result *structs.CheckQueryResult) { + if _, exists := cr[allocID]; !exists { + cr[allocID] = make(AllocationResults) + } + cr[allocID][result.ID] = result +} diff --git a/client/serviceregistration/checks/result_test.go b/client/serviceregistration/checks/result_test.go new file mode 100644 index 000000000000..dee7082dcc37 --- /dev/null +++ b/client/serviceregistration/checks/result_test.go @@ -0,0 +1,171 @@ +package checks + +import ( + "testing" + "time" + + "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" +) + +func TestChecks_GetCheckQuery(t *testing.T) { + cases := []struct { + name string + cType string + protocol string + onUpdate string + expMode structs.CheckMode + expProtocol string + }{ + { + name: "http check and http set", + cType: "http", + protocol: "http", + onUpdate: "checks", + expMode: structs.Healthiness, + expProtocol: "http", + }, + { + name: "http check and https set", + cType: "http", + protocol: "https", + onUpdate: "checks", + expMode: structs.Healthiness, + expProtocol: "https", + }, + { + name: "http check and protocol unset", + cType: "http", + protocol: "", + onUpdate: "checks", + expMode: structs.Healthiness, + expProtocol: "http", // inherit default + }, + { + name: "tcp check and protocol unset", + cType: "tcp", + protocol: "", + onUpdate: "checks", + expMode: structs.Healthiness, + expProtocol: "", + }, + { + name: "http check and http set", + cType: "http", + protocol: "http", + onUpdate: "checks", + expMode: structs.Healthiness, + expProtocol: "http", + }, + { + name: "on-update ignore", + cType: "http", + protocol: "http", + onUpdate: "ignore", + expMode: structs.Readiness, + expProtocol: "http", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + serviceCheck := &structs.ServiceCheck{ + Type: tc.cType, + Path: "/", + Protocol: tc.protocol, + PortLabel: "web", + AddressMode: "host", + Interval: 10 * time.Second, + Timeout: 2 * time.Second, + Method: "GET", + OnUpdate: tc.onUpdate, + } + query := GetCheckQuery(serviceCheck) + must.Eq(t, tc.expMode, query.Mode) + must.Eq(t, tc.expProtocol, query.Protocol) + }) + } +} + +func TestChecks_Stub(t *testing.T) { + now := time.Date(2020, 1, 2, 3, 4, 5, 6, time.UTC).Unix() + result := Stub( + "abc123", // check id + structs.Healthiness, // kind + now, // timestamp + "group", "task", "service", "check", + ) + must.Eq(t, &structs.CheckQueryResult{ + ID: "abc123", + Mode: structs.Healthiness, + Status: structs.CheckPending, + Output: "nomad: waiting to run", + Timestamp: now, + Group: "group", + Task: "task", + Service: "service", + Check: "check", + }, result) +} + +func TestChecks_AllocationResults_diff(t *testing.T) { + cases := []struct { + name string + results []structs.CheckID + ids []structs.CheckID + exp []structs.CheckID + }{{ + name: "empty results and empty ids", + results: nil, + ids: nil, + exp: nil, + }, { + name: "empty results and nonempty ids", + results: nil, + ids: []structs.CheckID{"aaa", "bbb"}, + exp: []structs.CheckID{"aaa", "bbb"}, + }, { + name: "nonempty results and empty ids", + results: []structs.CheckID{"aaa", "bbb"}, + ids: nil, + exp: nil, + }, { + name: "mix", + results: []structs.CheckID{"aaa", "ccc", "ddd", "fff"}, + ids: []structs.CheckID{"bbb", "ccc", "fff", "eee"}, + exp: []structs.CheckID{"bbb", "eee"}, + }} + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + ar := make(AllocationResults) + for _, id := range tc.results { + ar[id] = nil + } + result := ar.diff(tc.ids) + must.Eq(t, tc.exp, result) + }) + } +} + +func TestChecks_ClientResults_Insert(t *testing.T) { + cr := make(ClientResults) + cr.Insert("alloc1", &structs.CheckQueryResult{ID: "qr1", Check: "c1"}) + cr.Insert("alloc2", &structs.CheckQueryResult{ID: "qr2", Check: "c2"}) + cr.Insert("alloc3", &structs.CheckQueryResult{ID: "qr3", Check: "c3"}) + cr.Insert("alloc2", &structs.CheckQueryResult{ID: "qr2", Check: "c4"}) // overwrite + cr.Insert("alloc3", &structs.CheckQueryResult{ID: "qr4", Check: "c5"}) + exp := ClientResults{ + "alloc1": { + "qr1": &structs.CheckQueryResult{ID: "qr1", Check: "c1"}, + }, + "alloc2": { + "qr2": &structs.CheckQueryResult{ID: "qr2", Check: "c4"}, + }, + "alloc3": { + "qr3": &structs.CheckQueryResult{ID: "qr3", Check: "c3"}, + "qr4": &structs.CheckQueryResult{ID: "qr4", Check: "c5"}, + }, + } + must.Eq(t, exp, cr) +} diff --git a/client/serviceregistration/service_registration.go b/client/serviceregistration/service_registration.go index 4981ff1cfebb..2a76399b783e 100644 --- a/client/serviceregistration/service_registration.go +++ b/client/serviceregistration/service_registration.go @@ -108,6 +108,7 @@ func (a *AllocRegistration) NumChecks() int { // ServiceRegistrations holds the status of services registered for a // particular task or task group. type ServiceRegistrations struct { + // Services maps service_id -> service registration Services map[string]*ServiceRegistration } diff --git a/client/state/state_database.go b/client/state/db_bolt.go similarity index 91% rename from client/state/state_database.go rename to client/state/db_bolt.go index be81fd53ecb2..1be688a1a613 100644 --- a/client/state/state_database.go +++ b/client/state/db_bolt.go @@ -1,6 +1,7 @@ package state import ( + "bytes" "fmt" "os" "path/filepath" @@ -11,6 +12,7 @@ import ( dmstate "github.com/hashicorp/nomad/client/devicemanager/state" "github.com/hashicorp/nomad/client/dynamicplugins" driverstate "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state" + "github.com/hashicorp/nomad/client/serviceregistration/checks" "github.com/hashicorp/nomad/helper/boltdd" "github.com/hashicorp/nomad/nomad/structs" "go.etcd.io/bbolt" @@ -29,7 +31,9 @@ allocations/ |--> network_status -> networkStatusEntry{*structs.AllocNetworkStatus} |--> task-/ |--> local_state -> *trstate.LocalState # Local-only state - |--> task_state -> *structs.TaskState # Sync'd to servers + |--> task_state -> *structs.TaskState # Syncs to servers + |--> checks/ + |--> check- -> *structs.CheckState # Syncs to servers devicemanager/ |--> plugin_state -> *dmstate.PluginState @@ -73,6 +77,9 @@ var ( // stored under allocNetworkStatusKey = []byte("network_status") + // checkResultsBucket is the bucket name in which check query results are stored + checkResultsBucket = []byte("check_results") + // allocations -> $allocid -> task-$taskname -> the keys below taskLocalStateKey = []byte("local_state") taskStateKey = []byte("task_state") @@ -176,6 +183,7 @@ func (s *BoltStateDB) Name() string { func (s *BoltStateDB) GetAllAllocations() ([]*structs.Allocation, map[string]error, error) { var allocs []*structs.Allocation var errs map[string]error + err := s.db.View(func(tx *boltdd.Tx) error { allocs, errs = s.getAllAllocations(tx) return nil @@ -195,7 +203,7 @@ type allocEntry struct { } func (s *BoltStateDB) getAllAllocations(tx *boltdd.Tx) ([]*structs.Allocation, map[string]error) { - allocs := []*structs.Allocation{} + allocs := make([]*structs.Allocation, 0, 10) errs := map[string]error{} allocationsBkt := tx.Bucket(allocationsBucketName) @@ -717,6 +725,71 @@ func (s *BoltStateDB) GetDynamicPluginRegistryState() (*dynamicplugins.RegistryS return ps, nil } +func keyForCheck(allocID string, checkID structs.CheckID) []byte { + return []byte(fmt.Sprintf("%s_%s", allocID, checkID)) +} + +// PutCheckResult puts qr into the state store. +func (s *BoltStateDB) PutCheckResult(allocID string, qr *structs.CheckQueryResult) error { + return s.db.Update(func(tx *boltdd.Tx) error { + bkt, err := tx.CreateBucketIfNotExists(checkResultsBucket) + if err != nil { + return err + } + key := keyForCheck(allocID, qr.ID) + return bkt.Put(key, qr) + }) +} + +// GetCheckResults gets the check results associated with allocID from the state store. +func (s *BoltStateDB) GetCheckResults() (checks.ClientResults, error) { + m := make(checks.ClientResults) + + err := s.db.View(func(tx *boltdd.Tx) error { + bkt := tx.Bucket(checkResultsBucket) + if bkt == nil { + return nil // nothing set yet + } + + if err := boltdd.Iterate(bkt, nil, func(key []byte, qr structs.CheckQueryResult) { + parts := bytes.SplitN(key, []byte("_"), 2) + allocID, _ := parts[0], parts[1] + m.Insert(string(allocID), &qr) + }); err != nil { + return err + } + return nil + }) + return m, err +} + +func (s *BoltStateDB) DeleteCheckResults(allocID string, checkIDs []structs.CheckID) error { + return s.db.Update(func(tx *boltdd.Tx) error { + bkt := tx.Bucket(checkResultsBucket) + if bkt == nil { + return nil // nothing set yet + } + + for _, id := range checkIDs { + key := keyForCheck(allocID, id) + if err := bkt.Delete(key); err != nil { + return err + } + } + return nil + }) +} + +func (s *BoltStateDB) PurgeCheckResults(allocID string) error { + return s.db.Update(func(tx *boltdd.Tx) error { + bkt := tx.Bucket(checkResultsBucket) + if bkt == nil { + return nil // nothing set yet + } + return bkt.DeletePrefix([]byte(allocID + "_")) + }) +} + // init initializes metadata entries in a newly created state database. func (s *BoltStateDB) init() error { return s.db.Update(func(tx *boltdd.Tx) error { diff --git a/client/state/errdb.go b/client/state/db_error.go similarity index 85% rename from client/state/errdb.go rename to client/state/db_error.go index b6e75e6e265a..77e51e631245 100644 --- a/client/state/errdb.go +++ b/client/state/db_error.go @@ -7,6 +7,7 @@ import ( dmstate "github.com/hashicorp/nomad/client/devicemanager/state" "github.com/hashicorp/nomad/client/dynamicplugins" driverstate "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state" + "github.com/hashicorp/nomad/client/serviceregistration/checks" "github.com/hashicorp/nomad/nomad/structs" ) @@ -80,8 +81,6 @@ func (m *ErrDB) PutDynamicPluginRegistryState(state *dynamicplugins.RegistryStat return fmt.Errorf("Error!") } -// GetDevicePluginState stores the device manager's plugin state or returns an -// error. func (m *ErrDB) GetDevicePluginState() (*dmstate.PluginState, error) { return nil, fmt.Errorf("Error!") } @@ -94,9 +93,22 @@ func (m *ErrDB) PutDriverPluginState(ps *driverstate.PluginState) error { return fmt.Errorf("Error!") } -func (m *ErrDB) Close() error { +func (m *ErrDB) PutCheckResult(allocID string, qr *structs.CheckQueryResult) error { + return fmt.Errorf("Error!") +} + +func (m *ErrDB) GetCheckResults() (checks.ClientResults, error) { + return nil, fmt.Errorf("Error!") +} + +func (m *ErrDB) DeleteCheckResults(allocID string, checkIDs []structs.CheckID) error { return fmt.Errorf("Error!") } -// Ensure *ErrDB implements StateDB -var _ StateDB = (*ErrDB)(nil) +func (m *ErrDB) PurgeCheckResults(allocID string) error { + return fmt.Errorf("Error!") +} + +func (m *ErrDB) Close() error { + return fmt.Errorf("Error!") +} diff --git a/client/state/memdb.go b/client/state/db_mem.go similarity index 81% rename from client/state/memdb.go rename to client/state/db_mem.go index 1e281410979e..e22e8726e326 100644 --- a/client/state/memdb.go +++ b/client/state/db_mem.go @@ -3,11 +3,13 @@ package state import ( "sync" - hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state" dmstate "github.com/hashicorp/nomad/client/devicemanager/state" "github.com/hashicorp/nomad/client/dynamicplugins" driverstate "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state" + "github.com/hashicorp/nomad/client/serviceregistration/checks" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" ) @@ -27,6 +29,9 @@ type MemDB struct { localTaskState map[string]map[string]*state.LocalState taskState map[string]map[string]*structs.TaskState + // alloc_id -> check_id -> result + checks checks.ClientResults + // devicemanager -> plugin-state devManagerPs *dmstate.PluginState @@ -49,6 +54,7 @@ func NewMemDB(logger hclog.Logger) *MemDB { networkStatus: make(map[string]*structs.AllocNetworkStatus), localTaskState: make(map[string]map[string]*state.LocalState), taskState: make(map[string]map[string]*structs.TaskState), + checks: make(checks.ClientResults), logger: logger, } } @@ -73,7 +79,7 @@ func (m *MemDB) GetAllAllocations() ([]*structs.Allocation, map[string]error, er return allocs, map[string]error{}, nil } -func (m *MemDB) PutAllocation(alloc *structs.Allocation, opts ...WriteOption) error { +func (m *MemDB) PutAllocation(alloc *structs.Allocation, _ ...WriteOption) error { m.mu.Lock() defer m.mu.Unlock() m.allocs[alloc.ID] = alloc @@ -99,7 +105,7 @@ func (m *MemDB) GetNetworkStatus(allocID string) (*structs.AllocNetworkStatus, e return m.networkStatus[allocID], nil } -func (m *MemDB) PutNetworkStatus(allocID string, ns *structs.AllocNetworkStatus, opts ...WriteOption) error { +func (m *MemDB) PutNetworkStatus(allocID string, ns *structs.AllocNetworkStatus, _ ...WriteOption) error { m.mu.Lock() m.networkStatus[allocID] = ns defer m.mu.Unlock() @@ -175,7 +181,7 @@ func (m *MemDB) DeleteTaskBucket(allocID, taskName string) error { return nil } -func (m *MemDB) DeleteAllocationBucket(allocID string, opts ...WriteOption) error { +func (m *MemDB) DeleteAllocationBucket(allocID string, _ ...WriteOption) error { m.mu.Lock() defer m.mu.Unlock() @@ -227,6 +233,40 @@ func (m *MemDB) PutDynamicPluginRegistryState(ps *dynamicplugins.RegistryState) return nil } +func (m *MemDB) PutCheckResult(allocID string, qr *structs.CheckQueryResult) error { + m.mu.Lock() + defer m.mu.Unlock() + + if _, exists := m.checks[allocID]; !exists { + m.checks[allocID] = make(checks.AllocationResults) + } + + m.checks[allocID][qr.ID] = qr + return nil +} + +func (m *MemDB) GetCheckResults() (checks.ClientResults, error) { + m.mu.Lock() + defer m.mu.Unlock() + return helper.CopyMap(m.checks), nil +} + +func (m *MemDB) DeleteCheckResults(allocID string, checkIDs []structs.CheckID) error { + m.mu.Lock() + defer m.mu.Unlock() + for _, id := range checkIDs { + delete(m.checks[allocID], id) + } + return nil +} + +func (m *MemDB) PurgeCheckResults(allocID string) error { + m.mu.Lock() + defer m.mu.Unlock() + delete(m.checks, allocID) + return nil +} + func (m *MemDB) Close() error { m.mu.Lock() defer m.mu.Unlock() diff --git a/client/state/noopdb.go b/client/state/db_noop.go similarity index 84% rename from client/state/noopdb.go rename to client/state/db_noop.go index e199a7affab4..0b433e1498b0 100644 --- a/client/state/noopdb.go +++ b/client/state/db_noop.go @@ -5,6 +5,7 @@ import ( dmstate "github.com/hashicorp/nomad/client/devicemanager/state" "github.com/hashicorp/nomad/client/dynamicplugins" driverstate "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state" + "github.com/hashicorp/nomad/client/serviceregistration/checks" "github.com/hashicorp/nomad/nomad/structs" ) @@ -87,6 +88,22 @@ func (n NoopDB) GetDynamicPluginRegistryState() (*dynamicplugins.RegistryState, return nil, nil } +func (n NoopDB) PutCheckResult(allocID string, qr *structs.CheckQueryResult) error { + return nil +} + +func (n NoopDB) GetCheckResults() (checks.ClientResults, error) { + return nil, nil +} + +func (n NoopDB) DeleteCheckResults(allocID string, checkIDs []structs.CheckID) error { + return nil +} + +func (n NoopDB) PurgeCheckResults(allocID string) error { + return nil +} + func (n NoopDB) Close() error { return nil } diff --git a/client/state/db_test.go b/client/state/db_test.go index ad928878a157..542c219007a5 100644 --- a/client/state/db_test.go +++ b/client/state/db_test.go @@ -16,23 +16,32 @@ import ( "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/kr/pretty" + "github.com/shoenig/test/must" "github.com/stretchr/testify/require" ) +// assert each implementation satisfies StateDB interface +var ( + _ StateDB = (*BoltStateDB)(nil) + _ StateDB = (*MemDB)(nil) + _ StateDB = (*NoopDB)(nil) + _ StateDB = (*ErrDB)(nil) +) + func setupBoltStateDB(t *testing.T) *BoltStateDB { dir := t.TempDir() db, err := NewBoltStateDB(testlog.HCLogger(t), dir) if err != nil { - if err := os.RemoveAll(dir); err != nil { - t.Logf("error removing boltdb dir: %v", err) + if rmErr := os.RemoveAll(dir); rmErr != nil { + t.Logf("error removing boltdb dir: %v", rmErr) } t.Fatalf("error creating boltdb: %v", err) } t.Cleanup(func() { - if err := db.Close(); err != nil { - t.Errorf("error closing boltdb: %v", err) + if closeErr := db.Close(); closeErr != nil { + t.Errorf("error closing boltdb: %v", closeErr) } }) @@ -40,14 +49,12 @@ func setupBoltStateDB(t *testing.T) *BoltStateDB { } func testDB(t *testing.T, f func(*testing.T, StateDB)) { - boltdb := setupBoltStateDB(t) - - memdb := NewMemDB(testlog.HCLogger(t)) - - impls := []StateDB{boltdb, memdb} + dbs := []StateDB{ + setupBoltStateDB(t), + NewMemDB(testlog.HCLogger(t)), + } - for _, db := range impls { - db := db + for _, db := range dbs { t.Run(db.Name(), func(t *testing.T) { f(t, db) }) @@ -374,6 +381,79 @@ func TestStateDB_DynamicRegistry(t *testing.T) { }) } +func TestStateDB_CheckResult_keyForCheck(t *testing.T) { + ci.Parallel(t) + + allocID := "alloc1" + checkID := structs.CheckID("id1") + result := keyForCheck(allocID, checkID) + exp := allocID + "_" + string(checkID) + must.Eq(t, exp, string(result)) +} + +func TestStateDB_CheckResult(t *testing.T) { + ci.Parallel(t) + + qr := func(id string) *structs.CheckQueryResult { + return &structs.CheckQueryResult{ + ID: structs.CheckID(id), + Mode: "healthiness", + Status: "passing", + Output: "nomad: tcp ok", + Timestamp: 1, + Group: "group", + Task: "task", + Service: "service", + Check: "check", + } + } + + testDB(t, func(t *testing.T, db StateDB) { + t.Run("put and get", func(t *testing.T) { + err := db.PutCheckResult("alloc1", qr("abc123")) + must.NoError(t, err) + results, err := db.GetCheckResults() + must.NoError(t, err) + must.MapContainsKeys(t, results, []string{"alloc1"}) + must.MapContainsKeys(t, results["alloc1"], []structs.CheckID{"abc123"}) + }) + }) + + testDB(t, func(t *testing.T, db StateDB) { + t.Run("delete", func(t *testing.T) { + must.NoError(t, db.PutCheckResult("alloc1", qr("id1"))) + must.NoError(t, db.PutCheckResult("alloc1", qr("id2"))) + must.NoError(t, db.PutCheckResult("alloc1", qr("id3"))) + must.NoError(t, db.PutCheckResult("alloc1", qr("id4"))) + must.NoError(t, db.PutCheckResult("alloc2", qr("id5"))) + err := db.DeleteCheckResults("alloc1", []structs.CheckID{"id2", "id3"}) + must.NoError(t, err) + results, err := db.GetCheckResults() + must.NoError(t, err) + must.MapContainsKeys(t, results, []string{"alloc1", "alloc2"}) + must.MapContainsKeys(t, results["alloc1"], []structs.CheckID{"id1", "id4"}) + must.MapContainsKeys(t, results["alloc2"], []structs.CheckID{"id5"}) + }) + }) + + testDB(t, func(t *testing.T, db StateDB) { + t.Run("purge", func(t *testing.T) { + must.NoError(t, db.PutCheckResult("alloc1", qr("id1"))) + must.NoError(t, db.PutCheckResult("alloc1", qr("id2"))) + must.NoError(t, db.PutCheckResult("alloc1", qr("id3"))) + must.NoError(t, db.PutCheckResult("alloc1", qr("id4"))) + must.NoError(t, db.PutCheckResult("alloc2", qr("id5"))) + err := db.PurgeCheckResults("alloc1") + must.NoError(t, err) + results, err := db.GetCheckResults() + must.NoError(t, err) + must.MapContainsKeys(t, results, []string{"alloc2"}) + must.MapContainsKeys(t, results["alloc2"], []structs.CheckID{"id5"}) + }) + }) + +} + // TestStateDB_Upgrade asserts calling Upgrade on new databases always // succeeds. func TestStateDB_Upgrade(t *testing.T) { diff --git a/client/state/interface.go b/client/state/interface.go index 908fe6712d43..f1250350fdc9 100644 --- a/client/state/interface.go +++ b/client/state/interface.go @@ -5,6 +5,7 @@ import ( dmstate "github.com/hashicorp/nomad/client/devicemanager/state" "github.com/hashicorp/nomad/client/dynamicplugins" driverstate "github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state" + "github.com/hashicorp/nomad/client/serviceregistration/checks" "github.com/hashicorp/nomad/nomad/structs" ) @@ -28,14 +29,16 @@ type StateDB interface { // not be stored. PutAllocation(*structs.Allocation, ...WriteOption) error - // Get/Put DeploymentStatus get and put the allocation's deployment - // status. It may be nil. + // GetDeploymentStatus gets the allocation's deployment status. It may be nil. GetDeploymentStatus(allocID string) (*structs.AllocDeploymentStatus, error) + + // PutDeploymentStatus sets the allocation's deployment status. It may be nil. PutDeploymentStatus(allocID string, ds *structs.AllocDeploymentStatus) error - // Get/Put NetworkStatus get and put the allocation's network - // status. It may be nil. + // GetNetworkStatus gets the allocation's network status. It may be nil. GetNetworkStatus(allocID string) (*structs.AllocNetworkStatus, error) + + // PutNetworkStatus puts the allocation's network status. It may be nil. PutNetworkStatus(allocID string, ns *structs.AllocNetworkStatus, opts ...WriteOption) error // GetTaskRunnerState returns the LocalState and TaskState for a @@ -43,7 +46,7 @@ type StateDB interface { // error is encountered only the error will be non-nil. GetTaskRunnerState(allocID, taskName string) (*state.LocalState, *structs.TaskState, error) - // PutTaskRunnerLocalTask stores the LocalState for a TaskRunner or + // PutTaskRunnerLocalState stores the LocalState for a TaskRunner or // returns an error. PutTaskRunnerLocalState(allocID, taskName string, val *state.LocalState) error @@ -81,6 +84,18 @@ type StateDB interface { // PutDynamicPluginRegistryState is used to store the dynamic plugin manager's state. PutDynamicPluginRegistryState(state *dynamicplugins.RegistryState) error + // PutCheckResult sets the query result for the check implied in qr. + PutCheckResult(allocID string, qr *structs.CheckQueryResult) error + + // DeleteCheckResults removes the given set of check results. + DeleteCheckResults(allocID string, checkIDs []structs.CheckID) error + + // PurgeCheckResults removes all check results of the given allocation. + PurgeCheckResults(allocID string) error + + // GetCheckResults is used to restore the set of check results on this Client. + GetCheckResults() (checks.ClientResults, error) + // Close the database. Unsafe for further use after calling regardless // of return value. Close() error diff --git a/client/structs/broadcaster.go b/client/structs/broadcaster.go index 555779695823..407894e71c91 100644 --- a/client/structs/broadcaster.go +++ b/client/structs/broadcaster.go @@ -4,7 +4,7 @@ import ( "errors" "sync" - hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/nomad/structs" ) @@ -32,7 +32,7 @@ type AllocBroadcaster struct { // nextId is the next id to assign in listener map nextId int - // closed is true if broadcsater is closed + // closed is true if broadcaster is closed. closed bool // last alloc sent to prime new listeners diff --git a/client/structs/structs.go b/client/structs/structs.go index 77a2c71c0ef6..d3b2ab8d09f4 100644 --- a/client/structs/structs.go +++ b/client/structs/structs.go @@ -182,6 +182,20 @@ type AllocExecRequest struct { structs.QueryOptions } +// AllocChecksRequest is used to request the latest nomad service discovery +// check status information of a given allocation. +type AllocChecksRequest struct { + structs.QueryOptions + AllocID string +} + +// AllocChecksResponse is used to return the latest nomad service discovery +// check status information of a given allocation. +type AllocChecksResponse struct { + structs.QueryMeta + Results map[structs.CheckID]*structs.CheckQueryResult +} + // AllocStatsRequest is used to request the resource usage of a given // allocation, potentially filtering by task type AllocStatsRequest struct { diff --git a/command/agent/alloc_endpoint.go b/command/agent/alloc_endpoint.go index d5bfdf151ccb..a714ebb7e7f6 100644 --- a/command/agent/alloc_endpoint.go +++ b/command/agent/alloc_endpoint.go @@ -84,6 +84,8 @@ func (s *HTTPServer) AllocSpecificRequest(resp http.ResponseWriter, req *http.Re } switch tokens[1] { + case "checks": + return s.allocChecks(allocID, resp, req) case "stop": return s.allocStop(allocID, resp, req) case "services": @@ -213,6 +215,8 @@ func (s *HTTPServer) ClientAllocRequest(resp http.ResponseWriter, req *http.Requ } allocID := tokens[0] switch tokens[1] { + case "checks": + return s.allocChecks(allocID, resp, req) case "stats": return s.allocStats(allocID, resp, req) case "exec": @@ -436,6 +440,39 @@ func (s *HTTPServer) allocStats(allocID string, resp http.ResponseWriter, req *h return reply.Stats, rpcErr } +func (s *HTTPServer) allocChecks(allocID string, resp http.ResponseWriter, req *http.Request) (any, error) { + // Build the request and parse the ACL token + args := cstructs.AllocChecksRequest{ + AllocID: allocID, + } + s.parse(resp, req, &args.QueryOptions.Region, &args.QueryOptions) + + // Determine the handler to use + useLocalClient, useClientRPC, useServerRPC := s.rpcHandlerForAlloc(allocID) + + // Make the RPC + var reply cstructs.AllocChecksResponse + var rpcErr error + switch { + case useLocalClient: + rpcErr = s.agent.Client().ClientRPC("Allocations.Checks", &args, &reply) + case useClientRPC: + rpcErr = s.agent.Client().RPC("Allocations.Checks", &args, &reply) + case useServerRPC: + rpcErr = s.agent.Server().RPC("Allocations.Checks", &args, &reply) + default: + rpcErr = CodedError(400, "No local Node and node_id not provided") + } + + if rpcErr != nil { + if structs.IsErrNoNodeConn(rpcErr) || structs.IsErrUnknownAllocation(rpcErr) { + rpcErr = CodedError(404, rpcErr.Error()) + } + } + + return reply.Results, rpcErr +} + func (s *HTTPServer) allocExec(allocID string, resp http.ResponseWriter, req *http.Request) (interface{}, error) { // Build the request and parse the ACL token task := req.URL.Query().Get("task") diff --git a/command/job_init.bindata_assetfs.go b/command/job_init.bindata_assetfs.go index f20461f32c50..bad35e0a339e 100644 --- a/command/job_init.bindata_assetfs.go +++ b/command/job_init.bindata_assetfs.go @@ -87,7 +87,7 @@ func commandAssetsConnectShortNomad() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "command/assets/connect-short.nomad", size: 1071, mode: os.FileMode(420), modTime: time.Unix(1654712611, 0)} + info := bindataFileInfo{name: "command/assets/connect-short.nomad", size: 1071, mode: os.FileMode(436), modTime: time.Unix(1656778128, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -107,7 +107,7 @@ func commandAssetsConnectNomad() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "command/assets/connect.nomad", size: 18126, mode: os.FileMode(420), modTime: time.Unix(1654712611, 0)} + info := bindataFileInfo{name: "command/assets/connect.nomad", size: 18126, mode: os.FileMode(436), modTime: time.Unix(1656778128, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -127,7 +127,7 @@ func commandAssetsExampleShortNomad() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "command/assets/example-short.nomad", size: 369, mode: os.FileMode(420), modTime: time.Unix(1654554902, 0)} + info := bindataFileInfo{name: "command/assets/example-short.nomad", size: 369, mode: os.FileMode(436), modTime: time.Unix(1656778128, 0)} a := &asset{bytes: bytes, info: info} return a, nil } @@ -147,7 +147,7 @@ func commandAssetsExampleNomad() (*asset, error) { return nil, err } - info := bindataFileInfo{name: "command/assets/example.nomad", size: 16351, mode: os.FileMode(420), modTime: time.Unix(1654554902, 0)} + info := bindataFileInfo{name: "command/assets/example.nomad", size: 16351, mode: os.FileMode(436), modTime: time.Unix(1656778128, 0)} a := &asset{bytes: bytes, info: info} return a, nil } diff --git a/helper/boltdd/boltdd.go b/helper/boltdd/boltdd.go index 86273afd769c..5b492e097764 100644 --- a/helper/boltdd/boltdd.go +++ b/helper/boltdd/boltdd.go @@ -356,6 +356,8 @@ func (b *Bucket) Get(key []byte, obj interface{}) error { // Iterate iterates each key in Bucket b that starts with prefix. fn is called on // the key and msg-pack decoded value. If prefix is empty or nil, all keys in the // bucket are iterated. +// +// b must already exist. func Iterate[T any](b *Bucket, prefix []byte, fn func([]byte, T)) error { c := b.boltBucket.Cursor() for k, data := c.Seek(prefix); k != nil && bytes.HasPrefix(k, prefix); k, data = c.Next() { @@ -371,6 +373,8 @@ func Iterate[T any](b *Bucket, prefix []byte, fn func([]byte, T)) error { // DeletePrefix removes all keys starting with prefix from the bucket. If no keys // with prefix exist then nothing is done and a nil error is returned. Returns an // error if the bucket was created from a read-only transaction. +// +// b must already exist. func (b *Bucket) DeletePrefix(prefix []byte) error { c := b.boltBucket.Cursor() for k, _ := c.Seek(prefix); k != nil && bytes.HasPrefix(k, prefix); k, _ = c.Next() { diff --git a/helper/funcs.go b/helper/funcs.go index 665017222f22..d379a142de42 100644 --- a/helper/funcs.go +++ b/helper/funcs.go @@ -245,6 +245,8 @@ func SliceStringIsSubset(larger, smaller []string) (bool, []string) { } // SliceStringContains returns whether item exists at least once in list. +// +// Deprecated; use slices.Contains instead. func SliceStringContains(list []string, item string) bool { for _, s := range list { if s == item { diff --git a/nomad/client_alloc_endpoint.go b/nomad/client_alloc_endpoint.go index 92912f96a19b..ee57e137342d 100644 --- a/nomad/client_alloc_endpoint.go +++ b/nomad/client_alloc_endpoint.go @@ -10,14 +10,13 @@ import ( metrics "github.com/armon/go-metrics" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-msgpack/codec" + "github.com/hashicorp/nomad/acl" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/helper" - - "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/structs" ) -// ClientAllocations is used to forward RPC requests to the targed Nomad client's +// ClientAllocations is used to forward RPC requests to the targeted Nomad client's // Allocation endpoint. type ClientAllocations struct { srv *Server diff --git a/nomad/mock/checks.go b/nomad/mock/checks.go new file mode 100644 index 000000000000..211edea22a0d --- /dev/null +++ b/nomad/mock/checks.go @@ -0,0 +1,30 @@ +package mock + +import ( + "github.com/hashicorp/nomad/nomad/structs" +) + +// CheckShim is a mock implementation of checkstore.Shim +// +// So far the implementation does nothing. +type CheckShim struct{} + +func (s *CheckShim) Set(allocID string, result *structs.CheckQueryResult) error { + return nil +} + +func (s *CheckShim) List(allocID string) map[structs.CheckID]*structs.CheckQueryResult { + return nil +} + +func (s *CheckShim) Difference(allocID string, ids []structs.CheckID) []structs.CheckID { + return nil +} + +func (s *CheckShim) Remove(allocID string, ids []structs.CheckID) error { + return nil +} + +func (s *CheckShim) Purge(allocID string) error { + return nil +} diff --git a/nomad/mock/network.go b/nomad/mock/network.go new file mode 100644 index 000000000000..ced50d73452e --- /dev/null +++ b/nomad/mock/network.go @@ -0,0 +1,19 @@ +package mock + +import ( + "github.com/hashicorp/nomad/nomad/structs" +) + +// NetworkStatus is a mock implementation of structs.NetworkStatus +type NetworkStatus struct { + address string +} + +// NewNetworkStatus creates a mock NetworkStatus based on address. +func NewNetworkStatus(address string) structs.NetworkStatus { + return &NetworkStatus{address: address} +} + +func (ns *NetworkStatus) NetworkStatus() *structs.AllocNetworkStatus { + return &structs.AllocNetworkStatus{Address: ns.address} +} diff --git a/nomad/structs/check_test.go b/nomad/structs/check_test.go new file mode 100644 index 000000000000..a562f62e4c7d --- /dev/null +++ b/nomad/structs/check_test.go @@ -0,0 +1,104 @@ +package structs + +import ( + "testing" + "time" + + "github.com/hashicorp/nomad/ci" + "github.com/shoenig/test/must" +) + +func TestChecks_NomadCheckID(t *testing.T) { + ci.Parallel(t) + + orig := ServiceCheck{ + Name: "c1", + Type: "http", + Path: "/health", + Protocol: "https", + PortLabel: "web", + AddressMode: "host", + Interval: 1 * time.Minute, + Timeout: 10 * time.Second, + Method: "GET", + TaskName: "t1", + OnUpdate: "ignore", + } + + different := func(a, b ServiceCheck) bool { + idA := NomadCheckID("id", "group", &a) + idB := NomadCheckID("id", "group", &b) + return idA != idB + } + + t.Run("same", func(t *testing.T) { + c := orig + must.False(t, different(orig, c)) + }) + + t.Run("different name", func(t *testing.T) { + c := orig + c.Name = "c2" + must.True(t, different(orig, c)) + }) + + t.Run("different type", func(t *testing.T) { + c := orig + c.Type = "tcp" + must.True(t, different(orig, c)) + }) + + t.Run("different path", func(t *testing.T) { + c := orig + c.Path = "/metrics" + must.True(t, different(orig, c)) + }) + + t.Run("different protocol", func(t *testing.T) { + c := orig + c.Protocol = "http" + must.True(t, different(orig, c)) + }) + + t.Run("different port label", func(t *testing.T) { + c := orig + c.PortLabel = "ingress" + must.True(t, different(orig, c)) + }) + + t.Run("different address mode", func(t *testing.T) { + c := orig + c.AddressMode = "bridge" + must.True(t, different(orig, c)) + }) + + t.Run("different interval", func(t *testing.T) { + c := orig + c.Interval = 1 * time.Second + must.True(t, different(orig, c)) + }) + + t.Run("different timeout", func(t *testing.T) { + c := orig + c.Timeout = 5 * time.Second + must.True(t, different(orig, c)) + }) + + t.Run("different method", func(t *testing.T) { + c := orig + c.Method = "POST" + must.True(t, different(orig, c)) + }) + + t.Run("different task", func(t *testing.T) { + c := orig + c.TaskName = "task2" + must.True(t, different(orig, c)) + }) + + t.Run("different on update", func(t *testing.T) { + c := orig + c.OnUpdate = "checks" + must.True(t, different(orig, c)) + }) +} diff --git a/nomad/structs/checks.go b/nomad/structs/checks.go new file mode 100644 index 000000000000..35257d5bf10b --- /dev/null +++ b/nomad/structs/checks.go @@ -0,0 +1,92 @@ +package structs + +import ( + "crypto/md5" + "fmt" +) + +// The CheckMode of a Nomad check is either Healthiness or Readiness. +type CheckMode string + +const ( + // A Healthiness check is useful in the context of ensuring a service + // is capable of performing its duties. This is an indicator that a check's + // on_update configuration is set to "check_result", implying that Deployments + // will not move forward while the check is failing. + Healthiness CheckMode = "healthiness" + + // A Readiness check is useful in the context of ensuring a service is + // should be performing its duties (regardless of healthiness). This is an + // indicator that the check's on_update configuration is set to "ignore", + // implying that Deployments will move forward regardless if the check is + // failing. + Readiness CheckMode = "readiness" +) + +// GetCheckMode determines whether the check is readiness or healthiness. +func GetCheckMode(c *ServiceCheck) CheckMode { + if c != nil && c.OnUpdate == "ignore" { + return Readiness + } + return Healthiness +} + +// An CheckID is unique to a check. +type CheckID string + +// A CheckQueryResult represents the outcome of a single execution of a Nomad service +// check. It records the result, the output, and when the execution took place. +// Advanced check math (e.g. success_before_passing) are left to the calling +// context. +type CheckQueryResult struct { + ID CheckID + Mode CheckMode + Status CheckStatus + StatusCode int `json:",omitempty"` + Output string + Timestamp int64 + + // check coordinates + Group string + Task string `json:",omitempty"` + Service string + Check string +} + +func (r *CheckQueryResult) String() string { + return fmt.Sprintf("(%s %s %s %v)", r.ID, r.Mode, r.Status, r.Timestamp) +} + +// A CheckStatus is the result of executing a check. The status of a query is +// ternary - success, failure, or pending (not yet executed). Deployments treat +// pending and failure as the same - a deployment does not continue until a check +// is passing (unless on_update=ignore). +type CheckStatus string + +const ( + CheckSuccess CheckStatus = "success" + CheckFailure CheckStatus = "failure" + CheckPending CheckStatus = "pending" +) + +// NomadCheckID returns an ID unique to the nomad service check. +// +// Checks of group-level services have no task. +func NomadCheckID(allocID, group string, c *ServiceCheck) CheckID { + sum := md5.New() + hashString(sum, allocID) + hashString(sum, group) + hashString(sum, c.TaskName) + hashString(sum, c.Name) + hashString(sum, c.Type) + hashString(sum, c.PortLabel) + hashString(sum, c.OnUpdate) + hashString(sum, c.AddressMode) + hashDuration(sum, c.Interval) + hashDuration(sum, c.Timeout) + hashString(sum, c.Protocol) + hashString(sum, c.Path) + hashString(sum, c.Method) + h := sum.Sum(nil) + return CheckID(fmt.Sprintf("%x", h)) +} diff --git a/nomad/structs/services.go b/nomad/structs/services.go index 93ac0c96c4d2..95cb98efe2ce 100644 --- a/nomad/structs/services.go +++ b/nomad/structs/services.go @@ -2,6 +2,7 @@ package structs import ( "crypto/sha1" + "encoding/binary" "errors" "fmt" "hash" @@ -19,6 +20,7 @@ import ( "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/args" "github.com/mitchellh/copystructure" + "golang.org/x/exp/slices" ) const ( @@ -41,7 +43,7 @@ const ( // ServiceCheck represents the Consul health check. type ServiceCheck struct { - Name string // Name of the check, defaults to id + Name string // Name of the check, defaults to a generated label Type string // Type of the check - tcp, http, docker and script Command string // Command is the command to run for script checks Args []string // Args is a list of arguments for script checks @@ -66,6 +68,12 @@ type ServiceCheck struct { OnUpdate string } +// IsReadiness returns whether the configuration of the ServiceCheck is effectively +// a readiness check - i.e. check failures do not affect a deployment. +func (sc *ServiceCheck) IsReadiness() bool { + return sc != nil && sc.OnUpdate == "ignore" +} + // Copy the stanza recursively. Returns nil if nil. func (sc *ServiceCheck) Copy() *ServiceCheck { if sc == nil { @@ -207,48 +215,49 @@ func (sc *ServiceCheck) Canonicalize(serviceName string) { } } -// validate a Service's ServiceCheck -func (sc *ServiceCheck) validate() error { - // Validate Type +// validateCommon validates the parts of ServiceCheck shared across providers. +func (sc *ServiceCheck) validateCommon(allowableTypes []string) error { + // validate the type is allowable (different between nomad, consul checks) checkType := strings.ToLower(sc.Type) + if !slices.Contains(allowableTypes, checkType) { + s := strings.Join(allowableTypes, ", ") + return fmt.Errorf(`invalid check type (%q), must be one of %s`, checkType, s) + } + + // validate specific check types switch checkType { - case ServiceCheckGRPC: - case ServiceCheckTCP: case ServiceCheckHTTP: if sc.Path == "" { - return fmt.Errorf("http type must have a valid http path") + return fmt.Errorf("http type must have http path") } - checkPath, err := url.Parse(sc.Path) - if err != nil { - return fmt.Errorf("http type must have a valid http path") + checkPath, pathErr := url.Parse(sc.Path) + if pathErr != nil { + return fmt.Errorf("http type must have valid http path") } if checkPath.IsAbs() { - return fmt.Errorf("http type must have a relative http path") + return fmt.Errorf("http type must have relative http path") } - case ServiceCheckScript: if sc.Command == "" { return fmt.Errorf("script type must have a valid script path") } - - default: - return fmt.Errorf(`invalid type (%+q), must be one of "http", "tcp", or "script" type`, sc.Type) } - // Validate interval and timeout + // validate interval if sc.Interval == 0 { return fmt.Errorf("missing required value interval. Interval cannot be less than %v", minCheckInterval) } else if sc.Interval < minCheckInterval { return fmt.Errorf("interval (%v) cannot be lower than %v", sc.Interval, minCheckInterval) } + // validate timeout if sc.Timeout == 0 { return fmt.Errorf("missing required value timeout. Timeout cannot be less than %v", minCheckInterval) } else if sc.Timeout < minCheckTimeout { return fmt.Errorf("timeout (%v) is lower than required minimum timeout %v", sc.Timeout, minCheckInterval) } - // Validate InitialStatus + // validate the initial status switch sc.InitialStatus { case "": case api.HealthPassing: @@ -256,10 +265,9 @@ func (sc *ServiceCheck) validate() error { case api.HealthCritical: default: return fmt.Errorf(`invalid initial check state (%s), must be one of %q, %q, %q or empty`, sc.InitialStatus, api.HealthPassing, api.HealthWarning, api.HealthCritical) - } - // Validate AddressMode + // validate address_mode switch sc.AddressMode { case "", AddressModeHost, AddressModeDriver, AddressModeAlloc: // Ok @@ -269,7 +277,7 @@ func (sc *ServiceCheck) validate() error { return fmt.Errorf("invalid address_mode %q", sc.AddressMode) } - // Validate OnUpdate + // validate on_update switch sc.OnUpdate { case "", OnUpdateIgnore, OnUpdateRequireHealthy, OnUpdateIgnoreWarn: // OK @@ -277,9 +285,101 @@ func (sc *ServiceCheck) validate() error { return fmt.Errorf("on_update must be %q, %q, or %q; got %q", OnUpdateRequireHealthy, OnUpdateIgnoreWarn, OnUpdateIgnore, sc.OnUpdate) } + // validate check_restart and on_update do not conflict + if sc.CheckRestart != nil { + // CheckRestart and OnUpdate Ignore are incompatible If OnUpdate treats + // an error has healthy, and the deployment succeeds followed by check + // restart restarting failing checks, the deployment is left in an odd + // state + if sc.OnUpdate == OnUpdateIgnore { + return fmt.Errorf("on_update value %q is not compatible with check_restart", sc.OnUpdate) + } + // CheckRestart IgnoreWarnings must be true if a check has defined OnUpdate + // ignore_warnings + if !sc.CheckRestart.IgnoreWarnings && sc.OnUpdate == OnUpdateIgnoreWarn { + return fmt.Errorf("on_update value %q not supported with check_restart ignore_warnings value %q", sc.OnUpdate, strconv.FormatBool(sc.CheckRestart.IgnoreWarnings)) + } + } + + // validate check_restart + if err := sc.CheckRestart.Validate(); err != nil { + return err + } + + return nil +} + +// validate a Service's ServiceCheck in the context of the Nomad provider. +func (sc *ServiceCheck) validateNomad() error { + allowable := []string{ServiceCheckTCP, ServiceCheckHTTP} + if err := sc.validateCommon(allowable); err != nil { + return err + } + + // expose is connect (consul) specific + if sc.Expose { + return fmt.Errorf("expose may only be set for Consul service checks") + } + + // nomad checks do not have warnings + if sc.OnUpdate == "ignore_warnings" { + return fmt.Errorf("on_update may only be set to ignore_warnings for Consul service checks") + } + + // below are temporary limitations on checks in nomad + // https://github.com/hashicorp/team-nomad/issues/354 + + // check_restart not yet supported on nomad + if sc.CheckRestart != nil { + return fmt.Errorf("check_restart may only be set for Consul service checks") + } + + // address_mode="driver" not yet supported on nomad + if sc.AddressMode == "driver" { + return fmt.Errorf("address_mode = driver may only be set for Consul service checks") + } + + if sc.Type == "http" { + if sc.Method != "GET" { + return fmt.Errorf("http checks may only use GET method in Nomad services") + } + + if len(sc.Header) > 0 { + return fmt.Errorf("http checks may not set headers in Nomad services") + } + + if len(sc.Body) > 0 { + return fmt.Errorf("http checks may not set Body in Nomad services") + } + } + + // success_before_passing is consul only + if sc.SuccessBeforePassing != 0 { + return fmt.Errorf("success_before_passing may only be set for Consul service checks") + } + + // failures_before_critical is consul only + if sc.FailuresBeforeCritical != 0 { + return fmt.Errorf("failures_before_critical may only be set for Consul service checks") + } + + return nil +} + +// validate a Service's ServiceCheck in the context of the Consul provider. +func (sc *ServiceCheck) validateConsul() error { + allowable := []string{ServiceCheckGRPC, ServiceCheckTCP, ServiceCheckHTTP, ServiceCheckScript} + if err := sc.validateCommon(allowable); err != nil { + return err + } + + checkType := strings.ToLower(sc.Type) + // Note that we cannot completely validate the Expose field yet - we do not // know whether this ServiceCheck belongs to a connect-enabled group-service. // Instead, such validation will happen in a job admission controller. + // + // Consul only. if sc.Expose { // We can however immediately ensure expose is configured only for HTTP // and gRPC checks. @@ -292,6 +392,8 @@ func (sc *ServiceCheck) validate() error { // passFailCheckTypes are intersection of check types supported by both Consul // and Nomad when using the pass/fail check threshold features. + // + // Consul only. passFailCheckTypes := []string{"tcp", "http", "grpc"} if sc.SuccessBeforePassing < 0 { @@ -306,23 +408,7 @@ func (sc *ServiceCheck) validate() error { return fmt.Errorf("failures_before_critical not supported for check of type %q", sc.Type) } - // Check that CheckRestart and OnUpdate do not conflict - if sc.CheckRestart != nil { - // CheckRestart and OnUpdate Ignore are incompatible If OnUpdate treats - // an error has healthy, and the deployment succeeds followed by check - // restart restarting erroring checks, the deployment is left in an odd - // state - if sc.OnUpdate == OnUpdateIgnore { - return fmt.Errorf("on_update value %q is not compatible with check_restart", sc.OnUpdate) - } - // CheckRestart IgnoreWarnings must be true if a check has defined OnUpdate - // ignore_warnings - if !sc.CheckRestart.IgnoreWarnings && sc.OnUpdate == OnUpdateIgnoreWarn { - return fmt.Errorf("on_update value %q not supported with check_restart ignore_warnings value %q", sc.OnUpdate, strconv.FormatBool(sc.CheckRestart.IgnoreWarnings)) - } - } - - return sc.CheckRestart.Validate() + return nil } // RequiresPort returns whether the service check requires the task has a port. @@ -401,6 +487,10 @@ func hashIntIfNonZero(h hash.Hash, name string, i int) { } } +func hashDuration(h hash.Hash, dur time.Duration) { + _ = binary.Write(h, binary.LittleEndian, dur) +} + func hashHeader(h hash.Hash, m map[string][]string) { // maintain backwards compatibility for ID stability // using the %v formatter on a map with string keys produces consistent @@ -621,14 +711,21 @@ func (s *Service) Validate() error { return mErr.ErrorOrNil() } +func (s *Service) validateCheckPort(c *ServiceCheck) error { + if s.PortLabel == "" && c.PortLabel == "" && c.RequiresPort() { + return fmt.Errorf("Check %s invalid: check requires a port but neither check nor service %+q have a port", c.Name, s.Name) + } + return nil +} + // validateConsulService performs validation on a service which is using the // consul provider. func (s *Service) validateConsulService(mErr *multierror.Error) { - // check checks for _, c := range s.Checks { - if s.PortLabel == "" && c.PortLabel == "" && c.RequiresPort() { - mErr.Errors = append(mErr.Errors, fmt.Errorf("Check %s invalid: check requires a port but neither check nor service %+q have a port", c.Name, s.Name)) + // validat ethe check port + if err := s.validateCheckPort(c); err != nil { + mErr.Errors = append(mErr.Errors, err) continue } @@ -640,7 +737,8 @@ func (s *Service) validateConsulService(mErr *multierror.Error) { continue } - if err := c.validate(); err != nil { + // validate the consul check + if err := c.validateConsul(); err != nil { mErr.Errors = append(mErr.Errors, fmt.Errorf("Check %s invalid: %v", c.Name, err)) } } @@ -662,12 +760,18 @@ func (s *Service) validateConsulService(mErr *multierror.Error) { // validateNomadService performs validation on a service which is using the // nomad provider. func (s *Service) validateNomadService(mErr *multierror.Error) { + // check checks + for _, c := range s.Checks { + // validate the check port + if err := s.validateCheckPort(c); err != nil { + mErr.Errors = append(mErr.Errors, err) + continue + } - // Service blocks for the Nomad provider do not support checks. We perform - // a nil check, as an empty check list is nilled within the service - // canonicalize function. - if s.Checks != nil { - mErr.Errors = append(mErr.Errors, errors.New("Service with provider nomad cannot include Check blocks")) + // validate the nomad check + if err := c.validateNomad(); err != nil { + mErr.Errors = append(mErr.Errors, err) + } } // Services using the Nomad provider do not support Consul connect. diff --git a/nomad/structs/services_test.go b/nomad/structs/services_test.go index 1044352c1104..97b225dc4b04 100644 --- a/nomad/structs/services_test.go +++ b/nomad/structs/services_test.go @@ -8,6 +8,7 @@ import ( "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/helper" + "github.com/shoenig/test/must" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -68,7 +69,7 @@ func TestServiceCheck_validate_PassingTypes(t *testing.T) { Interval: 1 * time.Second, Timeout: 2 * time.Second, SuccessBeforePassing: 3, - }).validate() + }).validateConsul() require.NoError(t, err) } }) @@ -81,7 +82,7 @@ func TestServiceCheck_validate_PassingTypes(t *testing.T) { Interval: 1 * time.Second, Timeout: 2 * time.Second, SuccessBeforePassing: 3, - }).validate() + }).validateConsul() require.EqualError(t, err, `success_before_passing not supported for check of type "script"`) }) } @@ -98,7 +99,7 @@ func TestServiceCheck_validate_FailingTypes(t *testing.T) { Interval: 1 * time.Second, Timeout: 2 * time.Second, FailuresBeforeCritical: 3, - }).validate() + }).validateConsul() require.NoError(t, err) } }) @@ -112,7 +113,7 @@ func TestServiceCheck_validate_FailingTypes(t *testing.T) { Timeout: 2 * time.Second, SuccessBeforePassing: 0, FailuresBeforeCritical: 3, - }).validate() + }).validateConsul() require.EqualError(t, err, `failures_before_critical not supported for check of type "script"`) }) } @@ -129,7 +130,7 @@ func TestServiceCheck_validate_PassFailZero_on_scripts(t *testing.T) { Timeout: 2 * time.Second, SuccessBeforePassing: 0, // script checks should still pass validation FailuresBeforeCritical: 0, // script checks should still pass validation - }).validate() + }).validateConsul() require.NoError(t, err) }) } @@ -150,7 +151,7 @@ func TestServiceCheck_validate_OnUpdate_CheckRestart_Conflict(t *testing.T) { Grace: 5 * time.Second, }, OnUpdate: "ignore_warnings", - }).validate() + }).validateConsul() require.EqualError(t, err, `on_update value "ignore_warnings" not supported with check_restart ignore_warnings value "false"`) }) @@ -167,7 +168,7 @@ func TestServiceCheck_validate_OnUpdate_CheckRestart_Conflict(t *testing.T) { Grace: 5 * time.Second, }, OnUpdate: "ignore", - }).validate() + }).validateConsul() require.EqualError(t, err, `on_update value "ignore" is not compatible with check_restart`) }) @@ -184,11 +185,129 @@ func TestServiceCheck_validate_OnUpdate_CheckRestart_Conflict(t *testing.T) { Grace: 5 * time.Second, }, OnUpdate: "ignore_warnings", - }).validate() + }).validateConsul() require.NoError(t, err) }) } +func TestServiceCheck_validateNomad(t *testing.T) { + ci.Parallel(t) + + testCases := []struct { + name string + sc *ServiceCheck + exp string + }{ + {name: "grpc", sc: &ServiceCheck{Type: ServiceCheckGRPC}, exp: `invalid check type ("grpc"), must be one of tcp, http`}, + {name: "script", sc: &ServiceCheck{Type: ServiceCheckScript}, exp: `invalid check type ("script"), must be one of tcp, http`}, + { + name: "expose", + sc: &ServiceCheck{ + Type: ServiceCheckTCP, + Expose: true, // consul only + Interval: 3 * time.Second, + Timeout: 1 * time.Second, + }, + exp: `expose may only be set for Consul service checks`, + }, { + name: "on_update ignore_warnings", + sc: &ServiceCheck{ + Type: ServiceCheckTCP, + Interval: 3 * time.Second, + Timeout: 1 * time.Second, + OnUpdate: "ignore_warnings", + }, + exp: `on_update may only be set to ignore_warnings for Consul service checks`, + }, + { + name: "success_before_passing", + sc: &ServiceCheck{ + Type: ServiceCheckTCP, + SuccessBeforePassing: 3, // consul only + Interval: 3 * time.Second, + Timeout: 1 * time.Second, + }, + exp: `success_before_passing may only be set for Consul service checks`, + }, + { + name: "failures_before_critical", + sc: &ServiceCheck{ + Type: ServiceCheckTCP, + FailuresBeforeCritical: 3, // consul only + Interval: 3 * time.Second, + Timeout: 1 * time.Second, + }, + exp: `failures_before_critical may only be set for Consul service checks`, + }, + { + name: "check_restart", + sc: &ServiceCheck{ + Type: ServiceCheckTCP, + Interval: 3 * time.Second, + Timeout: 1 * time.Second, + CheckRestart: new(CheckRestart), + }, + exp: `check_restart may only be set for Consul service checks`, + }, + { + name: "address mode driver", + sc: &ServiceCheck{ + Type: ServiceCheckTCP, + Interval: 3 * time.Second, + Timeout: 1 * time.Second, + AddressMode: "driver", + }, + exp: `address_mode = driver may only be set for Consul service checks`, + }, + { + name: "http non GET", + sc: &ServiceCheck{ + Type: ServiceCheckHTTP, + Interval: 3 * time.Second, + Timeout: 1 * time.Second, + Path: "/health", + Method: "HEAD", + }, + exp: `http checks may only use GET method in Nomad services`, + }, + { + name: "http with headers", + sc: &ServiceCheck{ + Type: ServiceCheckHTTP, + Interval: 3 * time.Second, + Timeout: 1 * time.Second, + Path: "/health", + Method: "GET", + Header: map[string][]string{"foo": {"bar"}}, + }, + exp: `http checks may not set headers in Nomad services`, + }, + { + name: "http with body", + sc: &ServiceCheck{ + Type: ServiceCheckHTTP, + Interval: 3 * time.Second, + Timeout: 1 * time.Second, + Path: "/health", + Method: "GET", + Body: "blah", + }, + exp: `http checks may not set Body in Nomad services`, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + err := testCase.sc.validateNomad() + if testCase.exp == "" { + must.NoError(t, err) + } else { + must.EqError(t, err, testCase.exp) + } + }) + } +} + func TestService_Hash(t *testing.T) { ci.Parallel(t) @@ -1551,6 +1670,30 @@ func TestService_Validate(t *testing.T) { expErrStr: "Consul Connect must be exclusively native", name: "Native Connect with Sidecar", }, + { + input: &Service{ + Name: "testservice", + Provider: "nomad", + PortLabel: "port", + Checks: []*ServiceCheck{ + { + Name: "servicecheck", + Type: "http", + Path: "/", + Interval: 1 * time.Second, + Timeout: 3 * time.Second, + }, + { + Name: "servicecheck", + Type: "tcp", + Interval: 1 * time.Second, + Timeout: 3 * time.Second, + }, + }, + }, + expErr: false, + name: "provider nomad with checks", + }, { input: &Service{ Name: "testservice", @@ -1558,12 +1701,12 @@ func TestService_Validate(t *testing.T) { Checks: []*ServiceCheck{ { Name: "servicecheck", + Type: "script", }, }, }, - expErr: true, - expErrStr: "Service with provider nomad cannot include Check blocks", - name: "provider nomad with checks", + expErr: true, + name: "provider nomad with invalid check type", }, { input: &Service{ @@ -1602,6 +1745,8 @@ func TestService_Validate(t *testing.T) { } func TestService_Validate_Address(t *testing.T) { + ci.Parallel(t) + try := func(mode, advertise string, exp error) { s := &Service{Name: "s1", Provider: "consul", AddressMode: mode, Address: advertise} result := s.Validate() @@ -1711,13 +1856,17 @@ func TestService_validateNomadService(t *testing.T) { PortLabel: "http", Namespace: "default", Provider: "nomad", - Checks: []*ServiceCheck{ - {Name: "some-check"}, - }, + Checks: []*ServiceCheck{{ + Name: "webapp", + Type: ServiceCheckHTTP, + Path: "/health", + Interval: 3 * time.Second, + Timeout: 1 * time.Second, + }}, }, inputErr: &multierror.Error{}, - expectedOutputErrors: []error{errors.New("Service with provider nomad cannot include Check blocks")}, - name: "invalid service due to checks", + expectedOutputErrors: []error{}, + name: "valid service with checks", }, { inputService: &Service{ @@ -1739,19 +1888,15 @@ func TestService_validateNomadService(t *testing.T) { PortLabel: "http", Namespace: "default", Provider: "nomad", - Connect: &ConsulConnect{ - Native: true, - }, Checks: []*ServiceCheck{ {Name: "some-check"}, }, }, inputErr: &multierror.Error{}, expectedOutputErrors: []error{ - errors.New("Service with provider nomad cannot include Check blocks"), - errors.New("Service with provider nomad cannot include Connect blocks"), + errors.New(`invalid check type (""), must be one of tcp, http`), }, - name: "invalid service due to checks and connect", + name: "bad nomad check", }, } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 9ce2fd72d45e..d93ae43c7fb0 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -25,9 +25,6 @@ import ( "time" jwt "github.com/golang-jwt/jwt/v4" - "github.com/hashicorp/nomad/helper/escapingfs" - "golang.org/x/crypto/blake2b" - "github.com/hashicorp/cronexpr" "github.com/hashicorp/go-msgpack/codec" "github.com/hashicorp/go-multierror" @@ -38,12 +35,14 @@ import ( "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/args" "github.com/hashicorp/nomad/helper/constraints/semver" + "github.com/hashicorp/nomad/helper/escapingfs" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/nomad/lib/kheap" psstructs "github.com/hashicorp/nomad/plugins/shared/structs" "github.com/miekg/dns" "github.com/mitchellh/copystructure" + "golang.org/x/crypto/blake2b" ) var ( @@ -4818,7 +4817,7 @@ func (jc *JobChildrenSummary) Copy() *JobChildrenSummary { return njc } -// TaskGroup summarizes the state of all the allocations of a particular +// TaskGroupSummary summarizes the state of all the allocations of a particular // TaskGroup type TaskGroupSummary struct { Queued int @@ -4909,9 +4908,9 @@ func (u *UpdateStrategy) Copy() *UpdateStrategy { return nil } - copy := new(UpdateStrategy) - *copy = *u - return copy + c := new(UpdateStrategy) + *c = *u + return c } func (u *UpdateStrategy) Validate() error { @@ -6314,6 +6313,25 @@ func (tg *TaskGroup) Canonicalize(job *Job) { } } +// NomadServices returns a list of all group and task - level services in tg that +// are making use of the nomad service provider. +func (tg *TaskGroup) NomadServices() []*Service { + var services []*Service + for _, service := range tg.Services { + if service.Provider == ServiceProviderNomad { + services = append(services, service) + } + } + for _, task := range tg.Tasks { + for _, service := range task.Services { + if service.Provider == ServiceProviderNomad { + services = append(services, service) + } + } + } + return services +} + // Validate is used to check a task group for reasonable configuration func (tg *TaskGroup) Validate(j *Job) error { var mErr multierror.Error @@ -7817,7 +7835,7 @@ func (wc *WaitConfig) Validate() error { return nil } -// AllocState records a single event that changes the state of the whole allocation +// AllocStateField records a single event that changes the state of the whole allocation type AllocStateField uint8 const ( @@ -10652,6 +10670,12 @@ func (a *AllocNetworkStatus) Copy() *AllocNetworkStatus { } } +// NetworkStatus is an interface satisfied by alloc runner, for acquiring the +// network status of an allocation. +type NetworkStatus interface { + NetworkStatus() *AllocNetworkStatus +} + // AllocDeploymentStatus captures the status of the allocation as part of the // deployment. This can include things like if the allocation has been marked as // healthy. diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 3765016706c9..02d0fe1be373 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -1999,7 +1999,7 @@ func TestTask_Validate_Service_Check(t *testing.T) { Interval: 10 * time.Second, } - err := invalidCheck.validate() + err := invalidCheck.validateConsul() if err == nil || !strings.Contains(err.Error(), "Timeout cannot be less") { t.Fatalf("expected a timeout validation error but received: %q", err) } @@ -2011,12 +2011,12 @@ func TestTask_Validate_Service_Check(t *testing.T) { Timeout: 2 * time.Second, } - if err := check1.validate(); err != nil { + if err := check1.validateConsul(); err != nil { t.Fatalf("err: %v", err) } check1.InitialStatus = "foo" - err = check1.validate() + err = check1.validateConsul() if err == nil { t.Fatal("Expected an error") } @@ -2026,19 +2026,19 @@ func TestTask_Validate_Service_Check(t *testing.T) { } check1.InitialStatus = api.HealthCritical - err = check1.validate() + err = check1.validateConsul() if err != nil { t.Fatalf("err: %v", err) } check1.InitialStatus = api.HealthPassing - err = check1.validate() + err = check1.validateConsul() if err != nil { t.Fatalf("err: %v", err) } check1.InitialStatus = "" - err = check1.validate() + err = check1.validateConsul() if err != nil { t.Fatalf("err: %v", err) } @@ -2051,22 +2051,22 @@ func TestTask_Validate_Service_Check(t *testing.T) { Path: "/foo/bar", } - err = check2.validate() + err = check2.validateConsul() if err != nil { t.Fatalf("err: %v", err) } check2.Path = "" - err = check2.validate() + err = check2.validateConsul() if err == nil { t.Fatal("Expected an error") } - if !strings.Contains(err.Error(), "valid http path") { + if !strings.Contains(err.Error(), "http type must have http path") { t.Fatalf("err: %v", err) } check2.Path = "http://www.example.com" - err = check2.validate() + err = check2.validateConsul() if err == nil { t.Fatal("Expected an error") } @@ -2082,7 +2082,7 @@ func TestTask_Validate_Service_Check(t *testing.T) { Timeout: 1 * time.Second, Path: "/health", Expose: true, - }).validate()) + }).validateConsul()) }) t.Run("type tcp", func(t *testing.T) { require.EqualError(t, (&ServiceCheck{ @@ -2090,7 +2090,7 @@ func TestTask_Validate_Service_Check(t *testing.T) { Interval: 1 * time.Second, Timeout: 1 * time.Second, Expose: true, - }).validate(), "expose may only be set on HTTP or gRPC checks") + }).validateConsul(), "expose may only be set on HTTP or gRPC checks") }) }) }