Skip to content

Commit

Permalink
services: always set deregister flag after deregistration of group
Browse files Browse the repository at this point in the history
This PR fixes a bug where the group service hook's deregister flag was
not set in some cases, causing the hook to attempt deregistrations twice
during job updates (alloc replacement).
  • Loading branch information
shoenig committed Mar 16, 2023
1 parent 8684183 commit 6507686
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 26 deletions.
3 changes: 3 additions & 0 deletions .changelog/16289.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
services: Fixed a bug where a service would be deregistered twice
```
59 changes: 34 additions & 25 deletions client/allocrunner/group_service_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"sync"
"time"

log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/client/serviceregistration"
"github.com/hashicorp/nomad/client/serviceregistration/wrapper"
Expand Down Expand Up @@ -39,7 +39,7 @@ type groupServiceHook struct {
// and check registration and deregistration.
serviceRegWrapper *wrapper.HandlerWrapper

logger log.Logger
logger hclog.Logger

// The following fields may be updated
canary bool
Expand All @@ -60,7 +60,7 @@ type groupServiceHookConfig struct {
taskEnvBuilder *taskenv.Builder
networkStatus structs.NetworkStatus
shutdownDelayCtx context.Context
logger log.Logger
logger hclog.Logger

// providerNamespace is the Nomad or Consul namespace in which service
// registrations will be made.
Expand Down Expand Up @@ -118,23 +118,26 @@ func (h *groupServiceHook) Prerun() error {
h.prerun = true
h.mu.Unlock()
}()
return h.prerunLocked()
return h.preRunLocked()
}

func (h *groupServiceHook) prerunLocked() error {
// caller must hold h.lock
func (h *groupServiceHook) preRunLocked() error {
if len(h.services) == 0 {
return nil
}

services := h.getWorkloadServices()
services := h.getWorkloadServicesLocked()
return h.serviceRegWrapper.RegisterWorkload(services)
}

// Update is run when a job submitter modifies service(s) (but not much else -
// otherwise a full alloc replacement would occur).
func (h *groupServiceHook) Update(req *interfaces.RunnerUpdateRequest) error {
h.mu.Lock()
defer h.mu.Unlock()

oldWorkloadServices := h.getWorkloadServices()
oldWorkloadServices := h.getWorkloadServicesLocked()

// Store new updated values out of request
canary := false
Expand Down Expand Up @@ -166,7 +169,7 @@ func (h *groupServiceHook) Update(req *interfaces.RunnerUpdateRequest) error {
h.providerNamespace = req.Alloc.ServiceProviderNamespace()

// Create new task services struct with those new values
newWorkloadServices := h.getWorkloadServices()
newWorkloadServices := h.getWorkloadServicesLocked()

if !h.prerun {
// Update called before Prerun. Update alloc and exit to allow
Expand All @@ -186,21 +189,20 @@ func (h *groupServiceHook) PreTaskRestart() error {
}()

h.preKillLocked()
return h.prerunLocked()
return h.preRunLocked()
}

func (h *groupServiceHook) PreKill() {
h.mu.Lock()
defer h.mu.Unlock()
h.preKillLocked()
helper.WithLock(&h.mu, h.preKillLocked)
}

// implements the PreKill hook but requires the caller hold the lock
// implements the PreKill hook
//
// caller must hold h.lock
func (h *groupServiceHook) preKillLocked() {
// If we have a shutdown delay deregister group services and then wait
// before continuing to kill tasks.
h.deregister()
h.deregistered = true
h.deregisterLocked()

if h.delay == 0 {
return
Expand All @@ -220,24 +222,31 @@ func (h *groupServiceHook) preKillLocked() {
}

func (h *groupServiceHook) Postrun() error {
h.mu.Lock()
defer h.mu.Unlock()

if !h.deregistered {
h.deregister()
}
helper.WithLock(&h.mu, h.deregisterLocked)
return nil
}

// deregister services from Consul/Nomad service provider.
func (h *groupServiceHook) deregister() {
// deregisterLocked will deregister services from Consul/Nomad service provider.
//
// caller must hold h.lock
func (h *groupServiceHook) deregisterLocked() {
if h.deregistered {
return
}

if len(h.services) > 0 {
workloadServices := h.getWorkloadServices()
workloadServices := h.getWorkloadServicesLocked()
h.serviceRegWrapper.RemoveWorkload(workloadServices)
}

h.deregistered = true
}

func (h *groupServiceHook) getWorkloadServices() *serviceregistration.WorkloadServices {
// getWorkloadServicesLocked returns the set of workload services currently
// on the hook.
//
// caller must hold h.lock
func (h *groupServiceHook) getWorkloadServicesLocked() *serviceregistration.WorkloadServices {
// Interpolate with the task's environment
interpolatedServices := taskenv.InterpolateServices(h.taskEnvBuilder.Build(), h.services)

Expand Down
2 changes: 1 addition & 1 deletion client/allocrunner/group_service_hook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,6 @@ func TestGroupServiceHook_getWorkloadServices(t *testing.T) {
logger: logger,
})

services := h.getWorkloadServices()
services := h.getWorkloadServicesLocked()
require.Len(t, services.Services, 1)
}
2 changes: 2 additions & 0 deletions client/serviceregistration/nsd/nsd.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ func (s *ServiceRegistrationHandler) removeWorkload(
defer wg.Done()

// Stop check watcher
//
// todo(shoenig) - shouldn't we only unwatch checks for the given serviceSpec ?
for _, service := range workload.Services {
for _, check := range service.Checks {
checkID := string(structs.NomadCheckID(workload.AllocInfo.AllocID, workload.AllocInfo.Group, check))
Expand Down

0 comments on commit 6507686

Please sign in to comment.