Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

services: un-mark group services as deregistered if restart hook runs #16905

Merged
merged 4 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/16905.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
services: Fixed a bug preventing group service deregistrations after alloc restarts
```
8 changes: 6 additions & 2 deletions client/allocrunner/group_service_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,14 @@ func (h *groupServiceHook) Prerun() error {
defer func() {
// Mark prerun as true to unblock Updates
h.prerun = true
// Mark deregistered as false to allow de-registration
h.deregistered = false
h.mu.Unlock()
}()
return h.preRunLocked()
}

// caller must hold h.lock
// caller must hold h.mu
func (h *groupServiceHook) preRunLocked() error {
if len(h.services) == 0 {
return nil
Expand Down Expand Up @@ -188,6 +190,8 @@ func (h *groupServiceHook) PreTaskRestart() error {
defer func() {
// Mark prerun as true to unblock Updates
h.prerun = true
// Mark deregistered as false to allow de-registration
h.deregistered = false
h.mu.Unlock()
}()

Expand All @@ -201,7 +205,7 @@ func (h *groupServiceHook) PreKill() {

// implements the PreKill hook
//
// caller must hold h.lock
// caller must hold h.mu
func (h *groupServiceHook) preKillLocked() {
// If we have a shutdown delay deregister group services and then wait
// before continuing to kill tasks.
Expand Down
59 changes: 59 additions & 0 deletions e2e/consul/alloc_restart_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package consul

import (
"context"
"testing"

"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/shoenig/test/must"
)

func testAllocRestart(t *testing.T) {
nc := e2eutil.NomadClient(t)
cc := e2eutil.ConsulClient(t).Catalog()

const jobFile = "./input/alloc_restart.hcl"
jobID := "alloc-restart-" + uuid.Short()
jobIDs := []string{jobID}

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
defer e2eutil.CleanupJobsAndGCWithContext(t, ctx, &jobIDs)

// register our job
err := e2eutil.Register(jobID, jobFile)
must.NoError(t, err)

// wait for our allocation to be running
allocID := e2eutil.SingleAllocID(t, jobID, "", 0)
e2eutil.WaitForAllocRunning(t, nc, allocID)

// make sure our service is registered
services, _, err := cc.Service("alloc-restart-http", "", nil)
must.NoError(t, err)
must.Len(t, 1, services)

// restart the alloc
stderr, err := e2eutil.Command("nomad", "alloc", "restart", allocID)
must.NoError(t, err, must.Sprintf("stderr: %s", stderr))

// wait for alloc running again
e2eutil.WaitForAllocRunning(t, nc, allocID)

// make sure our service is still registered
services, _, err = cc.Service("alloc-restart-http", "", nil)
must.NoError(t, err)
must.Len(t, 1, services)

err = e2eutil.StopJob(jobID)
must.NoError(t, err)

// make sure our service is no longer registered
services, _, err = cc.Service("alloc-restart-http", "", nil)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure by the time the alloc is marked running the service will be registered. But service deregistration isn't synchronous with the Job.Deregister RPC, right? I think we need to must.Wait for this API call.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, will add that

must.NoError(t, err)
must.SliceEmpty(t, services)
}
19 changes: 19 additions & 0 deletions e2e/consul/consul_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package consul

import (
"testing"

"github.com/hashicorp/nomad/e2e/e2eutil"
)

func TestConsul(t *testing.T) {
// todo: migrate the remaining consul tests

nomad := e2eutil.NomadClient(t)

e2eutil.WaitForLeader(t, nomad)
e2eutil.WaitForNodesReady(t, nomad, 1)

t.Run("testServiceReversion", testServiceReversion)
t.Run("testAllocRestart", testAllocRestart)
}
33 changes: 33 additions & 0 deletions e2e/consul/input/alloc_restart.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
job "alloc-restart" {
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}

group "group" {
network {
port "http" {
to = 8080
}
}

service {
name = "alloc-restart-http"
port = "http"
}

task "python" {
driver = "raw_exec"

config {
command = "python3"
args = ["-m", "http.server", "8080", "--directory", "/tmp"]
}

resources {
cpu = 16
memory = 32
}
}
}
}
11 changes: 0 additions & 11 deletions e2e/consul/service_revert_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,6 @@ import (
"github.com/shoenig/test/must"
)

func TestConsul(t *testing.T) {
// todo: migrate the remaining consul tests

nomad := e2eutil.NomadClient(t)

e2eutil.WaitForLeader(t, nomad)
e2eutil.WaitForNodesReady(t, nomad, 1)

t.Run("testServiceReversion", testServiceReversion)
}

// testServiceReversion asserts we can
// - submit a job with a service
// - update that job and modify service
Expand Down