Skip to content

Commit

Permalink
E2E tests for node drain
Browse files Browse the repository at this point in the history
* re-enable the previously disabled tests
* port off the old framework to use the stdlib runner
  • Loading branch information
tgross committed Apr 12, 2023
1 parent 8ad02b7 commit c81e7b9
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 189 deletions.
2 changes: 1 addition & 1 deletion e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (
_ "github.com/hashicorp/nomad/e2e/lifecycle"
_ "github.com/hashicorp/nomad/e2e/metrics"
_ "github.com/hashicorp/nomad/e2e/networking"
_ "github.com/hashicorp/nomad/e2e/nodedrain"
_ "github.com/hashicorp/nomad/e2e/nomadexec"
_ "github.com/hashicorp/nomad/e2e/oversubscription"
_ "github.com/hashicorp/nomad/e2e/parameterized"
Expand All @@ -45,6 +44,7 @@ import (
// we get a quick check that they compile on every commit
_ "github.com/hashicorp/nomad/e2e/disconnectedclients"
_ "github.com/hashicorp/nomad/e2e/namespaces"
_ "github.com/hashicorp/nomad/e2e/nodedrain"
_ "github.com/hashicorp/nomad/e2e/volumes"
)

Expand Down
1 change: 0 additions & 1 deletion e2e/nodedrain/input/drain_deadline.nomad
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# SPDX-License-Identifier: MPL-2.0

job "drain_deadline" {
datacenters = ["dc1", "dc2"]

constraint {
attribute = "${attr.kernel.name}"
Expand Down
78 changes: 74 additions & 4 deletions e2e/nodedrain/node_drain_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ func TestNodeDrain(t *testing.T) {
t.Run("IgnoreSystem", testIgnoreSystem)
t.Run("EphemeralMigrate", testEphemeralMigrate)
t.Run("KeepIneligible", testKeepIneligible)
t.Run("DeadlineFlag", testDeadlineFlag)
t.Run("ForceFlag", testForceFlag)
}

// testIgnoreSystem tests that system jobs are left behind when the
Expand Down Expand Up @@ -67,7 +69,8 @@ func testIgnoreSystem(t *testing.T) {
must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))

// The service job should be drained
newAllocs := waitForAllocDrain(t, nomadClient, serviceJobID, oldAllocID, oldNodeID)
newAllocs := waitForAllocDrain(t, nomadClient, serviceJobID,
oldAllocID, oldNodeID, time.Second*120)
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new service job alloc"))

// The system job should not have been drained
Expand Down Expand Up @@ -117,7 +120,8 @@ func testEphemeralMigrate(t *testing.T) {
out, err := e2eutil.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", oldNodeID)
must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))

newAllocs := waitForAllocDrain(t, nomadClient, jobID, oldAllocID, oldNodeID)
newAllocs := waitForAllocDrain(t, nomadClient, jobID,
oldAllocID, oldNodeID, time.Second*120)
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))
newAllocID := newAllocs[0].ID
newNodeID := newAllocs[0].NodeID
Expand Down Expand Up @@ -176,6 +180,72 @@ func testKeepIneligible(t *testing.T) {
}
}

// testDeadlineFlag tests the enforcement of the node drain deadline so
// that allocations are terminated even if they haven't gracefully exited.
func testDeadlineFlag(t *testing.T) {

nomadClient := e2eutil.NomadClient(t)
t.Cleanup(cleanupDrainState(t))

jobID := "test-node-drain-" + uuid.Generate()[0:8]

allocs := registerAndWaitForRunning(t, nomadClient, jobID, "./input/drain_deadline.nomad", 1)
t.Cleanup(cleanupJobState(t, jobID))
oldAllocID := allocs[0].ID
oldNodeID := allocs[0].NodeID

t.Logf("draining node %v", oldNodeID)
out, err := e2eutil.Command(
"nomad", "node", "drain",
"-deadline", "5s",
"-enable", "-yes", "-detach", oldNodeID)
must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID, err, out))

// the job's kill_timeout is 2m, but our drain deadline is 5s, so we expect
// the allocation to be force-killed after 5s. But we can't guarantee that
// it's instantly terminated at 5s because we have to wait for the client's
// Client.GetAllocs and Node.UpdateAlloc calls to be made. So set a 30s
// timeout on this test to give us plenty of time to finish up but still be
// well under the 2m kill_timeout.
newAllocs := waitForAllocDrain(t, nomadClient, jobID,
oldAllocID, oldNodeID, time.Second*30)
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))
}

// testForceFlag tests the enforcement of the node drain -force flag so that
// allocations are terminated immediately.
func testForceFlag(t *testing.T) {

nomadClient := e2eutil.NomadClient(t)
t.Cleanup(cleanupDrainState(t))

jobID := "test-node-drain-" + uuid.Generate()[0:8]
must.NoError(t, e2eutil.Register(jobID, "./input/drain_deadline.nomad"))
t.Cleanup(cleanupJobState(t, jobID))

allocs := registerAndWaitForRunning(t, nomadClient, jobID, "./input/drain_deadline.nomad", 1)
t.Cleanup(cleanupJobState(t, jobID))
oldAllocID := allocs[0].ID
oldNodeID := allocs[0].NodeID

t.Logf("draining node %v", oldNodeID)
out, err := e2eutil.Command(
"nomad", "node", "drain",
"-force",
"-enable", "-yes", "-detach", oldNodeID)
must.NoError(t, err, must.Sprintf("'nomad node drain' failed: %v\n%v", err, out))

// the job's kill_timeout is 2m, but we've passed -force, so we expect
// the allocation to be immediately force-killed. But we can't guarantee that
// it's instantly terminated because we have to wait for the client's
// Client.GetAllocs and Node.UpdateAlloc calls to be made. So set a 30s
// timeout on this test to give us plenty of time to finish up but still be
// well under the 2m kill_timeout.
newAllocs := waitForAllocDrain(t, nomadClient, jobID,
oldAllocID, oldNodeID, time.Second*30)
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))
}

// registerAndWaitForRunning registers a job and waits for the expected number
// of allocations to be in a running state. Returns the allocations.
func registerAndWaitForRunning(t *testing.T, nomadClient *api.Client, jobID, jobSpec string, expectedCount int) []*api.AllocationListStub {
Expand Down Expand Up @@ -211,7 +281,7 @@ func registerAndWaitForRunning(t *testing.T, nomadClient *api.Client, jobID, job
// migrating:
// - the old alloc should be stopped
// - the new alloc should be running
func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string) []*api.AllocationListStub {
func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string, deadline time.Duration) []*api.AllocationListStub {

t.Helper()
newAllocs := set.From([]*api.AllocationListStub{})
Expand Down Expand Up @@ -243,7 +313,7 @@ func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID,
oldNodeID[:8], time.Now().Sub(start))
return nil
}),
wait.Timeout(120*time.Second),
wait.Timeout(deadline),
wait.Gap(500*time.Millisecond),
))

Expand Down
183 changes: 0 additions & 183 deletions e2e/nodedrain/nodedrain.go

This file was deleted.

0 comments on commit c81e7b9

Please sign in to comment.