diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index b8d62309be68..674920ced047 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -24,7 +24,6 @@ import ( _ "github.com/hashicorp/nomad/e2e/lifecycle" _ "github.com/hashicorp/nomad/e2e/metrics" _ "github.com/hashicorp/nomad/e2e/networking" - _ "github.com/hashicorp/nomad/e2e/nodedrain" _ "github.com/hashicorp/nomad/e2e/nomadexec" _ "github.com/hashicorp/nomad/e2e/oversubscription" _ "github.com/hashicorp/nomad/e2e/parameterized" @@ -45,6 +44,7 @@ import ( // we get a quick check that they compile on every commit _ "github.com/hashicorp/nomad/e2e/disconnectedclients" _ "github.com/hashicorp/nomad/e2e/namespaces" + _ "github.com/hashicorp/nomad/e2e/nodedrain" _ "github.com/hashicorp/nomad/e2e/volumes" ) diff --git a/e2e/nodedrain/input/drain_deadline.nomad b/e2e/nodedrain/input/drain_deadline.nomad index b04aba70cfd5..17ea4efe1bcb 100644 --- a/e2e/nodedrain/input/drain_deadline.nomad +++ b/e2e/nodedrain/input/drain_deadline.nomad @@ -2,7 +2,6 @@ # SPDX-License-Identifier: MPL-2.0 job "drain_deadline" { - datacenters = ["dc1", "dc2"] constraint { attribute = "${attr.kernel.name}" diff --git a/e2e/nodedrain/node_drain_test.go b/e2e/nodedrain/node_drain_test.go index fbbc1f6a9f9f..27c56be1279f 100644 --- a/e2e/nodedrain/node_drain_test.go +++ b/e2e/nodedrain/node_drain_test.go @@ -30,6 +30,8 @@ func TestNodeDrain(t *testing.T) { t.Run("IgnoreSystem", testIgnoreSystem) t.Run("EphemeralMigrate", testEphemeralMigrate) t.Run("KeepIneligible", testKeepIneligible) + t.Run("DeadlineFlag", testDeadlineFlag) + t.Run("ForceFlag", testForceFlag) } // testIgnoreSystem tests that system jobs are left behind when the @@ -67,7 +69,8 @@ func testIgnoreSystem(t *testing.T) { must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out)) // The service job should be drained - newAllocs := waitForAllocDrain(t, nomadClient, serviceJobID, oldAllocID, oldNodeID) + newAllocs := waitForAllocDrain(t, nomadClient, serviceJobID, + oldAllocID, oldNodeID, time.Second*120) must.Len(t, 1, newAllocs, must.Sprint("expected 1 new service job alloc")) // The system job should not have been drained @@ -117,7 +120,8 @@ func testEphemeralMigrate(t *testing.T) { out, err := e2eutil.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", oldNodeID) must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out)) - newAllocs := waitForAllocDrain(t, nomadClient, jobID, oldAllocID, oldNodeID) + newAllocs := waitForAllocDrain(t, nomadClient, jobID, + oldAllocID, oldNodeID, time.Second*120) must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc")) newAllocID := newAllocs[0].ID newNodeID := newAllocs[0].NodeID @@ -176,6 +180,72 @@ func testKeepIneligible(t *testing.T) { } } +// testDeadlineFlag tests the enforcement of the node drain deadline so +// that allocations are terminated even if they haven't gracefully exited. +func testDeadlineFlag(t *testing.T) { + + nomadClient := e2eutil.NomadClient(t) + t.Cleanup(cleanupDrainState(t)) + + jobID := "test-node-drain-" + uuid.Generate()[0:8] + + allocs := registerAndWaitForRunning(t, nomadClient, jobID, "./input/drain_deadline.nomad", 1) + t.Cleanup(cleanupJobState(t, jobID)) + oldAllocID := allocs[0].ID + oldNodeID := allocs[0].NodeID + + t.Logf("draining node %v", oldNodeID) + out, err := e2eutil.Command( + "nomad", "node", "drain", + "-deadline", "5s", + "-enable", "-yes", "-detach", oldNodeID) + must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID, err, out)) + + // the job's kill_timeout is 2m, but our drain deadline is 5s, so we expect + // the allocation to be force-killed after 5s. But we can't guarantee that + // it's instantly terminated at 5s because we have to wait for the client's + // Client.GetAllocs and Node.UpdateAlloc calls to be made. So set a 30s + // timeout on this test to give us plenty of time to finish up but still be + // well under the 2m kill_timeout. + newAllocs := waitForAllocDrain(t, nomadClient, jobID, + oldAllocID, oldNodeID, time.Second*30) + must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc")) +} + +// testForceFlag tests the enforcement of the node drain -force flag so that +// allocations are terminated immediately. +func testForceFlag(t *testing.T) { + + nomadClient := e2eutil.NomadClient(t) + t.Cleanup(cleanupDrainState(t)) + + jobID := "test-node-drain-" + uuid.Generate()[0:8] + must.NoError(t, e2eutil.Register(jobID, "./input/drain_deadline.nomad")) + t.Cleanup(cleanupJobState(t, jobID)) + + allocs := registerAndWaitForRunning(t, nomadClient, jobID, "./input/drain_deadline.nomad", 1) + t.Cleanup(cleanupJobState(t, jobID)) + oldAllocID := allocs[0].ID + oldNodeID := allocs[0].NodeID + + t.Logf("draining node %v", oldNodeID) + out, err := e2eutil.Command( + "nomad", "node", "drain", + "-force", + "-enable", "-yes", "-detach", oldNodeID) + must.NoError(t, err, must.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) + + // the job's kill_timeout is 2m, but we've passed -force, so we expect + // the allocation to be immediately force-killed. But we can't guarantee that + // it's instantly terminated because we have to wait for the client's + // Client.GetAllocs and Node.UpdateAlloc calls to be made. So set a 30s + // timeout on this test to give us plenty of time to finish up but still be + // well under the 2m kill_timeout. + newAllocs := waitForAllocDrain(t, nomadClient, jobID, + oldAllocID, oldNodeID, time.Second*30) + must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc")) +} + // registerAndWaitForRunning registers a job and waits for the expected number // of allocations to be in a running state. Returns the allocations. func registerAndWaitForRunning(t *testing.T, nomadClient *api.Client, jobID, jobSpec string, expectedCount int) []*api.AllocationListStub { @@ -211,7 +281,7 @@ func registerAndWaitForRunning(t *testing.T, nomadClient *api.Client, jobID, job // migrating: // - the old alloc should be stopped // - the new alloc should be running -func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string) []*api.AllocationListStub { +func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string, deadline time.Duration) []*api.AllocationListStub { t.Helper() newAllocs := set.From([]*api.AllocationListStub{}) @@ -243,7 +313,7 @@ func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID[:8], time.Now().Sub(start)) return nil }), - wait.Timeout(120*time.Second), + wait.Timeout(deadline), wait.Gap(500*time.Millisecond), )) diff --git a/e2e/nodedrain/nodedrain.go b/e2e/nodedrain/nodedrain.go deleted file mode 100644 index d7c01d867bc9..000000000000 --- a/e2e/nodedrain/nodedrain.go +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright (c) HashiCorp, Inc. -// SPDX-License-Identifier: MPL-2.0 - -package nodedrain - -import ( - "fmt" - "os" - "time" - - e2e "github.com/hashicorp/nomad/e2e/e2eutil" - "github.com/hashicorp/nomad/e2e/framework" - "github.com/hashicorp/nomad/helper/uuid" - "github.com/hashicorp/nomad/testutil" -) - -const ns = "" - -type NodeDrainE2ETest struct { - framework.TC - jobIDs []string - nodeIDs []string -} - -func init() { - framework.AddSuites(&framework.TestSuite{ - Component: "NodeDrain", - CanRunLocal: true, - Consul: true, - Cases: []framework.TestCase{ - new(NodeDrainE2ETest), - }, - }) - -} - -func (tc *NodeDrainE2ETest) BeforeAll(f *framework.F) { - e2e.WaitForLeader(f.T(), tc.Nomad()) - e2e.WaitForNodesReady(f.T(), tc.Nomad(), 2) // needs at least 2 to test migration -} - -func (tc *NodeDrainE2ETest) AfterEach(f *framework.F) { - if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { - return - } - - for _, id := range tc.jobIDs { - _, err := e2e.Command("nomad", "job", "stop", "-purge", id) - f.Assert().NoError(err) - } - tc.jobIDs = []string{} - - for _, id := range tc.nodeIDs { - _, err := e2e.Command("nomad", "node", "drain", "-disable", "-yes", id) - f.Assert().NoError(err) - _, err = e2e.Command("nomad", "node", "eligibility", "-enable", id) - f.Assert().NoError(err) - } - tc.nodeIDs = []string{} - - _, err := e2e.Command("nomad", "system", "gc") - f.Assert().NoError(err) -} - -func nodesForJob(jobID string) ([]string, error) { - allocs, err := e2e.AllocsForJob(jobID, ns) - if err != nil { - return nil, err - } - if len(allocs) < 1 { - return nil, fmt.Errorf("no allocs found for job: %v", jobID) - } - nodes := []string{} - for _, alloc := range allocs { - nodes = append(nodes, alloc["Node ID"]) - } - return nodes, nil -} - -// waitForNodeDrain is a convenience wrapper that polls 'node status' -// until the comparison function over the state of the job's allocs on that -// node returns true -func waitForNodeDrain(nodeID string, comparison func([]map[string]string) bool, wc *e2e.WaitConfig) error { - var got []map[string]string - var err error - interval, retries := wc.OrDefault() - testutil.WaitForResultRetries(retries, func() (bool, error) { - time.Sleep(interval) - got, err = e2e.AllocsForNode(nodeID) - if err != nil { - return false, err - } - return comparison(got), nil - }, func(e error) { - err = fmt.Errorf("node drain status check failed: %v\n%#v", e, got) - }) - return err -} - -// TestNodeDrainDeadline tests the enforcement of the node drain deadline so -// that allocations are terminated even if they haven't gracefully exited. -func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) { - f.T().Skip("The behavior is unclear and test assertions don't capture intent. Issue 9902") - - jobID := "test-node-drain-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad")) - tc.jobIDs = append(tc.jobIDs, jobID) - - expected := []string{"running"} - f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running") - - nodes, err := nodesForJob(jobID) - f.NoError(err, "could not get nodes for job") - f.Len(nodes, 1, "could not get nodes for job") - nodeID := nodes[0] - - f.T().Logf("draining node %v", nodeID) - out, err := e2e.Command( - "nomad", "node", "drain", - "-deadline", "5s", - "-enable", "-yes", "-detach", nodeID) - f.NoError(err, fmt.Sprintf("'nomad node drain %v' failed: %v\n%v", nodeID, err, out)) - tc.nodeIDs = append(tc.nodeIDs, nodeID) - - // the deadline is 40s but we can't guarantee its instantly terminated at - // that point, so we give it 30s which is well under the 2m kill_timeout in - // the job. - // deadline here needs to account for scheduling and propagation delays. - f.NoError(waitForNodeDrain(nodeID, - func(got []map[string]string) bool { - // FIXME: check the drain job alloc specifically. test - // may pass if client had another completed alloc - for _, alloc := range got { - if alloc["Status"] == "complete" { - return true - } - } - return false - }, &e2e.WaitConfig{Interval: time.Second, Retries: 40}, - ), "node did not drain immediately following deadline") -} - -// TestNodeDrainForce tests the enforcement of the node drain -force flag so -// that allocations are terminated immediately. -func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) { - f.T().Skip("The behavior is unclear and test assertions don't capture intent. Issue 9902") - - jobID := "test-node-drain-" + uuid.Generate()[0:8] - f.NoError(e2e.Register(jobID, "nodedrain/input/drain_deadline.nomad")) - tc.jobIDs = append(tc.jobIDs, jobID) - - expected := []string{"running"} - f.NoError(e2e.WaitForAllocStatusExpected(jobID, ns, expected), "job should be running") - - nodes, err := nodesForJob(jobID) - f.NoError(err, "could not get nodes for job") - f.Len(nodes, 1, "could not get nodes for job") - nodeID := nodes[0] - - out, err := e2e.Command( - "nomad", "node", "drain", - "-force", - "-enable", "-yes", "-detach", nodeID) - f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) - tc.nodeIDs = append(tc.nodeIDs, nodeID) - - // we've passed -force but we can't guarantee its instantly terminated at - // that point, so we give it 30s which is under the 2m kill_timeout in - // the job - f.NoError(waitForNodeDrain(nodeID, - func(got []map[string]string) bool { - // FIXME: check the drain job alloc specifically. test - // may pass if client had another completed alloc - for _, alloc := range got { - if alloc["Status"] == "complete" { - return true - } - } - return false - }, &e2e.WaitConfig{Interval: time.Second, Retries: 40}, - ), "node did not drain immediately when forced") - -}