Skip to content

Commit

Permalink
E2E: update subset of node drain tests off the old framework (#16823)
Browse files Browse the repository at this point in the history
While working on several open drain issues, I'm fixing up the E2E tests. This
subset of tests being refactored are existing ones that already work. I'm
shipping these as their own PR to keep review sizes manageable when I push up
PRs in the next few days for #9902, #12314, and #12915.
  • Loading branch information
tgross committed Apr 12, 2023
1 parent 20c11f3 commit c9d1fc0
Show file tree
Hide file tree
Showing 5 changed files with 212 additions and 108 deletions.
4 changes: 4 additions & 0 deletions e2e/nodedrain/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
package nodedrain

// This package contains only tests, so this is a placeholder file to
// make sure builds don't fail with "no non-test Go files in" errors
1 change: 0 additions & 1 deletion e2e/nodedrain/input/drain_ignore_system.nomad
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
job "drain_ignore_system_service" {
datacenters = ["dc1", "dc2"]

type = "system"

Expand Down
1 change: 0 additions & 1 deletion e2e/nodedrain/input/drain_simple.nomad
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
job "drain_simple" {
datacenters = ["dc1", "dc2"]

constraint {
attribute = "${attr.kernel.name}"
Expand Down
208 changes: 208 additions & 0 deletions e2e/nodedrain/node_drain_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
package nodedrain

import (
"fmt"
"os"
"testing"
"time"

"github.com/hashicorp/go-set"
"github.com/shoenig/test"
"github.com/shoenig/test/must"
"github.com/shoenig/test/wait"

"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/structs"
)

func TestNodeDrain(t *testing.T) {

nomadClient := e2eutil.NomadClient(t)
e2eutil.WaitForLeader(t, nomadClient)
e2eutil.WaitForNodesReady(t, nomadClient, 2) // needs at least 2 to test migration

t.Run("IgnoreSystem", testIgnoreSystem)
t.Run("KeepIneligible", testKeepIneligible)
}

// testIgnoreSystem tests that system jobs are left behind when the
// -ignore-system flag is used.
func testIgnoreSystem(t *testing.T) {

t.Cleanup(cleanupDrainState(t))
nomadClient := e2eutil.NomadClient(t)

// Figure out how many system alloc we'll expect to see
nodes, err := e2eutil.NodeStatusListFiltered(
func(section string) bool {
kernelName, err := e2eutil.GetField(section, "kernel.name")
return err == nil && kernelName == "linux"
})
must.NoError(t, err, must.Sprint("could not get node status listing"))
count := len(nodes)

// Run a system job, which will not be moved when we drain the node
systemJobID := "test-node-drain-system-" + uuid.Short()
t.Cleanup(cleanupJobState(t, systemJobID))
registerAndWaitForRunning(t, nomadClient, systemJobID, "./input/drain_ignore_system.nomad", count)

// Also run a service job so we can verify when the drain is done
serviceJobID := "test-node-drain-service-" + uuid.Short()
t.Cleanup(cleanupJobState(t, serviceJobID))
serviceAllocs := registerAndWaitForRunning(t, nomadClient, serviceJobID, "./input/drain_simple.nomad", 1)
oldAllocID := serviceAllocs[0].ID
oldNodeID := serviceAllocs[0].NodeID

// Drain the node with -ignore-system
out, err := e2eutil.Command(
"nomad", "node", "drain",
"-ignore-system", "-enable", "-yes", "-detach", oldNodeID)
must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))

// The service job should be drained
newAllocs := waitForAllocDrain(t, nomadClient, serviceJobID, oldAllocID, oldNodeID)
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new service job alloc"))

// The system job should not have been drained
got, err := e2eutil.AllocsForJob(systemJobID, structs.DefaultNamespace)
must.NoError(t, err, must.Sprintf("could not read allocs for system job: %v", got))
must.Len(t, count, got, must.Sprintf("expected %d system allocs", count))

for _, systemAlloc := range got {
must.Eq(t, "running", systemAlloc["Status"],
must.Sprint("expected all system allocs to be left client=running"))
must.Eq(t, "run", systemAlloc["Desired"],
must.Sprint("expected all system allocs to be left desired=run"))
}
}

// testKeepIneligible tests that nodes can be kept ineligible for scheduling after
// disabling drain.
func testKeepIneligible(t *testing.T) {

nodes, err := e2eutil.NodeStatusList()
must.NoError(t, err, must.Sprint("expected no error when listing nodes"))

nodeID := nodes[0]["ID"]

t.Cleanup(cleanupDrainState(t))

out, err := e2eutil.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
must.NoError(t, err, must.Sprintf("expected no error when marking node for drain: %v", out))

out, err = e2eutil.Command(
"nomad", "node", "drain",
"-disable", "-keep-ineligible", "-yes", nodeID)
must.NoError(t, err, must.Sprintf("expected no error when disabling drain for node: %v", out))

nodes, err = e2eutil.NodeStatusList()
must.NoError(t, err, must.Sprint("expected no error when listing nodes"))

for _, node := range nodes {
if node["ID"] == nodeID {
must.Eq(t, "ineligible", nodes[0]["Eligibility"])
must.Eq(t, "false", nodes[0]["Drain"])
}
}
}

// registerAndWaitForRunning registers a job and waits for the expected number
// of allocations to be in a running state. Returns the allocations.
func registerAndWaitForRunning(t *testing.T, nomadClient *api.Client, jobID, jobSpec string, expectedCount int) []*api.AllocationListStub {
t.Helper()

var allocs []*api.AllocationListStub
var err error
must.NoError(t, e2eutil.Register(jobID, jobSpec))

must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(func() error {
allocs, _, err = nomadClient.Jobs().Allocations(jobID, false, nil)
if err != nil {
return fmt.Errorf("expected no error listing allocs: %v", err)
}
if len(allocs) != expectedCount {
return fmt.Errorf("expected %d allocs but found %d", expectedCount, len(allocs))
}
for _, alloc := range allocs {
if alloc.ClientStatus != structs.AllocClientStatusRunning {
return fmt.Errorf("alloc %q was %q, not running", alloc.ID, alloc.ClientStatus)
}
}
return nil
}),
wait.Timeout(60*time.Second),
wait.Gap(500*time.Millisecond),
))
return allocs
}

// waitForAllocDrain polls the allocation statues for a job until we've finished
// migrating:
// - the old alloc should be stopped
// - the new alloc should be running
func waitForAllocDrain(t *testing.T, nomadClient *api.Client, jobID, oldAllocID, oldNodeID string) []*api.AllocationListStub {

t.Helper()
newAllocs := set.From([]*api.AllocationListStub{})
start := time.Now()

must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(func() error {
allocs, _, err := nomadClient.Jobs().Allocations(jobID, false, nil)
if err != nil {
return fmt.Errorf("could not read allocations for node: %w", err)
}
if len(allocs) == 1 {
return fmt.Errorf("no new alloc started")
}

for _, alloc := range allocs {
if alloc.ID == oldAllocID {
if alloc.ClientStatus != structs.AllocClientStatusComplete {
return fmt.Errorf("old alloc was not marked complete")
}
} else {
if alloc.ClientStatus != structs.AllocClientStatusRunning {
return fmt.Errorf("new alloc was not marked running")
}
newAllocs.Insert(alloc)
}
}
t.Logf("alloc has drained from node=%s after %v",
oldNodeID, time.Now().Sub(start))
return nil
}),
wait.Timeout(120*time.Second),
wait.Gap(500*time.Millisecond),
))

return newAllocs.Slice()
}

func cleanupJobState(t *testing.T, jobID string) func() {
return func() {
_, err := e2eutil.Command("nomad", "job", "stop", "-purge", jobID)
test.NoError(t, err)
}
}

func cleanupDrainState(t *testing.T) func() {
return func() {
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
return
}

nomadClient := e2eutil.NomadClient(t)
nodes, _, err := nomadClient.Nodes().List(nil)
must.NoError(t, err, must.Sprint("expected no error when listing nodes"))
for _, node := range nodes {
_, err := e2eutil.Command("nomad", "node", "drain", "-disable", "-yes", node.ID)
test.NoError(t, err)
_, err = e2eutil.Command("nomad", "node", "eligibility", "-enable", node.ID)
test.NoError(t, err)
}
}
}
106 changes: 0 additions & 106 deletions e2e/nodedrain/nodedrain.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,87 +162,6 @@ func (tc *NodeDrainE2ETest) TestNodeDrainEphemeralMigrate(f *framework.F) {
f.Equal(oldAllocID, strings.TrimSpace(got), "node drained but migration failed")
}

// TestNodeDrainIgnoreSystem tests that system jobs are left behind when the
// -ignore-system flag is used.
func (tc *NodeDrainE2ETest) TestNodeDrainIgnoreSystem(f *framework.F) {

nodes, err := e2e.NodeStatusListFiltered(
func(section string) bool {
kernelName, err := e2e.GetField(section, "kernel.name")
return err == nil && kernelName == "linux"
})
f.NoError(err, "could not get node status listing")

serviceJobID := "test-node-drain-service-" + uuid.Generate()[0:8]
systemJobID := "test-node-drain-system-" + uuid.Generate()[0:8]

f.NoError(e2e.Register(serviceJobID, "nodedrain/input/drain_simple.nomad"))
tc.jobIDs = append(tc.jobIDs, serviceJobID)

f.NoError(e2e.WaitForAllocStatusExpected(serviceJobID, ns, []string{"running"}))

allocs, err := e2e.AllocsForJob(serviceJobID, ns)
f.NoError(err, "could not get allocs for service job")
f.Len(allocs, 1, "could not get allocs for service job")
oldAllocID := allocs[0]["ID"]

f.NoError(e2e.Register(systemJobID, "nodedrain/input/drain_ignore_system.nomad"))
tc.jobIDs = append(tc.jobIDs, systemJobID)

expected := []string{"running"}
f.NoError(e2e.WaitForAllocStatusExpected(serviceJobID, ns, expected),
"service job should be running")

// can't just give it a static list because the number of nodes can vary
f.NoError(
e2e.WaitForAllocStatusComparison(
func() ([]string, error) { return e2e.AllocStatuses(systemJobID, ns) },
func(got []string) bool {
if len(got) != len(nodes) {
return false
}
for _, status := range got {
if status != "running" {
return false
}
}
return true
}, nil,
),
"system job should be running on every node",
)

jobNodes, err := nodesForJob(serviceJobID)
f.NoError(err, "could not get nodes for job")
f.Len(jobNodes, 1, "could not get nodes for job")
nodeID := jobNodes[0]

out, err := e2e.Command(
"nomad", "node", "drain",
"-ignore-system", "-enable", "-yes", "-detach", nodeID)
f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
tc.nodeIDs = append(tc.nodeIDs, nodeID)

f.NoError(waitForNodeDrain(nodeID,
func(got []map[string]string) bool {
for _, alloc := range got {
if alloc["ID"] == oldAllocID && alloc["Status"] == "complete" {
return true
}
}
return false
}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 500},
), "node did not drain")

allocs, err = e2e.AllocsForJob(systemJobID, ns)
f.NoError(err, "could not query allocs for system job")
f.Equal(len(nodes), len(allocs), "system job should still be running on every node")
for _, alloc := range allocs {
f.Equal("run", alloc["Desired"], "no system allocs should be draining")
f.Equal("running", alloc["Status"], "no system allocs should be draining")
}
}

// TestNodeDrainDeadline tests the enforcement of the node drain deadline so
// that allocations are terminated even if they haven't gracefully exited.
func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
Expand Down Expand Up @@ -327,28 +246,3 @@ func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) {
), "node did not drain immediately when forced")

}

// TestNodeDrainKeepIneligible tests that nodes can be kept ineligible for
// scheduling after disabling drain.
func (tc *NodeDrainE2ETest) TestNodeDrainKeepIneligible(f *framework.F) {

nodes, err := e2e.NodeStatusList()
f.NoError(err, "could not get node status listing")

nodeID := nodes[0]["ID"]

out, err := e2e.Command("nomad", "node", "drain", "-enable", "-yes", "-detach", nodeID)
f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
tc.nodeIDs = append(tc.nodeIDs, nodeID)

_, err = e2e.Command(
"nomad", "node", "drain",
"-disable", "-keep-ineligible", "-yes", nodeID)
f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))

nodes, err = e2e.NodeStatusList()
f.NoError(err, "could not get updated node status listing")

f.Equal("ineligible", nodes[0]["Eligibility"])
f.Equal("false", nodes[0]["Drain"])
}

0 comments on commit c9d1fc0

Please sign in to comment.