E2E: extend CSI test to cover create and snapshot workflows

Split the EBS and EFS tests out into their own test case so that we can do cleanup differently when we have controller plugins.
hashicorp · Apr 8, 2021 · d24f886 · d24f886
1 parent 83a559f
commit d24f886
Show file tree

Hide file tree

Showing 9 changed files with 429 additions and 308 deletions.
diff --git a/e2e/csi/csi.go b/e2e/csi/csi.go
@@ -13,29 +13,20 @@ import (
 	"strings"
 	"time"
 
-	"github.com/stretchr/testify/require"
-
 	"github.com/hashicorp/nomad/api"
 	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
 	"github.com/hashicorp/nomad/e2e/framework"
-	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/testutil"
 )
 
-type CSIVolumesTest struct {
-	framework.TC
-	testJobIDs   []string
-	volumeIDs    []string
-	pluginJobIDs []string
-}
-
 func init() {
 	framework.AddSuites(&framework.TestSuite{
 		Component:   "CSI",
 		CanRunLocal: true,
 		Consul:      false,
 		Cases: []framework.TestCase{
-			new(CSIVolumesTest),
+			new(CSIControllerPluginEBSTest), // see ebs.go
+			new(CSINodeOnlyPluginEFSTest),   // see efs.go
 		},
 	})
 }
@@ -45,269 +36,27 @@ const ns = ""
 var pluginWait = &e2e.WaitConfig{Interval: 5 * time.Second, Retries: 36} // 3min
 var reapWait = &e2e.WaitConfig{Interval: 5 * time.Second, Retries: 36}   // 3min
 
-func (tc *CSIVolumesTest) BeforeAll(f *framework.F) {
-	t := f.T()
-
-	_, err := os.Stat("csi/input/volume-ebs.hcl")
-	if err != nil {
-		t.Skip("skipping CSI test because EBS volume spec file missing:", err)
-	}
-
-	_, err = os.Stat("csi/input/volume-efs.hcl")
+// assertNoErrorElseDump calls a non-halting assert on the error and dumps the
+// plugin logs if it fails.
+func assertNoErrorElseDump(f *framework.F, err error, msg string, pluginJobIDs []string) {
 	if err != nil {
-		t.Skip("skipping CSI test because EFS volume spec file missing:", err)
+		dumpLogs(pluginJobIDs)
+		f.Assert().NoError(err, fmt.Sprintf("%v: %v", msg, err))
 	}
-
-	// Ensure cluster has leader and at least two client
-	// nodes in a ready state before running tests
-	e2e.WaitForLeader(t, tc.Nomad())
-	e2e.WaitForNodesReady(t, tc.Nomad(), 2)
-}
-
-// TestEBSVolumeClaim launches AWS EBS plugins and registers an EBS volume
-// as a Nomad CSI volume. We then deploy a job that writes to the volume,
-// stop that job, and reuse the volume for another job which should be able
-// to read the data written by the first job.
-func (tc *CSIVolumesTest) TestEBSVolumeClaim(f *framework.F) {
-	t := f.T()
-	require := require.New(t)
-	nomadClient := tc.Nomad()
-	uuid := uuid.Generate()
-	pluginID := "aws-ebs0"
-
-	// deploy the controller plugin job
-	controllerJobID := "aws-ebs-plugin-controller-" + uuid[0:8]
-	f.NoError(e2e.Register(controllerJobID, "csi/input/plugin-aws-ebs-controller.nomad"))
-	tc.pluginJobIDs = append(tc.pluginJobIDs, controllerJobID)
-	expected := []string{"running", "running"}
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(controllerJobID, ns, expected),
-		"job should be running")
-
-	// deploy the node plugins job
-	nodesJobID := "aws-ebs-plugin-nodes-" + uuid[0:8]
-	f.NoError(e2e.Register(nodesJobID, "csi/input/plugin-aws-ebs-nodes.nomad"))
-	tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID)
-
-	f.NoError(e2e.WaitForAllocStatusComparison(
-		func() ([]string, error) { return e2e.AllocStatuses(nodesJobID, ns) },
-		func(got []string) bool {
-			for _, status := range got {
-				if status != "running" {
-					return false
-				}
-			}
-			return true
-		}, nil,
-	))
-
-	f.NoError(waitForPluginStatusControllerCount(pluginID, 2, pluginWait),
-		"aws-ebs0 controller plugins did not become healthy")
-	f.NoError(waitForPluginStatusMinNodeCount(pluginID, 2, pluginWait),
-		"aws-ebs0 node plugins did not become healthy")
-
-	// register a volume
-	volID := "ebs-vol[0]"
-	err := volumeRegister(volID, "csi/input/volume-ebs.hcl")
-	require.NoError(err)
-	tc.volumeIDs = append(tc.volumeIDs, volID)
-
-	// deploy a job that writes to the volume
-	writeJobID := "write-ebs-" + uuid[0:8]
-	f.NoError(e2e.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
-		"job should be running")
-
-	allocs, err := e2e.AllocsForJob(writeJobID, ns)
-	f.NoError(err, "could not get allocs for write job")
-	f.Len(allocs, 1, "could not get allocs for write job")
-	writeAllocID := allocs[0]["ID"]
-
-	// read data from volume and assert the writer wrote a file to it
-	expectedPath := "/task/test/" + writeAllocID
-	_, err = readFile(nomadClient, writeAllocID, expectedPath)
-	require.NoError(err)
-
-	// Shutdown (and purge) the writer so we can run a reader.
-	// we could mount the EBS volume with multi-attach, but we
-	// want this test to exercise the unpublish workflow.
-	_, err = e2e.Command("nomad", "job", "stop", "-purge", writeJobID)
-	require.NoError(err)
-
-	// wait for the volume unpublish workflow to complete
-	require.NoError(waitForVolumeClaimRelease(volID, reapWait),
-		"write-ebs alloc claim was not released")
-
-	// deploy a job so we can read from the volume
-	readJobID := "read-ebs-" + uuid[0:8]
-	tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up
-	f.NoError(e2e.Register(readJobID, "csi/input/use-ebs-volume.nomad"))
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}),
-		"job should be running")
-
-	allocs, err = e2e.AllocsForJob(readJobID, ns)
-	f.NoError(err, "could not get allocs for read job")
-	f.Len(allocs, 1, "could not get allocs for read job")
-	readAllocID := allocs[0]["ID"]
-
-	// read data from volume and assert we can read the file the writer wrote
-	expectedPath = "/task/test/" + readAllocID
-	_, err = readFile(nomadClient, readAllocID, expectedPath)
-	require.NoError(err)
-
 }
 
-// TestEFSVolumeClaim launches AWS EFS plugins and registers an EFS volume
-// as a Nomad CSI volume. We then deploy a job that writes to the volume,
-// and share the volume with another job which should be able to read the
-// data written by the first job.
-func (tc *CSIVolumesTest) TestEFSVolumeClaim(f *framework.F) {
-	t := f.T()
-	require := require.New(t)
-	nomadClient := tc.Nomad()
-	uuid := uuid.Generate()
-	pluginID := "aws-efs0"
-
-	// deploy the node plugins job (no need for a controller for EFS)
-	nodesJobID := "aws-efs-plugin-nodes-" + uuid[0:8]
-	f.NoError(e2e.Register(nodesJobID, "csi/input/plugin-aws-efs-nodes.nomad"))
-	tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID)
-
-	f.NoError(e2e.WaitForAllocStatusComparison(
-		func() ([]string, error) { return e2e.AllocStatuses(nodesJobID, ns) },
-		func(got []string) bool {
-			for _, status := range got {
-				if status != "running" {
-					return false
-				}
-			}
-			return true
-		}, nil,
-	))
-
-	f.NoError(waitForPluginStatusMinNodeCount(pluginID, 2, pluginWait),
-		"aws-efs0 node plugins did not become healthy")
-
-	// register a volume
-	volID := "efs-vol0"
-	err := volumeRegister(volID, "csi/input/volume-efs.hcl")
-	require.NoError(err)
-	tc.volumeIDs = append(tc.volumeIDs, volID)
-
-	// deploy a job that writes to the volume
-	writeJobID := "write-efs-" + uuid[0:8]
-	tc.testJobIDs = append(tc.testJobIDs, writeJobID) // ensure failed tests clean up
-	f.NoError(e2e.Register(writeJobID, "csi/input/use-efs-volume-write.nomad"))
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
-		"job should be running")
-
-	allocs, err := e2e.AllocsForJob(writeJobID, ns)
-	f.NoError(err, "could not get allocs for write job")
-	f.Len(allocs, 1, "could not get allocs for write job")
-	writeAllocID := allocs[0]["ID"]
-
-	// read data from volume and assert the writer wrote a file to it
-	expectedPath := "/task/test/" + writeAllocID
-	_, err = readFile(nomadClient, writeAllocID, expectedPath)
-	require.NoError(err)
-
-	// Shutdown the writer so we can run a reader.
-	// although EFS should support multiple readers, the plugin
-	// does not.
-	_, err = e2e.Command("nomad", "job", "stop", writeJobID)
-	require.NoError(err)
-
-	// wait for the volume unpublish workflow to complete
-	require.NoError(waitForVolumeClaimRelease(volID, reapWait),
-		"write-efs alloc claim was not released")
-
-	// deploy a job that reads from the volume
-	readJobID := "read-efs-" + uuid[0:8]
-	tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up
-	f.NoError(e2e.Register(readJobID, "csi/input/use-efs-volume-read.nomad"))
-	f.NoError(
-		e2e.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}),
-		"job should be running")
-
-	allocs, err = e2e.AllocsForJob(readJobID, ns)
-	f.NoError(err, "could not get allocs for read job")
-	f.Len(allocs, 1, "could not get allocs for read job")
-	readAllocID := allocs[0]["ID"]
-
-	// read data from volume and assert the writer wrote a file to it
-	require.NoError(err)
-	_, err = readFile(nomadClient, readAllocID, expectedPath)
-	require.NoError(err)
-}
-
-func (tc *CSIVolumesTest) AfterEach(f *framework.F) {
-
-	// Stop all jobs in test
-	for _, id := range tc.testJobIDs {
-		out, err := e2e.Command("nomad", "job", "stop", "-purge", id)
-		f.Assert().NoError(err, out)
-	}
-	tc.testJobIDs = []string{}
-
-	// Deregister all volumes in test
-	for _, id := range tc.volumeIDs {
-		// make sure all the test jobs have finished unpublishing claims
-		err := waitForVolumeClaimRelease(id, reapWait)
-		f.Assert().NoError(err, "volume claims were not released")
-
-		out, err := e2e.Command("nomad", "volume", "deregister", id)
-		if err != nil {
-			fmt.Println("could not deregister volume, dumping allocation logs")
-			f.Assert().NoError(tc.dumpLogs())
-		}
-		f.Assert().NoError(err, out)
-	}
-	tc.volumeIDs = []string{}
-
-	// Deregister all plugin jobs in test
-	for _, id := range tc.pluginJobIDs {
-		out, err := e2e.Command("nomad", "job", "stop", "-purge", id)
-		f.Assert().NoError(err, out)
+// requireNoErrorElseDump calls a halting assert on the error and dumps the
+// plugin logs if it fails.
+func requireNoErrorElseDump(f *framework.F, err error, msg string, pluginJobIDs []string) {
+	if err != nil {
+		dumpLogs(pluginJobIDs)
+		f.NoError(err, fmt.Sprintf("%v: %v", msg, err))
 	}
-	tc.pluginJobIDs = []string{}
-
-	// Garbage collect
-	out, err := e2e.Command("nomad", "system", "gc")
-	f.Assert().NoError(err, out)
-}
-
-// waitForVolumeClaimRelease makes sure we don't try to re-claim a volume
-// that's in the process of being unpublished. we can't just wait for allocs
-// to stop, but need to wait for their claims to be released
-func waitForVolumeClaimRelease(volID string, wc *e2e.WaitConfig) error {
-	var out string
-	var err error
-	testutil.WaitForResultRetries(wc.Retries, func() (bool, error) {
-		time.Sleep(wc.Interval)
-		out, err = e2e.Command("nomad", "volume", "status", volID)
-		if err != nil {
-			return false, err
-		}
-		section, err := e2e.GetSection(out, "Allocations")
-		if err != nil {
-			return false, err
-		}
-		return strings.Contains(section, "No allocations placed"), nil
-	}, func(e error) {
-		if e == nil {
-			err = nil
-		}
-		err = fmt.Errorf("alloc claim was not released: %v\n%s", e, out)
-	})
-	return err
 }
 
-func (tc *CSIVolumesTest) dumpLogs() error {
+func dumpLogs(pluginIDs []string) error {
 
-	for _, id := range tc.pluginJobIDs {
+	for _, id := range pluginIDs {
 		allocs, err := e2e.AllocsForJob(id, ns)
 		if err != nil {
 			return fmt.Errorf("could not find allocs for plugin: %v", err)
@@ -340,6 +89,32 @@ func (tc *CSIVolumesTest) dumpLogs() error {
 	return nil
 }
 
+// waitForVolumeClaimRelease makes sure we don't try to re-claim a volume
+// that's in the process of being unpublished. we can't just wait for allocs
+// to stop, but need to wait for their claims to be released
+func waitForVolumeClaimRelease(volID string, wc *e2e.WaitConfig) error {
+	var out string
+	var err error
+	testutil.WaitForResultRetries(wc.Retries, func() (bool, error) {
+		time.Sleep(wc.Interval)
+		out, err = e2e.Command("nomad", "volume", "status", volID)
+		if err != nil {
+			return false, err
+		}
+		section, err := e2e.GetSection(out, "Allocations")
+		if err != nil {
+			return false, err
+		}
+		return strings.Contains(section, "No allocations placed"), nil
+	}, func(e error) {
+		if e == nil {
+			err = nil
+		}
+		err = fmt.Errorf("alloc claim was not released: %v\n%s", e, out)
+	})
+	return err
+}
+
 // TODO(tgross): replace this w/ AllocFS().Stat() after
 // https://github.com/hashicorp/nomad/issues/7365 is fixed
 func readFile(client *api.Client, allocID string, path string) (bytes.Buffer, error) {
@@ -434,11 +209,12 @@ func waitForPluginStatusCompare(pluginID string, compare func(got string) (bool,
 	return err
 }
 
-// VolumeRegister registers a jobspec from a file but with a unique ID.
-// The caller is responsible for recording that ID for later cleanup.
-func volumeRegister(volID, volFilePath string) error {
+// volumeRegister creates or registers a volume spec from a file but with a
+// unique ID. The caller is responsible for recording that ID for later
+// cleanup.
+func volumeRegister(volID, volFilePath, createOrRegister string) error {
 
-	cmd := exec.Command("nomad", "volume", "register", "-")
+	cmd := exec.Command("nomad", "volume", createOrRegister, "-")
 	stdin, err := cmd.StdinPipe()
 	if err != nil {
 		return fmt.Errorf("could not open stdin?: %w", err)