From b3a33e0d468b17b1672239d617f2106ddba754dc Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Wed, 20 May 2020 15:22:51 -0400
Subject: [PATCH] csi: don't pass volume claim releases thru GC eval (#8021)

Following the new volumewatcher in #7794 and performance improvements
to it that landed afterwards, there's no particular reason we should
be threading claim releases through the GC eval rather than writing an
empty `CSIVolumeClaimRequest` with the mode set to
`CSIVolumeClaimRelease`, just as the GC evaluation would do.

Also, by batching up these raft messages, we can reduce the amount of
raft writes by 1 and cross-server RPCs by 1 per volume we release
claims on.
---
 nomad/csi_batch.go          |  76 ++++++++++++++++++++++++++
 nomad/csi_batch_test.go     |  34 ++++++++++++
 nomad/job_endpoint.go       | 105 +++++++++++++++---------------------
 nomad/node_endpoint.go      |  40 +++++---------
 nomad/node_endpoint_test.go |  65 ++++++++++++++--------
 5 files changed, 210 insertions(+), 110 deletions(-)
 create mode 100644 nomad/csi_batch.go
 create mode 100644 nomad/csi_batch_test.go

diff --git a/nomad/csi_batch.go b/nomad/csi_batch.go
new file mode 100644
index 000000000000..50e36e7a6c40
--- /dev/null
+++ b/nomad/csi_batch.go
@@ -0,0 +1,76 @@
+package nomad
+
+import (
+	log "github.com/hashicorp/go-hclog"
+	multierror "github.com/hashicorp/go-multierror"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+// csiBatchRelease is a helper for any time we need to release a bunch
+// of volume claims at once. It de-duplicates the volumes and batches
+// the raft messages into manageable chunks. Intended for use by RPCs
+// that have already been forwarded to the leader.
+type csiBatchRelease struct {
+	srv    *Server
+	logger log.Logger
+
+	maxBatchSize int
+	seen         map[string]struct{}
+	batches      []*structs.CSIVolumeClaimBatchRequest
+}
+
+func newCSIBatchRelease(srv *Server, logger log.Logger, max int) *csiBatchRelease {
+	return &csiBatchRelease{
+		srv:          srv,
+		logger:       logger,
+		maxBatchSize: max,
+		seen:         map[string]struct{}{},
+		batches:      []*structs.CSIVolumeClaimBatchRequest{{}},
+	}
+}
+
+// add the volume ID + namespace to the deduplicated batches
+func (c *csiBatchRelease) add(vol, namespace string) {
+	id := vol + "\x00" + namespace
+
+	// ignore duplicates
+	_, seen := c.seen[id]
+	if seen {
+		return
+	}
+	c.seen[id] = struct{}{}
+
+	req := structs.CSIVolumeClaimRequest{
+		VolumeID: vol,
+		Claim:    structs.CSIVolumeClaimRelease,
+	}
+	req.Namespace = namespace
+	req.Region = c.srv.config.Region
+
+	for _, batch := range c.batches {
+		// otherwise append to the first non-full batch
+		if len(batch.Claims) < c.maxBatchSize {
+			batch.Claims = append(batch.Claims, req)
+			return
+		}
+	}
+	// no non-full batch found, make a new one
+	newBatch := &structs.CSIVolumeClaimBatchRequest{
+		Claims: []structs.CSIVolumeClaimRequest{req}}
+	c.batches = append(c.batches, newBatch)
+}
+
+// apply flushes the batches to raft
+func (c *csiBatchRelease) apply() error {
+	var result *multierror.Error
+	for _, batch := range c.batches {
+		if len(batch.Claims) > 0 {
+			_, _, err := c.srv.raftApply(structs.CSIVolumeClaimBatchRequestType, batch)
+			if err != nil {
+				c.logger.Error("csi raft apply failed", "error", err, "method", "claim")
+				result = multierror.Append(result, err)
+			}
+		}
+	}
+	return result.ErrorOrNil()
+}
diff --git a/nomad/csi_batch_test.go b/nomad/csi_batch_test.go
new file mode 100644
index 000000000000..565fef591e30
--- /dev/null
+++ b/nomad/csi_batch_test.go
@@ -0,0 +1,34 @@
+package nomad
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestCSI_Batcher(t *testing.T) {
+	t.Parallel()
+	srv, shutdown := TestServer(t, func(c *Config) {
+		c.NumSchedulers = 0 // Prevent automatic dequeue
+	})
+	defer shutdown()
+
+	batcher := newCSIBatchRelease(srv, nil, 5)
+
+	batcher.add("vol0", "global")
+	batcher.add("vol", "0global")
+	batcher.add("vol1", "global")
+	batcher.add("vol1", "global")
+	batcher.add("vol2", "global")
+	batcher.add("vol2", "other")
+	batcher.add("vol3", "global")
+	batcher.add("vol4", "global")
+	batcher.add("vol5", "global")
+	batcher.add("vol6", "global")
+
+	require.Len(t, batcher.batches, 2)
+	require.Len(t, batcher.batches[0].Claims, 5, "first batch")
+	require.Equal(t, batcher.batches[0].Claims[4].VolumeID, "vol2")
+	require.Equal(t, batcher.batches[0].Claims[4].Namespace, "other")
+	require.Len(t, batcher.batches[1].Claims, 4, "second batch")
+}
diff --git a/nomad/job_endpoint.go b/nomad/job_endpoint.go
index 536a051ee050..ce18222b69c7 100644
--- a/nomad/job_endpoint.go
+++ b/nomad/job_endpoint.go
@@ -707,19 +707,20 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD
 		return err
 	}
 
-	// For a job with volumes, find its volumes before deleting the job
-	volumesToGC := make(map[string]*structs.VolumeRequest)
+	// For a job with volumes, find its volumes before deleting the job.
+	// Later we'll apply this raft.
+	volumesToGC := newCSIBatchRelease(j.srv, j.logger, 100)
 	if job != nil {
 		for _, tg := range job.TaskGroups {
 			for _, vol := range tg.Volumes {
 				if vol.Type == structs.VolumeTypeCSI {
-					volumesToGC[vol.Source] = vol
+					volumesToGC.add(vol.Source, job.Namespace)
 				}
 			}
 		}
 	}
 
-	// Commit this update via Raft
+	// Commit the job update via Raft
 	_, index, err := j.srv.raftApply(structs.JobDeregisterRequestType, args)
 	if err != nil {
 		j.logger.Error("deregister failed", "error", err)
@@ -729,70 +730,52 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD
 	// Populate the reply with job information
 	reply.JobModifyIndex = index
 
-	evals := []*structs.Evaluation{}
-	now := time.Now().UTC().UnixNano()
-
-	// Add an evaluation for garbage collecting the the CSI volume claims
-	// of terminal allocs
-	for _, vol := range volumesToGC {
-		// we have to build this eval by hand rather than calling srv.CoreJob
-		// here because we need to use the volume's namespace
-		eval := &structs.Evaluation{
-			ID:          uuid.Generate(),
-			Namespace:   job.Namespace,
-			Priority:    structs.CoreJobPriority,
-			Type:        structs.JobTypeCore,
-			TriggeredBy: structs.EvalTriggerAllocStop,
-			JobID:       structs.CoreJobCSIVolumeClaimGC + ":" + vol.Source,
-			LeaderACL:   j.srv.getLeaderAcl(),
-			Status:      structs.EvalStatusPending,
-			CreateTime:  now,
-			ModifyTime:  now,
-		}
-		evals = append(evals, eval)
+	// Make a raft apply to release the CSI volume claims of terminal allocs.
+	var result *multierror.Error
+	err = volumesToGC.apply()
+	if err != nil {
+		result = multierror.Append(result, err)
 	}
 
-	// If the job is periodic or parameterized, we don't create an eval
-	// for the job, but might still need one for the volumes
-	if job == nil || !(job.IsPeriodic() || job.IsParameterized()) {
-		// Create a new evaluation
-		// XXX: The job priority / type is strange for this, since it's not a high
-		// priority even if the job was.
-		eval := &structs.Evaluation{
-			ID:             uuid.Generate(),
-			Namespace:      args.RequestNamespace(),
-			Priority:       structs.JobDefaultPriority,
-			Type:           structs.JobTypeService,
-			TriggeredBy:    structs.EvalTriggerJobDeregister,
-			JobID:          args.JobID,
-			JobModifyIndex: index,
-			Status:         structs.EvalStatusPending,
-			CreateTime:     now,
-			ModifyTime:     now,
-		}
-		evals = append(evals, eval)
+	// If the job is periodic or parameterized, we don't create an eval.
+	if job != nil && (job.IsPeriodic() || job.IsParameterized()) {
+		return nil
 	}
 
-	if len(evals) > 0 {
-		update := &structs.EvalUpdateRequest{
-			Evals:        evals,
-			WriteRequest: structs.WriteRequest{Region: args.Region},
-		}
-
-		// Commit this evaluation via Raft
-		_, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update)
-		if err != nil {
-			j.logger.Error("eval create failed", "error", err, "method", "deregister")
-			return err
-		}
+	// Create a new evaluation
+	// XXX: The job priority / type is strange for this, since it's not a high
+	// priority even if the job was.
+	now := time.Now().UTC().UnixNano()
+	eval := &structs.Evaluation{
+		ID:             uuid.Generate(),
+		Namespace:      args.RequestNamespace(),
+		Priority:       structs.JobDefaultPriority,
+		Type:           structs.JobTypeService,
+		TriggeredBy:    structs.EvalTriggerJobDeregister,
+		JobID:          args.JobID,
+		JobModifyIndex: index,
+		Status:         structs.EvalStatusPending,
+		CreateTime:     now,
+		ModifyTime:     now,
+	}
+	update := &structs.EvalUpdateRequest{
+		Evals:        []*structs.Evaluation{eval},
+		WriteRequest: structs.WriteRequest{Region: args.Region},
+	}
 
-		// Populate the reply with eval information
-		reply.EvalID = evals[len(evals)-1].ID
-		reply.EvalCreateIndex = evalIndex
-		reply.Index = evalIndex
+	// Commit this evaluation via Raft
+	_, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update)
+	if err != nil {
+		result = multierror.Append(result, err)
+		j.logger.Error("eval create failed", "error", err, "method", "deregister")
+		return result.ErrorOrNil()
 	}
 
-	return nil
+	// Populate the reply with eval information
+	reply.EvalID = eval.ID
+	reply.EvalCreateIndex = evalIndex
+	reply.Index = evalIndex
+	return result.ErrorOrNil()
 }
 
 // BatchDeregister is used to remove a set of jobs from the cluster.
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index e2b4d6940cc7..d613f62b0025 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -1081,9 +1081,9 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 	now := time.Now()
 	var evals []*structs.Evaluation
 
-	// A set of de-duplicated volumes that need volume claim GC.
-	// Later we'll create a gc eval for each volume.
-	volumesToGC := make(map[string][]string) // ID+namespace -> [id, namespace]
+	// A set of de-duplicated volumes that need their volume claims released.
+	// Later we'll apply this raft.
+	volumesToGC := newCSIBatchRelease(n.srv, n.logger, 100)
 
 	for _, allocToUpdate := range args.Alloc {
 		allocToUpdate.ModifyTime = now.UTC().UnixNano()
@@ -1113,11 +1113,11 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 			continue
 		}
 
-		// If the terminal alloc has CSI volumes, add its job to the list
-		// of jobs we're going to call volume claim GC on.
+		// If the terminal alloc has CSI volumes, add the volumes to the batch
+		// of volumes we'll release the claims of.
 		for _, vol := range taskGroup.Volumes {
 			if vol.Type == structs.VolumeTypeCSI {
-				volumesToGC[vol.Source+alloc.Namespace] = []string{vol.Source, alloc.Namespace}
+				volumesToGC.add(vol.Source, alloc.Namespace)
 			}
 		}
 
@@ -1138,24 +1138,11 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 		}
 	}
 
-	// Add an evaluation for garbage collecting the the CSI volume claims
-	// of terminal allocs
-	for _, volAndNamespace := range volumesToGC {
-		// we have to build this eval by hand rather than calling srv.CoreJob
-		// here because we need to use the volume's namespace
-		eval := &structs.Evaluation{
-			ID:          uuid.Generate(),
-			Namespace:   volAndNamespace[1],
-			Priority:    structs.CoreJobPriority,
-			Type:        structs.JobTypeCore,
-			TriggeredBy: structs.EvalTriggerAllocStop,
-			JobID:       structs.CoreJobCSIVolumeClaimGC + ":" + volAndNamespace[0],
-			LeaderACL:   n.srv.getLeaderAcl(),
-			Status:      structs.EvalStatusPending,
-			CreateTime:  now.UTC().UnixNano(),
-			ModifyTime:  now.UTC().UnixNano(),
-		}
-		evals = append(evals, eval)
+	// Make a raft apply to release the CSI volume claims of terminal allocs.
+	var result *multierror.Error
+	err := volumesToGC.apply()
+	if err != nil {
+		result = multierror.Append(result, err)
 	}
 
 	// Add this to the batch
@@ -1188,12 +1175,13 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 
 	// Wait for the future
 	if err := future.Wait(); err != nil {
-		return err
+		result = multierror.Append(result, err)
+		return result.ErrorOrNil()
 	}
 
 	// Setup the response
 	reply.Index = future.Index()
-	return nil
+	return result.ErrorOrNil()
 }
 
 // batchUpdate is used to update all the allocations
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 76f8ebcf3242..8d56bf2c95e3 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -2321,6 +2321,8 @@ func TestClientEndpoint_UpdateAlloc_UnclaimVolumes(t *testing.T) {
 
 	codec := rpcClient(t, srv)
 	state := srv.fsm.State()
+
+	index := uint64(0)
 	ws := memdb.NewWatchSet()
 
 	// Create a client node, plugin, and volume
@@ -2333,7 +2335,8 @@ func TestClientEndpoint_UpdateAlloc_UnclaimVolumes(t *testing.T) {
 			ControllerInfo: &structs.CSIControllerInfo{},
 		},
 	}
-	err := state.UpsertNode(99, node)
+	index++
+	err := state.UpsertNode(index, node)
 	require.NoError(t, err)
 	volId0 := uuid.Generate()
 	ns := structs.DefaultNamespace
@@ -2344,7 +2347,8 @@ func TestClientEndpoint_UpdateAlloc_UnclaimVolumes(t *testing.T) {
 		AccessMode:     structs.CSIVolumeAccessModeMultiNodeSingleWriter,
 		AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
 	}}
-	err = state.CSIVolumeRegister(100, vols)
+	index++
+	err = state.CSIVolumeRegister(index, vols)
 	require.NoError(t, err)
 	vol, err := state.CSIVolumeByID(ws, ns, volId0)
 	require.NoError(t, err)
@@ -2361,39 +2365,51 @@ func TestClientEndpoint_UpdateAlloc_UnclaimVolumes(t *testing.T) {
 			ReadOnly: false,
 		},
 	}
-	err = state.UpsertJob(101, job)
+	index++
+	err = state.UpsertJob(index, job)
 	require.NoError(t, err)
 
 	alloc1 := mock.Alloc()
 	alloc1.JobID = job.ID
 	alloc1.NodeID = node.ID
-	err = state.UpsertJobSummary(102, mock.JobSummary(alloc1.JobID))
+	index++
+	err = state.UpsertJobSummary(index, mock.JobSummary(alloc1.JobID))
 	require.NoError(t, err)
 	alloc1.TaskGroup = job.TaskGroups[0].Name
 
 	alloc2 := mock.Alloc()
 	alloc2.JobID = job.ID
 	alloc2.NodeID = node.ID
-	err = state.UpsertJobSummary(103, mock.JobSummary(alloc2.JobID))
+	index++
+	err = state.UpsertJobSummary(index, mock.JobSummary(alloc2.JobID))
 	require.NoError(t, err)
 	alloc2.TaskGroup = job.TaskGroups[0].Name
 
-	err = state.UpsertAllocs(104, []*structs.Allocation{alloc1, alloc2})
+	index++
+	err = state.UpsertAllocs(index, []*structs.Allocation{alloc1, alloc2})
 	require.NoError(t, err)
 
-	// Claim the volumes and verify the claims were set
-	err = state.CSIVolumeClaim(105, ns, volId0, &structs.CSIVolumeClaim{
-		AllocationID: alloc1.ID,
-		NodeID:       alloc1.NodeID,
-		Mode:         structs.CSIVolumeClaimWrite,
-	})
-	require.NoError(t, err)
-	err = state.CSIVolumeClaim(106, ns, volId0, &structs.CSIVolumeClaim{
-		AllocationID: alloc2.ID,
-		NodeID:       alloc2.NodeID,
-		Mode:         structs.CSIVolumeClaimRead,
-	})
+	// Claim the volumes and verify the claims were set. We need to
+	// apply this through the FSM so that we make sure the index is
+	// properly updated to test later
+	batch := &structs.CSIVolumeClaimBatchRequest{
+		Claims: []structs.CSIVolumeClaimRequest{
+			{
+				VolumeID:     volId0,
+				AllocationID: alloc1.ID,
+				NodeID:       alloc1.NodeID,
+				Claim:        structs.CSIVolumeClaimWrite,
+			},
+			{
+				VolumeID:     volId0,
+				AllocationID: alloc2.ID,
+				NodeID:       alloc2.NodeID,
+				Claim:        structs.CSIVolumeClaimRead,
+			},
+		}}
+	_, lastIndex, err := srv.raftApply(structs.CSIVolumeClaimBatchRequestType, batch)
 	require.NoError(t, err)
+
 	vol, err = state.CSIVolumeByID(ws, ns, volId0)
 	require.NoError(t, err)
 	require.Len(t, vol.ReadAllocs, 1)
@@ -2413,11 +2429,14 @@ func TestClientEndpoint_UpdateAlloc_UnclaimVolumes(t *testing.T) {
 	require.NoError(t, err)
 	require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
 
-	// Verify the eval for the claim GC was emitted
-	// Lookup the evaluations
-	eval, err := state.EvalsByJob(ws, job.Namespace, structs.CoreJobCSIVolumeClaimGC+":"+volId0)
-	require.NotNil(t, eval)
-	require.Nil(t, err)
+	// Verify the index has been updated to trigger a volume claim release
+
+	req := &structs.CSIVolumeGetRequest{ID: volId0}
+	req.Region = "global"
+	getResp := &structs.CSIVolumeGetResponse{}
+	err = msgpackrpc.CallWithCodec(codec, "CSIVolume.Get", req, getResp)
+	require.NoError(t, err)
+	require.Greater(t, getResp.Volume.ModifyIndex, lastIndex)
 }
 
 func TestClientEndpoint_CreateNodeEvals(t *testing.T) {