Merge remote-tracking branch 'origin/main' into f-verify-rpc-cert-origin

hashicorp · Feb 1, 2022 · dd59dc5 · dd59dc5
2 parents d1950c4 + 4d5bc23
commit dd59dc5
Show file tree

Hide file tree

Showing 66 changed files with 1,844 additions and 422 deletions.
diff --git a/.changelog/11878.txt b/.changelog/11878.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+core: Fixed auto-promotion of canaries in jobs with at least one task group without canaries.
+```
diff --git a/.changelog/11890.txt b/.changelog/11890.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+csi: Fixed a bug where garbage collected allocations could block new claims on a volume
+```
diff --git a/.changelog/11892.txt b/.changelog/11892.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+csi: Unmount volumes from the client before sending unpublish RPC
+```
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+## 1.2.5 (February 1, 2022)
+
+BUG FIXES:
+
+* csi: Fixed a bug where garbage collected allocations could block new claims on a volume [[GH-11890](https://github.com/hashicorp/nomad/issues/11890)]
+* csi: Fixed a bug where releasing volume claims would fail with ACL errors after leadership transitions. [[GH-11891](https://github.com/hashicorp/nomad/issues/11891)]
+* csi: Unmount volumes from the client before sending unpublish RPC [[GH-11892](https://github.com/hashicorp/nomad/issues/11892)]
+* template: Fixed a bug where client template configuration that did not include any of the new 1.2.4 configuration options could result in none of the configuration getting set. [[GH-11902](https://github.com/hashicorp/nomad/issues/11902)]
+
 ## 1.2.4 (January 18, 2022)
 
 FEATURES:
@@ -151,6 +160,15 @@ BUG FIXES:
 * server: Fixed a panic on arm64 platform when dispatching a job with a payload [[GH-11396](https://github.com/hashicorp/nomad/issues/11396)]
 * server: Fixed a panic that may occur when preempting multiple allocations on the same node [[GH-11346](https://github.com/hashicorp/nomad/issues/11346)]
 
+## 1.1.11 (February 1, 2022)
+
+BUG FIXES:
+
+* csi: Fixed a bug where garbage collected allocations could block new claims on a volume [[GH-11890](https://github.com/hashicorp/nomad/issues/11890)]
+* csi: Fixed a bug where releasing volume claims would fail with ACL errors after leadership transitions. [[GH-11891](https://github.com/hashicorp/nomad/issues/11891)]
+* csi: Fixed a bug where volume claim releases that were not fully processed before a leadership transition would be ignored [[GH-11776](https://github.com/hashicorp/nomad/issues/11776)]
+* csi: Unmount volumes from the client before sending unpublish RPC [[GH-11892](https://github.com/hashicorp/nomad/issues/11892)]
+
 ## 1.1.10 (January 18, 2022)
 
 BUG FIXES:
@@ -434,6 +452,15 @@ BUG FIXES:
  * server: Fixed a panic that may arise on submission of jobs containing invalid service checks [[GH-10154](https://github.com/hashicorp/nomad/issues/10154)]
  * ui: Fixed the rendering of interstitial components shown after processing a dynamic application sizing recommendation. [[GH-10094](https://github.com/hashicorp/nomad/pull/10094)]
 
+## 1.0.17 (February 1, 2022)
+
+BUG FIXES:
+
+* csi: Fixed a bug where garbage collected allocations could block new claims on a volume [[GH-11890](https://github.com/hashicorp/nomad/issues/11890)]
+* csi: Fixed a bug where releasing volume claims would fail with ACL errors after leadership transitions. [[GH-11891](https://github.com/hashicorp/nomad/issues/11891)]
+* csi: Fixed a bug where volume claim releases that were not fully processed before a leadership transition would be ignored [[GH-11776](https://github.com/hashicorp/nomad/issues/11776)]
+* csi: Unmount volumes from the client before sending unpublish RPC [[GH-11892](https://github.com/hashicorp/nomad/issues/11892)]
+
 ## 1.0.16 (January 18, 2022)
 
 BUG FIXES:

diff --git a/GNUmakefile b/GNUmakefile
@@ -39,7 +39,7 @@ PROTO_COMPARE_TAG ?= v1.0.3$(if $(findstring ent,$(GO_TAGS)),+ent,)
 
 # LAST_RELEASE is the git sha of the latest release corresponding to this branch. main should have the latest
 # published release, but backport branches should point to the parent tag (e.g. 1.0.8 in release-1.0.9 after 1.1.0 is cut).
-LAST_RELEASE ?= v1.2.4
+LAST_RELEASE ?= v1.2.5
 
 default: help
 

diff --git a/client/allocrunner/csi_hook.go b/client/allocrunner/csi_hook.go
@@ -3,6 +3,8 @@ package allocrunner
 import (
 	"context"
 	"fmt"
+	"sync"
+	"time"
 
 	hclog "github.com/hashicorp/go-hclog"
 	multierror "github.com/hashicorp/go-multierror"
@@ -24,7 +26,9 @@ type csiHook struct {
 	updater              hookResourceSetter
 	nodeSecret           string
 
-	volumeRequests map[string]*volumeAndRequest
+	volumeRequests     map[string]*volumeAndRequest
+	maxBackoffInterval time.Duration
+	maxBackoffDuration time.Duration
 }
 
 // implemented by allocrunner
@@ -42,6 +46,8 @@ func newCSIHook(alloc *structs.Allocation, logger hclog.Logger, csi csimanager.M
 		updater:              updater,
 		nodeSecret:           nodeSecret,
 		volumeRequests:       map[string]*volumeAndRequest{},
+		maxBackoffInterval:   time.Minute,
+		maxBackoffDuration:   time.Hour * 24,
 	}
 }
 
@@ -103,41 +109,43 @@ func (c *csiHook) Postrun() error {
 		return nil
 	}
 
-	var mErr *multierror.Error
+	var wg sync.WaitGroup
+	errs := make(chan error, len(c.volumeRequests))
 
 	for _, pair := range c.volumeRequests {
+		wg.Add(1)
+
+		// CSI RPCs can potentially fail for a very long time if a
+		// node plugin has failed. split the work into goroutines so
+		// that operators could potentially reuse one of a set of
+		// volumes even if this hook is stuck waiting on the others
+		go func(pair *volumeAndRequest) {
+			defer wg.Done()
+
+			// we can recover an unmount failure if the operator
+			// brings the plugin back up, so retry every few minutes
+			// but eventually give up
+			err := c.unmountWithRetry(pair)
+			if err != nil {
+				errs <- err
+				return
+			}
 
-		mode := structs.CSIVolumeClaimRead
-		if !pair.request.ReadOnly {
-			mode = structs.CSIVolumeClaimWrite
-		}
+			// we can't recover from this RPC error client-side; the
+			// volume claim GC job will have to clean up for us once
+			// the allocation is marked terminal
+			errs <- c.unpublish(pair)
+		}(pair)
+	}
 
-		source := pair.request.Source
-		if pair.request.PerAlloc {
-			// NOTE: PerAlloc can't be set if we have canaries
-			source = source + structs.AllocSuffix(c.alloc.Name)
-		}
+	wg.Wait()
+	close(errs) // so we don't block waiting if there were no errors
 
-		req := &structs.CSIVolumeUnpublishRequest{
-			VolumeID: source,
-			Claim: &structs.CSIVolumeClaim{
-				AllocationID: c.alloc.ID,
-				NodeID:       c.alloc.NodeID,
-				Mode:         mode,
-				State:        structs.CSIVolumeClaimStateUnpublishing,
-			},
-			WriteRequest: structs.WriteRequest{
-				Region:    c.alloc.Job.Region,
-				Namespace: c.alloc.Job.Namespace,
-				AuthToken: c.nodeSecret,
-			},
-		}
-		err := c.rpcClient.RPC("CSIVolume.Unpublish",
-			req, &structs.CSIVolumeUnpublishResponse{})
-		if err != nil {
-			mErr = multierror.Append(mErr, err)
-		}
+	var mErr *multierror.Error
+	for err := range errs {
+		mErr = multierror.Append(mErr, err)
 	}
+
 	return mErr.ErrorOrNil()
 }
 
@@ -231,3 +239,95 @@ func (c *csiHook) shouldRun() bool {
 
 	return false
 }
+
+func (c *csiHook) unpublish(pair *volumeAndRequest) error {
+
+	mode := structs.CSIVolumeClaimRead
+	if !pair.request.ReadOnly {
+		mode = structs.CSIVolumeClaimWrite
+	}
+
+	source := pair.request.Source
+	if pair.request.PerAlloc {
+		// NOTE: PerAlloc can't be set if we have canaries
+		source = source + structs.AllocSuffix(c.alloc.Name)
+	}
+
+	req := &structs.CSIVolumeUnpublishRequest{
+		VolumeID: source,
+		Claim: &structs.CSIVolumeClaim{
+			AllocationID: c.alloc.ID,
+			NodeID:       c.alloc.NodeID,
+			Mode:         mode,
+			State:        structs.CSIVolumeClaimStateUnpublishing,
+		},
+		WriteRequest: structs.WriteRequest{
+			Region:    c.alloc.Job.Region,
+			Namespace: c.alloc.Job.Namespace,
+			AuthToken: c.nodeSecret,
+		},
+	}
+
+	return c.rpcClient.RPC("CSIVolume.Unpublish",
+		req, &structs.CSIVolumeUnpublishResponse{})
+
+}
+
+// unmountWithRetry tries to unmount/unstage the volume, retrying with
+// exponential backoff capped to a maximum interval
+func (c *csiHook) unmountWithRetry(pair *volumeAndRequest) error {
+
+	// note: allocrunner hooks don't have access to the client's
+	// shutdown context, just the allocrunner's shutdown; if we make
+	// it available in the future we should thread it through here so
+	// that retry can exit gracefully instead of dropping the
+	// in-flight goroutine
+	ctx, cancel := context.WithTimeout(context.TODO(), c.maxBackoffDuration)
+	defer cancel()
+	var err error
+	backoff := time.Second
+	ticker := time.NewTicker(backoff)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return err
+		case <-ticker.C:
+		}
+
+		err = c.unmountImpl(pair)
+		if err == nil {
+			break
+		}
+
+		if backoff < c.maxBackoffInterval {
+			backoff = backoff * 2
+			if backoff > c.maxBackoffInterval {
+				backoff = c.maxBackoffInterval
+			}
+		}
+		ticker.Reset(backoff)
+	}
+	return nil
+}
+
+// unmountImpl implements the call to the CSI plugin manager to
+// unmount the volume. Each retry will write an "Unmount volume"
+// NodeEvent
+func (c *csiHook) unmountImpl(pair *volumeAndRequest) error {
+
+	mounter, err := c.csimanager.MounterForPlugin(context.TODO(), pair.volume.PluginID)
+	if err != nil {
+		return err
+	}
+
+	usageOpts := &csimanager.UsageOptions{
+		ReadOnly:       pair.request.ReadOnly,
+		AttachmentMode: pair.request.AttachmentMode,
+		AccessMode:     pair.request.AccessMode,
+		MountOptions:   pair.request.MountOptions,
+	}
+
+	return mounter.UnmountVolume(context.TODO(),
+		pair.volume.ID, pair.volume.RemoteID(), c.alloc.ID, usageOpts)
+}
diff --git a/client/allocrunner/csi_hook_test.go b/client/allocrunner/csi_hook_test.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"path/filepath"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/require"
 
@@ -59,7 +60,7 @@ func TestCSIHook(t *testing.T) {
 					"test-alloc-dir/%s/testvolume0/ro-file-system-single-node-reader-only", alloc.ID)},
 			},
 			expectedMountCalls:     1,
-			expectedUnmountCalls:   0, // not until this is done client-side
+			expectedUnmountCalls:   1,
 			expectedClaimCalls:     1,
 			expectedUnpublishCalls: 1,
 		},
@@ -83,7 +84,7 @@ func TestCSIHook(t *testing.T) {
 					"test-alloc-dir/%s/testvolume0/ro-file-system-single-node-reader-only", alloc.ID)},
 			},
 			expectedMountCalls:     1,
-			expectedUnmountCalls:   0, // not until this is done client-side
+			expectedUnmountCalls:   1,
 			expectedClaimCalls:     1,
 			expectedUnpublishCalls: 1,
 		},
@@ -122,7 +123,7 @@ func TestCSIHook(t *testing.T) {
 		// 			"test-alloc-dir/%s/testvolume0/ro-file-system-multi-node-reader-only", alloc.ID)},
 		// 	},
 		// 	expectedMountCalls:     1,
-		// 	expectedUnmountCalls:   0, // not until this is done client-side
+		// 	expectedUnmountCalls:   1,
 		// 	expectedClaimCalls:     1,
 		// 	expectedUnpublishCalls: 1,
 		// },
@@ -144,6 +145,9 @@ func TestCSIHook(t *testing.T) {
 				},
 			}
 			hook := newCSIHook(alloc, logger, mgr, rpcer, ar, ar, "secret")
+			hook.maxBackoffInterval = 100 * time.Millisecond
+			hook.maxBackoffDuration = 2 * time.Second
+
 			require.NotNil(t, hook)
 
 			require.NoError(t, hook.Prerun())

diff --git a/client/allocrunner/taskrunner/envoy_bootstrap_hook.go b/client/allocrunner/taskrunner/envoy_bootstrap_hook.go
@@ -18,10 +18,10 @@ import (
 	"github.com/hashicorp/nomad/client/taskenv"
 	agentconsul "github.com/hashicorp/nomad/command/agent/consul"
 	"github.com/hashicorp/nomad/helper"
-	"github.com/hashicorp/nomad/helper/exptime"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/nomad/structs/config"
 	"github.com/pkg/errors"
+	"oss.indeed.com/go/libtime/decay"
 )
 
 const envoyBootstrapHookName = "envoy_bootstrap"
@@ -277,7 +277,7 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart
 
 	// Since Consul services are registered asynchronously with this task
 	// hook running, retry until timeout or success.
-	if backoffErr := exptime.Backoff(func() (bool, error) {
+	if backoffErr := decay.Backoff(func() (bool, error) {
 
 		// If hook is killed, just stop.
 		select {
@@ -324,7 +324,7 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart
 		_ = os.Remove(bootstrapFilePath)
 
 		return true, cmdErr
-	}, exptime.BackoffOptions{
+	}, decay.BackoffOptions{
 		MaxSleepTime:   h.envoyBootstrapWaitTime,
 		InitialGapSize: h.envoyBoostrapInitialGap,
 		MaxJitterSize:  h.envoyBootstrapMaxJitter,

diff --git a/client/lib/cgutil/cpuset_manager_linux_test.go b/client/lib/cgutil/cpuset_manager_linux_test.go
@@ -57,27 +57,31 @@ func TestCpusetManager_Init(t *testing.T) {
 	require.DirExists(t, filepath.Join(manager.cgroupParentPath, ReservedCpusetCgroupName))
 }
 
-func TestCpusetManager_AddAlloc(t *testing.T) {
+func TestCpusetManager_AddAlloc_single(t *testing.T) {
 	manager, cleanup := tmpCpusetManager(t)
 	defer cleanup()
 	require.NoError(t, manager.Init())
 
 	alloc := mock.Alloc()
-	alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores = manager.parentCpuset.ToSlice()
+	// reserve just one core (the 0th core, which probably exists)
+	alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(0).ToSlice()
 	manager.AddAlloc(alloc)
+
 	// force reconcile
 	manager.reconcileCpusets()
 
-	// check that no more cores exist in the shared cgroup
+	// check that the 0th core is no longer available in the shared group
+	// actual contents of shared group depends on machine core count
 	require.DirExists(t, filepath.Join(manager.cgroupParentPath, SharedCpusetCgroupName))
 	require.FileExists(t, filepath.Join(manager.cgroupParentPath, SharedCpusetCgroupName, "cpuset.cpus"))
 	sharedCpusRaw, err := ioutil.ReadFile(filepath.Join(manager.cgroupParentPath, SharedCpusetCgroupName, "cpuset.cpus"))
 	require.NoError(t, err)
 	sharedCpus, err := cpuset.Parse(string(sharedCpusRaw))
 	require.NoError(t, err)
-	require.Empty(t, sharedCpus.ToSlice())
+	require.NotEmpty(t, sharedCpus.ToSlice())
+	require.NotContains(t, sharedCpus.ToSlice(), uint16(0))
 
-	// check that all cores are allocated to reserved cgroup
+	// check that the 0th core is allocated to reserved cgroup
 	require.DirExists(t, filepath.Join(manager.cgroupParentPath, ReservedCpusetCgroupName))
 	reservedCpusRaw, err := ioutil.ReadFile(filepath.Join(manager.cgroupParentPath, ReservedCpusetCgroupName, "cpuset.cpus"))
 	require.NoError(t, err)
@@ -100,6 +104,17 @@ func TestCpusetManager_AddAlloc(t *testing.T) {
 	require.Exactly(t, alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores, taskCpus.ToSlice())
 }
 
+func TestCpusetManager_AddAlloc_subset(t *testing.T) {
+	t.Skip("todo: add test for #11933")
+}
+
+func TestCpusetManager_AddAlloc_all(t *testing.T) {
+	// cgroupsv2 changes behavior of writing empty cpuset.cpu, which is what
+	// happens to the /shared group when one or more allocs consume all available
+	// cores.
+	t.Skip("todo: add test for #11933")
+}
+
 func TestCpusetManager_RemoveAlloc(t *testing.T) {
 	manager, cleanup := tmpCpusetManager(t)
 	defer cleanup()