From 3aedd9b95cc2cfd336827ec630546aaad6f3d605 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 23 Feb 2022 14:59:43 -0500 Subject: [PATCH 1/4] csi: remove redundant schedulable check in `FreeWriteClaims` If a volume has been created but not yet claimed, its capabilities will be checked in `WriteSchedulable` at both scheduling time and claim time. We don't need to also check them in the `FreeWriteClaims` method. --- nomad/structs/csi.go | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/nomad/structs/csi.go b/nomad/structs/csi.go index 7da1877b856e..c155d1de4341 100644 --- a/nomad/structs/csi.go +++ b/nomad/structs/csi.go @@ -422,24 +422,13 @@ func (v *CSIVolume) HasFreeWriteClaims() bool { // which is checked in WriteSchedulable return true case CSIVolumeAccessModeUnknown: - // this volume was created but not yet claimed, so we check what it's - // capable of, not what it's been assigned - if len(v.RequestedCapabilities) == 0 { - // COMPAT: a volume that was registered before 1.1.0 and has not - // had a change in claims could have no requested caps. It will - // get corrected on the first claim. - return true - } - for _, cap := range v.RequestedCapabilities { - switch cap.AccessMode { - case CSIVolumeAccessModeSingleNodeWriter, CSIVolumeAccessModeMultiNodeSingleWriter: - return len(v.WriteClaims) == 0 - case CSIVolumeAccessModeMultiNodeMultiWriter: - return true - } - } + // This volume was created but not yet claimed, so its + // capabilities have been checked in WriteSchedulable + return true + default: + // Reader modes never have free write claims + return false } - return false } // InUse tests whether any allocations are actively using the volume From 31287103aec4286897bc45c1f94e413698b80033 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 23 Feb 2022 15:01:38 -0500 Subject: [PATCH 2/4] csi: running allocations for the same job should block claims When the scheduler checks feasibility for CSI volumes, the check is fairly loose: earlier versions of the same job are not counted as active claims. This allows the scheduler to place new allocations for the new version of a job, under the assumption that we'll replace the existing allocations and their volume claims. But when the alloc runner claims the volume, we need to enforce the active claims even if they're for allocations of an earlier version of the job. Otherwise we'll try to mount a volume that's currently being unmounted, and this will cause replacement allocations to frequently fail. This commit correctly enforces maximum volume claims for writers. --- nomad/csi_endpoint_test.go | 8 +++++--- nomad/structs/csi.go | 7 +------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/nomad/csi_endpoint_test.go b/nomad/csi_endpoint_test.go index 6e60688beb50..0d06a85910ae 100644 --- a/nomad/csi_endpoint_test.go +++ b/nomad/csi_endpoint_test.go @@ -243,9 +243,11 @@ func TestCSIVolumeEndpoint_Claim(t *testing.T) { // Create an initial volume claim request; we expect it to fail // because there's no such volume yet. claimReq := &structs.CSIVolumeClaimRequest{ - VolumeID: id0, - AllocationID: alloc.ID, - Claim: structs.CSIVolumeClaimWrite, + VolumeID: id0, + AllocationID: alloc.ID, + Claim: structs.CSIVolumeClaimWrite, + AccessMode: structs.CSIVolumeAccessModeMultiNodeSingleWriter, + AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem, WriteRequest: structs.WriteRequest{ Region: "global", Namespace: structs.DefaultNamespace, diff --git a/nomad/structs/csi.go b/nomad/structs/csi.go index c155d1de4341..c948499e04ba 100644 --- a/nomad/structs/csi.go +++ b/nomad/structs/csi.go @@ -554,12 +554,7 @@ func (v *CSIVolume) claimWrite(claim *CSIVolumeClaim, alloc *Allocation) error { } if !v.HasFreeWriteClaims() { - // Check the blocking allocations to see if they belong to this job - for _, a := range v.WriteAllocs { - if a != nil && (a.Namespace != alloc.Namespace || a.JobID != alloc.JobID) { - return ErrCSIVolumeMaxClaims - } - } + return ErrCSIVolumeMaxClaims } // Allocations are copy on write, so we want to keep the id but don't need the From 85c086e1ca24947c2856c486a5c1eb08c6fe2a92 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 23 Feb 2022 15:07:04 -0500 Subject: [PATCH 3/4] csi: enforce single-node reader check for read-only volumes When the alloc runner makes a claim for a read-only volume, we only check that the volume is potentially schedulable and not that it actually has free read claims. --- nomad/structs/csi.go | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/nomad/structs/csi.go b/nomad/structs/csi.go index c948499e04ba..9540d8445b26 100644 --- a/nomad/structs/csi.go +++ b/nomad/structs/csi.go @@ -411,6 +411,26 @@ func (v *CSIVolume) WriteSchedulable() bool { return false } +// HasFreeReadClaims determines if there are any free read claims available +func (v *CSIVolume) HasFreeReadClaims() bool { + switch v.AccessMode { + case CSIVolumeAccessModeSingleNodeReader: + return len(v.ReadClaims) == 0 + case CSIVolumeAccessModeSingleNodeWriter: + return len(v.ReadClaims) == 0 && len(v.WriteClaims) == 0 + case CSIVolumeAccessModeUnknown: + // This volume was created but not yet claimed, so its + // capabilities have been checked in ReadSchedulable + return true + default: + // For multi-node AccessModes, the CSI spec doesn't allow for + // setting a max number of readers we track node resource + // exhaustion through v.ResourceExhausted which is checked in + // ReadSchedulable + return true + } +} + // HasFreeWriteClaims determines if there are any free write claims available func (v *CSIVolume) HasFreeWriteClaims() bool { switch v.AccessMode { @@ -527,6 +547,10 @@ func (v *CSIVolume) claimRead(claim *CSIVolumeClaim, alloc *Allocation) error { return ErrCSIVolumeUnschedulable } + if !v.HasFreeReadClaims() { + return ErrCSIVolumeMaxClaims + } + // Allocations are copy on write, so we want to keep the id but don't need the // pointer. We'll get it from the db in denormalize. v.ReadAllocs[claim.AllocationID] = nil From 7200e88b46b602baa2f04269faa85161022b6be9 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 23 Feb 2022 15:16:23 -0500 Subject: [PATCH 4/4] changelog entry --- .changelog/12112.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/12112.txt diff --git a/.changelog/12112.txt b/.changelog/12112.txt new file mode 100644 index 000000000000..e88723f5c4bd --- /dev/null +++ b/.changelog/12112.txt @@ -0,0 +1,3 @@ +```release-note:bug +csi: Fixed a bug where the maximum number of volume claims was incorrectly enforced when an allocation claims a volume +```