Skip to content

Commit

Permalink
CSI: enforce usage at claim time (#12112)
Browse files Browse the repository at this point in the history
* Remove redundant schedulable check in `FreeWriteClaims`. If a volume
  has been created but not yet claimed, its capabilities will be checked
  in `WriteSchedulable` at both scheduling time and claim time. We don't
  need to also check them in the `FreeWriteClaims` method.

* Enforce maximum volume claims for writers.

  When the scheduler checks feasibility for CSI volumes, the check is
  fairly loose: earlier versions of the same job are not counted as
  active claims. This allows the scheduler to place new allocations
  for the new version of a job, under the assumption that we'll replace
  the existing allocations and their volume claims.

  But when the alloc runner claims the volume, we need to enforce the
  active claims even if they're for allocations of an earlier version of
  the job. Otherwise we'll try to mount a volume that's currently being
  unmounted, and this will cause replacement allocations to frequently
  fail.

* Enforce single-node reader check for read-only volumes. When the
  alloc runner makes a claim for a read-only volume, we only check that
  the volume is potentially schedulable and not that it actually has
  free read claims.
  • Loading branch information
tgross committed Feb 24, 2022
1 parent 0ae76b1 commit 6b6b827
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 26 deletions.
3 changes: 3 additions & 0 deletions .changelog/12112.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
csi: Fixed a bug where the maximum number of volume claims was incorrectly enforced when an allocation claims a volume
```
8 changes: 5 additions & 3 deletions nomad/csi_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,11 @@ func TestCSIVolumeEndpoint_Claim(t *testing.T) {
// Create an initial volume claim request; we expect it to fail
// because there's no such volume yet.
claimReq := &structs.CSIVolumeClaimRequest{
VolumeID: id0,
AllocationID: alloc.ID,
Claim: structs.CSIVolumeClaimWrite,
VolumeID: id0,
AllocationID: alloc.ID,
Claim: structs.CSIVolumeClaimWrite,
AccessMode: structs.CSIVolumeAccessModeMultiNodeSingleWriter,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
WriteRequest: structs.WriteRequest{
Region: "global",
Namespace: structs.DefaultNamespace,
Expand Down
54 changes: 31 additions & 23 deletions nomad/structs/csi.go
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,26 @@ func (v *CSIVolume) WriteSchedulable() bool {
return false
}

// HasFreeReadClaims determines if there are any free read claims available
func (v *CSIVolume) HasFreeReadClaims() bool {
switch v.AccessMode {
case CSIVolumeAccessModeSingleNodeReader:
return len(v.ReadClaims) == 0
case CSIVolumeAccessModeSingleNodeWriter:
return len(v.ReadClaims) == 0 && len(v.WriteClaims) == 0
case CSIVolumeAccessModeUnknown:
// This volume was created but not yet claimed, so its
// capabilities have been checked in ReadSchedulable
return true
default:
// For multi-node AccessModes, the CSI spec doesn't allow for
// setting a max number of readers we track node resource
// exhaustion through v.ResourceExhausted which is checked in
// ReadSchedulable
return true
}
}

// HasFreeWriteClaims determines if there are any free write claims available
func (v *CSIVolume) HasFreeWriteClaims() bool {
switch v.AccessMode {
Expand All @@ -422,24 +442,13 @@ func (v *CSIVolume) HasFreeWriteClaims() bool {
// which is checked in WriteSchedulable
return true
case CSIVolumeAccessModeUnknown:
// this volume was created but not yet claimed, so we check what it's
// capable of, not what it's been assigned
if len(v.RequestedCapabilities) == 0 {
// COMPAT: a volume that was registered before 1.1.0 and has not
// had a change in claims could have no requested caps. It will
// get corrected on the first claim.
return true
}
for _, cap := range v.RequestedCapabilities {
switch cap.AccessMode {
case CSIVolumeAccessModeSingleNodeWriter, CSIVolumeAccessModeMultiNodeSingleWriter:
return len(v.WriteClaims) == 0
case CSIVolumeAccessModeMultiNodeMultiWriter:
return true
}
}
// This volume was created but not yet claimed, so its
// capabilities have been checked in WriteSchedulable
return true
default:
// Reader modes never have free write claims
return false
}
return false
}

// InUse tests whether any allocations are actively using the volume
Expand Down Expand Up @@ -538,6 +547,10 @@ func (v *CSIVolume) claimRead(claim *CSIVolumeClaim, alloc *Allocation) error {
return ErrCSIVolumeUnschedulable
}

if !v.HasFreeReadClaims() {
return ErrCSIVolumeMaxClaims
}

// Allocations are copy on write, so we want to keep the id but don't need the
// pointer. We'll get it from the db in denormalize.
v.ReadAllocs[claim.AllocationID] = nil
Expand Down Expand Up @@ -565,12 +578,7 @@ func (v *CSIVolume) claimWrite(claim *CSIVolumeClaim, alloc *Allocation) error {
}

if !v.HasFreeWriteClaims() {
// Check the blocking allocations to see if they belong to this job
for _, a := range v.WriteAllocs {
if a != nil && (a.Namespace != alloc.Namespace || a.JobID != alloc.JobID) {
return ErrCSIVolumeMaxClaims
}
}
return ErrCSIVolumeMaxClaims
}

// Allocations are copy on write, so we want to keep the id but don't need the
Expand Down

0 comments on commit 6b6b827

Please sign in to comment.