Skip to content

Commit

Permalink
csi: make feasibility check errors more understandable
Browse files Browse the repository at this point in the history
When the feasibility checker finds we have no free write claims, it
checks to see if any of those claims are for the job we're currently
scheduling (so that earlier versions of a job can't block claims for
new versions) and reports a conflict if the volume can't be scheduled
so that the user can fix their claims. But when the checker hits a
claim that has a GCd allocation, the state is recoverable by the
server once claim reaping completes and no user intervention is
required; the blocked eval should complete. Differentiate the
scheduler error produced by these two conditions.
  • Loading branch information
tgross committed Jan 24, 2022
1 parent 6353ea7 commit e5057ab
Showing 1 changed file with 25 additions and 14 deletions.
39 changes: 25 additions & 14 deletions scheduler/feasible.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,19 @@ import (
)

const (
FilterConstraintHostVolumes = "missing compatible host volumes"
FilterConstraintCSIPluginTemplate = "CSI plugin %s is missing from client %s"
FilterConstraintCSIPluginUnhealthyTemplate = "CSI plugin %s is unhealthy on client %s"
FilterConstraintCSIPluginMaxVolumesTemplate = "CSI plugin %s has the maximum number of volumes on client %s"
FilterConstraintCSIVolumesLookupFailed = "CSI volume lookup failed"
FilterConstraintCSIVolumeNotFoundTemplate = "missing CSI Volume %s"
FilterConstraintCSIVolumeNoReadTemplate = "CSI volume %s is unschedulable or has exhausted its available reader claims"
FilterConstraintCSIVolumeNoWriteTemplate = "CSI volume %s is unschedulable or is read-only"
FilterConstraintCSIVolumeInUseTemplate = "CSI volume %s has exhausted its available writer claims" //
FilterConstraintDrivers = "missing drivers"
FilterConstraintDevices = "missing devices"
FilterConstraintHostVolumes = "missing compatible host volumes"
FilterConstraintCSIPluginTemplate = "CSI plugin %s is missing from client %s"
FilterConstraintCSIPluginUnhealthyTemplate = "CSI plugin %s is unhealthy on client %s"
FilterConstraintCSIPluginMaxVolumesTemplate = "CSI plugin %s has the maximum number of volumes on client %s"
FilterConstraintCSIVolumesLookupFailed = "CSI volume lookup failed"
FilterConstraintCSIVolumeNotFoundTemplate = "missing CSI Volume %s"
FilterConstraintCSIVolumeNoReadTemplate = "CSI volume %s is unschedulable or has exhausted its available reader claims"
FilterConstraintCSIVolumeNoWriteTemplate = "CSI volume %s is unschedulable or is read-only"
FilterConstraintCSIVolumeInUseTemplate = "CSI volume %s has exhausted its available writer claims"
FilterConstraintCSIVolumeGCdAllocationTemplate = "CSI volume %s is claimed by a garbage collected allocation %s"

FilterConstraintDrivers = "missing drivers"
FilterConstraintDevices = "missing devices"
)

var (
Expand Down Expand Up @@ -320,11 +322,20 @@ func (c *CSIVolumeChecker) isFeasible(n *structs.Node) (bool, string) {
return false, fmt.Sprintf(FilterConstraintCSIVolumeNoWriteTemplate, vol.ID)
}
if !vol.WriteFreeClaims() {
// Check the blocking allocations to see if they belong to this job
for id := range vol.WriteAllocs {
a, err := c.ctx.State().AllocByID(ws, id)
if err != nil || a == nil ||
a.Namespace != c.namespace || a.JobID != c.jobID {
// the alloc for this blocking claim has been
// garbage collected but the volumewatcher hasn't
// finished releasing the claim (and possibly
// detaching the volume), so we need to block
// until it can be scheduled
if err != nil || a == nil {
return false, fmt.Sprintf(
FilterConstraintCSIVolumeGCdAllocationTemplate, vol.ID, id)
} else if a.Namespace != c.namespace || a.JobID != c.jobID {
// the blocking claim is for another live job
// so it's legitimately blocking more write
// claims
return false, fmt.Sprintf(
FilterConstraintCSIVolumeInUseTemplate, vol.ID)
}
Expand Down

0 comments on commit e5057ab

Please sign in to comment.