Skip to content

Commit

Permalink
Fixing unfeasible scheduling keys bug (#182)
Browse files Browse the repository at this point in the history
* Fixing unfeasible scheduling keys bug

* Renaming test parameter to be more consistent with other parameters

* Renaming constraints
  • Loading branch information
mustafai-gr authored and GitHub Enterprise committed Jul 29, 2024
1 parent 89bd9e8 commit 778e936
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 73 deletions.
17 changes: 10 additions & 7 deletions internal/scheduler/constraints/constraints.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,22 @@ const (
GangExceedsGlobalBurstSizeUnschedulableReason = "gang cardinality too large: exceeds global max burst size"
GangExceedsQueueBurstSizeUnschedulableReason = "gang cardinality too large: exceeds queue max burst size"

// Indicates that jobs cannot be scheduled due current executor state
GangDoesNotFitUnschedulableReason = "unable to schedule gang since minimum cardinality not met"
JobDoesNotFitUnschedulableReason = "job does not fit on any node"

UnschedulableReasonMaximumResourcesExceeded = "resource limit exceeded"
)

func UnschedulableReasonIsPropertyOfGang(reason string) bool {
return reason == GangExceedsGlobalBurstSizeUnschedulableReason || reason == JobDoesNotFitUnschedulableReason || reason == GangDoesNotFitUnschedulableReason
}

// IsTerminalUnschedulableReason returns true if reason indicates
// it's not possible to schedule any more jobs in this round.
func IsTerminalUnschedulableReason(reason string) bool {
if reason == MaximumResourcesScheduledUnschedulableReason {
return true
}
if reason == GlobalRateLimitExceededUnschedulableReason {
return true
}
return false
return reason == MaximumResourcesScheduledUnschedulableReason ||
reason == GlobalRateLimitExceededUnschedulableReason
}

// IsTerminalQueueUnschedulableReason returns true if reason indicates
Expand Down
10 changes: 6 additions & 4 deletions internal/scheduler/gang_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,13 @@ func (sch *GangScheduler) updateGangSchedulingContextOnFailure(gctx *schedulerco
return err
}

// Register unfeasible scheduling keys.
globallyUnschedulable := schedulerconstraints.UnschedulableReasonIsPropertyOfGang(unschedulableReason)

// Register globally unfeasible scheduling keys.
//
// Only record unfeasible scheduling keys for single-job gangs.
// Since a gang may be unschedulable even if all its members are individually schedulable.
if !sch.skipUnsuccessfulSchedulingKeyCheck && gctx.Cardinality() == 1 {
if !sch.skipUnsuccessfulSchedulingKeyCheck && gctx.Cardinality() == 1 && globallyUnschedulable {
jctx := gctx.JobSchedulingContexts[0]
schedulingKey, ok := jctx.SchedulingKey()
if ok && schedulingKey != schedulerobjects.EmptySchedulingKey {
Expand Down Expand Up @@ -233,9 +235,9 @@ func (sch *GangScheduler) tryScheduleGangWithTxn(_ *armadacontext.Context, txn *
if ok, err = sch.nodeDb.ScheduleManyWithTxn(txn, gctx); err == nil {
if !ok {
if gctx.Cardinality() > 1 {
unschedulableReason = "unable to schedule gang since minimum cardinality not met"
unschedulableReason = schedulerconstraints.GangDoesNotFitUnschedulableReason
} else {
unschedulableReason = "job does not fit on any node"
unschedulableReason = schedulerconstraints.JobDoesNotFitUnschedulableReason
}
}
return
Expand Down
Loading

0 comments on commit 778e936

Please sign in to comment.