Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing unfeasible scheduling keys bug (#182) #3845

Merged
merged 2 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions internal/scheduler/constraints/constraints.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,22 @@ const (
GangExceedsGlobalBurstSizeUnschedulableReason = "gang cardinality too large: exceeds global max burst size"
GangExceedsQueueBurstSizeUnschedulableReason = "gang cardinality too large: exceeds queue max burst size"

// Indicates that jobs cannot be scheduled due current executor state
GangDoesNotFitUnschedulableReason = "unable to schedule gang since minimum cardinality not met"
JobDoesNotFitUnschedulableReason = "job does not fit on any node"

UnschedulableReasonMaximumResourcesExceeded = "resource limit exceeded"
)

func UnschedulableReasonIsPropertyOfGang(reason string) bool {
return reason == GangExceedsGlobalBurstSizeUnschedulableReason || reason == JobDoesNotFitUnschedulableReason || reason == GangDoesNotFitUnschedulableReason
}

// IsTerminalUnschedulableReason returns true if reason indicates
// it's not possible to schedule any more jobs in this round.
func IsTerminalUnschedulableReason(reason string) bool {
if reason == MaximumResourcesScheduledUnschedulableReason {
return true
}
if reason == GlobalRateLimitExceededUnschedulableReason {
return true
}
return false
return reason == MaximumResourcesScheduledUnschedulableReason ||
reason == GlobalRateLimitExceededUnschedulableReason
}

// IsTerminalQueueUnschedulableReason returns true if reason indicates
Expand Down
10 changes: 6 additions & 4 deletions internal/scheduler/gang_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,13 @@ func (sch *GangScheduler) updateGangSchedulingContextOnFailure(gctx *schedulerco
return err
}

// Register unfeasible scheduling keys.
globallyUnschedulable := schedulerconstraints.UnschedulableReasonIsPropertyOfGang(unschedulableReason)

// Register globally unfeasible scheduling keys.
//
// Only record unfeasible scheduling keys for single-job gangs.
// Since a gang may be unschedulable even if all its members are individually schedulable.
if !sch.skipUnsuccessfulSchedulingKeyCheck && gctx.Cardinality() == 1 {
if !sch.skipUnsuccessfulSchedulingKeyCheck && gctx.Cardinality() == 1 && globallyUnschedulable {
jctx := gctx.JobSchedulingContexts[0]
schedulingKey, ok := jctx.SchedulingKey()
if ok && schedulingKey != schedulerobjects.EmptySchedulingKey {
Expand Down Expand Up @@ -233,9 +235,9 @@ func (sch *GangScheduler) tryScheduleGangWithTxn(_ *armadacontext.Context, txn *
if ok, err = sch.nodeDb.ScheduleManyWithTxn(txn, gctx); err == nil {
if !ok {
if gctx.Cardinality() > 1 {
unschedulableReason = "unable to schedule gang since minimum cardinality not met"
unschedulableReason = schedulerconstraints.GangDoesNotFitUnschedulableReason
} else {
unschedulableReason = "job does not fit on any node"
unschedulableReason = schedulerconstraints.JobDoesNotFitUnschedulableReason
}
}
return
Expand Down
Loading