Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scheduler: Fix bug where the would treat multiregion jobs as paused for job types that don't use deployments #14659

Merged
merged 4 commits into from
Sep 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/14659.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
scheduler: Fixed bug where the scheduler would treat multiregion jobs as paused for job types that don't use deployments
```
11 changes: 11 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -5746,6 +5746,17 @@ func (j *Job) GetScalingPolicies() []*ScalingPolicy {
return ret
}

// UsesDeployments returns a boolean indicating whether the job configuration
// results in a deployment during scheduling.
func (j *Job) UsesDeployments() bool {
switch j.Type {
case JobTypeService:
return true
default:
return false
}
Comment on lines +5752 to +5757
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this not be simplified?

Suggested change
switch j.Type {
case JobTypeService:
return true
default:
return false
}
return j.Type == JobTypeService

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could, but I intentionally did this since a goal is to support Deployments for more job types in the future. Maybe this is premature? I was hoping the team would discuss so thanks for taking the bait!

}

// ScalingPolicyListStub is used to return a subset of scaling policy information
// for the scaling policy list
type ScalingPolicyListStub struct {
Expand Down
18 changes: 14 additions & 4 deletions scheduler/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,17 +265,27 @@ func (a *allocReconciler) computeDeploymentUpdates(deploymentComplete bool) {
}
}

// computeDeploymentPaused is responsible for setting flags on the
// allocReconciler that indicate the state of the deployment if one
// is required. The flags that are managed are:
// 1. deploymentFailed: Did the current deployment fail just as named.
// 2. deploymentPaused: Multiregion job types that use deployments run
// the deployments later during the fan-out stage. When the deployment
// is created it will be in a pending state. If an invariant violation
// is detected by the deploymentWatcher during it will enter a paused
// state. This flag tells Compute we're paused or pending, so we should
// not make placements on the deployment.
func (a *allocReconciler) computeDeploymentPaused() {
if a.deployment != nil {
a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused ||
a.deployment.Status == structs.DeploymentStatusPending
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
}
if a.deployment == nil {
// When we create the deployment later, it will be in a pending
// state. But we also need to tell Compute we're paused, otherwise we
// make placements on the paused deployment.
if a.job.IsMultiregion() && !(a.job.IsPeriodic() || a.job.IsParameterized()) {
if a.job.IsMultiregion() &&
a.job.UsesDeployments() &&
!(a.job.IsPeriodic() || a.job.IsParameterized()) {

a.deploymentPaused = true
}
}
Expand Down
106 changes: 106 additions & 0 deletions scheduler/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5995,3 +5995,109 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) {
})
}
}

// Tests the reconciler properly handles the logic for computeDeploymentPaused
// for various job types.
func TestReconciler_ComputeDeploymentPaused(t *testing.T) {
ci.Parallel(t)

type testCase struct {
name string
jobType string
isMultiregion bool
isPeriodic bool
isParameterized bool
expected bool
}

multiregionCfg := mock.MultiregionJob().Multiregion
periodicCfg := mock.PeriodicJob().Periodic
parameterizedCfg := &structs.ParameterizedJobConfig{
Payload: structs.DispatchPayloadRequired,
}

testCases := []testCase{
{
name: "single region service is not paused",
jobType: structs.JobTypeService,
isMultiregion: false,
isPeriodic: false,
isParameterized: false,
expected: false,
},
{
name: "multiregion service is paused",
jobType: structs.JobTypeService,
isMultiregion: true,
isPeriodic: false,
isParameterized: false,
expected: true,
},
{
name: "multiregion periodic service is not paused",
jobType: structs.JobTypeService,
isMultiregion: true,
isPeriodic: true,
isParameterized: false,
expected: false,
},
{
name: "multiregion parameterized service is not paused",
jobType: structs.JobTypeService,
isMultiregion: true,
isPeriodic: false,
isParameterized: true,
expected: false,
},
{
name: "single region batch job is not paused",
jobType: structs.JobTypeBatch,
isMultiregion: false,
isPeriodic: false,
isParameterized: false,
expected: false,
},
{
name: "multiregion batch job is not paused",
jobType: structs.JobTypeBatch,
isMultiregion: false,
isPeriodic: false,
isParameterized: false,
expected: false,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
var job *structs.Job

if tc.jobType == structs.JobTypeService {
job = mock.Job()
} else if tc.jobType == structs.JobTypeBatch {
job = mock.BatchJob()
}

require.NotNil(t, job, "invalid job type", tc.jobType)

if tc.isMultiregion {
job.Multiregion = multiregionCfg
}

if tc.isPeriodic {
job.Periodic = periodicCfg
}

if tc.isParameterized {
job.ParameterizedJob = parameterizedCfg
}

reconciler := NewAllocReconciler(
testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job,
nil, nil, nil, "", job.Priority, true)

_ = reconciler.Compute()

require.Equal(t, tc.expected, reconciler.deploymentPaused)
})
}
}