diff --git a/.changelog/14659.txt b/.changelog/14659.txt new file mode 100644 index 000000000000..a8de1aba1fb0 --- /dev/null +++ b/.changelog/14659.txt @@ -0,0 +1,3 @@ +```release-note:bug +scheduler: Fixed bug where the scheduler would treat multiregion jobs as paused for job types that don't use deployments +``` diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 819b378e7fb4..4fb61a825275 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -5746,6 +5746,17 @@ func (j *Job) GetScalingPolicies() []*ScalingPolicy { return ret } +// UsesDeployments returns a boolean indicating whether the job configuration +// results in a deployment during scheduling. +func (j *Job) UsesDeployments() bool { + switch j.Type { + case JobTypeService: + return true + default: + return false + } +} + // ScalingPolicyListStub is used to return a subset of scaling policy information // for the scaling policy list type ScalingPolicyListStub struct { diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 4266597e4ec9..50a819cf806e 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -265,6 +265,16 @@ func (a *allocReconciler) computeDeploymentUpdates(deploymentComplete bool) { } } +// computeDeploymentPaused is responsible for setting flags on the +// allocReconciler that indicate the state of the deployment if one +// is required. The flags that are managed are: +// 1. deploymentFailed: Did the current deployment fail just as named. +// 2. deploymentPaused: Multiregion job types that use deployments run +// the deployments later during the fan-out stage. When the deployment +// is created it will be in a pending state. If an invariant violation +// is detected by the deploymentWatcher during it will enter a paused +// state. This flag tells Compute we're paused or pending, so we should +// not make placements on the deployment. func (a *allocReconciler) computeDeploymentPaused() { if a.deployment != nil { a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused || @@ -272,10 +282,10 @@ func (a *allocReconciler) computeDeploymentPaused() { a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed } if a.deployment == nil { - // When we create the deployment later, it will be in a pending - // state. But we also need to tell Compute we're paused, otherwise we - // make placements on the paused deployment. - if a.job.IsMultiregion() && !(a.job.IsPeriodic() || a.job.IsParameterized()) { + if a.job.IsMultiregion() && + a.job.UsesDeployments() && + !(a.job.IsPeriodic() || a.job.IsParameterized()) { + a.deploymentPaused = true } } diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index cb986ded62be..de72faff0bfc 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -5995,3 +5995,109 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { }) } } + +// Tests the reconciler properly handles the logic for computeDeploymentPaused +// for various job types. +func TestReconciler_ComputeDeploymentPaused(t *testing.T) { + ci.Parallel(t) + + type testCase struct { + name string + jobType string + isMultiregion bool + isPeriodic bool + isParameterized bool + expected bool + } + + multiregionCfg := mock.MultiregionJob().Multiregion + periodicCfg := mock.PeriodicJob().Periodic + parameterizedCfg := &structs.ParameterizedJobConfig{ + Payload: structs.DispatchPayloadRequired, + } + + testCases := []testCase{ + { + name: "single region service is not paused", + jobType: structs.JobTypeService, + isMultiregion: false, + isPeriodic: false, + isParameterized: false, + expected: false, + }, + { + name: "multiregion service is paused", + jobType: structs.JobTypeService, + isMultiregion: true, + isPeriodic: false, + isParameterized: false, + expected: true, + }, + { + name: "multiregion periodic service is not paused", + jobType: structs.JobTypeService, + isMultiregion: true, + isPeriodic: true, + isParameterized: false, + expected: false, + }, + { + name: "multiregion parameterized service is not paused", + jobType: structs.JobTypeService, + isMultiregion: true, + isPeriodic: false, + isParameterized: true, + expected: false, + }, + { + name: "single region batch job is not paused", + jobType: structs.JobTypeBatch, + isMultiregion: false, + isPeriodic: false, + isParameterized: false, + expected: false, + }, + { + name: "multiregion batch job is not paused", + jobType: structs.JobTypeBatch, + isMultiregion: false, + isPeriodic: false, + isParameterized: false, + expected: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var job *structs.Job + + if tc.jobType == structs.JobTypeService { + job = mock.Job() + } else if tc.jobType == structs.JobTypeBatch { + job = mock.BatchJob() + } + + require.NotNil(t, job, "invalid job type", tc.jobType) + + if tc.isMultiregion { + job.Multiregion = multiregionCfg + } + + if tc.isPeriodic { + job.Periodic = periodicCfg + } + + if tc.isParameterized { + job.ParameterizedJob = parameterizedCfg + } + + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, + nil, nil, nil, "", job.Priority, true) + + _ = reconciler.Compute() + + require.Equal(t, tc.expected, reconciler.deploymentPaused) + }) + } +}