From 85a4df5e5875752fd3439de206815ef255ead247 Mon Sep 17 00:00:00 2001 From: Derek Strickland <1111455+DerekStrickland@users.noreply.github.com> Date: Thu, 22 Sep 2022 14:31:27 -0400 Subject: [PATCH] scheduler: Fix bug where the would treat multiregion jobs as paused for job types that don't use deployments (#14659) * scheduler: Fix bug where the scheduler would treat multiregion jobs as paused for job types that don't use deployments Co-authored-by: Tim Gross Co-authored-by: Tim Gross --- .changelog/14659.txt | 3 + nomad/structs/structs.go | 11 ++++ scheduler/reconcile.go | 18 ++++-- scheduler/reconcile_test.go | 106 ++++++++++++++++++++++++++++++++++++ 4 files changed, 134 insertions(+), 4 deletions(-) create mode 100644 .changelog/14659.txt diff --git a/.changelog/14659.txt b/.changelog/14659.txt new file mode 100644 index 000000000000..a8de1aba1fb0 --- /dev/null +++ b/.changelog/14659.txt @@ -0,0 +1,3 @@ +```release-note:bug +scheduler: Fixed bug where the scheduler would treat multiregion jobs as paused for job types that don't use deployments +``` diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 54d7d934883e..cacec36d12ac 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -5747,6 +5747,17 @@ func (j *Job) GetScalingPolicies() []*ScalingPolicy { return ret } +// UsesDeployments returns a boolean indicating whether the job configuration +// results in a deployment during scheduling. +func (j *Job) UsesDeployments() bool { + switch j.Type { + case JobTypeService: + return true + default: + return false + } +} + // ScalingPolicyListStub is used to return a subset of scaling policy information // for the scaling policy list type ScalingPolicyListStub struct { diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 4266597e4ec9..50a819cf806e 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -265,6 +265,16 @@ func (a *allocReconciler) computeDeploymentUpdates(deploymentComplete bool) { } } +// computeDeploymentPaused is responsible for setting flags on the +// allocReconciler that indicate the state of the deployment if one +// is required. The flags that are managed are: +// 1. deploymentFailed: Did the current deployment fail just as named. +// 2. deploymentPaused: Multiregion job types that use deployments run +// the deployments later during the fan-out stage. When the deployment +// is created it will be in a pending state. If an invariant violation +// is detected by the deploymentWatcher during it will enter a paused +// state. This flag tells Compute we're paused or pending, so we should +// not make placements on the deployment. func (a *allocReconciler) computeDeploymentPaused() { if a.deployment != nil { a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused || @@ -272,10 +282,10 @@ func (a *allocReconciler) computeDeploymentPaused() { a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed } if a.deployment == nil { - // When we create the deployment later, it will be in a pending - // state. But we also need to tell Compute we're paused, otherwise we - // make placements on the paused deployment. - if a.job.IsMultiregion() && !(a.job.IsPeriodic() || a.job.IsParameterized()) { + if a.job.IsMultiregion() && + a.job.UsesDeployments() && + !(a.job.IsPeriodic() || a.job.IsParameterized()) { + a.deploymentPaused = true } } diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index cb986ded62be..de72faff0bfc 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -5995,3 +5995,109 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { }) } } + +// Tests the reconciler properly handles the logic for computeDeploymentPaused +// for various job types. +func TestReconciler_ComputeDeploymentPaused(t *testing.T) { + ci.Parallel(t) + + type testCase struct { + name string + jobType string + isMultiregion bool + isPeriodic bool + isParameterized bool + expected bool + } + + multiregionCfg := mock.MultiregionJob().Multiregion + periodicCfg := mock.PeriodicJob().Periodic + parameterizedCfg := &structs.ParameterizedJobConfig{ + Payload: structs.DispatchPayloadRequired, + } + + testCases := []testCase{ + { + name: "single region service is not paused", + jobType: structs.JobTypeService, + isMultiregion: false, + isPeriodic: false, + isParameterized: false, + expected: false, + }, + { + name: "multiregion service is paused", + jobType: structs.JobTypeService, + isMultiregion: true, + isPeriodic: false, + isParameterized: false, + expected: true, + }, + { + name: "multiregion periodic service is not paused", + jobType: structs.JobTypeService, + isMultiregion: true, + isPeriodic: true, + isParameterized: false, + expected: false, + }, + { + name: "multiregion parameterized service is not paused", + jobType: structs.JobTypeService, + isMultiregion: true, + isPeriodic: false, + isParameterized: true, + expected: false, + }, + { + name: "single region batch job is not paused", + jobType: structs.JobTypeBatch, + isMultiregion: false, + isPeriodic: false, + isParameterized: false, + expected: false, + }, + { + name: "multiregion batch job is not paused", + jobType: structs.JobTypeBatch, + isMultiregion: false, + isPeriodic: false, + isParameterized: false, + expected: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var job *structs.Job + + if tc.jobType == structs.JobTypeService { + job = mock.Job() + } else if tc.jobType == structs.JobTypeBatch { + job = mock.BatchJob() + } + + require.NotNil(t, job, "invalid job type", tc.jobType) + + if tc.isMultiregion { + job.Multiregion = multiregionCfg + } + + if tc.isPeriodic { + job.Periodic = periodicCfg + } + + if tc.isParameterized { + job.ParameterizedJob = parameterizedCfg + } + + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, + nil, nil, nil, "", job.Priority, true) + + _ = reconciler.Compute() + + require.Equal(t, tc.expected, reconciler.deploymentPaused) + }) + } +}