From 6eec337ea016d49aa602d3924d1f25675da64547 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Fri, 9 Oct 2020 16:31:38 -0500 Subject: [PATCH] core: implement system batch scheduler This PR implements a new "System Batch" scheduler type. Jobs can make use of this new scheduler by setting their type to 'sysbatch'. Like the name implies, sysbatch can be thought of as a hybrid between system and batch jobs - it is for running short lived jobs intended to run on every compatible node in the cluster. As with batch jobs, sysbatch jobs can also be periodic and/or parameterized dispatch jobs. A sysbatch job is considered complete when it has been run on all compatible nodes until reaching a terminal state (success or failed on retries). Feasibility and preemption are governed the same as with system jobs. In this PR, the update stanza is not yet supported. The update stanza is sill limited in functionality for the underlying system scheduler, and is not useful yet for sysbatch jobs. Further work in #4740 will improve support for the update stanza and deployments. Closes #2527 --- CHANGELOG.md | 1 + api/operator.go | 7 +- .../taskrunner/restarts/restarts.go | 18 +- command/agent/operator_endpoint.go | 7 +- command/agent/operator_endpoint_test.go | 8 + e2e/e2e_test.go | 3 +- e2e/e2eutil/utils.go | 24 + .../input/sysbatch_dispatch.nomad | 30 + .../input/sysbatch_job_fast.nomad | 25 + .../input/sysbatch_job_slow.nomad | 25 + .../input/sysbatch_periodic.nomad | 30 + e2e/scheduler_sysbatch/sysbatch.go | 269 +++ .../input/system_job0.nomad | 0 .../input/system_job1.nomad | 0 .../systemsched.go | 26 +- helper/uuid/uuid.go | 6 + nomad/config.go | 11 +- nomad/core_sched.go | 5 +- nomad/mock/mock.go | 84 +- nomad/state/schema.go | 13 +- nomad/state/state_store.go | 12 +- nomad/structs/funcs.go | 69 +- nomad/structs/funcs_test.go | 4 +- nomad/structs/operator.go | 3 + nomad/structs/structs.go | 27 +- scheduler/generic_sched.go | 2 +- scheduler/rank.go | 4 +- scheduler/scheduler.go | 7 +- scheduler/stack.go | 28 +- scheduler/stack_test.go | 16 +- .../{system_sched.go => system_scheduler.go} | 101 +- scheduler/system_sysbatch_test.go | 1623 +++++++++++++++++ ...em_sched_test.go => system_system_test.go} | 610 +++---- scheduler/util.go | 95 +- scheduler/util_test.go | 160 +- .../hashicorp/nomad/api/operator.go | 7 +- website/pages/docs/configuration/server.mdx | 7 +- website/pages/docs/job-specification/job.mdx | 2 +- .../docs/job-specification/reschedule.mdx | 4 +- .../pages/docs/job-specification/restart.mdx | 12 +- website/pages/docs/schedulers.mdx | 30 +- 41 files changed, 2800 insertions(+), 615 deletions(-) create mode 100644 e2e/scheduler_sysbatch/input/sysbatch_dispatch.nomad create mode 100644 e2e/scheduler_sysbatch/input/sysbatch_job_fast.nomad create mode 100644 e2e/scheduler_sysbatch/input/sysbatch_job_slow.nomad create mode 100644 e2e/scheduler_sysbatch/input/sysbatch_periodic.nomad create mode 100644 e2e/scheduler_sysbatch/sysbatch.go rename e2e/{systemsched => scheduler_system}/input/system_job0.nomad (100%) rename e2e/{systemsched => scheduler_system}/input/system_job1.nomad (100%) rename e2e/{systemsched => scheduler_system}/systemsched.go (87%) rename scheduler/{system_sched.go => system_scheduler.go} (85%) create mode 100644 scheduler/system_sysbatch_test.go rename scheduler/{system_sched_test.go => system_system_test.go} (84%) diff --git a/CHANGELOG.md b/CHANGELOG.md index a07114b0d319..c50f5f5936da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ FEATURES: * **Event Stream**: Subscribe to change events as they occur in real time. [[GH-9013](https://github.com/hashicorp/nomad/issues/9013)] * **Namespaces OSS**: Namespaces are now available in open source Nomad. [[GH-9135](https://github.com/hashicorp/nomad/issues/9135)] * **Topology Visualization**: See all of the clients and allocations in a cluster at once. [[GH-9077](https://github.com/hashicorp/nomad/issues/9077)] +* **System Batch Scheduling**: New `sysbatch` scheduler type for running short lived jobs across all nodes. [[GH-9160](https://github.com/hashicorp/nomad/pull/9160)] IMPROVEMENTS: * core: Improved job deregistration error logging. [[GH-8745](https://github.com/hashicorp/nomad/issues/8745)] diff --git a/api/operator.go b/api/operator.go index d5bc5d061d56..de57bffef4b2 100644 --- a/api/operator.go +++ b/api/operator.go @@ -159,9 +159,10 @@ const ( // PreemptionConfig specifies whether preemption is enabled based on scheduler type type PreemptionConfig struct { - SystemSchedulerEnabled bool - BatchSchedulerEnabled bool - ServiceSchedulerEnabled bool + SystemSchedulerEnabled bool + SysBatchSchedulerEnabled bool + BatchSchedulerEnabled bool + ServiceSchedulerEnabled bool } // SchedulerGetConfiguration is used to query the current Scheduler configuration. diff --git a/client/allocrunner/taskrunner/restarts/restarts.go b/client/allocrunner/taskrunner/restarts/restarts.go index 6ee0056ccd8b..429ee07a0384 100644 --- a/client/allocrunner/taskrunner/restarts/restarts.go +++ b/client/allocrunner/taskrunner/restarts/restarts.go @@ -14,15 +14,19 @@ const ( // jitter is the percent of jitter added to restart delays. jitter = 0.25 - ReasonNoRestartsAllowed = "Policy allows no restarts" - ReasonUnrecoverableErrror = "Error was unrecoverable" - ReasonWithinPolicy = "Restart within policy" - ReasonDelay = "Exceeded allowed attempts, applying a delay" + ReasonNoRestartsAllowed = "Policy allows no restarts" + ReasonUnrecoverableError = "Error was unrecoverable" + ReasonWithinPolicy = "Restart within policy" + ReasonDelay = "Exceeded allowed attempts, applying a delay" ) func NewRestartTracker(policy *structs.RestartPolicy, jobType string, tlc *structs.TaskLifecycleConfig) *RestartTracker { - // Batch jobs should not restart if they exit successfully - onSuccess := jobType != structs.JobTypeBatch + onSuccess := true + + // Batch & SysBatch jobs should not restart if they exit successfully + if jobType == structs.JobTypeBatch || jobType == structs.JobTypeSysBatch { + onSuccess = false + } // Prestart sidecars should get restarted on success if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPrestart { @@ -196,7 +200,7 @@ func (r *RestartTracker) GetState() (string, time.Duration) { if r.startErr != nil { // If the error is not recoverable, do not restart. if !structs.IsRecoverable(r.startErr) { - r.reason = ReasonUnrecoverableErrror + r.reason = ReasonUnrecoverableError return structs.TaskNotRestarting, 0 } } else if r.exitRes != nil { diff --git a/command/agent/operator_endpoint.go b/command/agent/operator_endpoint.go index ed4a3c4cb732..e008cd506357 100644 --- a/command/agent/operator_endpoint.go +++ b/command/agent/operator_endpoint.go @@ -261,9 +261,10 @@ func (s *HTTPServer) schedulerUpdateConfig(resp http.ResponseWriter, req *http.R args.Config = structs.SchedulerConfiguration{ SchedulerAlgorithm: structs.SchedulerAlgorithm(conf.SchedulerAlgorithm), PreemptionConfig: structs.PreemptionConfig{ - SystemSchedulerEnabled: conf.PreemptionConfig.SystemSchedulerEnabled, - BatchSchedulerEnabled: conf.PreemptionConfig.BatchSchedulerEnabled, - ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled}, + SystemSchedulerEnabled: conf.PreemptionConfig.SystemSchedulerEnabled, + SysBatchSchedulerEnabled: conf.PreemptionConfig.SysBatchSchedulerEnabled, + BatchSchedulerEnabled: conf.PreemptionConfig.BatchSchedulerEnabled, + ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled}, } if err := args.Config.Validate(); err != nil { diff --git a/command/agent/operator_endpoint_test.go b/command/agent/operator_endpoint_test.go index 8814fad4fa25..316c16ca3659 100644 --- a/command/agent/operator_endpoint_test.go +++ b/command/agent/operator_endpoint_test.go @@ -282,6 +282,7 @@ func TestOperator_SchedulerGetConfiguration(t *testing.T) { // Only system jobs can preempt other jobs by default. require.True(out.SchedulerConfig.PreemptionConfig.SystemSchedulerEnabled) + require.False(out.SchedulerConfig.PreemptionConfig.SysBatchSchedulerEnabled) require.False(out.SchedulerConfig.PreemptionConfig.BatchSchedulerEnabled) require.False(out.SchedulerConfig.PreemptionConfig.ServiceSchedulerEnabled) }) @@ -314,6 +315,8 @@ func TestOperator_SchedulerSetConfiguration(t *testing.T) { err = s.RPC("Operator.SchedulerGetConfiguration", &args, &reply) require.Nil(err) require.True(reply.SchedulerConfig.PreemptionConfig.SystemSchedulerEnabled) + require.False(reply.SchedulerConfig.PreemptionConfig.SysBatchSchedulerEnabled) + require.False(reply.SchedulerConfig.PreemptionConfig.BatchSchedulerEnabled) require.True(reply.SchedulerConfig.PreemptionConfig.ServiceSchedulerEnabled) }) } @@ -324,6 +327,7 @@ func TestOperator_SchedulerCASConfiguration(t *testing.T) { require := require.New(t) body := bytes.NewBuffer([]byte(`{"PreemptionConfig": { "SystemSchedulerEnabled": true, + "SysBatchSchedulerEnabled":true, "BatchSchedulerEnabled":true }}`)) req, _ := http.NewRequest("PUT", "/v1/operator/scheduler/configuration", body) @@ -346,7 +350,9 @@ func TestOperator_SchedulerCASConfiguration(t *testing.T) { t.Fatalf("err: %v", err) } require.True(reply.SchedulerConfig.PreemptionConfig.SystemSchedulerEnabled) + require.True(reply.SchedulerConfig.PreemptionConfig.SysBatchSchedulerEnabled) require.True(reply.SchedulerConfig.PreemptionConfig.BatchSchedulerEnabled) + require.False(reply.SchedulerConfig.PreemptionConfig.ServiceSchedulerEnabled) // Create a CAS request, bad index { @@ -387,7 +393,9 @@ func TestOperator_SchedulerCASConfiguration(t *testing.T) { t.Fatalf("err: %v", err) } require.False(reply.SchedulerConfig.PreemptionConfig.SystemSchedulerEnabled) + require.False(reply.SchedulerConfig.PreemptionConfig.SysBatchSchedulerEnabled) require.False(reply.SchedulerConfig.PreemptionConfig.BatchSchedulerEnabled) + require.False(reply.SchedulerConfig.PreemptionConfig.ServiceSchedulerEnabled) }) } diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 2e6c6db2ce8d..a534d8f38b4e 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -27,8 +27,9 @@ import ( _ "github.com/hashicorp/nomad/e2e/podman" _ "github.com/hashicorp/nomad/e2e/quotas" _ "github.com/hashicorp/nomad/e2e/rescheduling" + _ "github.com/hashicorp/nomad/e2e/scheduler_sysbatch" + _ "github.com/hashicorp/nomad/e2e/scheduler_system" _ "github.com/hashicorp/nomad/e2e/spread" - _ "github.com/hashicorp/nomad/e2e/systemsched" _ "github.com/hashicorp/nomad/e2e/taskevents" _ "github.com/hashicorp/nomad/e2e/vaultsecrets" _ "github.com/hashicorp/nomad/e2e/volumes" diff --git a/e2e/e2eutil/utils.go b/e2e/e2eutil/utils.go index 6cf10d574f42..d042c1743b1d 100644 --- a/e2e/e2eutil/utils.go +++ b/e2e/e2eutil/utils.go @@ -201,6 +201,30 @@ func WaitForAllocStopped(t *testing.T, nomadClient *api.Client, allocID string) }) } +func WaitForAllocStatus(t *testing.T, nomadClient *api.Client, allocID string, status string) { + testutil.WaitForResultRetries(retries, func() (bool, error) { + time.Sleep(time.Millisecond * 100) + alloc, _, err := nomadClient.Allocations().Info(allocID, nil) + if err != nil { + return false, err + } + switch alloc.ClientStatus { + case status: + return true, nil + default: + return false, fmt.Errorf("expected %s alloc, but was: %s", status, alloc.ClientStatus) + } + }, func(err error) { + t.Fatalf("failed to wait on alloc: %v", err) + }) +} + +func WaitForAllocsStatus(t *testing.T, nomadClient *api.Client, allocIDs []string, status string) { + for _, allocID := range allocIDs { + WaitForAllocStatus(t, nomadClient, allocID, status) + } +} + func AllocIDsFromAllocationListStubs(allocs []*api.AllocationListStub) []string { allocIDs := make([]string, 0, len(allocs)) for _, alloc := range allocs { diff --git a/e2e/scheduler_sysbatch/input/sysbatch_dispatch.nomad b/e2e/scheduler_sysbatch/input/sysbatch_dispatch.nomad new file mode 100644 index 000000000000..fcc369efdb6d --- /dev/null +++ b/e2e/scheduler_sysbatch/input/sysbatch_dispatch.nomad @@ -0,0 +1,30 @@ +job "sysbatchjob" { + datacenters = ["dc1"] + + type = "sysbatch" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + parameterized { + payload = "forbidden" + meta_required = ["KEY"] + } + + group "sysbatch_job_group" { + count = 1 + + task "sysbatch_task" { + driver = "docker" + + config { + image = "bash:5" + + command = "bash" + args = ["-c", "ping -c 10 example.com"] + } + } + } +} diff --git a/e2e/scheduler_sysbatch/input/sysbatch_job_fast.nomad b/e2e/scheduler_sysbatch/input/sysbatch_job_fast.nomad new file mode 100644 index 000000000000..5aaba9072ba1 --- /dev/null +++ b/e2e/scheduler_sysbatch/input/sysbatch_job_fast.nomad @@ -0,0 +1,25 @@ +job "sysbatchjob" { + datacenters = ["dc1"] + + type = "sysbatch" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "sysbatch_job_group" { + count = 1 + + task "sysbatch_task" { + driver = "docker" + + config { + image = "bash:5" + + command = "bash" + args = ["-c", "ping -c 10 example.com"] + } + } + } +} diff --git a/e2e/scheduler_sysbatch/input/sysbatch_job_slow.nomad b/e2e/scheduler_sysbatch/input/sysbatch_job_slow.nomad new file mode 100644 index 000000000000..3a0b667eb25f --- /dev/null +++ b/e2e/scheduler_sysbatch/input/sysbatch_job_slow.nomad @@ -0,0 +1,25 @@ +job "sysbatchjob" { + datacenters = ["dc1"] + + type = "sysbatch" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "sysbatch_job_group" { + count = 1 + + task "sysbatch_task" { + driver = "docker" + + config { + image = "bash:5" + + command = "bash" + args = ["-c", "ping -c 100000 example.com"] + } + } + } +} diff --git a/e2e/scheduler_sysbatch/input/sysbatch_periodic.nomad b/e2e/scheduler_sysbatch/input/sysbatch_periodic.nomad new file mode 100644 index 000000000000..d3521a5355f3 --- /dev/null +++ b/e2e/scheduler_sysbatch/input/sysbatch_periodic.nomad @@ -0,0 +1,30 @@ +job "sysbatchjob" { + datacenters = ["dc1"] + + type = "sysbatch" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + periodic { + cron = "*/15 * * * * *" + prohibit_overlap = true + } + + group "sysbatch_job_group" { + count = 1 + + task "sysbatch_task" { + driver = "docker" + + config { + image = "bash:5" + + command = "bash" + args = ["-c", "ping -c 10 example.com"] + } + } + } +} diff --git a/e2e/scheduler_sysbatch/sysbatch.go b/e2e/scheduler_sysbatch/sysbatch.go new file mode 100644 index 000000000000..26bc979c04e1 --- /dev/null +++ b/e2e/scheduler_sysbatch/sysbatch.go @@ -0,0 +1,269 @@ +package scheduler_sysbatch + +import ( + "strings" + "time" + + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/e2e/framework" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type SysBatchSchedulerTest struct { + framework.TC + jobIDs []string +} + +func init() { + framework.AddSuites(&framework.TestSuite{ + Component: "SysBatchScheduler", + CanRunLocal: true, + Cases: []framework.TestCase{ + new(SysBatchSchedulerTest), + }, + }) +} + +func (tc *SysBatchSchedulerTest) BeforeAll(f *framework.F) { + // Ensure cluster has leader before running tests + e2eutil.WaitForLeader(f.T(), tc.Nomad()) + e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 4) +} + +func (tc *SysBatchSchedulerTest) TestJobRunBasic(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a fast sysbatch job + jobID := "sysbatch_run_basic" + tc.jobIDs = append(tc.jobIDs, jobID) + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "") + + // get our allocations for this sysbatch job + jobs := nomadClient.Jobs() + allocs, _, err := jobs.Allocations(jobID, true, nil) + require.NoError(t, err) + + // make sure this is job is being run on "all" the linux clients + // in the future, might be nice to have a way to query that information + // during test run time, to create more accurate assertions + require.True(t, len(allocs) >= 3) + + // wait for every alloc to reach completion + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete) +} + +func (tc *SysBatchSchedulerTest) TestJobStopEarly(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a slow sysbatch job + jobID := "sysbatch_stop_early" + tc.jobIDs = append(tc.jobIDs, jobID) + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "") + + // get our allocations for this sysbatch job + jobs := nomadClient.Jobs() + allocs, _, err := jobs.Allocations(jobID, true, nil) + require.NoError(t, err) + + // make sure this is job is being run on "all" the linux clients + // in the future, might be nice to have a way to query that information + // during test run time, to create more accurate assertions + require.True(t, len(allocs) >= 3) + + // wait for every alloc to reach running status + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusRunning) + + // stop the job before allocs reach completion + _, _, err = jobs.Deregister(jobID, false, nil) + require.NoError(t, err) +} + +func (tc *SysBatchSchedulerTest) TestJobReplaceRunning(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a slow sysbatch job + jobID := "sysbatch_replace_running" + tc.jobIDs = append(tc.jobIDs, jobID) + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "") + + // get out allocations for this sysbatch job + jobs := nomadClient.Jobs() + allocs, _, err := jobs.Allocations(jobID, true, nil) + require.NoError(t, err) + + // make sure this is job is being run on "all" the linux clients + require.True(t, len(allocs) >= 3) + + // wait for every alloc to reach running status + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusRunning) + + // replace the slow job with the fast job + intermediate := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "") + + // get the allocs for the new updated job + var updated []*api.AllocationListStub + for _, alloc := range intermediate { + if alloc.JobVersion == 1 { + updated = append(updated, alloc) + } + } + + // should be equal number of old and new allocs + newAllocIDs := e2eutil.AllocIDsFromAllocationListStubs(updated) + + // make sure this new job is being run on "all" the linux clients + require.True(t, len(updated) >= 3) + + // wait for the allocs of the fast job to complete + e2eutil.WaitForAllocsStatus(t, nomadClient, newAllocIDs, structs.AllocClientStatusComplete) +} + +func (tc *SysBatchSchedulerTest) TestJobReplaceDead(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a fast sysbatch job + jobID := "sysbatch_replace_dead" + tc.jobIDs = append(tc.jobIDs, jobID) + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "") + + // get the allocations for this sysbatch job + jobs := nomadClient.Jobs() + allocs, _, err := jobs.Allocations(jobID, true, nil) + require.NoError(t, err) + + // make sure this is job is being run on "all" the linux clients + require.True(t, len(allocs) >= 3) + + // wait for every alloc to reach complete status + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete) + + // replace the fast job with the slow job + intermediate := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "") + + // get the allocs for the new updated job + var updated []*api.AllocationListStub + for _, alloc := range intermediate { + if alloc.JobVersion == 1 { + updated = append(updated, alloc) + } + } + + // should be equal number of old and new allocs + upAllocIDs := e2eutil.AllocIDsFromAllocationListStubs(updated) + + // make sure this new job is being run on "all" the linux clients + require.True(t, len(updated) >= 3) + + // wait for the allocs of the slow job to be running + e2eutil.WaitForAllocsStatus(t, nomadClient, upAllocIDs, structs.AllocClientStatusRunning) +} + +func (tc *SysBatchSchedulerTest) TestJobRunPeriodic(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a fast sysbatch job + jobID := "sysbatch_job_periodic" + tc.jobIDs = append(tc.jobIDs, jobID) + err := e2eutil.Register(jobID, "scheduler_sysbatch/input/sysbatch_periodic.nomad") + require.NoError(t, err) + + // force the cron job to run + jobs := nomadClient.Jobs() + _, _, err = jobs.PeriodicForce(jobID, nil) + require.NoError(t, err) + + // find the cron job that got launched + jobsList, _, err := jobs.List(nil) + require.NoError(t, err) + cronJobID := "" + for _, job := range jobsList { + if strings.HasPrefix(job.Name, "sysbatch_job_periodic/periodic-") { + cronJobID = job.Name + break + } + } + require.NotEmpty(t, cronJobID) + tc.jobIDs = append(tc.jobIDs, cronJobID) + + // wait for allocs of the cron job + var allocs []*api.AllocationListStub + require.True(t, assert.Eventually(t, func() bool { + var err error + allocs, _, err = jobs.Allocations(cronJobID, false, nil) + require.NoError(t, err) + return len(allocs) >= 3 + }, 30*time.Second, time.Second)) + + // wait for every cron job alloc to reach completion + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete) +} + +func (tc *SysBatchSchedulerTest) TestJobRunDispatch(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a fast sysbatch dispatch job + jobID := "sysbatch_job_dispatch" + tc.jobIDs = append(tc.jobIDs, jobID) + err := e2eutil.Register(jobID, "scheduler_sysbatch/input/sysbatch_dispatch.nomad") + require.NoError(t, err) + + // dispatch the sysbatch job + jobs := nomadClient.Jobs() + result, _, err := jobs.Dispatch(jobID, map[string]string{ + "KEY": "value", + }, nil, nil) + require.NoError(t, err) + + // grab the new dispatched jobID + dispatchID := result.DispatchedJobID + tc.jobIDs = append(tc.jobIDs, dispatchID) + + // wait for allocs of the dispatched job + var allocs []*api.AllocationListStub + require.True(t, assert.Eventually(t, func() bool { + var err error + allocs, _, err = jobs.Allocations(dispatchID, false, nil) + require.NoError(t, err) + return len(allocs) >= 3 + }, 30*time.Second, time.Second)) + + // wait for every dispatch alloc to reach completion + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete) +} + +func (tc *SysBatchSchedulerTest) AfterEach(f *framework.F) { + nomadClient := tc.Nomad() + + // Mark all nodes eligible + nodesAPI := tc.Nomad().Nodes() + nodes, _, _ := nodesAPI.List(nil) + for _, node := range nodes { + _, _ = nodesAPI.ToggleEligibility(node.ID, true, nil) + } + + jobs := nomadClient.Jobs() + + // Stop all jobs in test + for _, id := range tc.jobIDs { + _, _, _ = jobs.Deregister(id, true, nil) + } + tc.jobIDs = []string{} + + // Garbage collect + _ = nomadClient.System().GarbageCollect() +} diff --git a/e2e/systemsched/input/system_job0.nomad b/e2e/scheduler_system/input/system_job0.nomad similarity index 100% rename from e2e/systemsched/input/system_job0.nomad rename to e2e/scheduler_system/input/system_job0.nomad diff --git a/e2e/systemsched/input/system_job1.nomad b/e2e/scheduler_system/input/system_job1.nomad similarity index 100% rename from e2e/systemsched/input/system_job1.nomad rename to e2e/scheduler_system/input/system_job1.nomad diff --git a/e2e/systemsched/systemsched.go b/e2e/scheduler_system/systemsched.go similarity index 87% rename from e2e/systemsched/systemsched.go rename to e2e/scheduler_system/systemsched.go index 09b3f9141b33..5ec17ef28547 100644 --- a/e2e/systemsched/systemsched.go +++ b/e2e/scheduler_system/systemsched.go @@ -1,4 +1,4 @@ -package systemsched +package scheduler_system import ( "github.com/hashicorp/nomad/api" @@ -35,16 +35,14 @@ func (tc *SystemSchedTest) TestJobUpdateOnIneligbleNode(f *framework.F) { jobID := "system_deployment" tc.jobIDs = append(tc.jobIDs, jobID) - e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "systemsched/input/system_job0.nomad", jobID, "") + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_system/input/system_job0.nomad", jobID, "") jobs := nomadClient.Jobs() allocs, _, err := jobs.Allocations(jobID, true, nil) require.NoError(t, err) + require.True(t, len(allocs) >= 3) - var allocIDs []string - for _, alloc := range allocs { - allocIDs = append(allocIDs, alloc.ID) - } + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) // Wait for allocations to get past initial pending state e2eutil.WaitForAllocsNotPending(t, nomadClient, allocIDs) @@ -58,13 +56,9 @@ func (tc *SystemSchedTest) TestJobUpdateOnIneligbleNode(f *framework.F) { // Assert all jobs still running jobs = nomadClient.Jobs() allocs, _, err = jobs.Allocations(jobID, true, nil) - - allocIDs = nil - for _, alloc := range allocs { - allocIDs = append(allocIDs, alloc.ID) - } - require.NoError(t, err) + + allocIDs = e2eutil.AllocIDsFromAllocationListStubs(allocs) allocForDisabledNode := make(map[string]*api.AllocationListStub) // Wait for allocs to run and collect allocs on ineligible node @@ -89,19 +83,15 @@ func (tc *SystemSchedTest) TestJobUpdateOnIneligbleNode(f *framework.F) { require.Len(t, allocForDisabledNode, 1) // Update job - e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "systemsched/input/system_job1.nomad", jobID, "") + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_system/input/system_job1.nomad", jobID, "") // Get updated allocations jobs = nomadClient.Jobs() allocs, _, err = jobs.Allocations(jobID, false, nil) require.NoError(t, err) - allocIDs = nil - for _, alloc := range allocs { - allocIDs = append(allocIDs, alloc.ID) - } - // Wait for allocs to start + allocIDs = e2eutil.AllocIDsFromAllocationListStubs(allocs) e2eutil.WaitForAllocsNotPending(t, nomadClient, allocIDs) // Get latest alloc status now that they are no longer pending diff --git a/helper/uuid/uuid.go b/helper/uuid/uuid.go index 145c817803d0..c0eec178ea9d 100644 --- a/helper/uuid/uuid.go +++ b/helper/uuid/uuid.go @@ -19,3 +19,9 @@ func Generate() string { buf[8:10], buf[10:16]) } + +// Short is used to generate a random shortened UUID. +func Short() string { + id := Generate() + return id[len(id)-8:] +} diff --git a/nomad/config.go b/nomad/config.go index 08e4f562f5c3..9575416c3bde 100644 --- a/nomad/config.go +++ b/nomad/config.go @@ -323,8 +323,8 @@ type Config struct { AutopilotInterval time.Duration // DefaultSchedulerConfig configures the initial scheduler config to be persisted in Raft. - // Once the cluster is bootstrapped, and Raft persists the config (from here or through API), - // This value is ignored. + // Once the cluster is bootstrapped, and Raft persists the config (from here or through API) + // and this value is ignored. DefaultSchedulerConfig structs.SchedulerConfiguration `hcl:"default_scheduler_config"` // PluginLoader is used to load plugins. @@ -433,9 +433,10 @@ func DefaultConfig() *Config { DefaultSchedulerConfig: structs.SchedulerConfiguration{ SchedulerAlgorithm: structs.SchedulerAlgorithmBinpack, PreemptionConfig: structs.PreemptionConfig{ - SystemSchedulerEnabled: true, - BatchSchedulerEnabled: false, - ServiceSchedulerEnabled: false, + SystemSchedulerEnabled: true, + SysBatchSchedulerEnabled: false, + BatchSchedulerEnabled: false, + ServiceSchedulerEnabled: false, }, }, } diff --git a/nomad/core_sched.go b/nomad/core_sched.go index 1ac135d0aaea..eb796f66bcaa 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -136,9 +136,7 @@ OUTER: gc, allocs, err := c.gcEval(eval, oldThreshold, true) if err != nil { continue OUTER - } - - if gc { + } else if gc { jobEval = append(jobEval, eval.ID) jobAlloc = append(jobAlloc, allocs...) } else { @@ -160,6 +158,7 @@ OUTER: if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { return nil } + c.logger.Debug("job GC found eligible objects", "jobs", len(gcJob), "evals", len(gcEval), "allocs", len(gcAlloc)) diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index e8c555a10218..6e1a5f283b8e 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -172,6 +172,46 @@ func HCL() string { ` } +func SystemBatchJob() *structs.Job { + job := &structs.Job{ + Region: "global", + ID: fmt.Sprintf("mock-sysbatch-%s", uuid.Short()), + Name: "my-sysbatch", + Namespace: structs.DefaultNamespace, + Type: structs.JobTypeSysBatch, + Priority: 10, + Datacenters: []string{"dc1"}, + Constraints: []*structs.Constraint{ + { + LTarget: "${attr.kernel.name}", + RTarget: "linux", + Operand: "=", + }, + }, + TaskGroups: []*structs.TaskGroup{{ + Count: 1, + Name: "pinger", + Tasks: []*structs.Task{{ + Name: "ping-example", + Driver: "exec", + Config: map[string]interface{}{ + "command": "/usr/bin/ping", + "args": []string{"-c", "5", "example.com"}, + }, + LogConfig: structs.DefaultLogConfig(), + }}, + }}, + + Status: structs.JobStatusPending, + Version: 0, + CreateIndex: 42, + ModifyIndex: 99, + JobModifyIndex: 99, + } + job.Canonicalize() + return job +} + func Job() *structs.Job { job := &structs.Job{ Region: "global", @@ -895,7 +935,7 @@ func Eval() *structs.Evaluation { } func JobSummary(jobID string) *structs.JobSummary { - js := &structs.JobSummary{ + return &structs.JobSummary{ JobID: jobID, Namespace: structs.DefaultNamespace, Summary: map[string]structs.TaskGroupSummary{ @@ -905,7 +945,19 @@ func JobSummary(jobID string) *structs.JobSummary { }, }, } - return js +} + +func JobSysBatchSummary(jobID string) *structs.JobSummary { + return &structs.JobSummary{ + JobID: jobID, + Namespace: structs.DefaultNamespace, + Summary: map[string]structs.TaskGroupSummary{ + "pinger": { + Queued: 0, + Starting: 0, + }, + }, + } } func Alloc() *structs.Allocation { @@ -1191,6 +1243,34 @@ func BatchAlloc() *structs.Allocation { return alloc } +func SysBatchAlloc() *structs.Allocation { + job := SystemBatchJob() + return &structs.Allocation{ + ID: uuid.Generate(), + EvalID: uuid.Generate(), + NodeID: "12345678-abcd-efab-cdef-123456789abc", + Namespace: structs.DefaultNamespace, + TaskGroup: "pinger", + AllocatedResources: &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "ping-example": { + Cpu: structs.AllocatedCpuResources{CpuShares: 500}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 256}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + }}, + }, + }, + Shared: structs.AllocatedSharedResources{DiskMB: 150}, + }, + Job: job, + JobID: job.ID, + DesiredStatus: structs.AllocDesiredStatusRun, + ClientStatus: structs.AllocClientStatusPending, + } +} + func SystemAlloc() *structs.Allocation { alloc := &structs.Allocation{ ID: uuid.Generate(), diff --git a/nomad/state/schema.go b/nomad/state/schema.go index 923b44617139..8178ec515f34 100644 --- a/nomad/state/schema.go +++ b/nomad/state/schema.go @@ -271,13 +271,16 @@ func jobIsGCable(obj interface{}) (bool, error) { return true, nil } - // Otherwise, only batch jobs are eligible because they complete on their - // own without a user stopping them. - if j.Type != structs.JobTypeBatch { + switch j.Type { + // Otherwise, batch and sysbatch jobs are eligible because they complete on + // their own without a user stopping them. + case structs.JobTypeBatch, structs.JobTypeSysBatch: + return true, nil + + default: + // other job types may not be GC until stopped return false, nil } - - return true, nil } // jobIsPeriodic satisfies the ConditionalIndexFunc interface and creates an index diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index d5e5f12f9b0b..f6acd476b6f7 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -1973,7 +1973,7 @@ func (s *StateStore) JobsByScheduler(ws memdb.WatchSet, schedulerType string) (m return iter, nil } -// JobsByGC returns an iterator over all jobs eligible or uneligible for garbage +// JobsByGC returns an iterator over all jobs eligible or ineligible for garbage // collection. func (s *StateStore) JobsByGC(ws memdb.WatchSet, gc bool) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() @@ -4462,13 +4462,15 @@ func (s *StateStore) setJobStatus(index uint64, txn *txn, } func (s *StateStore) getJobStatus(txn *txn, job *structs.Job, evalDelete bool) (string, error) { - // System, Periodic and Parameterized jobs are running until explicitly - // stopped - if job.Type == structs.JobTypeSystem || job.IsParameterized() || job.IsPeriodic() { + // System, SysBatch, Periodic and Parameterized jobs are running until + // explicitly stopped. + if job.Type == structs.JobTypeSysBatch || + job.Type == structs.JobTypeSystem || + job.IsParameterized() || + job.IsPeriodic() { if job.Stop { return structs.JobStatusDead, nil } - return structs.JobStatusRunning, nil } diff --git a/nomad/structs/funcs.go b/nomad/structs/funcs.go index 7d5398133ff4..0c693eae904a 100644 --- a/nomad/structs/funcs.go +++ b/nomad/structs/funcs.go @@ -70,10 +70,11 @@ func RemoveAllocs(alloc []*Allocation, remove []*Allocation) []*Allocation { } // FilterTerminalAllocs filters out all allocations in a terminal state and -// returns the latest terminal allocations +// returns the latest terminal allocations. func FilterTerminalAllocs(allocs []*Allocation) ([]*Allocation, map[string]*Allocation) { terminalAllocsByName := make(map[string]*Allocation) n := len(allocs) + for i := 0; i < n; i++ { if allocs[i].TerminalStatus() { @@ -91,9 +92,75 @@ func FilterTerminalAllocs(allocs []*Allocation) ([]*Allocation, map[string]*Allo n-- } } + return allocs[:n], terminalAllocsByName } +// SplitTerminalAllocs splits allocs into non-terminal and terminal allocs, with +// the terminal allocs indexed by node->alloc.name. +func SplitTerminalAllocs(allocs []*Allocation) ([]*Allocation, TerminalByNodeByName) { + var alive []*Allocation + var terminal = make(TerminalByNodeByName) + + for _, alloc := range allocs { + if alloc.TerminalStatus() { + terminal.Set(alloc) + } else { + alive = append(alive, alloc) + } + } + + return alive, terminal +} + +// TerminalByNodeByName is a map of NodeID->Allocation.Name->Allocation used by +// the sysbatch scheduler for locating the most up-to-date terminal allocations. +type TerminalByNodeByName map[string]map[string]*Allocation + +func (a TerminalByNodeByName) Set(allocation *Allocation) { + node := allocation.NodeID + name := allocation.Name + + if _, exists := a[node]; !exists { + a[node] = make(map[string]*Allocation) + } + + if previous, exists := a[node][name]; !exists { + a[node][name] = allocation + } else { + // keep the newest version of the terminal alloc for the coordinate + if previous.CreateIndex < allocation.CreateIndex { + a[node][name] = allocation + } + } +} + +func (a TerminalByNodeByName) Get(nodeID, name string) (*Allocation, bool) { + if _, exists := a[nodeID]; !exists { + return nil, false + } + + if _, exists := a[nodeID][name]; !exists { + return nil, false + } + + return a[nodeID][name], true +} + +// Any returns any matching allocation matching name if exists. Used by the +// system scheduler to substitute a missing allocation that will be updated +// later. +func (a TerminalByNodeByName) Any(name string) *Allocation { + for _, names := range a { + for aName := range names { + if name == aName { + return names[name] + } + } + } + return nil +} + // AllocsFit checks if a given set of allocations will fit on a node. // The netIdx can optionally be provided if its already been computed. // If the netIdx is provided, it is assumed that the client has already diff --git a/nomad/structs/funcs_test.go b/nomad/structs/funcs_test.go index 504cc3a8e486..d802274f6f38 100644 --- a/nomad/structs/funcs_test.go +++ b/nomad/structs/funcs_test.go @@ -335,8 +335,8 @@ func TestAllocsFit(t *testing.T) { DiskMB: 5000, Networks: Networks{ { - Mode: "host", - IP: "10.0.0.1", + Mode: "host", + IP: "10.0.0.1", ReservedPorts: []Port{{"main", 8000, 0, ""}}, }, }, diff --git a/nomad/structs/operator.go b/nomad/structs/operator.go index 8a3afef9f154..4960369219ec 100644 --- a/nomad/structs/operator.go +++ b/nomad/structs/operator.go @@ -205,6 +205,9 @@ type PreemptionConfig struct { // SystemSchedulerEnabled specifies if preemption is enabled for system jobs SystemSchedulerEnabled bool `hcl:"system_scheduler_enabled"` + // SysBatchSchedulerEnabled specifies if preemption is enabled for sysbatch jobs + SysBatchSchedulerEnabled bool `hcl:"sysbatch_scheduler_enabled"` + // BatchSchedulerEnabled specifies if preemption is enabled for batch jobs BatchSchedulerEnabled bool `hcl:"batch_scheduler_enabled"` diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 481b157ba038..b8b93c568c4e 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -3761,10 +3761,11 @@ func (c *ComparableResources) NetIndex(n *NetworkResource) int { const ( // JobTypeNomad is reserved for internal system tasks and is // always handled by the CoreScheduler. - JobTypeCore = "_core" - JobTypeService = "service" - JobTypeBatch = "batch" - JobTypeSystem = "system" + JobTypeCore = "_core" + JobTypeService = "service" + JobTypeBatch = "batch" + JobTypeSystem = "system" + JobTypeSysBatch = "sysbatch" ) const ( @@ -4027,7 +4028,7 @@ func (j *Job) Validate() error { mErr.Errors = append(mErr.Errors, errors.New("Job must be in a namespace")) } switch j.Type { - case JobTypeCore, JobTypeService, JobTypeBatch, JobTypeSystem: + case JobTypeCore, JobTypeService, JobTypeBatch, JobTypeSystem, JobTypeSysBatch: case "": mErr.Errors = append(mErr.Errors, errors.New("Missing job type")) default: @@ -4119,11 +4120,12 @@ func (j *Job) Validate() error { } } - // Validate periodic is only used with batch jobs. + // Validate periodic is only used with batch or sysbatch jobs. if j.IsPeriodic() && j.Periodic.Enabled { - if j.Type != JobTypeBatch { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("Periodic can only be used with %q scheduler", JobTypeBatch)) + if j.Type != JobTypeBatch && j.Type != JobTypeSysBatch { + mErr.Errors = append(mErr.Errors, fmt.Errorf( + "Periodic can only be used with %q or %q scheduler", JobTypeBatch, JobTypeSysBatch, + )) } if err := j.Periodic.Validate(); err != nil { @@ -4132,9 +4134,10 @@ func (j *Job) Validate() error { } if j.IsParameterized() { - if j.Type != JobTypeBatch { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("Parameterized job can only be used with %q scheduler", JobTypeBatch)) + if j.Type != JobTypeBatch && j.Type != JobTypeSysBatch { + mErr.Errors = append(mErr.Errors, fmt.Errorf( + "Parameterized job can only be used with %q or %q scheduler", JobTypeBatch, JobTypeSysBatch, + )) } if err := j.ParameterizedJob.Validate(); err != nil { diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index c67eafad870a..b933deb1eb21 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -36,7 +36,7 @@ const ( // allocInPlace is the status used when speculating on an in-place update allocInPlace = "alloc updating in-place" - // allocNodeTainted is the status used when stopping an alloc because it's + // allocNodeTainted is the status used when stopping an alloc because its // node is tainted. allocNodeTainted = "alloc not needed as node is tainted" diff --git a/scheduler/rank.go b/scheduler/rank.go index 1653d9cf9067..ec4b2635d423 100644 --- a/scheduler/rank.go +++ b/scheduler/rank.go @@ -24,7 +24,7 @@ type RankedNode struct { TaskLifecycles map[string]*structs.TaskLifecycleConfig AllocResources *structs.AllocatedSharedResources - // Allocs is used to cache the proposed allocations on the + // Proposed is used to cache the proposed allocations on the // node. This can be shared between iterators that require it. Proposed []*structs.Allocation @@ -60,7 +60,7 @@ func (r *RankedNode) SetTaskResources(task *structs.Task, r.TaskLifecycles[task.Name] = task.Lifecycle } -// RankFeasibleIterator is used to iteratively yield nodes along +// RankIterator is used to iteratively yield nodes along // with ranking metadata. The iterators may manage some state for // performance optimizations. type RankIterator interface { diff --git a/scheduler/scheduler.go b/scheduler/scheduler.go index a950690db44f..d1bbfa4c3e41 100644 --- a/scheduler/scheduler.go +++ b/scheduler/scheduler.go @@ -21,9 +21,10 @@ const ( // BuiltinSchedulers contains the built in registered schedulers // which are available var BuiltinSchedulers = map[string]Factory{ - "service": NewServiceScheduler, - "batch": NewBatchScheduler, - "system": NewSystemScheduler, + "service": NewServiceScheduler, + "batch": NewBatchScheduler, + "system": NewSystemScheduler, + "sysbatch": NewSysBatchScheduler, } // NewScheduler is used to instantiate and return a new scheduler diff --git a/scheduler/stack.go b/scheduler/stack.go index bccabc7899ab..cf01c2992afe 100644 --- a/scheduler/stack.go +++ b/scheduler/stack.go @@ -198,8 +198,12 @@ type SystemStack struct { scoreNorm *ScoreNormalizationIterator } -// NewSystemStack constructs a stack used for selecting system job placements. -func NewSystemStack(ctx Context) *SystemStack { +// NewSystemStack constructs a stack used for selecting system and sysbatch +// job placements. +// +// sysbatch is used to determine which scheduler config option is used to +// control the use of preemption. +func NewSystemStack(sysbatch bool, ctx Context) *SystemStack { // Create a new stack s := &SystemStack{ctx: ctx} @@ -237,10 +241,13 @@ func NewSystemStack(ctx Context) *SystemStack { // previously been marked as eligible or ineligible. Generally this will be // checks that only needs to examine the single node to determine feasibility. jobs := []FeasibilityChecker{s.jobConstraint} - tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint, + tgs := []FeasibilityChecker{ + s.taskGroupDrivers, + s.taskGroupConstraint, s.taskGroupHostVolumes, s.taskGroupDevices, - s.taskGroupNetwork} + s.taskGroupNetwork, + } avail := []FeasibilityChecker{s.taskGroupCSIVolumes} s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs, avail) @@ -257,9 +264,14 @@ func NewSystemStack(ctx Context) *SystemStack { schedulerAlgorithm := schedConfig.EffectiveSchedulerAlgorithm() enablePreemption := true if schedConfig != nil { - enablePreemption = schedConfig.PreemptionConfig.SystemSchedulerEnabled + if sysbatch { + enablePreemption = schedConfig.PreemptionConfig.SysBatchSchedulerEnabled + } else { + enablePreemption = schedConfig.PreemptionConfig.SystemSchedulerEnabled + } } + // Create binpack iterator s.binPack = NewBinPackIterator(ctx, rankSource, enablePreemption, 0, schedulerAlgorithm) // Apply score normalization @@ -360,11 +372,13 @@ func NewGenericStack(batch bool, ctx Context) *GenericStack { // previously been marked as eligible or ineligible. Generally this will be // checks that only needs to examine the single node to determine feasibility. jobs := []FeasibilityChecker{s.jobConstraint} - tgs := []FeasibilityChecker{s.taskGroupDrivers, + tgs := []FeasibilityChecker{ + s.taskGroupDrivers, s.taskGroupConstraint, s.taskGroupHostVolumes, s.taskGroupDevices, - s.taskGroupNetwork} + s.taskGroupNetwork, + } avail := []FeasibilityChecker{s.taskGroupCSIVolumes} s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs, avail) diff --git a/scheduler/stack_test.go b/scheduler/stack_test.go index 4650546d32f1..b45d91bc5d16 100644 --- a/scheduler/stack_test.go +++ b/scheduler/stack_test.go @@ -389,7 +389,7 @@ func TestServiceStack_Select_BinPack_Overflow(t *testing.T) { func TestSystemStack_SetNodes(t *testing.T) { _, ctx := testContext(t) - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) nodes := []*structs.Node{ mock.Node(), @@ -411,7 +411,7 @@ func TestSystemStack_SetNodes(t *testing.T) { func TestSystemStack_SetJob(t *testing.T) { _, ctx := testContext(t) - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) job := mock.Job() stack.SetJob(job) @@ -427,7 +427,7 @@ func TestSystemStack_SetJob(t *testing.T) { func TestSystemStack_Select_Size(t *testing.T) { _, ctx := testContext(t) nodes := []*structs.Node{mock.Node()} - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) stack.SetNodes(nodes) job := mock.Job() @@ -455,7 +455,7 @@ func TestSystemStack_Select_MetricsReset(t *testing.T) { mock.Node(), mock.Node(), } - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) stack.SetNodes(nodes) job := mock.Job() @@ -491,7 +491,7 @@ func TestSystemStack_Select_DriverFilter(t *testing.T) { zero := nodes[0] zero.Attributes["driver.foo"] = "1" - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) stack.SetNodes(nodes) job := mock.Job() @@ -513,7 +513,7 @@ func TestSystemStack_Select_DriverFilter(t *testing.T) { t.Fatalf("ComputedClass() failed: %v", err) } - stack = NewSystemStack(ctx) + stack = NewSystemStack(false, ctx) stack.SetNodes(nodes) stack.SetJob(job) node = stack.Select(job.TaskGroups[0], selectOptions) @@ -534,7 +534,7 @@ func TestSystemStack_Select_ConstraintFilter(t *testing.T) { t.Fatalf("ComputedClass() failed: %v", err) } - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) stack.SetNodes(nodes) job := mock.Job() @@ -577,7 +577,7 @@ func TestSystemStack_Select_BinPack_Overflow(t *testing.T) { } one := nodes[1] - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) stack.SetNodes(nodes) job := mock.Job() diff --git a/scheduler/system_sched.go b/scheduler/system_scheduler.go similarity index 85% rename from scheduler/system_sched.go rename to scheduler/system_scheduler.go index 4b1e5c8cbfaa..53e4b4eefbb0 100644 --- a/scheduler/system_sched.go +++ b/scheduler/system_scheduler.go @@ -14,15 +14,21 @@ const ( // we will attempt to schedule if we continue to hit conflicts for system // jobs. maxSystemScheduleAttempts = 5 + + // maxSysBatchScheduleAttempts is used to limit the number of times we will + // attempt to schedule if we continue to hit conflicts for sysbatch jobs. + maxSysBatchScheduleAttempts = 2 ) -// SystemScheduler is used for 'system' jobs. This scheduler is -// designed for services that should be run on every client. -// One for each job, containing an allocation for each node +// SystemScheduler is used for 'system' and 'sysbatch' jobs. This scheduler is +// designed for jobs that should be run on every client. The 'system' mode +// will ensure those jobs continuously run regardless of successful task exits, +// whereas 'sysbatch' considers the task complete on success. type SystemScheduler struct { - logger log.Logger - state State - planner Planner + logger log.Logger + state State + planner Planner + sysbatch bool eval *structs.Evaluation job *structs.Job @@ -30,8 +36,9 @@ type SystemScheduler struct { planResult *structs.PlanResult ctx *EvalContext stack *SystemStack - nodes []*structs.Node - nodesByDC map[string]int + + nodes []*structs.Node + nodesByDC map[string]int limitReached bool nextEval *structs.Evaluation @@ -44,14 +51,25 @@ type SystemScheduler struct { // scheduler. func NewSystemScheduler(logger log.Logger, state State, planner Planner) Scheduler { return &SystemScheduler{ - logger: logger.Named("system_sched"), - state: state, - planner: planner, + logger: logger.Named("system_sched"), + state: state, + planner: planner, + sysbatch: false, + } +} + +func NewSysBatchScheduler(logger log.Logger, state State, planner Planner) Scheduler { + return &SystemScheduler{ + logger: logger.Named("sysbatch_sched"), + state: state, + planner: planner, + sysbatch: true, } } // Process is used to handle a single evaluation. func (s *SystemScheduler) Process(eval *structs.Evaluation) error { + // Store the evaluation s.eval = eval @@ -59,21 +77,20 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) error { s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace) // Verify the evaluation trigger reason is understood - switch eval.TriggeredBy { - case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, structs.EvalTriggerFailedFollowUp, - structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, structs.EvalTriggerPreemption, - structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerNodeDrain, structs.EvalTriggerAllocStop, - structs.EvalTriggerQueuedAllocs, structs.EvalTriggerScaling: - default: - desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", - eval.TriggeredBy) + if !s.canHandle(eval.TriggeredBy) { + desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy) return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs, "") } + limit := maxSystemScheduleAttempts + if s.sysbatch { + limit = maxSysBatchScheduleAttempts + } + // Retry up to the maxSystemScheduleAttempts and reset if progress is made. progress := func() bool { return progressMade(s.planResult) } - if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil { + if err := retryMax(limit, s.process, progress); err != nil { if statusErr, ok := err.(*SetStatusError); ok { return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(), s.queuedAllocs, "") @@ -94,9 +111,9 @@ func (s *SystemScheduler) process() (bool, error) { ws := memdb.NewWatchSet() s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID) if err != nil { - return false, fmt.Errorf("failed to get job '%s': %v", - s.eval.JobID, err) + return false, fmt.Errorf("failed to get job '%s': %v", s.eval.JobID, err) } + numTaskGroups := 0 if !s.job.Stopped() { numTaskGroups = len(s.job.TaskGroups) @@ -121,7 +138,7 @@ func (s *SystemScheduler) process() (bool, error) { s.ctx = NewEvalContext(s.state, s.plan, s.logger) // Construct the placement stack - s.stack = NewSystemStack(s.ctx) + s.stack = NewSystemStack(s.sysbatch, s.ctx) if !s.job.Stopped() { s.stack.SetJob(s.job) } @@ -185,26 +202,24 @@ func (s *SystemScheduler) computeJobAllocs() error { ws := memdb.NewWatchSet() allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true) if err != nil { - return fmt.Errorf("failed to get allocs for job '%s': %v", - s.eval.JobID, err) + return fmt.Errorf("failed to get allocs for job '%s': %v", s.eval.JobID, err) } // Determine the tainted nodes containing job allocs tainted, err := taintedNodes(s.state, allocs) if err != nil { - return fmt.Errorf("failed to get tainted nodes for job '%s': %v", - s.eval.JobID, err) + return fmt.Errorf("failed to get tainted nodes for job '%s': %v", s.eval.JobID, err) } // Update the allocations which are in pending/running state on tainted - // nodes to lost + // nodes to lost. updateNonTerminalAllocsToLost(s.plan, tainted, allocs) - // Filter out the allocations in a terminal state - allocs, terminalAllocs := structs.FilterTerminalAllocs(allocs) + // Split out terminal allocations + live, term := structs.SplitTerminalAllocs(allocs) // Diff the required and existing allocations - diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs) + diff := diffSystemAllocs(s.job, s.nodes, tainted, live, term) s.logger.Debug("reconciled current state with desired state", "place", len(diff.place), "update", len(diff.update), "migrate", len(diff.migrate), "stop", len(diff.stop), @@ -423,3 +438,27 @@ func (s *SystemScheduler) addBlocked(node *structs.Node) error { return s.planner.CreateEval(blocked) } + +func (s *SystemScheduler) canHandle(trigger string) bool { + switch trigger { + case structs.EvalTriggerJobRegister: + case structs.EvalTriggerNodeUpdate: + case structs.EvalTriggerFailedFollowUp: + case structs.EvalTriggerJobDeregister: + case structs.EvalTriggerRollingUpdate: + case structs.EvalTriggerPreemption: + case structs.EvalTriggerDeploymentWatcher: + case structs.EvalTriggerNodeDrain: + case structs.EvalTriggerAllocStop: + case structs.EvalTriggerQueuedAllocs: + case structs.EvalTriggerScaling: + default: + switch s.sysbatch { + case true: + return trigger == structs.EvalTriggerPeriodicJob + case false: + return false + } + } + return true +} diff --git a/scheduler/system_sysbatch_test.go b/scheduler/system_sysbatch_test.go new file mode 100644 index 000000000000..1bbfffc02698 --- /dev/null +++ b/scheduler/system_sysbatch_test.go @@ -0,0 +1,1623 @@ +package scheduler + +import ( + "fmt" + "sort" + "testing" + + "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" +) + +func TestSysBatch_JobRegister(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + _ = createNodes(t, h, 10) + + // Create a job + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to deregister the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan does not have annotations + require.Nil(t, plan.Annotations, "expected no annotations") + + // Ensure the plan allocated + var planned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(t, planned, 10) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure all allocations placed + require.Len(t, out, 10) + + // Check the available nodes + count, ok := out[0].Metrics.NodesAvailable["dc1"] + require.True(t, ok) + require.Equal(t, 10, count, "bad metrics %#v:", out[0].Metrics) + + // Ensure no allocations are queued + queued := h.Evals[0].QueuedAllocations["my-sysbatch"] + require.Equal(t, 0, queued, "unexpected queued allocations") + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobRegister_AddNode_Running(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + // Generate a fake sysbatch job with allocations + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.ClientStatus = structs.AllocClientStatusRunning + allocs = append(allocs, alloc) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Add a new node. + node := mock.Node() + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Create a mock evaluation to deal with the node update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan had no node updates + var update []*structs.Allocation + for _, updateList := range plan.NodeUpdate { + update = append(update, updateList...) + } + require.Empty(t, update) + + // Ensure the plan allocated on the new node + var planned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(t, planned, 1) + + // Ensure it allocated on the right node + _, ok := plan.NodeAllocation[node.ID] + require.True(t, ok, "allocated on wrong node: %#v", plan) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure all allocations placed + out, _ = structs.FilterTerminalAllocs(out) + require.Len(t, out, 11) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobRegister_AddNode_Dead(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + // Generate a dead sysbatch job with complete allocations + job := mock.SystemBatchJob() + job.Status = structs.JobStatusDead // job is dead but not stopped + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.ClientStatus = structs.AllocClientStatusComplete + allocs = append(allocs, alloc) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Add a new node. + node := mock.Node() + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Create a mock evaluation to deal with the node update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan has no node update + var update []*structs.Allocation + for _, updateList := range plan.NodeUpdate { + update = append(update, updateList...) + } + require.Len(t, update, 0) + + // Ensure the plan allocates on the new node + var planned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(t, planned, 1) + + // Ensure it allocated on the right node + _, ok := plan.NodeAllocation[node.ID] + require.True(t, ok, "allocated on wrong node: %#v", plan) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure 1 non-terminal allocation + live, _ := structs.FilterTerminalAllocs(out) + require.Len(t, live, 1) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobModify(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + // Generate a fake job with allocations + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.ClientStatus = structs.AllocClientStatusPending + allocs = append(allocs, alloc) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Add a few terminal status allocations, these should be reinstated + var terminal []*structs.Allocation + for i := 0; i < 5; i++ { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = nodes[i].ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.ClientStatus = structs.AllocClientStatusComplete + terminal = append(terminal, alloc) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), terminal)) + + // Update the job + job2 := mock.SystemBatchJob() + job2.ID = job.ID + + // Update the task, such that it cannot be done in-place + job2.TaskGroups[0].Tasks[0].Config["command"] = "/bin/other" + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) + + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted all allocs + var update []*structs.Allocation + for _, updateList := range plan.NodeUpdate { + update = append(update, updateList...) + } + require.Equal(t, len(allocs), len(update)) + + // Ensure the plan allocated + var planned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(t, planned, 10) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure all allocations placed + out, _ = structs.FilterTerminalAllocs(out) + require.Len(t, out, 10) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobModify_InPlace(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + allocs = append(allocs, alloc) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Update the job + job2 := mock.SystemBatchJob() + job2.ID = job.ID + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) + + // Create a mock evaluation to deal with update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan did not evict any allocs + var update []*structs.Allocation + for _, updateList := range plan.NodeUpdate { + update = append(update, updateList...) + } + require.Empty(t, update) + + // Ensure the plan updated the existing allocs + var planned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(t, planned, 10) + + for _, p := range planned { + require.Equal(t, job2, p.Job, "should update job") + } + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure all allocations placed + require.Len(t, out, 10) + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobDeregister_Purged(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + // Create a sysbatch job + job := mock.SystemBatchJob() + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + allocs = append(allocs, alloc) + } + for _, alloc := range allocs { + require.NoError(t, h.State.UpsertJobSummary(h.NextIndex(), mock.JobSysBatchSummary(alloc.JobID))) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Create a mock evaluation to deregister the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerJobDeregister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted the job from all nodes. + for _, node := range nodes { + require.Len(t, plan.NodeUpdate[node.ID], 1) + } + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure no remaining allocations + out, _ = structs.FilterTerminalAllocs(out) + require.Empty(t, out) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobDeregister_Stopped(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + // Generate a stopped sysbatch job with allocations + job := mock.SystemBatchJob() + job.Stop = true + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + allocs = append(allocs, alloc) + } + for _, alloc := range allocs { + require.NoError(t, h.State.UpsertJobSummary(h.NextIndex(), mock.JobSysBatchSummary(alloc.JobID))) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Create a mock evaluation to deregister the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerJobDeregister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted the job from all nodes. + for _, node := range nodes { + require.Len(t, plan.NodeUpdate[node.ID], 1) + } + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure no remaining allocations + out, _ = structs.FilterTerminalAllocs(out) + require.Empty(t, out) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_NodeDown(t *testing.T) { + h := NewHarness(t) + + // Register a down node + node := mock.Node() + node.Status = structs.NodeStatusDown + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Generate a sysbatch job allocated on that node + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) + + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted all allocs + require.Len(t, plan.NodeUpdate[node.ID], 1) + + // Ensure the plan updated the allocation. + planned := make([]*structs.Allocation, 0) + for _, allocList := range plan.NodeUpdate { + planned = append(planned, allocList...) + } + require.Len(t, planned, 1) + + // Ensure the allocations is stopped + p := planned[0] + require.Equal(t, structs.AllocDesiredStatusStop, p.DesiredStatus) + // removed badly designed assertion on client_status = lost + // the actual client_status is pending + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_NodeDrain_Down(t *testing.T) { + h := NewHarness(t) + + // Register a draining node + node := mock.Node() + node.Drain = true + node.Status = structs.NodeStatusDown + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Generate a sysbatch job allocated on that node. + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) + + // Create a mock evaluation to deal with the node update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted non terminal allocs + require.Len(t, plan.NodeUpdate[node.ID], 1) + + // Ensure that the allocation is marked as lost + var lost []string + for _, alloc := range plan.NodeUpdate[node.ID] { + lost = append(lost, alloc.ID) + } + require.Equal(t, []string{alloc.ID}, lost) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_NodeDrain(t *testing.T) { + h := NewHarness(t) + + // Register a draining node + node := mock.Node() + node.Drain = true + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Generate a sysbatch job allocated on that node. + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) + + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSystemScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted all allocs + require.Len(t, plan.NodeUpdate[node.ID], 1) + + // Ensure the plan updated the allocation. + planned := make([]*structs.Allocation, 0) + for _, allocList := range plan.NodeUpdate { + planned = append(planned, allocList...) + } + require.Len(t, planned, 1) + + // Ensure the allocations is stopped + require.Equal(t, structs.AllocDesiredStatusStop, planned[0].DesiredStatus) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_NodeUpdate(t *testing.T) { + h := NewHarness(t) + + // Register a node + node := mock.Node() + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Generate a sysbatch job allocated on that node. + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-system.pinger[0]" + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) + + // Create a mock evaluation to deal with the node update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure that queued allocations is zero + val, ok := h.Evals[0].QueuedAllocations["pinger"] + require.True(t, ok) + require.Zero(t, val) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_RetryLimit(t *testing.T) { + h := NewHarness(t) + h.Planner = &RejectPlan{h} + + // Create some nodes + _ = createNodes(t, h, 10) + + // Create a job + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to register + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure multiple plans + require.NotEmpty(t, h.Plans) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure no allocations placed + require.Empty(t, out) + + // Should hit the retry limit + h.AssertEvalStatus(t, structs.EvalStatusFailed) +} + +// This test ensures that the scheduler doesn't increment the queued allocation +// count for a task group when allocations can't be created on currently +// available nodes because of constraint mismatches. +func TestSysBatch_Queued_With_Constraints(t *testing.T) { + h := NewHarness(t) + + // Register a node + node := mock.Node() + node.Attributes["kernel.name"] = "darwin" + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Generate a sysbatch job which can't be placed on the node + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to deal with the node update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure that queued allocations is zero + val, ok := h.Evals[0].QueuedAllocations["pinger"] + require.True(t, ok) + require.Zero(t, val) +} + +// This test ensures that the scheduler correctly ignores ineligible +// nodes when scheduling due to a new node being added. The job has two +// task groups constrained to a particular node class. The desired behavior +// should be that the TaskGroup constrained to the newly added node class is +// added and that the TaskGroup constrained to the ineligible node is ignored. +func TestSysBatch_JobConstraint_AddNode(t *testing.T) { + h := NewHarness(t) + + // Create two nodes + var node *structs.Node + node = mock.Node() + node.NodeClass = "Class-A" + require.NoError(t, node.ComputeClass()) + require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + var nodeB *structs.Node + nodeB = mock.Node() + nodeB.NodeClass = "Class-B" + require.NoError(t, nodeB.ComputeClass()) + require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), nodeB)) + + // Make a sysbatch job with two task groups, each constraint to a node class + job := mock.SystemBatchJob() + tgA := job.TaskGroups[0] + tgA.Name = "groupA" + tgA.Constraints = []*structs.Constraint{{ + LTarget: "${node.class}", + RTarget: node.NodeClass, + Operand: "=", + }} + tgB := job.TaskGroups[0].Copy() + tgB.Name = "groupB" + tgB.Constraints = []*structs.Constraint{{ + LTarget: "${node.class}", + RTarget: nodeB.NodeClass, + Operand: "=", + }} + + // Upsert Job + job.TaskGroups = []*structs.TaskGroup{tgA, tgB} + require.Nil(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Evaluate the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + require.Nil(t, h.Process(NewSysBatchScheduler, eval)) + require.Equal(t, "complete", h.Evals[0].Status) + + // QueuedAllocations is drained + val, ok := h.Evals[0].QueuedAllocations["groupA"] + require.True(t, ok) + require.Equal(t, 0, val) + + val, ok = h.Evals[0].QueuedAllocations["groupB"] + require.True(t, ok) + require.Equal(t, 0, val) + + // Single plan with two NodeAllocations + require.Len(t, h.Plans, 1) + require.Len(t, h.Plans[0].NodeAllocation, 2) + + // Mark the node as ineligible + node.SchedulingEligibility = structs.NodeSchedulingIneligible + + // Evaluate the node update + eval2 := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerNodeUpdate, + NodeID: node.ID, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval2})) + + // Process the 2nd evaluation + require.Nil(t, h.Process(NewSysBatchScheduler, eval2)) + require.Equal(t, "complete", h.Evals[1].Status) + + // Ensure no new plans + require.Equal(t, 1, len(h.Plans)) + + // Ensure all NodeAllocations are from first Eval + for _, allocs := range h.Plans[0].NodeAllocation { + require.Len(t, allocs, 1) + require.Equal(t, eval.ID, allocs[0].EvalID) + } + + // Add a new node Class-B + var nodeBTwo *structs.Node + nodeBTwo = mock.Node() + require.NoError(t, nodeBTwo.ComputeClass()) + nodeBTwo.NodeClass = "Class-B" + require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), nodeBTwo)) + + // Evaluate the new node + eval3 := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + NodeID: nodeBTwo.ID, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + + // Ensure 3rd eval is complete + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval3})) + require.Nil(t, h.Process(NewSysBatchScheduler, eval3)) + require.Equal(t, "complete", h.Evals[2].Status) + + // Ensure no failed TG allocs + require.Equal(t, 0, len(h.Evals[2].FailedTGAllocs)) + + require.Len(t, h.Plans, 2) + require.Len(t, h.Plans[1].NodeAllocation, 1) + // Ensure all NodeAllocations are from first Eval + for _, allocs := range h.Plans[1].NodeAllocation { + require.Len(t, allocs, 1) + require.Equal(t, eval3.ID, allocs[0].EvalID) + } + + ws := memdb.NewWatchSet() + + allocsNodeOne, err := h.State.AllocsByNode(ws, node.ID) + require.NoError(t, err) + require.Len(t, allocsNodeOne, 1) + + allocsNodeTwo, err := h.State.AllocsByNode(ws, nodeB.ID) + require.NoError(t, err) + require.Len(t, allocsNodeTwo, 1) + + allocsNodeThree, err := h.State.AllocsByNode(ws, nodeBTwo.ID) + require.NoError(t, err) + require.Len(t, allocsNodeThree, 1) +} + +// No errors reported when no available nodes prevent placement +func TestSysBatch_ExistingAllocNoNodes(t *testing.T) { + h := NewHarness(t) + + var node *structs.Node + // Create a node + node = mock.Node() + require.NoError(t, node.ComputeClass()) + require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Make a sysbatch job + job := mock.SystemBatchJob() + job.Meta = map[string]string{"version": "1"} + require.Nil(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Evaluate the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + require.Nil(t, h.Process(NewSysBatchScheduler, eval)) + require.Equal(t, "complete", h.Evals[0].Status) + + // QueuedAllocations is drained + val, ok := h.Evals[0].QueuedAllocations["pinger"] + require.True(t, ok) + require.Equal(t, 0, val) + + // The plan has one NodeAllocations + require.Equal(t, 1, len(h.Plans)) + + // Mark the node as ineligible + node.SchedulingEligibility = structs.NodeSchedulingIneligible + + // Evaluate the job + eval2 := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval2})) + require.Nil(t, h.Process(NewSysBatchScheduler, eval2)) + require.Equal(t, "complete", h.Evals[1].Status) + + // Create a new job version, deploy + job2 := job.Copy() + job2.Meta["version"] = "2" + require.Nil(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) + + // Run evaluation as a plan + eval3 := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job2.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job2.ID, + Status: structs.EvalStatusPending, + AnnotatePlan: true, + } + + // Ensure New eval is complete + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval3})) + require.Nil(t, h.Process(NewSysBatchScheduler, eval3)) + require.Equal(t, "complete", h.Evals[2].Status) + + // Ensure there are no FailedTGAllocs + require.Equal(t, 0, len(h.Evals[2].FailedTGAllocs)) + require.Equal(t, 0, h.Evals[2].QueuedAllocations[job2.Name]) +} + +func TestSysBatch_ConstraintErrors(t *testing.T) { + h := NewHarness(t) + + var node *structs.Node + // Register some nodes + // the tag "aaaaaa" is hashed so that the nodes are processed + // in an order other than good, good, bad + for _, tag := range []string{"aaaaaa", "foo", "foo", "foo"} { + node = mock.Node() + node.Meta["tag"] = tag + require.NoError(t, node.ComputeClass()) + require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + } + + // Mark the last node as ineligible + node.SchedulingEligibility = structs.NodeSchedulingIneligible + + // Make a job with a constraint that matches a subset of the nodes + job := mock.SystemBatchJob() + job.Priority = 100 + job.Constraints = append(job.Constraints, + &structs.Constraint{ + LTarget: "${meta.tag}", + RTarget: "foo", + Operand: "=", + }) + + require.Nil(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Evaluate the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + require.Nil(t, h.Process(NewSysBatchScheduler, eval)) + require.Equal(t, "complete", h.Evals[0].Status) + + // QueuedAllocations is drained + val, ok := h.Evals[0].QueuedAllocations["pinger"] + require.True(t, ok) + require.Equal(t, 0, val) + + // The plan has two NodeAllocations + require.Equal(t, 1, len(h.Plans)) + require.Nil(t, h.Plans[0].Annotations) + require.Equal(t, 2, len(h.Plans[0].NodeAllocation)) + + // Two nodes were allocated and are running + ws := memdb.NewWatchSet() + as, err := h.State.AllocsByJob(ws, structs.DefaultNamespace, job.ID, false) + require.Nil(t, err) + + running := 0 + for _, a := range as { + if "running" == a.Job.Status { + running++ + } + } + + require.Equal(t, 2, len(as)) + require.Equal(t, 2, running) + + // Failed allocations is empty + require.Equal(t, 0, len(h.Evals[0].FailedTGAllocs)) +} + +func TestSysBatch_ChainedAlloc(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + _ = createNodes(t, h, 10) + + // Create a sysbatch job + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to register the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + var allocIDs []string + for _, allocList := range h.Plans[0].NodeAllocation { + for _, alloc := range allocList { + allocIDs = append(allocIDs, alloc.ID) + } + } + sort.Strings(allocIDs) + + // Create a new harness to invoke the scheduler again + h1 := NewHarnessWithState(t, h.State) + job1 := mock.SystemBatchJob() + job1.ID = job.ID + job1.TaskGroups[0].Tasks[0].Env = make(map[string]string) + job1.TaskGroups[0].Tasks[0].Env["foo"] = "bar" + require.NoError(t, h1.State.UpsertJob(structs.MsgTypeTestSetup, h1.NextIndex(), job1)) + + // Insert two more nodes + for i := 0; i < 2; i++ { + node := mock.Node() + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + } + + // Create a mock evaluation to update the job + eval1 := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job1.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job1.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval1})) + // Process the evaluation + err = h1.Process(NewSysBatchScheduler, eval1) + require.NoError(t, err) + + require.Len(t, h.Plans, 1) + plan := h1.Plans[0] + + // Collect all the chained allocation ids and the new allocations which + // don't have any chained allocations + var prevAllocs []string + var newAllocs []string + for _, allocList := range plan.NodeAllocation { + for _, alloc := range allocList { + if alloc.PreviousAllocation == "" { + newAllocs = append(newAllocs, alloc.ID) + continue + } + prevAllocs = append(prevAllocs, alloc.PreviousAllocation) + } + } + sort.Strings(prevAllocs) + + // Ensure that the new allocations has their corresponding original + // allocation ids + require.Equal(t, allocIDs, prevAllocs) + + // Ensuring two new allocations don't have any chained allocations + require.Len(t, newAllocs, 2) +} + +func TestSysBatch_PlanWithDrainedNode(t *testing.T) { + h := NewHarness(t) + + // Register two nodes with two different classes + node := mock.Node() + node.NodeClass = "green" + node.Drain = true + require.NoError(t, node.ComputeClass()) + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + node2 := mock.Node() + node2.NodeClass = "blue" + require.NoError(t, node2.ComputeClass()) + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node2)) + + // Create a sysbatch job with two task groups, each constrained on node class + job := mock.SystemBatchJob() + tg1 := job.TaskGroups[0] + tg1.Constraints = append(tg1.Constraints, + &structs.Constraint{ + LTarget: "${node.class}", + RTarget: "green", + Operand: "==", + }) + + tg2 := tg1.Copy() + tg2.Name = "pinger2" + tg2.Constraints[0].RTarget = "blue" + job.TaskGroups = append(job.TaskGroups, tg2) + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create an allocation on each node + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) + alloc.TaskGroup = "pinger" + + alloc2 := mock.SysBatchAlloc() + alloc2.Job = job + alloc2.JobID = job.ID + alloc2.NodeID = node2.ID + alloc2.Name = "my-sysbatch.pinger2[0]" + alloc2.TaskGroup = "pinger2" + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc, alloc2})) + + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted the alloc on the failed node + planned := plan.NodeUpdate[node.ID] + require.Len(t, plan.NodeUpdate[node.ID], 1) + + // Ensure the plan didn't place + require.Empty(t, plan.NodeAllocation) + + // Ensure the allocations is stopped + require.Equal(t, structs.AllocDesiredStatusStop, planned[0].DesiredStatus) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_QueuedAllocsMultTG(t *testing.T) { + h := NewHarness(t) + + // Register two nodes with two different classes + node := mock.Node() + node.NodeClass = "green" + require.NoError(t, node.ComputeClass()) + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + node2 := mock.Node() + node2.NodeClass = "blue" + require.NoError(t, node2.ComputeClass()) + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node2)) + + // Create a sysbatch job with two task groups, each constrained on node class + job := mock.SystemBatchJob() + tg1 := job.TaskGroups[0] + tg1.Constraints = append(tg1.Constraints, + &structs.Constraint{ + LTarget: "${node.class}", + RTarget: "green", + Operand: "==", + }) + + tg2 := tg1.Copy() + tg2.Name = "pinger2" + tg2.Constraints[0].RTarget = "blue" + job.TaskGroups = append(job.TaskGroups, tg2) + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + + qa := h.Evals[0].QueuedAllocations + require.Zero(t, qa["pinger"]) + require.Zero(t, qa["pinger2"]) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_Preemption(t *testing.T) { + h := NewHarness(t) + + // Create nodes + nodes := make([]*structs.Node, 0) + for i := 0; i < 2; i++ { + node := mock.Node() + // TODO: remove in 0.11 + node.Resources = &structs.Resources{ + CPU: 3072, + MemoryMB: 5034, + DiskMB: 20 * 1024, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + CIDR: "192.168.0.100/32", + MBits: 1000, + }}, + } + node.NodeResources = &structs.NodeResources{ + Cpu: structs.NodeCpuResources{CpuShares: 3072}, + Memory: structs.NodeMemoryResources{MemoryMB: 5034}, + Disk: structs.NodeDiskResources{DiskMB: 20 * 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + CIDR: "192.168.0.100/32", + MBits: 1000, + }}, + NodeNetworks: []*structs.NodeNetworkResource{{ + Mode: "host", + Device: "eth0", + Addresses: []structs.NodeNetworkAddress{{ + Family: structs.NodeNetworkAF_IPv4, + Alias: "default", + Address: "192.168.0.100", + }}, + }}, + } + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + nodes = append(nodes, node) + } + + // Enable Preemption + err := h.State.SchedulerSetConfig(h.NextIndex(), &structs.SchedulerConfiguration{ + PreemptionConfig: structs.PreemptionConfig{ + SysBatchSchedulerEnabled: true, + }, + }) + require.NoError(t, err) + + // Create some low priority batch jobs and allocations for them + // One job uses a reserved port + job1 := mock.BatchJob() + job1.Type = structs.JobTypeBatch + job1.Priority = 20 + job1.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ + CPU: 512, + MemoryMB: 1024, + Networks: []*structs.NetworkResource{{ + MBits: 200, + ReservedPorts: []structs.Port{{ + Label: "web", + Value: 80, + }}, + }}, + } + + alloc1 := mock.Alloc() + alloc1.Job = job1 + alloc1.JobID = job1.ID + alloc1.NodeID = nodes[0].ID + alloc1.Name = "my-job[0]" + alloc1.TaskGroup = job1.TaskGroups[0].Name + alloc1.AllocatedResources = &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{CpuShares: 512}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, + MBits: 200, + }}, + }, + }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, + } + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job1)) + + job2 := mock.BatchJob() + job2.Type = structs.JobTypeBatch + job2.Priority = 20 + job2.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ + CPU: 512, + MemoryMB: 1024, + Networks: []*structs.NetworkResource{{MBits: 200}}, + } + + alloc2 := mock.Alloc() + alloc2.Job = job2 + alloc2.JobID = job2.ID + alloc2.NodeID = nodes[0].ID + alloc2.Name = "my-job[2]" + alloc2.TaskGroup = job2.TaskGroups[0].Name + alloc2.AllocatedResources = &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{CpuShares: 512}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + MBits: 200, + }}, + }, + }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, + } + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) + + job3 := mock.Job() + job3.Type = structs.JobTypeBatch + job3.Priority = 40 + job3.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ + CPU: 1024, + MemoryMB: 2048, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + MBits: 400, + }}, + } + + alloc3 := mock.Alloc() + alloc3.Job = job3 + alloc3.JobID = job3.ID + alloc3.NodeID = nodes[0].ID + alloc3.Name = "my-job[0]" + alloc3.TaskGroup = job3.TaskGroups[0].Name + alloc3.AllocatedResources = &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{CpuShares: 1024}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 25}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, + MBits: 400, + }}, + }, + }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc1, alloc2, alloc3})) + + // Create a high priority job and allocs for it + // These allocs should not be preempted + + job4 := mock.BatchJob() + job4.Type = structs.JobTypeBatch + job4.Priority = 100 + job4.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ + CPU: 1024, + MemoryMB: 2048, + Networks: []*structs.NetworkResource{{MBits: 100}}, + } + + alloc4 := mock.Alloc() + alloc4.Job = job4 + alloc4.JobID = job4.ID + alloc4.NodeID = nodes[0].ID + alloc4.Name = "my-job4[0]" + alloc4.TaskGroup = job4.TaskGroups[0].Name + alloc4.AllocatedResources = &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: 1024, + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 2048, + }, + Networks: []*structs.NetworkResource{ + { + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, + MBits: 100, + }, + }, + }, + }, + Shared: structs.AllocatedSharedResources{ + DiskMB: 2 * 1024, + }, + } + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job4)) + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc4})) + + // Create a system job such that it would need to preempt both allocs to succeed + job := mock.SystemBatchJob() + job.Priority = 100 + job.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ + CPU: 1948, + MemoryMB: 256, + Networks: []*structs.NetworkResource{{ + MBits: 800, + DynamicPorts: []structs.Port{{Label: "http"}}, + }}, + } + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to register the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err = h.Process(NewSysBatchScheduler, eval) + require.Nil(t, err) + + // Ensure a single plan + require.Equal(t, 1, len(h.Plans)) + plan := h.Plans[0] + + // Ensure the plan doesn't have annotations + require.Nil(t, plan.Annotations) + + // Ensure the plan allocated on both nodes + var planned []*structs.Allocation + preemptingAllocId := "" + require.Equal(t, 2, len(plan.NodeAllocation)) + + // The alloc that got placed on node 1 is the preemptor + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + for _, alloc := range allocList { + if alloc.NodeID == nodes[0].ID { + preemptingAllocId = alloc.ID + } + } + } + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure all allocations placed + require.Equal(t, 2, len(out)) + + // Verify that one node has preempted allocs + require.NotNil(t, plan.NodePreemptions[nodes[0].ID]) + preemptedAllocs := plan.NodePreemptions[nodes[0].ID] + + // Verify that three jobs have preempted allocs + require.Equal(t, 3, len(preemptedAllocs)) + + expectedPreemptedJobIDs := []string{job1.ID, job2.ID, job3.ID} + + // We expect job1, job2 and job3 to have preempted allocations + // job4 should not have any allocs preempted + for _, alloc := range preemptedAllocs { + require.Contains(t, expectedPreemptedJobIDs, alloc.JobID) + } + // Look up the preempted allocs by job ID + ws = memdb.NewWatchSet() + + for _, jobId := range expectedPreemptedJobIDs { + out, err = h.State.AllocsByJob(ws, structs.DefaultNamespace, jobId, false) + require.NoError(t, err) + for _, alloc := range out { + require.Equal(t, structs.AllocDesiredStatusEvict, alloc.DesiredStatus) + require.Equal(t, fmt.Sprintf("Preempted by alloc ID %v", preemptingAllocId), alloc.DesiredDescription) + } + } + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_canHandle(t *testing.T) { + s := SystemScheduler{sysbatch: true} + t.Run("sysbatch register", func(t *testing.T) { + require.True(t, s.canHandle(structs.EvalTriggerJobRegister)) + }) + t.Run("sysbatch scheduled", func(t *testing.T) { + require.False(t, s.canHandle(structs.EvalTriggerScheduled)) + }) + t.Run("sysbatch periodic", func(t *testing.T) { + require.True(t, s.canHandle(structs.EvalTriggerPeriodicJob)) + }) +} +func createNodes(t *testing.T, h *Harness, n int) []*structs.Node { + nodes := make([]*structs.Node, n) + for i := 0; i < n; i++ { + node := mock.Node() + nodes[i] = node + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + } + return nodes +} diff --git a/scheduler/system_sched_test.go b/scheduler/system_system_test.go similarity index 84% rename from scheduler/system_sched_test.go rename to scheduler/system_system_test.go index 35ed1ce5189d..e3cff0e646b7 100644 --- a/scheduler/system_sched_test.go +++ b/scheduler/system_system_test.go @@ -19,10 +19,7 @@ func TestSystemSched_JobRegister(t *testing.T) { h := NewHarness(t) // Create some nodes - for i := 0; i < 10; i++ { - node := mock.Node() - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + _ = createNodes(t, h, 10) // Create a job job := mock.SystemJob() @@ -41,29 +38,21 @@ func TestSystemSched_JobRegister(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] - // Ensure the plan doesn't have annotations. - if plan.Annotations != nil { - t.Fatalf("expected no annotations") - } + // Ensure the plan does not have annotations + require.Nil(t, plan.Annotations, "expected no annotations") // Ensure the plan allocated var planned []*structs.Allocation for _, allocList := range plan.NodeAllocation { planned = append(planned, allocList...) } - if len(planned) != 10 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 10) // Lookup the allocations by JobID ws := memdb.NewWatchSet() @@ -71,20 +60,16 @@ func TestSystemSched_JobRegister(t *testing.T) { require.NoError(t, err) // Ensure all allocations placed - if len(out) != 10 { - t.Fatalf("bad: %#v", out) - } + require.Len(t, out, 10) // Check the available nodes - if count, ok := out[0].Metrics.NodesAvailable["dc1"]; !ok || count != 10 { - t.Fatalf("bad: %#v", out[0].Metrics) - } + count, ok := out[0].Metrics.NodesAvailable["dc1"] + require.True(t, ok) + require.Equal(t, 10, count, "bad metrics %#v:", out[0].Metrics) // Ensure no allocations are queued queued := h.Evals[0].QueuedAllocations["web"] - if queued != 0 { - t.Fatalf("expected queued allocations: %v, actual: %v", 0, queued) - } + require.Equal(t, 0, queued, "unexpected queued allocations") h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -93,10 +78,7 @@ func TestSystemSched_JobRegister_StickyAllocs(t *testing.T) { h := NewHarness(t) // Create some nodes - for i := 0; i < 10; i++ { - node := mock.Node() - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + _ = createNodes(t, h, 10) // Create a job job := mock.SystemJob() @@ -168,7 +150,7 @@ func TestSystemSched_JobRegister_StickyAllocs(t *testing.T) { func TestSystemSched_JobRegister_EphemeralDiskConstraint(t *testing.T) { h := NewHarness(t) - // Create a nodes + // Create a node node := mock.Node() require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) @@ -237,7 +219,7 @@ func TestSystemSched_JobRegister_EphemeralDiskConstraint(t *testing.T) { func TestSystemSched_ExhaustResources(t *testing.T) { h := NewHarness(t) - // Create a nodes + // Create a node node := mock.Node() require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) @@ -412,12 +394,7 @@ func TestSystemSched_JobRegister_AddNode(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -455,9 +432,7 @@ func TestSystemSched_JobRegister_AddNode(t *testing.T) { } // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan had no node updates @@ -465,19 +440,14 @@ func TestSystemSched_JobRegister_AddNode(t *testing.T) { for _, updateList := range plan.NodeUpdate { update = append(update, updateList...) } - if len(update) != 0 { - t.Log(len(update)) - t.Fatalf("bad: %#v", plan) - } + require.Empty(t, update) // Ensure the plan allocated on the new node var planned []*structs.Allocation for _, allocList := range plan.NodeAllocation { planned = append(planned, allocList...) } - if len(planned) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 1) // Ensure it allocated on the right node if _, ok := plan.NodeAllocation[node.ID]; !ok { @@ -534,12 +504,7 @@ func TestSystemSched_JobModify(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -590,14 +555,10 @@ func TestSystemSched_JobModify(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted all allocs @@ -605,18 +566,14 @@ func TestSystemSched_JobModify(t *testing.T) { for _, updateList := range plan.NodeUpdate { update = append(update, updateList...) } - if len(update) != len(allocs) { - t.Fatalf("bad: %#v", plan) - } + require.Equal(t, len(allocs), len(update)) // Ensure the plan allocated var planned []*structs.Allocation for _, allocList := range plan.NodeAllocation { planned = append(planned, allocList...) } - if len(planned) != 10 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 10) // Lookup the allocations by JobID ws := memdb.NewWatchSet() @@ -625,9 +582,7 @@ func TestSystemSched_JobModify(t *testing.T) { // Ensure all allocations placed out, _ = structs.FilterTerminalAllocs(out) - if len(out) != 10 { - t.Fatalf("bad: %#v", out) - } + require.Len(t, out, 10) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -636,12 +591,7 @@ func TestSystemSched_JobModify_Rolling(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -739,12 +689,7 @@ func TestSystemSched_JobModify_InPlace(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -766,7 +711,7 @@ func TestSystemSched_JobModify_InPlace(t *testing.T) { job2.ID = job.ID require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) - // Create a mock evaluation to deal with drain + // Create a mock evaluation to deal with update eval := &structs.Evaluation{ Namespace: structs.DefaultNamespace, ID: uuid.Generate(), @@ -779,14 +724,10 @@ func TestSystemSched_JobModify_InPlace(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan did not evict any allocs @@ -794,22 +735,17 @@ func TestSystemSched_JobModify_InPlace(t *testing.T) { for _, updateList := range plan.NodeUpdate { update = append(update, updateList...) } - if len(update) != 0 { - t.Fatalf("bad: %#v", plan) - } + require.Empty(t, update) // Ensure the plan updated the existing allocs var planned []*structs.Allocation for _, allocList := range plan.NodeAllocation { planned = append(planned, allocList...) } - if len(planned) != 10 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 10) + for _, p := range planned { - if p.Job != job2 { - t.Fatalf("should update job") - } + require.Equal(t, job2, p.Job, "should update job") } // Lookup the allocations by JobID @@ -818,18 +754,14 @@ func TestSystemSched_JobModify_InPlace(t *testing.T) { require.NoError(t, err) // Ensure all allocations placed - if len(out) != 10 { - t.Fatalf("bad: %#v", out) - } + require.Len(t, out, 10) h.AssertEvalStatus(t, structs.EvalStatusComplete) // Verify the network did not change rp := structs.Port{Label: "admin", Value: 5000} for _, alloc := range out { for _, resources := range alloc.TaskResources { - if resources.Networks[0].ReservedPorts[0] != rp { - t.Fatalf("bad: %#v", alloc) - } + require.Equal(t, rp, resources.Networks[0].ReservedPorts[0]) } } } @@ -838,12 +770,7 @@ func TestSystemSched_JobDeregister_Purged(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -875,21 +802,15 @@ func TestSystemSched_JobDeregister_Purged(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted the job from all nodes. for _, node := range nodes { - if len(plan.NodeUpdate[node.ID]) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) } // Lookup the allocations by JobID @@ -899,9 +820,7 @@ func TestSystemSched_JobDeregister_Purged(t *testing.T) { // Ensure no remaining allocations out, _ = structs.FilterTerminalAllocs(out) - if len(out) != 0 { - t.Fatalf("bad: %#v", out) - } + require.Empty(t, out) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -910,12 +829,7 @@ func TestSystemSched_JobDeregister_Stopped(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -949,21 +863,15 @@ func TestSystemSched_JobDeregister_Stopped(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted the job from all nodes. for _, node := range nodes { - if len(plan.NodeUpdate[node.ID]) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) } // Lookup the allocations by JobID @@ -973,9 +881,7 @@ func TestSystemSched_JobDeregister_Stopped(t *testing.T) { // Ensure no remaining allocations out, _ = structs.FilterTerminalAllocs(out) - if len(out) != 0 { - t.Fatalf("bad: %#v", out) - } + require.Empty(t, out) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1014,35 +920,27 @@ func TestSystemSched_NodeDown(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted all allocs - if len(plan.NodeUpdate[node.ID]) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) // Ensure the plan updated the allocation. - var planned []*structs.Allocation + planned := make([]*structs.Allocation, 0) for _, allocList := range plan.NodeUpdate { planned = append(planned, allocList...) } - if len(planned) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 1) // Ensure the allocations is stopped - if p := planned[0]; p.DesiredStatus != structs.AllocDesiredStatusStop && - p.ClientStatus != structs.AllocClientStatusLost { - t.Fatalf("bad: %#v", planned[0]) - } + p := planned[0] + require.Equal(t, structs.AllocDesiredStatusStop, p.DesiredStatus) + // removed badly designed assertion on client_status = lost + // the actual client_status is pending h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1080,32 +978,23 @@ func TestSystemSched_NodeDrain_Down(t *testing.T) { require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) // Process the evaluation - err := h.Process(NewServiceScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + err := h.Process(NewSystemScheduler, eval) // todo: yikes + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted non terminal allocs - if len(plan.NodeUpdate[node.ID]) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) // Ensure that the allocation is marked as lost - var lostAllocs []string + var lost []string for _, alloc := range plan.NodeUpdate[node.ID] { - lostAllocs = append(lostAllocs, alloc.ID) + lost = append(lost, alloc.ID) } - expected := []string{alloc.ID} + require.Equal(t, []string{alloc.ID}, lost) - if !reflect.DeepEqual(lostAllocs, expected) { - t.Fatalf("expected: %v, actual: %v", expected, lostAllocs) - } h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1143,35 +1032,24 @@ func TestSystemSched_NodeDrain(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted all allocs - if len(plan.NodeUpdate[node.ID]) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) // Ensure the plan updated the allocation. - var planned []*structs.Allocation + planned := make([]*structs.Allocation, 0) for _, allocList := range plan.NodeUpdate { planned = append(planned, allocList...) } - if len(planned) != 1 { - t.Log(len(planned)) - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 1) // Ensure the allocations is stopped - if planned[0].DesiredStatus != structs.AllocDesiredStatusStop { - t.Fatalf("bad: %#v", planned[0]) - } + require.Equal(t, structs.AllocDesiredStatusStop, planned[0].DesiredStatus) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1194,7 +1072,7 @@ func TestSystemSched_NodeUpdate(t *testing.T) { alloc.Name = "my-job.web[0]" require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) - // Create a mock evaluation to deal + // Create a mock evaluation to deal with the node update eval := &structs.Evaluation{ Namespace: structs.DefaultNamespace, ID: uuid.Generate(), @@ -1208,14 +1086,12 @@ func TestSystemSched_NodeUpdate(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure that queued allocations is zero - if val, ok := h.Evals[0].QueuedAllocations["web"]; !ok || val != 0 { - t.Fatalf("bad queued allocations: %#v", h.Evals[0].QueuedAllocations) - } + val, ok := h.Evals[0].QueuedAllocations["web"] + require.True(t, ok) + require.Zero(t, val) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1225,16 +1101,13 @@ func TestSystemSched_RetryLimit(t *testing.T) { h.Planner = &RejectPlan{h} // Create some nodes - for i := 0; i < 10; i++ { - node := mock.Node() - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + _ = createNodes(t, h, 10) // Create a job job := mock.SystemJob() require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) - // Create a mock evaluation to deregister the job + // Create a mock evaluation to register the job eval := &structs.Evaluation{ Namespace: structs.DefaultNamespace, ID: uuid.Generate(), @@ -1247,14 +1120,10 @@ func TestSystemSched_RetryLimit(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure multiple plans - if len(h.Plans) == 0 { - t.Fatalf("bad: %#v", h.Plans) - } + require.NotEmpty(t, h.Plans) // Lookup the allocations by JobID ws := memdb.NewWatchSet() @@ -1262,9 +1131,7 @@ func TestSystemSched_RetryLimit(t *testing.T) { require.NoError(t, err) // Ensure no allocations placed - if len(out) != 0 { - t.Fatalf("bad: %#v", out) - } + require.Empty(t, out) // Should hit the retry limit h.AssertEvalStatus(t, structs.EvalStatusFailed) @@ -1272,7 +1139,7 @@ func TestSystemSched_RetryLimit(t *testing.T) { // This test ensures that the scheduler doesn't increment the queued allocation // count for a task group when allocations can't be created on currently -// available nodes because of constrain mismatches. +// available nodes because of constraint mismatches. func TestSystemSched_Queued_With_Constraints(t *testing.T) { h := NewHarness(t) @@ -1285,7 +1152,7 @@ func TestSystemSched_Queued_With_Constraints(t *testing.T) { job := mock.SystemJob() require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) - // Create a mock evaluation to deal + // Create a mock evaluation to deal with the node update eval := &structs.Evaluation{ Namespace: structs.DefaultNamespace, ID: uuid.Generate(), @@ -1299,20 +1166,17 @@ func TestSystemSched_Queued_With_Constraints(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure that queued allocations is zero - if val, ok := h.Evals[0].QueuedAllocations["web"]; !ok || val != 0 { - t.Fatalf("bad queued allocations: %#v", h.Evals[0].QueuedAllocations) - } - + val, ok := h.Evals[0].QueuedAllocations["web"] + require.True(t, ok) + require.Zero(t, val) } // This test ensures that the scheduler correctly ignores ineligible // nodes when scheduling due to a new node being added. The job has two -// task groups contrained to a particular node class. The desired behavior +// task groups constrained to a particular node class. The desired behavior // should be that the TaskGroup constrained to the newly added node class is // added and that the TaskGroup constrained to the ineligible node is ignored. func TestSystemSched_JobConstraint_AddNode(t *testing.T) { @@ -1322,13 +1186,13 @@ func TestSystemSched_JobConstraint_AddNode(t *testing.T) { var node *structs.Node node = mock.Node() node.NodeClass = "Class-A" - node.ComputeClass() + require.NoError(t, node.ComputeClass()) require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) var nodeB *structs.Node nodeB = mock.Node() nodeB.NodeClass = "Class-B" - nodeB.ComputeClass() + require.NoError(t, nodeB.ComputeClass()) require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), nodeB)) // Make a job with two task groups, each constraint to a node class @@ -1365,7 +1229,6 @@ func TestSystemSched_JobConstraint_AddNode(t *testing.T) { JobID: job.ID, Status: structs.EvalStatusPending, } - require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) require.Nil(t, h.Process(NewSystemScheduler, eval)) @@ -1414,7 +1277,7 @@ func TestSystemSched_JobConstraint_AddNode(t *testing.T) { // Add a new node Class-B var nodeBTwo *structs.Node nodeBTwo = mock.Node() - nodeBTwo.ComputeClass() + require.NoError(t, nodeBTwo.ComputeClass()) nodeBTwo.NodeClass = "Class-B" require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), nodeBTwo)) @@ -1467,7 +1330,7 @@ func TestSystemSched_ExistingAllocNoNodes(t *testing.T) { var node *structs.Node // Create a node node = mock.Node() - node.ComputeClass() + require.NoError(t, node.ComputeClass()) require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) // Make a job @@ -1498,6 +1361,7 @@ func TestSystemSched_ExistingAllocNoNodes(t *testing.T) { // Mark the node as ineligible node.SchedulingEligibility = structs.NodeSchedulingIneligible + // Evaluate the job eval2 := &structs.Evaluation{ Namespace: structs.DefaultNamespace, @@ -1549,7 +1413,7 @@ func TestSystemSched_ConstraintErrors(t *testing.T) { for _, tag := range []string{"aaaaaa", "foo", "foo", "foo"} { node = mock.Node() node.Meta["tag"] = tag - node.ComputeClass() + require.NoError(t, node.ComputeClass()) require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) } @@ -1614,10 +1478,7 @@ func TestSystemSched_ChainedAlloc(t *testing.T) { h := NewHarness(t) // Create some nodes - for i := 0; i < 10; i++ { - node := mock.Node() - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + _ = createNodes(t, h, 10) // Create a job job := mock.SystemJob() @@ -1633,10 +1494,10 @@ func TestSystemSched_ChainedAlloc(t *testing.T) { Status: structs.EvalStatusPending, } require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + // Process the evaluation - if err := h.Process(NewSystemScheduler, eval); err != nil { - t.Fatalf("err: %v", err) - } + err := h.Process(NewSystemScheduler, eval) + require.NoError(t, err) var allocIDs []string for _, allocList := range h.Plans[0].NodeAllocation { @@ -1675,6 +1536,7 @@ func TestSystemSched_ChainedAlloc(t *testing.T) { t.Fatalf("err: %v", err) } + require.Len(t, h.Plans, 1) plan := h1.Plans[0] // Collect all the chained allocation ids and the new allocations which @@ -1694,14 +1556,10 @@ func TestSystemSched_ChainedAlloc(t *testing.T) { // Ensure that the new allocations has their corresponding original // allocation ids - if !reflect.DeepEqual(prevAllocs, allocIDs) { - t.Fatalf("expected: %v, actual: %v", len(allocIDs), len(prevAllocs)) - } + require.Equal(t, allocIDs, prevAllocs) // Ensuring two new allocations don't have any chained allocations - if len(newAllocs) != 2 { - t.Fatalf("expected: %v, actual: %v", 2, len(newAllocs)) - } + require.Len(t, newAllocs, 2) } func TestSystemSched_PlanWithDrainedNode(t *testing.T) { @@ -1711,12 +1569,12 @@ func TestSystemSched_PlanWithDrainedNode(t *testing.T) { node := mock.Node() node.NodeClass = "green" node.Drain = true - node.ComputeClass() + require.NoError(t, node.ComputeClass()) require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) node2 := mock.Node() node2.NodeClass = "blue" - node2.ComputeClass() + require.NoError(t, node2.ComputeClass()) require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node2)) // Create a Job with two task groups, each constrained on node class @@ -1766,31 +1624,21 @@ func TestSystemSched_PlanWithDrainedNode(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted the alloc on the failed node planned := plan.NodeUpdate[node.ID] - if len(planned) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) // Ensure the plan didn't place - if len(plan.NodeAllocation) != 0 { - t.Fatalf("bad: %#v", plan) - } + require.Empty(t, plan.NodeAllocation) // Ensure the allocations is stopped - if planned[0].DesiredStatus != structs.AllocDesiredStatusStop { - t.Fatalf("bad: %#v", planned[0]) - } + require.Equal(t, structs.AllocDesiredStatusStop, planned[0].DesiredStatus) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1801,12 +1649,12 @@ func TestSystemSched_QueuedAllocsMultTG(t *testing.T) { // Register two nodes with two different classes node := mock.Node() node.NodeClass = "green" - node.ComputeClass() + require.NoError(t, node.ComputeClass()) require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) node2 := mock.Node() node2.NodeClass = "blue" - node2.ComputeClass() + require.NoError(t, node2.ComputeClass()) require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node2)) // Create a Job with two task groups, each constrained on node class @@ -1839,19 +1687,14 @@ func TestSystemSched_QueuedAllocsMultTG(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) qa := h.Evals[0].QueuedAllocations - if qa["web"] != 0 || qa["web2"] != 0 { - t.Fatalf("bad queued allocations %#v", qa) - } + require.Zero(t, qa["pinger"]) + require.Zero(t, qa["pinger2"]) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1860,63 +1703,50 @@ func TestSystemSched_Preemption(t *testing.T) { h := NewHarness(t) // Create nodes - var nodes []*structs.Node + nodes := make([]*structs.Node, 0) for i := 0; i < 2; i++ { node := mock.Node() - // TODO(preetha): remove in 0.11 + // TODO: remove in 0.11 node.Resources = &structs.Resources{ CPU: 3072, MemoryMB: 5034, DiskMB: 20 * 1024, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - CIDR: "192.168.0.100/32", - MBits: 1000, - }, - }, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + CIDR: "192.168.0.100/32", + MBits: 1000, + }}, } node.NodeResources = &structs.NodeResources{ - Cpu: structs.NodeCpuResources{ - CpuShares: 3072, - }, - Memory: structs.NodeMemoryResources{ - MemoryMB: 5034, - }, - Disk: structs.NodeDiskResources{ - DiskMB: 20 * 1024, - }, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - CIDR: "192.168.0.100/32", - MBits: 1000, - }, - }, - NodeNetworks: []*structs.NodeNetworkResource{ - { - Mode: "host", - Device: "eth0", - Addresses: []structs.NodeNetworkAddress{ - { - Family: structs.NodeNetworkAF_IPv4, - Alias: "default", - Address: "192.168.0.100", - }, - }, - }, - }, + Cpu: structs.NodeCpuResources{CpuShares: 3072}, + Memory: structs.NodeMemoryResources{MemoryMB: 5034}, + Disk: structs.NodeDiskResources{DiskMB: 20 * 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + CIDR: "192.168.0.100/32", + MBits: 1000, + }}, + NodeNetworks: []*structs.NodeNetworkResource{{ + Mode: "host", + Device: "eth0", + Addresses: []structs.NodeNetworkAddress{{ + Family: structs.NodeNetworkAF_IPv4, + Alias: "default", + Address: "192.168.0.100", + }}, + }}, } require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) nodes = append(nodes, node) } // Enable Preemption - h.State.SchedulerSetConfig(h.NextIndex(), &structs.SchedulerConfiguration{ + err := h.State.SchedulerSetConfig(h.NextIndex(), &structs.SchedulerConfiguration{ PreemptionConfig: structs.PreemptionConfig{ SystemSchedulerEnabled: true, }, }) + require.NoError(t, err) // Create some low priority batch jobs and allocations for them // One job uses a reserved port @@ -1926,17 +1756,13 @@ func TestSystemSched_Preemption(t *testing.T) { job1.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ CPU: 512, MemoryMB: 1024, - Networks: []*structs.NetworkResource{ - { - MBits: 200, - ReservedPorts: []structs.Port{ - { - Label: "web", - Value: 80, - }, - }, - }, - }, + Networks: []*structs.NetworkResource{{ + MBits: 200, + ReservedPorts: []structs.Port{{ + Label: "web", + Value: 80, + }}, + }}, } alloc1 := mock.Alloc() @@ -1948,27 +1774,18 @@ func TestSystemSched_Preemption(t *testing.T) { alloc1.AllocatedResources = &structs.AllocatedResources{ Tasks: map[string]*structs.AllocatedTaskResources{ "web": { - Cpu: structs.AllocatedCpuResources{ - CpuShares: 512, - }, - Memory: structs.AllocatedMemoryResources{ - MemoryMB: 1024, - }, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - IP: "192.168.0.100", - ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, - MBits: 200, - }, - }, + Cpu: structs.AllocatedCpuResources{CpuShares: 512}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, + MBits: 200, + }}, }, }, - Shared: structs.AllocatedSharedResources{ - DiskMB: 5 * 1024, - }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, } - require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job1)) job2 := mock.BatchJob() @@ -1977,11 +1794,7 @@ func TestSystemSched_Preemption(t *testing.T) { job2.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ CPU: 512, MemoryMB: 1024, - Networks: []*structs.NetworkResource{ - { - MBits: 200, - }, - }, + Networks: []*structs.NetworkResource{{MBits: 200}}, } alloc2 := mock.Alloc() @@ -1993,24 +1806,16 @@ func TestSystemSched_Preemption(t *testing.T) { alloc2.AllocatedResources = &structs.AllocatedResources{ Tasks: map[string]*structs.AllocatedTaskResources{ "web": { - Cpu: structs.AllocatedCpuResources{ - CpuShares: 512, - }, - Memory: structs.AllocatedMemoryResources{ - MemoryMB: 1024, - }, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - IP: "192.168.0.100", - MBits: 200, - }, - }, + Cpu: structs.AllocatedCpuResources{CpuShares: 512}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + MBits: 200, + }}, }, }, - Shared: structs.AllocatedSharedResources{ - DiskMB: 5 * 1024, - }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, } require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) @@ -2020,12 +1825,10 @@ func TestSystemSched_Preemption(t *testing.T) { job3.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ CPU: 1024, MemoryMB: 2048, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - MBits: 400, - }, - }, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + MBits: 400, + }}, } alloc3 := mock.Alloc() @@ -2037,25 +1840,17 @@ func TestSystemSched_Preemption(t *testing.T) { alloc3.AllocatedResources = &structs.AllocatedResources{ Tasks: map[string]*structs.AllocatedTaskResources{ "web": { - Cpu: structs.AllocatedCpuResources{ - CpuShares: 1024, - }, - Memory: structs.AllocatedMemoryResources{ - MemoryMB: 25, - }, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - IP: "192.168.0.100", - ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, - MBits: 400, - }, - }, + Cpu: structs.AllocatedCpuResources{CpuShares: 1024}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 25}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, + MBits: 400, + }}, }, }, - Shared: structs.AllocatedSharedResources{ - DiskMB: 5 * 1024, - }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, } require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc1, alloc2, alloc3})) @@ -2068,11 +1863,7 @@ func TestSystemSched_Preemption(t *testing.T) { job4.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ CPU: 1024, MemoryMB: 2048, - Networks: []*structs.NetworkResource{ - { - MBits: 100, - }, - }, + Networks: []*structs.NetworkResource{{MBits: 100}}, } alloc4 := mock.Alloc() @@ -2112,12 +1903,10 @@ func TestSystemSched_Preemption(t *testing.T) { job.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ CPU: 1948, MemoryMB: 256, - Networks: []*structs.NetworkResource{ - { - MBits: 800, - DynamicPorts: []structs.Port{{Label: "http"}}, - }, - }, + Networks: []*structs.NetworkResource{{ + MBits: 800, + DynamicPorts: []structs.Port{{Label: "http"}}, + }}, } require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) @@ -2133,21 +1922,20 @@ func TestSystemSched_Preemption(t *testing.T) { require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) // Process the evaluation - err := h.Process(NewSystemScheduler, eval) - require := require.New(t) - require.Nil(err) + err = h.Process(NewSystemScheduler, eval) + require.Nil(t, err) // Ensure a single plan - require.Equal(1, len(h.Plans)) + require.Equal(t, 1, len(h.Plans)) plan := h.Plans[0] - // Ensure the plan doesn't have annotations. - require.Nil(plan.Annotations) + // Ensure the plan doesn't have annotations + require.Nil(t, plan.Annotations) // Ensure the plan allocated on both nodes var planned []*structs.Allocation preemptingAllocId := "" - require.Equal(2, len(plan.NodeAllocation)) + require.Equal(t, 2, len(plan.NodeAllocation)) // The alloc that got placed on node 1 is the preemptor for _, allocList := range plan.NodeAllocation { @@ -2162,37 +1950,49 @@ func TestSystemSched_Preemption(t *testing.T) { // Lookup the allocations by JobID ws := memdb.NewWatchSet() out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) - require.NoError(err) + require.NoError(t, err) // Ensure all allocations placed - require.Equal(2, len(out)) + require.Equal(t, 2, len(out)) // Verify that one node has preempted allocs - require.NotNil(plan.NodePreemptions[nodes[0].ID]) + require.NotNil(t, plan.NodePreemptions[nodes[0].ID]) preemptedAllocs := plan.NodePreemptions[nodes[0].ID] // Verify that three jobs have preempted allocs - require.Equal(3, len(preemptedAllocs)) + require.Equal(t, 3, len(preemptedAllocs)) expectedPreemptedJobIDs := []string{job1.ID, job2.ID, job3.ID} // We expect job1, job2 and job3 to have preempted allocations // job4 should not have any allocs preempted for _, alloc := range preemptedAllocs { - require.Contains(expectedPreemptedJobIDs, alloc.JobID) + require.Contains(t, expectedPreemptedJobIDs, alloc.JobID) } // Look up the preempted allocs by job ID ws = memdb.NewWatchSet() for _, jobId := range expectedPreemptedJobIDs { out, err = h.State.AllocsByJob(ws, structs.DefaultNamespace, jobId, false) - require.NoError(err) + require.NoError(t, err) for _, alloc := range out { - require.Equal(structs.AllocDesiredStatusEvict, alloc.DesiredStatus) - require.Equal(fmt.Sprintf("Preempted by alloc ID %v", preemptingAllocId), alloc.DesiredDescription) + require.Equal(t, structs.AllocDesiredStatusEvict, alloc.DesiredStatus) + require.Equal(t, fmt.Sprintf("Preempted by alloc ID %v", preemptingAllocId), alloc.DesiredDescription) } } h.AssertEvalStatus(t, structs.EvalStatusComplete) +} +func TestSystemSched_canHandle(t *testing.T) { + s := SystemScheduler{sysbatch: false} + t.Run("system register", func(t *testing.T) { + require.True(t, s.canHandle(structs.EvalTriggerJobRegister)) + }) + t.Run("system scheduled", func(t *testing.T) { + require.False(t, s.canHandle(structs.EvalTriggerScheduled)) + }) + t.Run("system periodic", func(t *testing.T) { + require.False(t, s.canHandle(structs.EvalTriggerPeriodicJob)) + }) } diff --git a/scheduler/util.go b/scheduler/util.go index 7261f67deb8f..75f291e6410a 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -60,21 +60,19 @@ func (d *diffResult) Append(other *diffResult) { // need to be migrated (node is draining), the allocs that need to be evicted // (no longer required), those that should be ignored and those that are lost // that need to be replaced (running on a lost node). -// -// job is the job whose allocs is going to be diff-ed. -// taintedNodes is an index of the nodes which are either down or in drain mode -// by name. -// required is a set of allocations that must exist. -// allocs is a list of non terminal allocations. -// terminalAllocs is an index of the latest terminal allocations by name. -func diffSystemAllocsForNode(job *structs.Job, nodeID string, - eligibleNodes, taintedNodes map[string]*structs.Node, - required map[string]*structs.TaskGroup, allocs []*structs.Allocation, - terminalAllocs map[string]*structs.Allocation) *diffResult { - result := &diffResult{} +func diffSystemAllocsForNode( + job *structs.Job, // job whose allocs are going to be diff-ed + nodeID string, + eligibleNodes map[string]*structs.Node, + taintedNodes map[string]*structs.Node, // nodes which are down or in drain (by node name) + required map[string]*structs.TaskGroup, // set of allocations that must exist + allocs []*structs.Allocation, // non-terminal allocations that exist + terminal structs.TerminalByNodeByName, // latest terminal allocations (by node, name) +) *diffResult { + result := new(diffResult) // Scan the existing updates - existing := make(map[string]struct{}) + existing := make(map[string]struct{}) // set of alloc names for _, exist := range allocs { // Index the existing node name := exist.Name @@ -102,6 +100,17 @@ func diffSystemAllocsForNode(job *structs.Job, nodeID string, }) continue } + + // If we are a sysbatch job and terminal, ignore (or stop?) the alloc + if job.Type == structs.JobTypeSysBatch && exist.TerminalStatus() { + result.ignore = append(result.ignore, allocTuple{ + Name: name, + TaskGroup: tg, + Alloc: exist, + }) + continue + } + // If we are on a tainted node, we must migrate if we are a service or // if the batch allocation did not finish if node, ok := taintedNodes[exist.NodeID]; ok { @@ -154,14 +163,38 @@ func diffSystemAllocsForNode(job *structs.Job, nodeID string, // Scan the required groups for name, tg := range required { + // Check for an existing allocation - _, ok := existing[name] + if _, ok := existing[name]; !ok { + + // Check for a terminal sysbatch allocation, which should be not placed + // again unless the job has been updated. + if job.Type == structs.JobTypeSysBatch { + if alloc, termExists := terminal.Get(nodeID, name); termExists { + // the alloc is terminal, but now the job has been updated + if job.JobModifyIndex != alloc.Job.JobModifyIndex { + result.update = append(result.update, allocTuple{ + Name: name, + TaskGroup: tg, + Alloc: alloc, + }) + } else { + // alloc is terminal and job unchanged, leave it alone + result.ignore = append(result.ignore, allocTuple{ + Name: name, + TaskGroup: tg, + Alloc: alloc, + }) + } + continue + } + } + + // Require a placement if no existing allocation. If there + // is an existing allocation, we would have checked for a potential + // update or ignore above. Ignore placements for tainted or + // ineligible nodes - // Require a placement if no existing allocation. If there - // is an existing allocation, we would have checked for a potential - // update or ignore above. Ignore placements for tainted or - // ineligible nodes - if !ok { // Tainted and ineligible nodes for a non existing alloc // should be filtered out and not count towards ignore or place if _, tainted := taintedNodes[nodeID]; tainted { @@ -171,10 +204,11 @@ func diffSystemAllocsForNode(job *structs.Job, nodeID string, continue } + termOnNode, _ := terminal.Get(nodeID, name) allocTuple := allocTuple{ Name: name, TaskGroup: tg, - Alloc: terminalAllocs[name], + Alloc: termOnNode, } // If the new allocation isn't annotated with a previous allocation @@ -183,6 +217,7 @@ func diffSystemAllocsForNode(job *structs.Job, nodeID string, if allocTuple.Alloc == nil || allocTuple.Alloc.NodeID != nodeID { allocTuple.Alloc = &structs.Allocation{NodeID: nodeID} } + result.place = append(result.place, allocTuple) } } @@ -191,15 +226,13 @@ func diffSystemAllocsForNode(job *structs.Job, nodeID string, // diffSystemAllocs is like diffSystemAllocsForNode however, the allocations in the // diffResult contain the specific nodeID they should be allocated on. -// -// job is the job whose allocs is going to be diff-ed. -// nodes is a list of nodes in ready state. -// taintedNodes is an index of the nodes which are either down or in drain mode -// by name. -// allocs is a list of non terminal allocations. -// terminalAllocs is an index of the latest terminal allocations by name. -func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]*structs.Node, - allocs []*structs.Allocation, terminalAllocs map[string]*structs.Allocation) *diffResult { +func diffSystemAllocs( + job *structs.Job, // jobs whose allocations are going to be diff-ed + nodes []*structs.Node, // list of nodes in the ready state + taintedNodes map[string]*structs.Node, // nodes which are down or drain mode (by name) + allocs []*structs.Allocation, // non-terminal allocations + terminal structs.TerminalByNodeByName, // latest terminal allocations (by name) +) *diffResult { // Build a mapping of nodes to all their allocs. nodeAllocs := make(map[string][]*structs.Allocation, len(allocs)) @@ -219,9 +252,9 @@ func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[ // Create the required task groups. required := materializeTaskGroups(job) - result := &diffResult{} + result := new(diffResult) for nodeID, allocs := range nodeAllocs { - diff := diffSystemAllocsForNode(job, nodeID, eligibleNodes, taintedNodes, required, allocs, terminalAllocs) + diff := diffSystemAllocsForNode(job, nodeID, eligibleNodes, taintedNodes, required, allocs, terminal) result.Append(diff) } diff --git a/scheduler/util_test.go b/scheduler/util_test.go index 5c783d7e59db..865b343fdbd2 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -27,6 +27,76 @@ func TestMaterializeTaskGroups(t *testing.T) { } } +func newNode(name string) *structs.Node { + n := mock.Node() + n.Name = name + return n +} + +func TestDiffSystemAllocsForNode_Sysbatch_terminal(t *testing.T) { + // For a sysbatch job, the scheduler should not re-place an allocation + // that has become terminal, unless the job has been updated. + + job := mock.SystemBatchJob() + required := materializeTaskGroups(job) + + eligible := map[string]*structs.Node{ + "node1": newNode("node1"), + } + + var live []*structs.Allocation // empty + + tainted := map[string]*structs.Node(nil) + + t.Run("current job", func(t *testing.T) { + terminal := structs.TerminalByNodeByName{ + "node1": map[string]*structs.Allocation{ + "my-sysbatch.pinger[0]": &structs.Allocation{ + ID: uuid.Generate(), + NodeID: "node1", + Name: "my-sysbatch.pinger[0]", + Job: job, + ClientStatus: structs.AllocClientStatusComplete, + }, + }, + } + + diff := diffSystemAllocsForNode(job, "node1", eligible, tainted, required, live, terminal) + require.Empty(t, diff.place) + require.Empty(t, diff.update) + require.Empty(t, diff.stop) + require.Empty(t, diff.migrate) + require.Empty(t, diff.lost) + require.True(t, len(diff.ignore) == 1 && diff.ignore[0].Alloc == terminal["node1"]["my-sysbatch.pinger[0]"]) + }) + + t.Run("outdated job", func(t *testing.T) { + previousJob := job.Copy() + previousJob.JobModifyIndex -= 1 + terminal := structs.TerminalByNodeByName{ + "node1": map[string]*structs.Allocation{ + "my-sysbatch.pinger[0]": &structs.Allocation{ + ID: uuid.Generate(), + NodeID: "node1", + Name: "my-sysbatch.pinger[0]", + Job: previousJob, + }, + }, + } + + expAlloc := terminal["node1"]["my-sysbatch.pinger[0]"] + expAlloc.NodeID = "node1" + + diff := diffSystemAllocsForNode(job, "node1", eligible, tainted, required, live, terminal) + require.Empty(t, diff.place) + require.Equal(t, 1, len(diff.update)) + require.Empty(t, diff.stop) + require.Empty(t, diff.migrate) + require.Empty(t, diff.lost) + require.Empty(t, diff.ignore) + }) +} + func TestDiffSystemAllocsForNode(t *testing.T) { job := mock.Job() required := materializeTaskGroups(job) @@ -99,28 +169,30 @@ func TestDiffSystemAllocsForNode(t *testing.T) { } // Have three terminal allocs - terminalAllocs := map[string]*structs.Allocation{ - "my-job.web[4]": { - ID: uuid.Generate(), - NodeID: "zip", - Name: "my-job.web[4]", - Job: job, - }, - "my-job.web[5]": { - ID: uuid.Generate(), - NodeID: "zip", - Name: "my-job.web[5]", - Job: job, - }, - "my-job.web[6]": { - ID: uuid.Generate(), - NodeID: "zip", - Name: "my-job.web[6]", - Job: job, + terminal := structs.TerminalByNodeByName{ + "zip": map[string]*structs.Allocation{ + "my-job.web[4]": { + ID: uuid.Generate(), + NodeID: "zip", + Name: "my-job.web[4]", + Job: job, + }, + "my-job.web[5]": { + ID: uuid.Generate(), + NodeID: "zip", + Name: "my-job.web[5]", + Job: job, + }, + "my-job.web[6]": { + ID: uuid.Generate(), + NodeID: "zip", + Name: "my-job.web[6]", + Job: job, + }, }, } - diff := diffSystemAllocsForNode(job, "zip", eligible, tainted, required, allocs, terminalAllocs) + diff := diffSystemAllocsForNode(job, "zip", eligible, tainted, required, allocs, terminal) place := diff.place update := diff.update migrate := diff.migrate @@ -147,12 +219,14 @@ func TestDiffSystemAllocsForNode(t *testing.T) { require.Equal(t, 6, len(place)) // Ensure that the allocations which are replacements of terminal allocs are - // annotated - for name, alloc := range terminalAllocs { - for _, allocTuple := range diff.place { - if name == allocTuple.Name { - require.True(t, reflect.DeepEqual(alloc, allocTuple.Alloc), - "expected: %#v, actual: %#v", alloc, allocTuple.Alloc) + // annotated. + for _, m := range terminal { + for _, alloc := range m { + for _, tuple := range diff.place { + if alloc.Name == tuple.Name { + require.True(t, reflect.DeepEqual(alloc, tuple.Alloc), + "expected: %#v, actual: %#v", alloc, tuple.Alloc) + } } } } @@ -199,9 +273,9 @@ func TestDiffSystemAllocsForNode_ExistingAllocIneligibleNode(t *testing.T) { } // No terminal allocs - terminalAllocs := map[string]*structs.Allocation{} + terminal := make(structs.TerminalByNodeByName) - diff := diffSystemAllocsForNode(job, eligibleNode.ID, eligible, tainted, required, allocs, terminalAllocs) + diff := diffSystemAllocsForNode(job, eligibleNode.ID, eligible, tainted, required, allocs, terminal) place := diff.place update := diff.update migrate := diff.migrate @@ -276,17 +350,19 @@ func TestDiffSystemAllocs(t *testing.T) { }, } - // Have three terminal allocs - terminalAllocs := map[string]*structs.Allocation{ - "my-job.web[0]": { - ID: uuid.Generate(), - NodeID: "pipe", - Name: "my-job.web[0]", - Job: job, + // Have three (?) terminal allocs + terminal := structs.TerminalByNodeByName{ + "pipe": map[string]*structs.Allocation{ + "my-job.web[0]": { + ID: uuid.Generate(), + NodeID: "pipe", + Name: "my-job.web[0]", + Job: job, + }, }, } - diff := diffSystemAllocs(job, nodes, tainted, allocs, terminalAllocs) + diff := diffSystemAllocs(job, nodes, tainted, allocs, terminal) place := diff.place update := diff.update migrate := diff.migrate @@ -313,12 +389,14 @@ func TestDiffSystemAllocs(t *testing.T) { require.Equal(t, 2, len(place)) // Ensure that the allocations which are replacements of terminal allocs are - // annotated - for _, alloc := range terminalAllocs { - for _, allocTuple := range diff.place { - if alloc.NodeID == allocTuple.Alloc.NodeID { - require.True(t, reflect.DeepEqual(alloc, allocTuple.Alloc), - "expected: %#v, actual: %#v", alloc, allocTuple.Alloc) + // annotated. + for _, m := range terminal { + for _, alloc := range m { + for _, tuple := range diff.place { + if alloc.NodeID == tuple.Alloc.NodeID { + require.True(t, reflect.DeepEqual(alloc, tuple.Alloc), + "expected: %#v, actual: %#v", alloc, tuple.Alloc) + } } } } diff --git a/vendor/github.com/hashicorp/nomad/api/operator.go b/vendor/github.com/hashicorp/nomad/api/operator.go index d5bc5d061d56..de57bffef4b2 100644 --- a/vendor/github.com/hashicorp/nomad/api/operator.go +++ b/vendor/github.com/hashicorp/nomad/api/operator.go @@ -159,9 +159,10 @@ const ( // PreemptionConfig specifies whether preemption is enabled based on scheduler type type PreemptionConfig struct { - SystemSchedulerEnabled bool - BatchSchedulerEnabled bool - ServiceSchedulerEnabled bool + SystemSchedulerEnabled bool + SysBatchSchedulerEnabled bool + BatchSchedulerEnabled bool + ServiceSchedulerEnabled bool } // SchedulerGetConfiguration is used to query the current Scheduler configuration. diff --git a/website/pages/docs/configuration/server.mdx b/website/pages/docs/configuration/server.mdx index 619911b7351f..9bec8ddfbeee 100644 --- a/website/pages/docs/configuration/server.mdx +++ b/website/pages/docs/configuration/server.mdx @@ -291,9 +291,10 @@ server { scheduler_algorithm = "spread" preemption_config { - batch_scheduler_enabled = true - system_scheduler_enabled = true - service_scheduler_enabled = true + batch_scheduler_enabled = true + system_scheduler_enabled = true + service_scheduler_enabled = true + sysbatch_scheduler_enabled = true } } } diff --git a/website/pages/docs/job-specification/job.mdx b/website/pages/docs/job-specification/job.mdx index c12b83320dbc..b73a6f2f8dfe 100644 --- a/website/pages/docs/job-specification/job.mdx +++ b/website/pages/docs/job-specification/job.mdx @@ -114,7 +114,7 @@ job "docs" { node if any of its allocation statuses become "failed". - `type` `(string: "service")` - Specifies the [Nomad scheduler][scheduler] to - use. Nomad provides the `service`, `system` and `batch` schedulers. + use. Nomad provides the `service`, `system`, `batch`, and `sysbatch` schedulers. - `update` ([Update][update]: nil) - Specifies the task's update strategy. When omitted, rolling updates are disabled. diff --git a/website/pages/docs/job-specification/reschedule.mdx b/website/pages/docs/job-specification/reschedule.mdx index 9234ca725eb3..96d340f473ea 100644 --- a/website/pages/docs/job-specification/reschedule.mdx +++ b/website/pages/docs/job-specification/reschedule.mdx @@ -47,8 +47,8 @@ job "docs" { } ``` -~> The reschedule stanza does not apply to `system` jobs because they run on -every node. +~> The reschedule stanza does not apply to `system` or `sysbatch` jobs because +they run on every node. ## `reschedule` Parameters diff --git a/website/pages/docs/job-specification/restart.mdx b/website/pages/docs/job-specification/restart.mdx index 6e9e771db7e6..84b53ce9fa66 100644 --- a/website/pages/docs/job-specification/restart.mdx +++ b/website/pages/docs/job-specification/restart.mdx @@ -14,7 +14,7 @@ description: The "restart" stanza configures a group's behavior on task failure. ]} /> -The `restart` stanza configures a tasks's behavior on task failure. Restarts +The `restart` stanza configures a task's behavior on task failure. Restarts happen on the client that is running the task. ```hcl @@ -36,9 +36,9 @@ For example, assuming that the task group restart policy is: ```hcl restart { - interval = "30m" attempts = 2 delay = "15s" + interval = "30m" mode = "fail" } ``` @@ -55,9 +55,9 @@ then the effective restart policy for the task will be: ```hcl restart { - interval = "30m" attempts = 5 delay = "15s" + interval = "30m" mode = "fail" } ``` @@ -87,7 +87,7 @@ restart { The values for many of the `restart` parameters vary by job type. Here are the defaults by job type: -- The default batch restart policy is: +- The default restart policy for `batch` jobs is: ```hcl restart { @@ -98,13 +98,13 @@ defaults by job type: } ``` -- The default service and system job restart policy is: +- The default restart policy for `service`, `system`, and `sysbatch` jobs is: ```hcl restart { - interval = "30m" attempts = 2 delay = "15s" + interval = "30m" mode = "fail" } ``` diff --git a/website/pages/docs/schedulers.mdx b/website/pages/docs/schedulers.mdx index 304f6d60c241..120530e2f198 100644 --- a/website/pages/docs/schedulers.mdx +++ b/website/pages/docs/schedulers.mdx @@ -7,9 +7,9 @@ description: Learn about Nomad's various schedulers. # Schedulers -Nomad has three scheduler types that can be used when creating your job: -`service`, `batch` and `system`. Here we will describe the differences between -each of these schedulers. +Nomad has four scheduler types that can be used when creating your job: +`service`, `batch`, `system` and `sysbatch`. Here we will describe the differences +between each of these schedulers. ## Service @@ -61,8 +61,30 @@ Systems jobs are intended to run until explicitly stopped either by an operator or [preemption]. If a system task exits it is considered a failure and handled according to the job's [restart] stanza; system jobs do not have rescheduling. +## System Batch + +The `sysbatch` scheduler is used to register jobs that should be run to completion +on all clients that meet the job's constraints. The `sysbatch` scheduler will +schedule jobs similarly to the `system` scheduler, but like a `batch` job once a +task exists successfully it is not restarted on that client. + +This scheduler type is useful for issuing "one off" commands to be run on every +node in the cluster. Sysbatch jobs can also be created as [periodic] and [parameterized] +jobs. Since these tasks are managed by Nomad, they can take advantage of job +updating, service discovery, monitoring, and more. + +The `sysbatch` scheduler will preempt lower priority tasks running on a node if there +is not enough capacity to place the job. See preemption details on how tasks that +get preempted are chosen. + +Sysbatch jobs are intended to run until successful completion, explicitly stopped +by an operator, or evicted through [preemption]. Sysbatch tasks that exit with an +error are handled according to the job's [restart] stanza. + [borg]: https://research.google.com/pubs/pub43438.html -[sparrow]: https://cs.stanford.edu/~matei/papers/2013/sosp_sparrow.pdf +[parameterized]: /docs/job-specification/parameterized +[periodic]: /docs/job-specification/periodic [preemption]: /docs/internals/scheduling/preemption [restart]: /docs/job-specification/restart [reschedule]: /docs/job-specification/reschedule +[sparrow]: https://cs.stanford.edu/~matei/papers/2013/sosp_sparrow.pdf