diff --git a/CHANGELOG.md b/CHANGELOG.md index 5057e1ac9144..7e0c9e4fbb6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ FEATURES: * **Event Stream**: Subscribe to change events as they occur in real time. [[GH-9013](https://github.com/hashicorp/nomad/issues/9013)] * **Namespaces OSS**: Namespaces are now available in open source Nomad. [[GH-9135](https://github.com/hashicorp/nomad/issues/9135)] * **Topology Visualization**: See all of the clients and allocations in a cluster at once. [[GH-9077](https://github.com/hashicorp/nomad/issues/9077)] +* **System Batch Scheduling**: New `sysbatch` scheduler type for running short lived jobs across all nodes. [[GH-9160](https://github.com/hashicorp/nomad/pull/9160)] IMPROVEMENTS: * core: Improved job deregistration error logging. [[GH-8745](https://github.com/hashicorp/nomad/issues/8745)] diff --git a/api/operator.go b/api/operator.go index d5bc5d061d56..de57bffef4b2 100644 --- a/api/operator.go +++ b/api/operator.go @@ -159,9 +159,10 @@ const ( // PreemptionConfig specifies whether preemption is enabled based on scheduler type type PreemptionConfig struct { - SystemSchedulerEnabled bool - BatchSchedulerEnabled bool - ServiceSchedulerEnabled bool + SystemSchedulerEnabled bool + SysBatchSchedulerEnabled bool + BatchSchedulerEnabled bool + ServiceSchedulerEnabled bool } // SchedulerGetConfiguration is used to query the current Scheduler configuration. diff --git a/client/allocrunner/taskrunner/restarts/restarts.go b/client/allocrunner/taskrunner/restarts/restarts.go index 6ee0056ccd8b..429ee07a0384 100644 --- a/client/allocrunner/taskrunner/restarts/restarts.go +++ b/client/allocrunner/taskrunner/restarts/restarts.go @@ -14,15 +14,19 @@ const ( // jitter is the percent of jitter added to restart delays. jitter = 0.25 - ReasonNoRestartsAllowed = "Policy allows no restarts" - ReasonUnrecoverableErrror = "Error was unrecoverable" - ReasonWithinPolicy = "Restart within policy" - ReasonDelay = "Exceeded allowed attempts, applying a delay" + ReasonNoRestartsAllowed = "Policy allows no restarts" + ReasonUnrecoverableError = "Error was unrecoverable" + ReasonWithinPolicy = "Restart within policy" + ReasonDelay = "Exceeded allowed attempts, applying a delay" ) func NewRestartTracker(policy *structs.RestartPolicy, jobType string, tlc *structs.TaskLifecycleConfig) *RestartTracker { - // Batch jobs should not restart if they exit successfully - onSuccess := jobType != structs.JobTypeBatch + onSuccess := true + + // Batch & SysBatch jobs should not restart if they exit successfully + if jobType == structs.JobTypeBatch || jobType == structs.JobTypeSysBatch { + onSuccess = false + } // Prestart sidecars should get restarted on success if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPrestart { @@ -196,7 +200,7 @@ func (r *RestartTracker) GetState() (string, time.Duration) { if r.startErr != nil { // If the error is not recoverable, do not restart. if !structs.IsRecoverable(r.startErr) { - r.reason = ReasonUnrecoverableErrror + r.reason = ReasonUnrecoverableError return structs.TaskNotRestarting, 0 } } else if r.exitRes != nil { diff --git a/command/agent/operator_endpoint.go b/command/agent/operator_endpoint.go index ed4a3c4cb732..e008cd506357 100644 --- a/command/agent/operator_endpoint.go +++ b/command/agent/operator_endpoint.go @@ -261,9 +261,10 @@ func (s *HTTPServer) schedulerUpdateConfig(resp http.ResponseWriter, req *http.R args.Config = structs.SchedulerConfiguration{ SchedulerAlgorithm: structs.SchedulerAlgorithm(conf.SchedulerAlgorithm), PreemptionConfig: structs.PreemptionConfig{ - SystemSchedulerEnabled: conf.PreemptionConfig.SystemSchedulerEnabled, - BatchSchedulerEnabled: conf.PreemptionConfig.BatchSchedulerEnabled, - ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled}, + SystemSchedulerEnabled: conf.PreemptionConfig.SystemSchedulerEnabled, + SysBatchSchedulerEnabled: conf.PreemptionConfig.SysBatchSchedulerEnabled, + BatchSchedulerEnabled: conf.PreemptionConfig.BatchSchedulerEnabled, + ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled}, } if err := args.Config.Validate(); err != nil { diff --git a/command/agent/operator_endpoint_test.go b/command/agent/operator_endpoint_test.go index 8814fad4fa25..316c16ca3659 100644 --- a/command/agent/operator_endpoint_test.go +++ b/command/agent/operator_endpoint_test.go @@ -282,6 +282,7 @@ func TestOperator_SchedulerGetConfiguration(t *testing.T) { // Only system jobs can preempt other jobs by default. require.True(out.SchedulerConfig.PreemptionConfig.SystemSchedulerEnabled) + require.False(out.SchedulerConfig.PreemptionConfig.SysBatchSchedulerEnabled) require.False(out.SchedulerConfig.PreemptionConfig.BatchSchedulerEnabled) require.False(out.SchedulerConfig.PreemptionConfig.ServiceSchedulerEnabled) }) @@ -314,6 +315,8 @@ func TestOperator_SchedulerSetConfiguration(t *testing.T) { err = s.RPC("Operator.SchedulerGetConfiguration", &args, &reply) require.Nil(err) require.True(reply.SchedulerConfig.PreemptionConfig.SystemSchedulerEnabled) + require.False(reply.SchedulerConfig.PreemptionConfig.SysBatchSchedulerEnabled) + require.False(reply.SchedulerConfig.PreemptionConfig.BatchSchedulerEnabled) require.True(reply.SchedulerConfig.PreemptionConfig.ServiceSchedulerEnabled) }) } @@ -324,6 +327,7 @@ func TestOperator_SchedulerCASConfiguration(t *testing.T) { require := require.New(t) body := bytes.NewBuffer([]byte(`{"PreemptionConfig": { "SystemSchedulerEnabled": true, + "SysBatchSchedulerEnabled":true, "BatchSchedulerEnabled":true }}`)) req, _ := http.NewRequest("PUT", "/v1/operator/scheduler/configuration", body) @@ -346,7 +350,9 @@ func TestOperator_SchedulerCASConfiguration(t *testing.T) { t.Fatalf("err: %v", err) } require.True(reply.SchedulerConfig.PreemptionConfig.SystemSchedulerEnabled) + require.True(reply.SchedulerConfig.PreemptionConfig.SysBatchSchedulerEnabled) require.True(reply.SchedulerConfig.PreemptionConfig.BatchSchedulerEnabled) + require.False(reply.SchedulerConfig.PreemptionConfig.ServiceSchedulerEnabled) // Create a CAS request, bad index { @@ -387,7 +393,9 @@ func TestOperator_SchedulerCASConfiguration(t *testing.T) { t.Fatalf("err: %v", err) } require.False(reply.SchedulerConfig.PreemptionConfig.SystemSchedulerEnabled) + require.False(reply.SchedulerConfig.PreemptionConfig.SysBatchSchedulerEnabled) require.False(reply.SchedulerConfig.PreemptionConfig.BatchSchedulerEnabled) + require.False(reply.SchedulerConfig.PreemptionConfig.ServiceSchedulerEnabled) }) } diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 2e6c6db2ce8d..a534d8f38b4e 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -27,8 +27,9 @@ import ( _ "github.com/hashicorp/nomad/e2e/podman" _ "github.com/hashicorp/nomad/e2e/quotas" _ "github.com/hashicorp/nomad/e2e/rescheduling" + _ "github.com/hashicorp/nomad/e2e/scheduler_sysbatch" + _ "github.com/hashicorp/nomad/e2e/scheduler_system" _ "github.com/hashicorp/nomad/e2e/spread" - _ "github.com/hashicorp/nomad/e2e/systemsched" _ "github.com/hashicorp/nomad/e2e/taskevents" _ "github.com/hashicorp/nomad/e2e/vaultsecrets" _ "github.com/hashicorp/nomad/e2e/volumes" diff --git a/e2e/e2eutil/utils.go b/e2e/e2eutil/utils.go index 6cf10d574f42..d042c1743b1d 100644 --- a/e2e/e2eutil/utils.go +++ b/e2e/e2eutil/utils.go @@ -201,6 +201,30 @@ func WaitForAllocStopped(t *testing.T, nomadClient *api.Client, allocID string) }) } +func WaitForAllocStatus(t *testing.T, nomadClient *api.Client, allocID string, status string) { + testutil.WaitForResultRetries(retries, func() (bool, error) { + time.Sleep(time.Millisecond * 100) + alloc, _, err := nomadClient.Allocations().Info(allocID, nil) + if err != nil { + return false, err + } + switch alloc.ClientStatus { + case status: + return true, nil + default: + return false, fmt.Errorf("expected %s alloc, but was: %s", status, alloc.ClientStatus) + } + }, func(err error) { + t.Fatalf("failed to wait on alloc: %v", err) + }) +} + +func WaitForAllocsStatus(t *testing.T, nomadClient *api.Client, allocIDs []string, status string) { + for _, allocID := range allocIDs { + WaitForAllocStatus(t, nomadClient, allocID, status) + } +} + func AllocIDsFromAllocationListStubs(allocs []*api.AllocationListStub) []string { allocIDs := make([]string, 0, len(allocs)) for _, alloc := range allocs { diff --git a/e2e/scheduler_sysbatch/input/sysbatch_dispatch.nomad b/e2e/scheduler_sysbatch/input/sysbatch_dispatch.nomad new file mode 100644 index 000000000000..fcc369efdb6d --- /dev/null +++ b/e2e/scheduler_sysbatch/input/sysbatch_dispatch.nomad @@ -0,0 +1,30 @@ +job "sysbatchjob" { + datacenters = ["dc1"] + + type = "sysbatch" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + parameterized { + payload = "forbidden" + meta_required = ["KEY"] + } + + group "sysbatch_job_group" { + count = 1 + + task "sysbatch_task" { + driver = "docker" + + config { + image = "bash:5" + + command = "bash" + args = ["-c", "ping -c 10 example.com"] + } + } + } +} diff --git a/e2e/scheduler_sysbatch/input/sysbatch_job_fast.nomad b/e2e/scheduler_sysbatch/input/sysbatch_job_fast.nomad new file mode 100644 index 000000000000..5aaba9072ba1 --- /dev/null +++ b/e2e/scheduler_sysbatch/input/sysbatch_job_fast.nomad @@ -0,0 +1,25 @@ +job "sysbatchjob" { + datacenters = ["dc1"] + + type = "sysbatch" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "sysbatch_job_group" { + count = 1 + + task "sysbatch_task" { + driver = "docker" + + config { + image = "bash:5" + + command = "bash" + args = ["-c", "ping -c 10 example.com"] + } + } + } +} diff --git a/e2e/scheduler_sysbatch/input/sysbatch_job_slow.nomad b/e2e/scheduler_sysbatch/input/sysbatch_job_slow.nomad new file mode 100644 index 000000000000..3a0b667eb25f --- /dev/null +++ b/e2e/scheduler_sysbatch/input/sysbatch_job_slow.nomad @@ -0,0 +1,25 @@ +job "sysbatchjob" { + datacenters = ["dc1"] + + type = "sysbatch" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "sysbatch_job_group" { + count = 1 + + task "sysbatch_task" { + driver = "docker" + + config { + image = "bash:5" + + command = "bash" + args = ["-c", "ping -c 100000 example.com"] + } + } + } +} diff --git a/e2e/scheduler_sysbatch/input/sysbatch_periodic.nomad b/e2e/scheduler_sysbatch/input/sysbatch_periodic.nomad new file mode 100644 index 000000000000..d3521a5355f3 --- /dev/null +++ b/e2e/scheduler_sysbatch/input/sysbatch_periodic.nomad @@ -0,0 +1,30 @@ +job "sysbatchjob" { + datacenters = ["dc1"] + + type = "sysbatch" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + periodic { + cron = "*/15 * * * * *" + prohibit_overlap = true + } + + group "sysbatch_job_group" { + count = 1 + + task "sysbatch_task" { + driver = "docker" + + config { + image = "bash:5" + + command = "bash" + args = ["-c", "ping -c 10 example.com"] + } + } + } +} diff --git a/e2e/scheduler_sysbatch/sysbatch.go b/e2e/scheduler_sysbatch/sysbatch.go new file mode 100644 index 000000000000..0d1c8f4dfc3c --- /dev/null +++ b/e2e/scheduler_sysbatch/sysbatch.go @@ -0,0 +1,265 @@ +package scheduler_sysbatch + +import ( + "strings" + "time" + + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/e2e/framework" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type SysBatchSchedulerTest struct { + framework.TC + jobIDs []string +} + +func init() { + framework.AddSuites(&framework.TestSuite{ + Component: "SysBatchScheduler", + CanRunLocal: true, + Cases: []framework.TestCase{ + new(SysBatchSchedulerTest), + }, + }) +} + +func (tc *SysBatchSchedulerTest) BeforeAll(f *framework.F) { + // Ensure cluster has leader before running tests + e2eutil.WaitForLeader(f.T(), tc.Nomad()) + e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 4) +} + +func (tc *SysBatchSchedulerTest) TestJobRunBasic(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a fast sysbatch job + jobID := "sysbatch_run_basic" + tc.jobIDs = append(tc.jobIDs, jobID) + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "") + + // get our allocations for this sysbatch job + jobs := nomadClient.Jobs() + allocs, _, err := jobs.Allocations(jobID, true, nil) + require.NoError(t, err) + + // make sure this is job is being run on "all" the linux clients + require.True(t, len(allocs) >= 3) + + // wait for every alloc to reach completion + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete) +} + +func (tc *SysBatchSchedulerTest) TestJobStopEarly(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a slow sysbatch job + jobID := "sysbatch_stop_early" + tc.jobIDs = append(tc.jobIDs, jobID) + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "") + + // get our allocations for this sysbatch job + jobs := nomadClient.Jobs() + allocs, _, err := jobs.Allocations(jobID, true, nil) + require.NoError(t, err) + + // make sure this is job is being run on "all" the linux clients + require.True(t, len(allocs) >= 3) + + // wait for every alloc to reach running status + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusRunning) + + // stop the job before allocs reach completion + _, _, err = jobs.Deregister(jobID, false, nil) + require.NoError(t, err) +} + +func (tc *SysBatchSchedulerTest) TestJobReplaceRunning(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a slow sysbatch job + jobID := "sysbatch_replace_running" + tc.jobIDs = append(tc.jobIDs, jobID) + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "") + + // get out allocations for this sysbatch job + jobs := nomadClient.Jobs() + allocs, _, err := jobs.Allocations(jobID, true, nil) + require.NoError(t, err) + + // make sure this is job is being run on "all" the linux clients + require.True(t, len(allocs) >= 3) + + // wait for every alloc to reach running status + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusRunning) + + // replace the slow job with the fast job + intermediate := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "") + + // get the allocs for the new updated job + var updated []*api.AllocationListStub + for _, alloc := range intermediate { + if alloc.JobVersion == 1 { + updated = append(updated, alloc) + } + } + + // should be equal number of old and new allocs + newAllocIDs := e2eutil.AllocIDsFromAllocationListStubs(updated) + + // make sure this new job is being run on "all" the linux clients + require.True(t, len(updated) >= 3) + + // wait for the allocs of the fast job to complete + e2eutil.WaitForAllocsStatus(t, nomadClient, newAllocIDs, structs.AllocClientStatusComplete) +} + +func (tc *SysBatchSchedulerTest) TestJobReplaceDead(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a fast sysbatch job + jobID := "sysbatch_replace_dead" + tc.jobIDs = append(tc.jobIDs, jobID) + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "") + + // get the allocations for this sysbatch job + jobs := nomadClient.Jobs() + allocs, _, err := jobs.Allocations(jobID, true, nil) + require.NoError(t, err) + + // make sure this is job is being run on "all" the linux clients + require.True(t, len(allocs) >= 3) + + // wait for every alloc to reach complete status + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete) + + // replace the fast job with the slow job + intermediate := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "") + + // get the allocs for the new updated job + var updated []*api.AllocationListStub + for _, alloc := range intermediate { + if alloc.JobVersion == 1 { + updated = append(updated, alloc) + } + } + + // should be equal number of old and new allocs + upAllocIDs := e2eutil.AllocIDsFromAllocationListStubs(updated) + + // make sure this new job is being run on "all" the linux clients + require.True(t, len(updated) >= 3) + + // wait for the allocs of the slow job to be running + e2eutil.WaitForAllocsStatus(t, nomadClient, upAllocIDs, structs.AllocClientStatusRunning) +} + +func (tc *SysBatchSchedulerTest) TestJobRunPeriodic(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a fast sysbatch job + jobID := "sysbatch_job_periodic" + tc.jobIDs = append(tc.jobIDs, jobID) + err := e2eutil.Register(jobID, "scheduler_sysbatch/input/sysbatch_periodic.nomad") + require.NoError(t, err) + + // force the cron job to run + jobs := nomadClient.Jobs() + _, _, err = jobs.PeriodicForce(jobID, nil) + require.NoError(t, err) + + // find the cron job that got launched + jobsList, _, err := jobs.List(nil) + require.NoError(t, err) + cronJobID := "" + for _, job := range jobsList { + if strings.HasPrefix(job.Name, "sysbatch_job_periodic/periodic-") { + cronJobID = job.Name + break + } + } + require.NotEmpty(t, cronJobID) + tc.jobIDs = append(tc.jobIDs, cronJobID) + + // wait for allocs of the cron job + var allocs []*api.AllocationListStub + require.True(t, assert.Eventually(t, func() bool { + var err error + allocs, _, err = jobs.Allocations(cronJobID, false, nil) + require.NoError(t, err) + return len(allocs) >= 3 + }, 30*time.Second, time.Second)) + + // wait for every cron job alloc to reach completion + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete) +} + +func (tc *SysBatchSchedulerTest) TestJobRunDispatch(f *framework.F) { + t := f.T() + nomadClient := tc.Nomad() + + // submit a fast sysbatch dispatch job + jobID := "sysbatch_job_dispatch" + tc.jobIDs = append(tc.jobIDs, jobID) + err := e2eutil.Register(jobID, "scheduler_sysbatch/input/sysbatch_dispatch.nomad") + require.NoError(t, err) + + // dispatch the sysbatch job + jobs := nomadClient.Jobs() + result, _, err := jobs.Dispatch(jobID, map[string]string{ + "KEY": "value", + }, nil, nil) + require.NoError(t, err) + + // grab the new dispatched jobID + dispatchID := result.DispatchedJobID + tc.jobIDs = append(tc.jobIDs, dispatchID) + + // wait for allocs of the dispatched job + var allocs []*api.AllocationListStub + require.True(t, assert.Eventually(t, func() bool { + var err error + allocs, _, err = jobs.Allocations(dispatchID, false, nil) + require.NoError(t, err) + return len(allocs) >= 3 + }, 30*time.Second, time.Second)) + + // wait for every dispatch alloc to reach completion + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) + e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete) +} + +func (tc *SysBatchSchedulerTest) AfterEach(f *framework.F) { + nomadClient := tc.Nomad() + + // Mark all nodes eligible + nodesAPI := tc.Nomad().Nodes() + nodes, _, _ := nodesAPI.List(nil) + for _, node := range nodes { + _, _ = nodesAPI.ToggleEligibility(node.ID, true, nil) + } + + jobs := nomadClient.Jobs() + + // Stop all jobs in test + for _, id := range tc.jobIDs { + _, _, _ = jobs.Deregister(id, true, nil) + } + tc.jobIDs = []string{} + + // Garbage collect + _ = nomadClient.System().GarbageCollect() +} diff --git a/e2e/systemsched/input/system_job0.nomad b/e2e/scheduler_system/input/system_job0.nomad similarity index 100% rename from e2e/systemsched/input/system_job0.nomad rename to e2e/scheduler_system/input/system_job0.nomad diff --git a/e2e/systemsched/input/system_job1.nomad b/e2e/scheduler_system/input/system_job1.nomad similarity index 100% rename from e2e/systemsched/input/system_job1.nomad rename to e2e/scheduler_system/input/system_job1.nomad diff --git a/e2e/systemsched/systemsched.go b/e2e/scheduler_system/systemsched.go similarity index 87% rename from e2e/systemsched/systemsched.go rename to e2e/scheduler_system/systemsched.go index 09b3f9141b33..5ec17ef28547 100644 --- a/e2e/systemsched/systemsched.go +++ b/e2e/scheduler_system/systemsched.go @@ -1,4 +1,4 @@ -package systemsched +package scheduler_system import ( "github.com/hashicorp/nomad/api" @@ -35,16 +35,14 @@ func (tc *SystemSchedTest) TestJobUpdateOnIneligbleNode(f *framework.F) { jobID := "system_deployment" tc.jobIDs = append(tc.jobIDs, jobID) - e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "systemsched/input/system_job0.nomad", jobID, "") + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_system/input/system_job0.nomad", jobID, "") jobs := nomadClient.Jobs() allocs, _, err := jobs.Allocations(jobID, true, nil) require.NoError(t, err) + require.True(t, len(allocs) >= 3) - var allocIDs []string - for _, alloc := range allocs { - allocIDs = append(allocIDs, alloc.ID) - } + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs) // Wait for allocations to get past initial pending state e2eutil.WaitForAllocsNotPending(t, nomadClient, allocIDs) @@ -58,13 +56,9 @@ func (tc *SystemSchedTest) TestJobUpdateOnIneligbleNode(f *framework.F) { // Assert all jobs still running jobs = nomadClient.Jobs() allocs, _, err = jobs.Allocations(jobID, true, nil) - - allocIDs = nil - for _, alloc := range allocs { - allocIDs = append(allocIDs, alloc.ID) - } - require.NoError(t, err) + + allocIDs = e2eutil.AllocIDsFromAllocationListStubs(allocs) allocForDisabledNode := make(map[string]*api.AllocationListStub) // Wait for allocs to run and collect allocs on ineligible node @@ -89,19 +83,15 @@ func (tc *SystemSchedTest) TestJobUpdateOnIneligbleNode(f *framework.F) { require.Len(t, allocForDisabledNode, 1) // Update job - e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "systemsched/input/system_job1.nomad", jobID, "") + e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_system/input/system_job1.nomad", jobID, "") // Get updated allocations jobs = nomadClient.Jobs() allocs, _, err = jobs.Allocations(jobID, false, nil) require.NoError(t, err) - allocIDs = nil - for _, alloc := range allocs { - allocIDs = append(allocIDs, alloc.ID) - } - // Wait for allocs to start + allocIDs = e2eutil.AllocIDsFromAllocationListStubs(allocs) e2eutil.WaitForAllocsNotPending(t, nomadClient, allocIDs) // Get latest alloc status now that they are no longer pending diff --git a/helper/uuid/uuid.go b/helper/uuid/uuid.go index 145c817803d0..c0eec178ea9d 100644 --- a/helper/uuid/uuid.go +++ b/helper/uuid/uuid.go @@ -19,3 +19,9 @@ func Generate() string { buf[8:10], buf[10:16]) } + +// Short is used to generate a random shortened UUID. +func Short() string { + id := Generate() + return id[len(id)-8:] +} diff --git a/nomad/config.go b/nomad/config.go index 08e4f562f5c3..9575416c3bde 100644 --- a/nomad/config.go +++ b/nomad/config.go @@ -323,8 +323,8 @@ type Config struct { AutopilotInterval time.Duration // DefaultSchedulerConfig configures the initial scheduler config to be persisted in Raft. - // Once the cluster is bootstrapped, and Raft persists the config (from here or through API), - // This value is ignored. + // Once the cluster is bootstrapped, and Raft persists the config (from here or through API) + // and this value is ignored. DefaultSchedulerConfig structs.SchedulerConfiguration `hcl:"default_scheduler_config"` // PluginLoader is used to load plugins. @@ -433,9 +433,10 @@ func DefaultConfig() *Config { DefaultSchedulerConfig: structs.SchedulerConfiguration{ SchedulerAlgorithm: structs.SchedulerAlgorithmBinpack, PreemptionConfig: structs.PreemptionConfig{ - SystemSchedulerEnabled: true, - BatchSchedulerEnabled: false, - ServiceSchedulerEnabled: false, + SystemSchedulerEnabled: true, + SysBatchSchedulerEnabled: false, + BatchSchedulerEnabled: false, + ServiceSchedulerEnabled: false, }, }, } diff --git a/nomad/core_sched.go b/nomad/core_sched.go index 1ac135d0aaea..eb796f66bcaa 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -136,9 +136,7 @@ OUTER: gc, allocs, err := c.gcEval(eval, oldThreshold, true) if err != nil { continue OUTER - } - - if gc { + } else if gc { jobEval = append(jobEval, eval.ID) jobAlloc = append(jobAlloc, allocs...) } else { @@ -160,6 +158,7 @@ OUTER: if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { return nil } + c.logger.Debug("job GC found eligible objects", "jobs", len(gcJob), "evals", len(gcEval), "allocs", len(gcAlloc)) diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index e8c555a10218..6e1a5f283b8e 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -172,6 +172,46 @@ func HCL() string { ` } +func SystemBatchJob() *structs.Job { + job := &structs.Job{ + Region: "global", + ID: fmt.Sprintf("mock-sysbatch-%s", uuid.Short()), + Name: "my-sysbatch", + Namespace: structs.DefaultNamespace, + Type: structs.JobTypeSysBatch, + Priority: 10, + Datacenters: []string{"dc1"}, + Constraints: []*structs.Constraint{ + { + LTarget: "${attr.kernel.name}", + RTarget: "linux", + Operand: "=", + }, + }, + TaskGroups: []*structs.TaskGroup{{ + Count: 1, + Name: "pinger", + Tasks: []*structs.Task{{ + Name: "ping-example", + Driver: "exec", + Config: map[string]interface{}{ + "command": "/usr/bin/ping", + "args": []string{"-c", "5", "example.com"}, + }, + LogConfig: structs.DefaultLogConfig(), + }}, + }}, + + Status: structs.JobStatusPending, + Version: 0, + CreateIndex: 42, + ModifyIndex: 99, + JobModifyIndex: 99, + } + job.Canonicalize() + return job +} + func Job() *structs.Job { job := &structs.Job{ Region: "global", @@ -895,7 +935,7 @@ func Eval() *structs.Evaluation { } func JobSummary(jobID string) *structs.JobSummary { - js := &structs.JobSummary{ + return &structs.JobSummary{ JobID: jobID, Namespace: structs.DefaultNamespace, Summary: map[string]structs.TaskGroupSummary{ @@ -905,7 +945,19 @@ func JobSummary(jobID string) *structs.JobSummary { }, }, } - return js +} + +func JobSysBatchSummary(jobID string) *structs.JobSummary { + return &structs.JobSummary{ + JobID: jobID, + Namespace: structs.DefaultNamespace, + Summary: map[string]structs.TaskGroupSummary{ + "pinger": { + Queued: 0, + Starting: 0, + }, + }, + } } func Alloc() *structs.Allocation { @@ -1191,6 +1243,34 @@ func BatchAlloc() *structs.Allocation { return alloc } +func SysBatchAlloc() *structs.Allocation { + job := SystemBatchJob() + return &structs.Allocation{ + ID: uuid.Generate(), + EvalID: uuid.Generate(), + NodeID: "12345678-abcd-efab-cdef-123456789abc", + Namespace: structs.DefaultNamespace, + TaskGroup: "pinger", + AllocatedResources: &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "ping-example": { + Cpu: structs.AllocatedCpuResources{CpuShares: 500}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 256}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + }}, + }, + }, + Shared: structs.AllocatedSharedResources{DiskMB: 150}, + }, + Job: job, + JobID: job.ID, + DesiredStatus: structs.AllocDesiredStatusRun, + ClientStatus: structs.AllocClientStatusPending, + } +} + func SystemAlloc() *structs.Allocation { alloc := &structs.Allocation{ ID: uuid.Generate(), diff --git a/nomad/state/schema.go b/nomad/state/schema.go index 923b44617139..8178ec515f34 100644 --- a/nomad/state/schema.go +++ b/nomad/state/schema.go @@ -271,13 +271,16 @@ func jobIsGCable(obj interface{}) (bool, error) { return true, nil } - // Otherwise, only batch jobs are eligible because they complete on their - // own without a user stopping them. - if j.Type != structs.JobTypeBatch { + switch j.Type { + // Otherwise, batch and sysbatch jobs are eligible because they complete on + // their own without a user stopping them. + case structs.JobTypeBatch, structs.JobTypeSysBatch: + return true, nil + + default: + // other job types may not be GC until stopped return false, nil } - - return true, nil } // jobIsPeriodic satisfies the ConditionalIndexFunc interface and creates an index diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index 32a873ebe83c..c88573fe8c99 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -1973,7 +1973,7 @@ func (s *StateStore) JobsByScheduler(ws memdb.WatchSet, schedulerType string) (m return iter, nil } -// JobsByGC returns an iterator over all jobs eligible or uneligible for garbage +// JobsByGC returns an iterator over all jobs eligible or ineligible for garbage // collection. func (s *StateStore) JobsByGC(ws memdb.WatchSet, gc bool) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() @@ -4445,12 +4445,13 @@ func (s *StateStore) setJobStatus(index uint64, txn *txn, func (s *StateStore) getJobStatus(txn *txn, job *structs.Job, evalDelete bool) (string, error) { // System, Periodic and Parameterized jobs are running until explicitly - // stopped - if job.Type == structs.JobTypeSystem || job.IsParameterized() || job.IsPeriodic() { + // stopped. + if job.Type == structs.JobTypeSystem || + job.IsParameterized() || + job.IsPeriodic() { if job.Stop { return structs.JobStatusDead, nil } - return structs.JobStatusRunning, nil } diff --git a/nomad/structs/funcs.go b/nomad/structs/funcs.go index 7d5398133ff4..de1e5f242d15 100644 --- a/nomad/structs/funcs.go +++ b/nomad/structs/funcs.go @@ -70,10 +70,11 @@ func RemoveAllocs(alloc []*Allocation, remove []*Allocation) []*Allocation { } // FilterTerminalAllocs filters out all allocations in a terminal state and -// returns the latest terminal allocations +// returns the latest terminal allocations. func FilterTerminalAllocs(allocs []*Allocation) ([]*Allocation, map[string]*Allocation) { terminalAllocsByName := make(map[string]*Allocation) n := len(allocs) + for i := 0; i < n; i++ { if allocs[i].TerminalStatus() { @@ -91,9 +92,61 @@ func FilterTerminalAllocs(allocs []*Allocation) ([]*Allocation, map[string]*Allo n-- } } + return allocs[:n], terminalAllocsByName } +// SplitTerminalAllocs splits allocs into non-terminal and terminal allocs, with +// the terminal allocs indexed by node->alloc.name. +func SplitTerminalAllocs(allocs []*Allocation) ([]*Allocation, TerminalByNodeByName) { + var alive []*Allocation + var terminal = make(TerminalByNodeByName) + + for _, alloc := range allocs { + if alloc.TerminalStatus() { + terminal.Set(alloc) + } else { + alive = append(alive, alloc) + } + } + + return alive, terminal +} + +// TerminalByNodeByName is a map of NodeID->Allocation.Name->Allocation used by +// the sysbatch scheduler for locating the most up-to-date terminal allocations. +type TerminalByNodeByName map[string]map[string]*Allocation + +func (a TerminalByNodeByName) Set(allocation *Allocation) { + node := allocation.NodeID + name := allocation.Name + + if _, exists := a[node]; !exists { + a[node] = make(map[string]*Allocation) + } + + if previous, exists := a[node][name]; !exists { + a[node][name] = allocation + } else { + // keep the newest version of the terminal alloc for the coordinate + if previous.CreateIndex < allocation.CreateIndex { + a[node][name] = allocation + } + } +} + +func (a TerminalByNodeByName) Get(nodeID, name string) (*Allocation, bool) { + if _, exists := a[nodeID]; !exists { + return nil, false + } + + if _, exists := a[nodeID][name]; !exists { + return nil, false + } + + return a[nodeID][name], true +} + // AllocsFit checks if a given set of allocations will fit on a node. // The netIdx can optionally be provided if its already been computed. // If the netIdx is provided, it is assumed that the client has already diff --git a/nomad/structs/funcs_test.go b/nomad/structs/funcs_test.go index 504cc3a8e486..d802274f6f38 100644 --- a/nomad/structs/funcs_test.go +++ b/nomad/structs/funcs_test.go @@ -335,8 +335,8 @@ func TestAllocsFit(t *testing.T) { DiskMB: 5000, Networks: Networks{ { - Mode: "host", - IP: "10.0.0.1", + Mode: "host", + IP: "10.0.0.1", ReservedPorts: []Port{{"main", 8000, 0, ""}}, }, }, diff --git a/nomad/structs/operator.go b/nomad/structs/operator.go index 8a3afef9f154..4960369219ec 100644 --- a/nomad/structs/operator.go +++ b/nomad/structs/operator.go @@ -205,6 +205,9 @@ type PreemptionConfig struct { // SystemSchedulerEnabled specifies if preemption is enabled for system jobs SystemSchedulerEnabled bool `hcl:"system_scheduler_enabled"` + // SysBatchSchedulerEnabled specifies if preemption is enabled for sysbatch jobs + SysBatchSchedulerEnabled bool `hcl:"sysbatch_scheduler_enabled"` + // BatchSchedulerEnabled specifies if preemption is enabled for batch jobs BatchSchedulerEnabled bool `hcl:"batch_scheduler_enabled"` diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 481b157ba038..b8b93c568c4e 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -3761,10 +3761,11 @@ func (c *ComparableResources) NetIndex(n *NetworkResource) int { const ( // JobTypeNomad is reserved for internal system tasks and is // always handled by the CoreScheduler. - JobTypeCore = "_core" - JobTypeService = "service" - JobTypeBatch = "batch" - JobTypeSystem = "system" + JobTypeCore = "_core" + JobTypeService = "service" + JobTypeBatch = "batch" + JobTypeSystem = "system" + JobTypeSysBatch = "sysbatch" ) const ( @@ -4027,7 +4028,7 @@ func (j *Job) Validate() error { mErr.Errors = append(mErr.Errors, errors.New("Job must be in a namespace")) } switch j.Type { - case JobTypeCore, JobTypeService, JobTypeBatch, JobTypeSystem: + case JobTypeCore, JobTypeService, JobTypeBatch, JobTypeSystem, JobTypeSysBatch: case "": mErr.Errors = append(mErr.Errors, errors.New("Missing job type")) default: @@ -4119,11 +4120,12 @@ func (j *Job) Validate() error { } } - // Validate periodic is only used with batch jobs. + // Validate periodic is only used with batch or sysbatch jobs. if j.IsPeriodic() && j.Periodic.Enabled { - if j.Type != JobTypeBatch { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("Periodic can only be used with %q scheduler", JobTypeBatch)) + if j.Type != JobTypeBatch && j.Type != JobTypeSysBatch { + mErr.Errors = append(mErr.Errors, fmt.Errorf( + "Periodic can only be used with %q or %q scheduler", JobTypeBatch, JobTypeSysBatch, + )) } if err := j.Periodic.Validate(); err != nil { @@ -4132,9 +4134,10 @@ func (j *Job) Validate() error { } if j.IsParameterized() { - if j.Type != JobTypeBatch { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("Parameterized job can only be used with %q scheduler", JobTypeBatch)) + if j.Type != JobTypeBatch && j.Type != JobTypeSysBatch { + mErr.Errors = append(mErr.Errors, fmt.Errorf( + "Parameterized job can only be used with %q or %q scheduler", JobTypeBatch, JobTypeSysBatch, + )) } if err := j.ParameterizedJob.Validate(); err != nil { diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index c67eafad870a..b933deb1eb21 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -36,7 +36,7 @@ const ( // allocInPlace is the status used when speculating on an in-place update allocInPlace = "alloc updating in-place" - // allocNodeTainted is the status used when stopping an alloc because it's + // allocNodeTainted is the status used when stopping an alloc because its // node is tainted. allocNodeTainted = "alloc not needed as node is tainted" diff --git a/scheduler/rank.go b/scheduler/rank.go index 1653d9cf9067..ec4b2635d423 100644 --- a/scheduler/rank.go +++ b/scheduler/rank.go @@ -24,7 +24,7 @@ type RankedNode struct { TaskLifecycles map[string]*structs.TaskLifecycleConfig AllocResources *structs.AllocatedSharedResources - // Allocs is used to cache the proposed allocations on the + // Proposed is used to cache the proposed allocations on the // node. This can be shared between iterators that require it. Proposed []*structs.Allocation @@ -60,7 +60,7 @@ func (r *RankedNode) SetTaskResources(task *structs.Task, r.TaskLifecycles[task.Name] = task.Lifecycle } -// RankFeasibleIterator is used to iteratively yield nodes along +// RankIterator is used to iteratively yield nodes along // with ranking metadata. The iterators may manage some state for // performance optimizations. type RankIterator interface { diff --git a/scheduler/scheduler.go b/scheduler/scheduler.go index a950690db44f..d1bbfa4c3e41 100644 --- a/scheduler/scheduler.go +++ b/scheduler/scheduler.go @@ -21,9 +21,10 @@ const ( // BuiltinSchedulers contains the built in registered schedulers // which are available var BuiltinSchedulers = map[string]Factory{ - "service": NewServiceScheduler, - "batch": NewBatchScheduler, - "system": NewSystemScheduler, + "service": NewServiceScheduler, + "batch": NewBatchScheduler, + "system": NewSystemScheduler, + "sysbatch": NewSysBatchScheduler, } // NewScheduler is used to instantiate and return a new scheduler diff --git a/scheduler/scheduler_sysbatch_test.go b/scheduler/scheduler_sysbatch_test.go new file mode 100644 index 000000000000..ccf1e4b6e007 --- /dev/null +++ b/scheduler/scheduler_sysbatch_test.go @@ -0,0 +1,1624 @@ +package scheduler + +import ( + "fmt" + "sort" + "testing" + + "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" +) + +func TestSysBatch_JobRegister(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + _ = createNodes(t, h, 10) + + // Create a job + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to deregister the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan does not have annotations + require.Nil(t, plan.Annotations, "expected no annotations") + + // Ensure the plan allocated + var planned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(t, planned, 10) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure all allocations placed + require.Len(t, out, 10) + + // Check the available nodes + count, ok := out[0].Metrics.NodesAvailable["dc1"] + require.True(t, ok) + require.Equal(t, 10, count, "bad metrics %#v:", out[0].Metrics) + + // Ensure no allocations are queued + queued := h.Evals[0].QueuedAllocations["my-sysbatch"] + require.Equal(t, 0, queued, "unexpected queued allocations") + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobRegister_AddNode_Running(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + // Generate a fake sysbatch job with allocations + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.ClientStatus = structs.AllocClientStatusRunning + allocs = append(allocs, alloc) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Add a new node. + node := mock.Node() + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Create a mock evaluation to deal with the node update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan had no node updates + var update []*structs.Allocation + for _, updateList := range plan.NodeUpdate { + update = append(update, updateList...) + } + require.Empty(t, update) + + // Ensure the plan allocated on the new node + var planned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(t, planned, 1) + + // Ensure it allocated on the right node + _, ok := plan.NodeAllocation[node.ID] + require.True(t, ok, "allocated on wrong node: %#v", plan) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure all allocations placed + out, _ = structs.FilterTerminalAllocs(out) + require.Len(t, out, 11) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobRegister_AddNode_Dead(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + // Generate a dead sysbatch job with complete allocations + job := mock.SystemBatchJob() + job.Status = structs.JobStatusDead // job is dead but not stopped + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.ClientStatus = structs.AllocClientStatusComplete + allocs = append(allocs, alloc) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Add a new node. + node := mock.Node() + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Create a mock evaluation to deal with the node update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan has no node update + var update []*structs.Allocation + for _, updateList := range plan.NodeUpdate { + update = append(update, updateList...) + } + require.Len(t, update, 0) + + // Ensure the plan allocates on the new node + var planned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(t, planned, 1) + + // Ensure it allocated on the right node + _, ok := plan.NodeAllocation[node.ID] + require.True(t, ok, "allocated on wrong node: %#v", plan) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure 1 non-terminal allocation + live, _ := structs.FilterTerminalAllocs(out) + require.Len(t, live, 1) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobModify(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + // Generate a fake job with allocations + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.ClientStatus = structs.AllocClientStatusPending + allocs = append(allocs, alloc) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Add a few terminal status allocations, these should be reinstated + var terminal []*structs.Allocation + for i := 0; i < 5; i++ { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = nodes[i].ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.ClientStatus = structs.AllocClientStatusComplete + terminal = append(terminal, alloc) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), terminal)) + + // Update the job + job2 := mock.SystemBatchJob() + job2.ID = job.ID + + // Update the task, such that it cannot be done in-place + job2.TaskGroups[0].Tasks[0].Config["command"] = "/bin/other" + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) + + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted all allocs + var update []*structs.Allocation + for _, updateList := range plan.NodeUpdate { + update = append(update, updateList...) + } + require.Equal(t, len(allocs), len(update)) + + // Ensure the plan allocated + var planned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(t, planned, 10) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure all allocations placed + out, _ = structs.FilterTerminalAllocs(out) + require.Len(t, out, 10) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobModify_InPlace(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + allocs = append(allocs, alloc) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Update the job + job2 := mock.SystemBatchJob() + job2.ID = job.ID + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) + + // Create a mock evaluation to deal with update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan did not evict any allocs + var update []*structs.Allocation + for _, updateList := range plan.NodeUpdate { + update = append(update, updateList...) + } + require.Empty(t, update) + + // Ensure the plan updated the existing allocs + var planned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + } + require.Len(t, planned, 10) + + for _, p := range planned { + require.Equal(t, job2, p.Job, "should update job") + } + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure all allocations placed + require.Len(t, out, 10) + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobDeregister_Purged(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + // Create a sysbatch job + job := mock.SystemBatchJob() + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + allocs = append(allocs, alloc) + } + for _, alloc := range allocs { + require.NoError(t, h.State.UpsertJobSummary(h.NextIndex(), mock.JobSysBatchSummary(alloc.JobID))) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Create a mock evaluation to deregister the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerJobDeregister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted the job from all nodes. + for _, node := range nodes { + require.Len(t, plan.NodeUpdate[node.ID], 1) + } + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure no remaining allocations + out, _ = structs.FilterTerminalAllocs(out) + require.Empty(t, out) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_JobDeregister_Stopped(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + nodes := createNodes(t, h, 10) + + // Generate a stopped sysbatch job with allocations + job := mock.SystemBatchJob() + job.Stop = true + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + var allocs []*structs.Allocation + for _, node := range nodes { + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + allocs = append(allocs, alloc) + } + for _, alloc := range allocs { + require.NoError(t, h.State.UpsertJobSummary(h.NextIndex(), mock.JobSysBatchSummary(alloc.JobID))) + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // Create a mock evaluation to deregister the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerJobDeregister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted the job from all nodes. + for _, node := range nodes { + require.Len(t, plan.NodeUpdate[node.ID], 1) + } + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure no remaining allocations + out, _ = structs.FilterTerminalAllocs(out) + require.Empty(t, out) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_NodeDown(t *testing.T) { + h := NewHarness(t) + + // Register a down node + node := mock.Node() + node.Status = structs.NodeStatusDown + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Generate a sysbatch job allocated on that node + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) + + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted all allocs + require.Len(t, plan.NodeUpdate[node.ID], 1) + + // Ensure the plan updated the allocation. + planned := make([]*structs.Allocation, 0) + for _, allocList := range plan.NodeUpdate { + planned = append(planned, allocList...) + } + require.Len(t, planned, 1) + + // Ensure the allocations is stopped + p := planned[0] + require.Equal(t, structs.AllocDesiredStatusStop, p.DesiredStatus) + // removed badly designed assertion on client_status = lost + // the actual client_status is pending + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_NodeDrain_Down(t *testing.T) { + h := NewHarness(t) + + // Register a draining node + node := mock.Node() + node.Drain = true + node.Status = structs.NodeStatusDown + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Generate a sysbatch job allocated on that node. + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) + + // Create a mock evaluation to deal with the node update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted non terminal allocs + require.Len(t, plan.NodeUpdate[node.ID], 1) + + // Ensure that the allocation is marked as lost + var lost []string + for _, alloc := range plan.NodeUpdate[node.ID] { + lost = append(lost, alloc.ID) + } + require.Equal(t, []string{alloc.ID}, lost) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_NodeDrain(t *testing.T) { + h := NewHarness(t) + + // Register a draining node + node := mock.Node() + node.Drain = true + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Generate a sysbatch job allocated on that node. + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) + + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSystemScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted all allocs + require.Len(t, plan.NodeUpdate[node.ID], 1) + + // Ensure the plan updated the allocation. + planned := make([]*structs.Allocation, 0) + for _, allocList := range plan.NodeUpdate { + planned = append(planned, allocList...) + } + require.Len(t, planned, 1) + + // Ensure the allocations is stopped + require.Equal(t, structs.AllocDesiredStatusStop, planned[0].DesiredStatus) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_NodeUpdate(t *testing.T) { + h := NewHarness(t) + + // Register a node + node := mock.Node() + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Generate a sysbatch job allocated on that node. + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-system.pinger[0]" + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) + + // Create a mock evaluation to deal with the node update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure that queued allocations is zero + val, ok := h.Evals[0].QueuedAllocations["pinger"] + require.True(t, ok) + require.Zero(t, val) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_RetryLimit(t *testing.T) { + h := NewHarness(t) + h.Planner = &RejectPlan{h} + + // Create some nodes + _ = createNodes(t, h, 10) + + // Create a job + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to register + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure multiple plans + require.NotEmpty(t, h.Plans) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure no allocations placed + require.Empty(t, out) + + // Should hit the retry limit + h.AssertEvalStatus(t, structs.EvalStatusFailed) +} + +// This test ensures that the scheduler doesn't increment the queued allocation +// count for a task group when allocations can't be created on currently +// available nodes because of constraint mismatches. +func TestSysBatch_Queued_With_Constraints(t *testing.T) { + h := NewHarness(t) + + // Register a node + node := mock.Node() + node.Attributes["kernel.name"] = "darwin" + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Generate a sysbatch job which can't be placed on the node + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to deal with the node update + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure that queued allocations is zero + val, ok := h.Evals[0].QueuedAllocations["pinger"] + require.True(t, ok) + require.Zero(t, val) +} + +// This test ensures that the scheduler correctly ignores ineligible +// nodes when scheduling due to a new node being added. The job has two +// task groups constrained to a particular node class. The desired behavior +// should be that the TaskGroup constrained to the newly added node class is +// added and that the TaskGroup constrained to the ineligible node is ignored. +func TestSysBatch_JobConstraint_AddNode(t *testing.T) { + h := NewHarness(t) + + // Create two nodes + var node *structs.Node + node = mock.Node() + node.NodeClass = "Class-A" + require.NoError(t, node.ComputeClass()) + require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + var nodeB *structs.Node + nodeB = mock.Node() + nodeB.NodeClass = "Class-B" + require.NoError(t, nodeB.ComputeClass()) + require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), nodeB)) + + // Make a sysbatch job with two task groups, each constraint to a node class + job := mock.SystemBatchJob() + tgA := job.TaskGroups[0] + tgA.Name = "groupA" + tgA.Constraints = []*structs.Constraint{{ + LTarget: "${node.class}", + RTarget: node.NodeClass, + Operand: "=", + }} + tgB := job.TaskGroups[0].Copy() + tgB.Name = "groupB" + tgB.Constraints = []*structs.Constraint{{ + LTarget: "${node.class}", + RTarget: nodeB.NodeClass, + Operand: "=", + }} + + // Upsert Job + job.TaskGroups = []*structs.TaskGroup{tgA, tgB} + require.Nil(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Evaluate the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + require.Nil(t, h.Process(NewSysBatchScheduler, eval)) + require.Equal(t, "complete", h.Evals[0].Status) + + // QueuedAllocations is drained + val, ok := h.Evals[0].QueuedAllocations["groupA"] + require.True(t, ok) + require.Equal(t, 0, val) + + val, ok = h.Evals[0].QueuedAllocations["groupB"] + require.True(t, ok) + require.Equal(t, 0, val) + + // Single plan with two NodeAllocations + require.Len(t, h.Plans, 1) + require.Len(t, h.Plans[0].NodeAllocation, 2) + + // Mark the node as ineligible + node.SchedulingEligibility = structs.NodeSchedulingIneligible + + // Evaluate the node update + eval2 := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerNodeUpdate, + NodeID: node.ID, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval2})) + + // Process the 2nd evaluation + require.Nil(t, h.Process(NewSysBatchScheduler, eval2)) + require.Equal(t, "complete", h.Evals[1].Status) + + // Ensure no new plans + require.Equal(t, 1, len(h.Plans)) + + // Ensure all NodeAllocations are from first Eval + for _, allocs := range h.Plans[0].NodeAllocation { + require.Len(t, allocs, 1) + require.Equal(t, eval.ID, allocs[0].EvalID) + } + + // Add a new node Class-B + var nodeBTwo *structs.Node + nodeBTwo = mock.Node() + require.NoError(t, nodeBTwo.ComputeClass()) + nodeBTwo.NodeClass = "Class-B" + require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), nodeBTwo)) + + // Evaluate the new node + eval3 := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + NodeID: nodeBTwo.ID, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + + // Ensure 3rd eval is complete + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval3})) + require.Nil(t, h.Process(NewSysBatchScheduler, eval3)) + require.Equal(t, "complete", h.Evals[2].Status) + + // Ensure no failed TG allocs + require.Equal(t, 0, len(h.Evals[2].FailedTGAllocs)) + + require.Len(t, h.Plans, 2) + require.Len(t, h.Plans[1].NodeAllocation, 1) + // Ensure all NodeAllocations are from first Eval + for _, allocs := range h.Plans[1].NodeAllocation { + require.Len(t, allocs, 1) + require.Equal(t, eval3.ID, allocs[0].EvalID) + } + + ws := memdb.NewWatchSet() + + allocsNodeOne, err := h.State.AllocsByNode(ws, node.ID) + require.NoError(t, err) + require.Len(t, allocsNodeOne, 1) + + allocsNodeTwo, err := h.State.AllocsByNode(ws, nodeB.ID) + require.NoError(t, err) + require.Len(t, allocsNodeTwo, 1) + + allocsNodeThree, err := h.State.AllocsByNode(ws, nodeBTwo.ID) + require.NoError(t, err) + require.Len(t, allocsNodeThree, 1) +} + +// No errors reported when no available nodes prevent placement +func TestSysBatch_ExistingAllocNoNodes(t *testing.T) { + h := NewHarness(t) + + var node *structs.Node + // Create a node + node = mock.Node() + require.NoError(t, node.ComputeClass()) + require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Make a sysbatch job + job := mock.SystemBatchJob() + job.Meta = map[string]string{"version": "1"} + require.Nil(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Evaluate the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + require.Nil(t, h.Process(NewSysBatchScheduler, eval)) + require.Equal(t, "complete", h.Evals[0].Status) + + // QueuedAllocations is drained + val, ok := h.Evals[0].QueuedAllocations["pinger"] + require.True(t, ok) + require.Equal(t, 0, val) + + // The plan has one NodeAllocations + require.Equal(t, 1, len(h.Plans)) + + // Mark the node as ineligible + node.SchedulingEligibility = structs.NodeSchedulingIneligible + + // Evaluate the job + eval2 := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval2})) + require.Nil(t, h.Process(NewSysBatchScheduler, eval2)) + require.Equal(t, "complete", h.Evals[1].Status) + + // Create a new job version, deploy + job2 := job.Copy() + job2.Meta["version"] = "2" + require.Nil(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) + + // Run evaluation as a plan + eval3 := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job2.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job2.ID, + Status: structs.EvalStatusPending, + AnnotatePlan: true, + } + + // Ensure New eval is complete + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval3})) + require.Nil(t, h.Process(NewSysBatchScheduler, eval3)) + require.Equal(t, "complete", h.Evals[2].Status) + + // Ensure there are no FailedTGAllocs + require.Equal(t, 0, len(h.Evals[2].FailedTGAllocs)) + require.Equal(t, 0, h.Evals[2].QueuedAllocations[job2.Name]) +} + +func TestSysBatch_ConstraintErrors(t *testing.T) { + h := NewHarness(t) + + var node *structs.Node + // Register some nodes + // the tag "aaaaaa" is hashed so that the nodes are processed + // in an order other than good, good, bad + for _, tag := range []string{"aaaaaa", "foo", "foo", "foo"} { + node = mock.Node() + node.Meta["tag"] = tag + require.NoError(t, node.ComputeClass()) + require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + } + + // Mark the last node as ineligible + node.SchedulingEligibility = structs.NodeSchedulingIneligible + + // Make a job with a constraint that matches a subset of the nodes + job := mock.SystemBatchJob() + job.Priority = 100 + job.Constraints = append(job.Constraints, + &structs.Constraint{ + LTarget: "${meta.tag}", + RTarget: "foo", + Operand: "=", + }) + + require.Nil(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Evaluate the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + + require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + require.Nil(t, h.Process(NewSysBatchScheduler, eval)) + require.Equal(t, "complete", h.Evals[0].Status) + + // QueuedAllocations is drained + val, ok := h.Evals[0].QueuedAllocations["pinger"] + require.True(t, ok) + require.Equal(t, 0, val) + + // The plan has two NodeAllocations + require.Equal(t, 1, len(h.Plans)) + require.Nil(t, h.Plans[0].Annotations) + require.Equal(t, 2, len(h.Plans[0].NodeAllocation)) + + // Two nodes were allocated and are pending. (unlike system jobs, sybatch + // jobs are not auto set to running) + ws := memdb.NewWatchSet() + as, err := h.State.AllocsByJob(ws, structs.DefaultNamespace, job.ID, false) + require.Nil(t, err) + + pending := 0 + for _, a := range as { + if "pending" == a.Job.Status { + pending++ + } + } + + require.Equal(t, 2, len(as)) + require.Equal(t, 2, pending) + + // Failed allocations is empty + require.Equal(t, 0, len(h.Evals[0].FailedTGAllocs)) +} + +func TestSysBatch_ChainedAlloc(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + _ = createNodes(t, h, 10) + + // Create a sysbatch job + job := mock.SystemBatchJob() + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to register the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + var allocIDs []string + for _, allocList := range h.Plans[0].NodeAllocation { + for _, alloc := range allocList { + allocIDs = append(allocIDs, alloc.ID) + } + } + sort.Strings(allocIDs) + + // Create a new harness to invoke the scheduler again + h1 := NewHarnessWithState(t, h.State) + job1 := mock.SystemBatchJob() + job1.ID = job.ID + job1.TaskGroups[0].Tasks[0].Env = make(map[string]string) + job1.TaskGroups[0].Tasks[0].Env["foo"] = "bar" + require.NoError(t, h1.State.UpsertJob(structs.MsgTypeTestSetup, h1.NextIndex(), job1)) + + // Insert two more nodes + for i := 0; i < 2; i++ { + node := mock.Node() + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + } + + // Create a mock evaluation to update the job + eval1 := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job1.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job1.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval1})) + // Process the evaluation + err = h1.Process(NewSysBatchScheduler, eval1) + require.NoError(t, err) + + require.Len(t, h.Plans, 1) + plan := h1.Plans[0] + + // Collect all the chained allocation ids and the new allocations which + // don't have any chained allocations + var prevAllocs []string + var newAllocs []string + for _, allocList := range plan.NodeAllocation { + for _, alloc := range allocList { + if alloc.PreviousAllocation == "" { + newAllocs = append(newAllocs, alloc.ID) + continue + } + prevAllocs = append(prevAllocs, alloc.PreviousAllocation) + } + } + sort.Strings(prevAllocs) + + // Ensure that the new allocations has their corresponding original + // allocation ids + require.Equal(t, allocIDs, prevAllocs) + + // Ensuring two new allocations don't have any chained allocations + require.Len(t, newAllocs, 2) +} + +func TestSysBatch_PlanWithDrainedNode(t *testing.T) { + h := NewHarness(t) + + // Register two nodes with two different classes + node := mock.Node() + node.NodeClass = "green" + node.Drain = true + require.NoError(t, node.ComputeClass()) + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + node2 := mock.Node() + node2.NodeClass = "blue" + require.NoError(t, node2.ComputeClass()) + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node2)) + + // Create a sysbatch job with two task groups, each constrained on node class + job := mock.SystemBatchJob() + tg1 := job.TaskGroups[0] + tg1.Constraints = append(tg1.Constraints, + &structs.Constraint{ + LTarget: "${node.class}", + RTarget: "green", + Operand: "==", + }) + + tg2 := tg1.Copy() + tg2.Name = "pinger2" + tg2.Constraints[0].RTarget = "blue" + job.TaskGroups = append(job.TaskGroups, tg2) + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create an allocation on each node + alloc := mock.SysBatchAlloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-sysbatch.pinger[0]" + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) + alloc.TaskGroup = "pinger" + + alloc2 := mock.SysBatchAlloc() + alloc2.Job = job + alloc2.JobID = job.ID + alloc2.NodeID = node2.ID + alloc2.Name = "my-sysbatch.pinger2[0]" + alloc2.TaskGroup = "pinger2" + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc, alloc2})) + + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + plan := h.Plans[0] + + // Ensure the plan evicted the alloc on the failed node + planned := plan.NodeUpdate[node.ID] + require.Len(t, plan.NodeUpdate[node.ID], 1) + + // Ensure the plan didn't place + require.Empty(t, plan.NodeAllocation) + + // Ensure the allocations is stopped + require.Equal(t, structs.AllocDesiredStatusStop, planned[0].DesiredStatus) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_QueuedAllocsMultTG(t *testing.T) { + h := NewHarness(t) + + // Register two nodes with two different classes + node := mock.Node() + node.NodeClass = "green" + require.NoError(t, node.ComputeClass()) + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + node2 := mock.Node() + node2.NodeClass = "blue" + require.NoError(t, node2.ComputeClass()) + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node2)) + + // Create a sysbatch job with two task groups, each constrained on node class + job := mock.SystemBatchJob() + tg1 := job.TaskGroups[0] + tg1.Constraints = append(tg1.Constraints, + &structs.Constraint{ + LTarget: "${node.class}", + RTarget: "green", + Operand: "==", + }) + + tg2 := tg1.Copy() + tg2.Name = "pinger2" + tg2.Constraints[0].RTarget = "blue" + job.TaskGroups = append(job.TaskGroups, tg2) + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewSysBatchScheduler, eval) + require.NoError(t, err) + + // Ensure a single plan + require.Len(t, h.Plans, 1) + + qa := h.Evals[0].QueuedAllocations + require.Zero(t, qa["pinger"]) + require.Zero(t, qa["pinger2"]) + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_Preemption(t *testing.T) { + h := NewHarness(t) + + // Create nodes + nodes := make([]*structs.Node, 0) + for i := 0; i < 2; i++ { + node := mock.Node() + // TODO: remove in 0.11 + node.Resources = &structs.Resources{ + CPU: 3072, + MemoryMB: 5034, + DiskMB: 20 * 1024, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + CIDR: "192.168.0.100/32", + MBits: 1000, + }}, + } + node.NodeResources = &structs.NodeResources{ + Cpu: structs.NodeCpuResources{CpuShares: 3072}, + Memory: structs.NodeMemoryResources{MemoryMB: 5034}, + Disk: structs.NodeDiskResources{DiskMB: 20 * 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + CIDR: "192.168.0.100/32", + MBits: 1000, + }}, + NodeNetworks: []*structs.NodeNetworkResource{{ + Mode: "host", + Device: "eth0", + Addresses: []structs.NodeNetworkAddress{{ + Family: structs.NodeNetworkAF_IPv4, + Alias: "default", + Address: "192.168.0.100", + }}, + }}, + } + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + nodes = append(nodes, node) + } + + // Enable Preemption + err := h.State.SchedulerSetConfig(h.NextIndex(), &structs.SchedulerConfiguration{ + PreemptionConfig: structs.PreemptionConfig{ + SysBatchSchedulerEnabled: true, + }, + }) + require.NoError(t, err) + + // Create some low priority batch jobs and allocations for them + // One job uses a reserved port + job1 := mock.BatchJob() + job1.Type = structs.JobTypeBatch + job1.Priority = 20 + job1.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ + CPU: 512, + MemoryMB: 1024, + Networks: []*structs.NetworkResource{{ + MBits: 200, + ReservedPorts: []structs.Port{{ + Label: "web", + Value: 80, + }}, + }}, + } + + alloc1 := mock.Alloc() + alloc1.Job = job1 + alloc1.JobID = job1.ID + alloc1.NodeID = nodes[0].ID + alloc1.Name = "my-job[0]" + alloc1.TaskGroup = job1.TaskGroups[0].Name + alloc1.AllocatedResources = &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{CpuShares: 512}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, + MBits: 200, + }}, + }, + }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, + } + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job1)) + + job2 := mock.BatchJob() + job2.Type = structs.JobTypeBatch + job2.Priority = 20 + job2.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ + CPU: 512, + MemoryMB: 1024, + Networks: []*structs.NetworkResource{{MBits: 200}}, + } + + alloc2 := mock.Alloc() + alloc2.Job = job2 + alloc2.JobID = job2.ID + alloc2.NodeID = nodes[0].ID + alloc2.Name = "my-job[2]" + alloc2.TaskGroup = job2.TaskGroups[0].Name + alloc2.AllocatedResources = &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{CpuShares: 512}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + MBits: 200, + }}, + }, + }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, + } + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) + + job3 := mock.Job() + job3.Type = structs.JobTypeBatch + job3.Priority = 40 + job3.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ + CPU: 1024, + MemoryMB: 2048, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + MBits: 400, + }}, + } + + alloc3 := mock.Alloc() + alloc3.Job = job3 + alloc3.JobID = job3.ID + alloc3.NodeID = nodes[0].ID + alloc3.Name = "my-job[0]" + alloc3.TaskGroup = job3.TaskGroups[0].Name + alloc3.AllocatedResources = &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{CpuShares: 1024}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 25}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, + MBits: 400, + }}, + }, + }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc1, alloc2, alloc3})) + + // Create a high priority job and allocs for it + // These allocs should not be preempted + + job4 := mock.BatchJob() + job4.Type = structs.JobTypeBatch + job4.Priority = 100 + job4.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ + CPU: 1024, + MemoryMB: 2048, + Networks: []*structs.NetworkResource{{MBits: 100}}, + } + + alloc4 := mock.Alloc() + alloc4.Job = job4 + alloc4.JobID = job4.ID + alloc4.NodeID = nodes[0].ID + alloc4.Name = "my-job4[0]" + alloc4.TaskGroup = job4.TaskGroups[0].Name + alloc4.AllocatedResources = &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: 1024, + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 2048, + }, + Networks: []*structs.NetworkResource{ + { + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, + MBits: 100, + }, + }, + }, + }, + Shared: structs.AllocatedSharedResources{ + DiskMB: 2 * 1024, + }, + } + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job4)) + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc4})) + + // Create a system job such that it would need to preempt both allocs to succeed + job := mock.SystemBatchJob() + job.Priority = 100 + job.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ + CPU: 1948, + MemoryMB: 256, + Networks: []*structs.NetworkResource{{ + MBits: 800, + DynamicPorts: []structs.Port{{Label: "http"}}, + }}, + } + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to register the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err = h.Process(NewSysBatchScheduler, eval) + require.Nil(t, err) + + // Ensure a single plan + require.Equal(t, 1, len(h.Plans)) + plan := h.Plans[0] + + // Ensure the plan doesn't have annotations + require.Nil(t, plan.Annotations) + + // Ensure the plan allocated on both nodes + var planned []*structs.Allocation + preemptingAllocId := "" + require.Equal(t, 2, len(plan.NodeAllocation)) + + // The alloc that got placed on node 1 is the preemptor + for _, allocList := range plan.NodeAllocation { + planned = append(planned, allocList...) + for _, alloc := range allocList { + if alloc.NodeID == nodes[0].ID { + preemptingAllocId = alloc.ID + } + } + } + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(t, err) + + // Ensure all allocations placed + require.Equal(t, 2, len(out)) + + // Verify that one node has preempted allocs + require.NotNil(t, plan.NodePreemptions[nodes[0].ID]) + preemptedAllocs := plan.NodePreemptions[nodes[0].ID] + + // Verify that three jobs have preempted allocs + require.Equal(t, 3, len(preemptedAllocs)) + + expectedPreemptedJobIDs := []string{job1.ID, job2.ID, job3.ID} + + // We expect job1, job2 and job3 to have preempted allocations + // job4 should not have any allocs preempted + for _, alloc := range preemptedAllocs { + require.Contains(t, expectedPreemptedJobIDs, alloc.JobID) + } + // Look up the preempted allocs by job ID + ws = memdb.NewWatchSet() + + for _, jobId := range expectedPreemptedJobIDs { + out, err = h.State.AllocsByJob(ws, structs.DefaultNamespace, jobId, false) + require.NoError(t, err) + for _, alloc := range out { + require.Equal(t, structs.AllocDesiredStatusEvict, alloc.DesiredStatus) + require.Equal(t, fmt.Sprintf("Preempted by alloc ID %v", preemptingAllocId), alloc.DesiredDescription) + } + } + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} + +func TestSysBatch_canHandle(t *testing.T) { + s := SystemScheduler{sysbatch: true} + t.Run("sysbatch register", func(t *testing.T) { + require.True(t, s.canHandle(structs.EvalTriggerJobRegister)) + }) + t.Run("sysbatch scheduled", func(t *testing.T) { + require.False(t, s.canHandle(structs.EvalTriggerScheduled)) + }) + t.Run("sysbatch periodic", func(t *testing.T) { + require.True(t, s.canHandle(structs.EvalTriggerPeriodicJob)) + }) +} +func createNodes(t *testing.T, h *Harness, n int) []*structs.Node { + nodes := make([]*structs.Node, n) + for i := 0; i < n; i++ { + node := mock.Node() + nodes[i] = node + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + } + return nodes +} diff --git a/scheduler/system_sched.go b/scheduler/scheduler_system.go similarity index 85% rename from scheduler/system_sched.go rename to scheduler/scheduler_system.go index 4b1e5c8cbfaa..53e4b4eefbb0 100644 --- a/scheduler/system_sched.go +++ b/scheduler/scheduler_system.go @@ -14,15 +14,21 @@ const ( // we will attempt to schedule if we continue to hit conflicts for system // jobs. maxSystemScheduleAttempts = 5 + + // maxSysBatchScheduleAttempts is used to limit the number of times we will + // attempt to schedule if we continue to hit conflicts for sysbatch jobs. + maxSysBatchScheduleAttempts = 2 ) -// SystemScheduler is used for 'system' jobs. This scheduler is -// designed for services that should be run on every client. -// One for each job, containing an allocation for each node +// SystemScheduler is used for 'system' and 'sysbatch' jobs. This scheduler is +// designed for jobs that should be run on every client. The 'system' mode +// will ensure those jobs continuously run regardless of successful task exits, +// whereas 'sysbatch' considers the task complete on success. type SystemScheduler struct { - logger log.Logger - state State - planner Planner + logger log.Logger + state State + planner Planner + sysbatch bool eval *structs.Evaluation job *structs.Job @@ -30,8 +36,9 @@ type SystemScheduler struct { planResult *structs.PlanResult ctx *EvalContext stack *SystemStack - nodes []*structs.Node - nodesByDC map[string]int + + nodes []*structs.Node + nodesByDC map[string]int limitReached bool nextEval *structs.Evaluation @@ -44,14 +51,25 @@ type SystemScheduler struct { // scheduler. func NewSystemScheduler(logger log.Logger, state State, planner Planner) Scheduler { return &SystemScheduler{ - logger: logger.Named("system_sched"), - state: state, - planner: planner, + logger: logger.Named("system_sched"), + state: state, + planner: planner, + sysbatch: false, + } +} + +func NewSysBatchScheduler(logger log.Logger, state State, planner Planner) Scheduler { + return &SystemScheduler{ + logger: logger.Named("sysbatch_sched"), + state: state, + planner: planner, + sysbatch: true, } } // Process is used to handle a single evaluation. func (s *SystemScheduler) Process(eval *structs.Evaluation) error { + // Store the evaluation s.eval = eval @@ -59,21 +77,20 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) error { s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace) // Verify the evaluation trigger reason is understood - switch eval.TriggeredBy { - case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, structs.EvalTriggerFailedFollowUp, - structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, structs.EvalTriggerPreemption, - structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerNodeDrain, structs.EvalTriggerAllocStop, - structs.EvalTriggerQueuedAllocs, structs.EvalTriggerScaling: - default: - desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", - eval.TriggeredBy) + if !s.canHandle(eval.TriggeredBy) { + desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy) return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs, "") } + limit := maxSystemScheduleAttempts + if s.sysbatch { + limit = maxSysBatchScheduleAttempts + } + // Retry up to the maxSystemScheduleAttempts and reset if progress is made. progress := func() bool { return progressMade(s.planResult) } - if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil { + if err := retryMax(limit, s.process, progress); err != nil { if statusErr, ok := err.(*SetStatusError); ok { return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(), s.queuedAllocs, "") @@ -94,9 +111,9 @@ func (s *SystemScheduler) process() (bool, error) { ws := memdb.NewWatchSet() s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID) if err != nil { - return false, fmt.Errorf("failed to get job '%s': %v", - s.eval.JobID, err) + return false, fmt.Errorf("failed to get job '%s': %v", s.eval.JobID, err) } + numTaskGroups := 0 if !s.job.Stopped() { numTaskGroups = len(s.job.TaskGroups) @@ -121,7 +138,7 @@ func (s *SystemScheduler) process() (bool, error) { s.ctx = NewEvalContext(s.state, s.plan, s.logger) // Construct the placement stack - s.stack = NewSystemStack(s.ctx) + s.stack = NewSystemStack(s.sysbatch, s.ctx) if !s.job.Stopped() { s.stack.SetJob(s.job) } @@ -185,26 +202,24 @@ func (s *SystemScheduler) computeJobAllocs() error { ws := memdb.NewWatchSet() allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true) if err != nil { - return fmt.Errorf("failed to get allocs for job '%s': %v", - s.eval.JobID, err) + return fmt.Errorf("failed to get allocs for job '%s': %v", s.eval.JobID, err) } // Determine the tainted nodes containing job allocs tainted, err := taintedNodes(s.state, allocs) if err != nil { - return fmt.Errorf("failed to get tainted nodes for job '%s': %v", - s.eval.JobID, err) + return fmt.Errorf("failed to get tainted nodes for job '%s': %v", s.eval.JobID, err) } // Update the allocations which are in pending/running state on tainted - // nodes to lost + // nodes to lost. updateNonTerminalAllocsToLost(s.plan, tainted, allocs) - // Filter out the allocations in a terminal state - allocs, terminalAllocs := structs.FilterTerminalAllocs(allocs) + // Split out terminal allocations + live, term := structs.SplitTerminalAllocs(allocs) // Diff the required and existing allocations - diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs) + diff := diffSystemAllocs(s.job, s.nodes, tainted, live, term) s.logger.Debug("reconciled current state with desired state", "place", len(diff.place), "update", len(diff.update), "migrate", len(diff.migrate), "stop", len(diff.stop), @@ -423,3 +438,27 @@ func (s *SystemScheduler) addBlocked(node *structs.Node) error { return s.planner.CreateEval(blocked) } + +func (s *SystemScheduler) canHandle(trigger string) bool { + switch trigger { + case structs.EvalTriggerJobRegister: + case structs.EvalTriggerNodeUpdate: + case structs.EvalTriggerFailedFollowUp: + case structs.EvalTriggerJobDeregister: + case structs.EvalTriggerRollingUpdate: + case structs.EvalTriggerPreemption: + case structs.EvalTriggerDeploymentWatcher: + case structs.EvalTriggerNodeDrain: + case structs.EvalTriggerAllocStop: + case structs.EvalTriggerQueuedAllocs: + case structs.EvalTriggerScaling: + default: + switch s.sysbatch { + case true: + return trigger == structs.EvalTriggerPeriodicJob + case false: + return false + } + } + return true +} diff --git a/scheduler/system_sched_test.go b/scheduler/scheduler_system_test.go similarity index 84% rename from scheduler/system_sched_test.go rename to scheduler/scheduler_system_test.go index 35ed1ce5189d..e3cff0e646b7 100644 --- a/scheduler/system_sched_test.go +++ b/scheduler/scheduler_system_test.go @@ -19,10 +19,7 @@ func TestSystemSched_JobRegister(t *testing.T) { h := NewHarness(t) // Create some nodes - for i := 0; i < 10; i++ { - node := mock.Node() - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + _ = createNodes(t, h, 10) // Create a job job := mock.SystemJob() @@ -41,29 +38,21 @@ func TestSystemSched_JobRegister(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] - // Ensure the plan doesn't have annotations. - if plan.Annotations != nil { - t.Fatalf("expected no annotations") - } + // Ensure the plan does not have annotations + require.Nil(t, plan.Annotations, "expected no annotations") // Ensure the plan allocated var planned []*structs.Allocation for _, allocList := range plan.NodeAllocation { planned = append(planned, allocList...) } - if len(planned) != 10 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 10) // Lookup the allocations by JobID ws := memdb.NewWatchSet() @@ -71,20 +60,16 @@ func TestSystemSched_JobRegister(t *testing.T) { require.NoError(t, err) // Ensure all allocations placed - if len(out) != 10 { - t.Fatalf("bad: %#v", out) - } + require.Len(t, out, 10) // Check the available nodes - if count, ok := out[0].Metrics.NodesAvailable["dc1"]; !ok || count != 10 { - t.Fatalf("bad: %#v", out[0].Metrics) - } + count, ok := out[0].Metrics.NodesAvailable["dc1"] + require.True(t, ok) + require.Equal(t, 10, count, "bad metrics %#v:", out[0].Metrics) // Ensure no allocations are queued queued := h.Evals[0].QueuedAllocations["web"] - if queued != 0 { - t.Fatalf("expected queued allocations: %v, actual: %v", 0, queued) - } + require.Equal(t, 0, queued, "unexpected queued allocations") h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -93,10 +78,7 @@ func TestSystemSched_JobRegister_StickyAllocs(t *testing.T) { h := NewHarness(t) // Create some nodes - for i := 0; i < 10; i++ { - node := mock.Node() - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + _ = createNodes(t, h, 10) // Create a job job := mock.SystemJob() @@ -168,7 +150,7 @@ func TestSystemSched_JobRegister_StickyAllocs(t *testing.T) { func TestSystemSched_JobRegister_EphemeralDiskConstraint(t *testing.T) { h := NewHarness(t) - // Create a nodes + // Create a node node := mock.Node() require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) @@ -237,7 +219,7 @@ func TestSystemSched_JobRegister_EphemeralDiskConstraint(t *testing.T) { func TestSystemSched_ExhaustResources(t *testing.T) { h := NewHarness(t) - // Create a nodes + // Create a node node := mock.Node() require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) @@ -412,12 +394,7 @@ func TestSystemSched_JobRegister_AddNode(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -455,9 +432,7 @@ func TestSystemSched_JobRegister_AddNode(t *testing.T) { } // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan had no node updates @@ -465,19 +440,14 @@ func TestSystemSched_JobRegister_AddNode(t *testing.T) { for _, updateList := range plan.NodeUpdate { update = append(update, updateList...) } - if len(update) != 0 { - t.Log(len(update)) - t.Fatalf("bad: %#v", plan) - } + require.Empty(t, update) // Ensure the plan allocated on the new node var planned []*structs.Allocation for _, allocList := range plan.NodeAllocation { planned = append(planned, allocList...) } - if len(planned) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 1) // Ensure it allocated on the right node if _, ok := plan.NodeAllocation[node.ID]; !ok { @@ -534,12 +504,7 @@ func TestSystemSched_JobModify(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -590,14 +555,10 @@ func TestSystemSched_JobModify(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted all allocs @@ -605,18 +566,14 @@ func TestSystemSched_JobModify(t *testing.T) { for _, updateList := range plan.NodeUpdate { update = append(update, updateList...) } - if len(update) != len(allocs) { - t.Fatalf("bad: %#v", plan) - } + require.Equal(t, len(allocs), len(update)) // Ensure the plan allocated var planned []*structs.Allocation for _, allocList := range plan.NodeAllocation { planned = append(planned, allocList...) } - if len(planned) != 10 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 10) // Lookup the allocations by JobID ws := memdb.NewWatchSet() @@ -625,9 +582,7 @@ func TestSystemSched_JobModify(t *testing.T) { // Ensure all allocations placed out, _ = structs.FilterTerminalAllocs(out) - if len(out) != 10 { - t.Fatalf("bad: %#v", out) - } + require.Len(t, out, 10) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -636,12 +591,7 @@ func TestSystemSched_JobModify_Rolling(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -739,12 +689,7 @@ func TestSystemSched_JobModify_InPlace(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -766,7 +711,7 @@ func TestSystemSched_JobModify_InPlace(t *testing.T) { job2.ID = job.ID require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) - // Create a mock evaluation to deal with drain + // Create a mock evaluation to deal with update eval := &structs.Evaluation{ Namespace: structs.DefaultNamespace, ID: uuid.Generate(), @@ -779,14 +724,10 @@ func TestSystemSched_JobModify_InPlace(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan did not evict any allocs @@ -794,22 +735,17 @@ func TestSystemSched_JobModify_InPlace(t *testing.T) { for _, updateList := range plan.NodeUpdate { update = append(update, updateList...) } - if len(update) != 0 { - t.Fatalf("bad: %#v", plan) - } + require.Empty(t, update) // Ensure the plan updated the existing allocs var planned []*structs.Allocation for _, allocList := range plan.NodeAllocation { planned = append(planned, allocList...) } - if len(planned) != 10 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 10) + for _, p := range planned { - if p.Job != job2 { - t.Fatalf("should update job") - } + require.Equal(t, job2, p.Job, "should update job") } // Lookup the allocations by JobID @@ -818,18 +754,14 @@ func TestSystemSched_JobModify_InPlace(t *testing.T) { require.NoError(t, err) // Ensure all allocations placed - if len(out) != 10 { - t.Fatalf("bad: %#v", out) - } + require.Len(t, out, 10) h.AssertEvalStatus(t, structs.EvalStatusComplete) // Verify the network did not change rp := structs.Port{Label: "admin", Value: 5000} for _, alloc := range out { for _, resources := range alloc.TaskResources { - if resources.Networks[0].ReservedPorts[0] != rp { - t.Fatalf("bad: %#v", alloc) - } + require.Equal(t, rp, resources.Networks[0].ReservedPorts[0]) } } } @@ -838,12 +770,7 @@ func TestSystemSched_JobDeregister_Purged(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -875,21 +802,15 @@ func TestSystemSched_JobDeregister_Purged(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted the job from all nodes. for _, node := range nodes { - if len(plan.NodeUpdate[node.ID]) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) } // Lookup the allocations by JobID @@ -899,9 +820,7 @@ func TestSystemSched_JobDeregister_Purged(t *testing.T) { // Ensure no remaining allocations out, _ = structs.FilterTerminalAllocs(out) - if len(out) != 0 { - t.Fatalf("bad: %#v", out) - } + require.Empty(t, out) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -910,12 +829,7 @@ func TestSystemSched_JobDeregister_Stopped(t *testing.T) { h := NewHarness(t) // Create some nodes - var nodes []*structs.Node - for i := 0; i < 10; i++ { - node := mock.Node() - nodes = append(nodes, node) - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + nodes := createNodes(t, h, 10) // Generate a fake job with allocations job := mock.SystemJob() @@ -949,21 +863,15 @@ func TestSystemSched_JobDeregister_Stopped(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted the job from all nodes. for _, node := range nodes { - if len(plan.NodeUpdate[node.ID]) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) } // Lookup the allocations by JobID @@ -973,9 +881,7 @@ func TestSystemSched_JobDeregister_Stopped(t *testing.T) { // Ensure no remaining allocations out, _ = structs.FilterTerminalAllocs(out) - if len(out) != 0 { - t.Fatalf("bad: %#v", out) - } + require.Empty(t, out) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1014,35 +920,27 @@ func TestSystemSched_NodeDown(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted all allocs - if len(plan.NodeUpdate[node.ID]) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) // Ensure the plan updated the allocation. - var planned []*structs.Allocation + planned := make([]*structs.Allocation, 0) for _, allocList := range plan.NodeUpdate { planned = append(planned, allocList...) } - if len(planned) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 1) // Ensure the allocations is stopped - if p := planned[0]; p.DesiredStatus != structs.AllocDesiredStatusStop && - p.ClientStatus != structs.AllocClientStatusLost { - t.Fatalf("bad: %#v", planned[0]) - } + p := planned[0] + require.Equal(t, structs.AllocDesiredStatusStop, p.DesiredStatus) + // removed badly designed assertion on client_status = lost + // the actual client_status is pending h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1080,32 +978,23 @@ func TestSystemSched_NodeDrain_Down(t *testing.T) { require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) // Process the evaluation - err := h.Process(NewServiceScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + err := h.Process(NewSystemScheduler, eval) // todo: yikes + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted non terminal allocs - if len(plan.NodeUpdate[node.ID]) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) // Ensure that the allocation is marked as lost - var lostAllocs []string + var lost []string for _, alloc := range plan.NodeUpdate[node.ID] { - lostAllocs = append(lostAllocs, alloc.ID) + lost = append(lost, alloc.ID) } - expected := []string{alloc.ID} + require.Equal(t, []string{alloc.ID}, lost) - if !reflect.DeepEqual(lostAllocs, expected) { - t.Fatalf("expected: %v, actual: %v", expected, lostAllocs) - } h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1143,35 +1032,24 @@ func TestSystemSched_NodeDrain(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted all allocs - if len(plan.NodeUpdate[node.ID]) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) // Ensure the plan updated the allocation. - var planned []*structs.Allocation + planned := make([]*structs.Allocation, 0) for _, allocList := range plan.NodeUpdate { planned = append(planned, allocList...) } - if len(planned) != 1 { - t.Log(len(planned)) - t.Fatalf("bad: %#v", plan) - } + require.Len(t, planned, 1) // Ensure the allocations is stopped - if planned[0].DesiredStatus != structs.AllocDesiredStatusStop { - t.Fatalf("bad: %#v", planned[0]) - } + require.Equal(t, structs.AllocDesiredStatusStop, planned[0].DesiredStatus) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1194,7 +1072,7 @@ func TestSystemSched_NodeUpdate(t *testing.T) { alloc.Name = "my-job.web[0]" require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc})) - // Create a mock evaluation to deal + // Create a mock evaluation to deal with the node update eval := &structs.Evaluation{ Namespace: structs.DefaultNamespace, ID: uuid.Generate(), @@ -1208,14 +1086,12 @@ func TestSystemSched_NodeUpdate(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure that queued allocations is zero - if val, ok := h.Evals[0].QueuedAllocations["web"]; !ok || val != 0 { - t.Fatalf("bad queued allocations: %#v", h.Evals[0].QueuedAllocations) - } + val, ok := h.Evals[0].QueuedAllocations["web"] + require.True(t, ok) + require.Zero(t, val) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1225,16 +1101,13 @@ func TestSystemSched_RetryLimit(t *testing.T) { h.Planner = &RejectPlan{h} // Create some nodes - for i := 0; i < 10; i++ { - node := mock.Node() - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + _ = createNodes(t, h, 10) // Create a job job := mock.SystemJob() require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) - // Create a mock evaluation to deregister the job + // Create a mock evaluation to register the job eval := &structs.Evaluation{ Namespace: structs.DefaultNamespace, ID: uuid.Generate(), @@ -1247,14 +1120,10 @@ func TestSystemSched_RetryLimit(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure multiple plans - if len(h.Plans) == 0 { - t.Fatalf("bad: %#v", h.Plans) - } + require.NotEmpty(t, h.Plans) // Lookup the allocations by JobID ws := memdb.NewWatchSet() @@ -1262,9 +1131,7 @@ func TestSystemSched_RetryLimit(t *testing.T) { require.NoError(t, err) // Ensure no allocations placed - if len(out) != 0 { - t.Fatalf("bad: %#v", out) - } + require.Empty(t, out) // Should hit the retry limit h.AssertEvalStatus(t, structs.EvalStatusFailed) @@ -1272,7 +1139,7 @@ func TestSystemSched_RetryLimit(t *testing.T) { // This test ensures that the scheduler doesn't increment the queued allocation // count for a task group when allocations can't be created on currently -// available nodes because of constrain mismatches. +// available nodes because of constraint mismatches. func TestSystemSched_Queued_With_Constraints(t *testing.T) { h := NewHarness(t) @@ -1285,7 +1152,7 @@ func TestSystemSched_Queued_With_Constraints(t *testing.T) { job := mock.SystemJob() require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) - // Create a mock evaluation to deal + // Create a mock evaluation to deal with the node update eval := &structs.Evaluation{ Namespace: structs.DefaultNamespace, ID: uuid.Generate(), @@ -1299,20 +1166,17 @@ func TestSystemSched_Queued_With_Constraints(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure that queued allocations is zero - if val, ok := h.Evals[0].QueuedAllocations["web"]; !ok || val != 0 { - t.Fatalf("bad queued allocations: %#v", h.Evals[0].QueuedAllocations) - } - + val, ok := h.Evals[0].QueuedAllocations["web"] + require.True(t, ok) + require.Zero(t, val) } // This test ensures that the scheduler correctly ignores ineligible // nodes when scheduling due to a new node being added. The job has two -// task groups contrained to a particular node class. The desired behavior +// task groups constrained to a particular node class. The desired behavior // should be that the TaskGroup constrained to the newly added node class is // added and that the TaskGroup constrained to the ineligible node is ignored. func TestSystemSched_JobConstraint_AddNode(t *testing.T) { @@ -1322,13 +1186,13 @@ func TestSystemSched_JobConstraint_AddNode(t *testing.T) { var node *structs.Node node = mock.Node() node.NodeClass = "Class-A" - node.ComputeClass() + require.NoError(t, node.ComputeClass()) require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) var nodeB *structs.Node nodeB = mock.Node() nodeB.NodeClass = "Class-B" - nodeB.ComputeClass() + require.NoError(t, nodeB.ComputeClass()) require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), nodeB)) // Make a job with two task groups, each constraint to a node class @@ -1365,7 +1229,6 @@ func TestSystemSched_JobConstraint_AddNode(t *testing.T) { JobID: job.ID, Status: structs.EvalStatusPending, } - require.Nil(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) require.Nil(t, h.Process(NewSystemScheduler, eval)) @@ -1414,7 +1277,7 @@ func TestSystemSched_JobConstraint_AddNode(t *testing.T) { // Add a new node Class-B var nodeBTwo *structs.Node nodeBTwo = mock.Node() - nodeBTwo.ComputeClass() + require.NoError(t, nodeBTwo.ComputeClass()) nodeBTwo.NodeClass = "Class-B" require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), nodeBTwo)) @@ -1467,7 +1330,7 @@ func TestSystemSched_ExistingAllocNoNodes(t *testing.T) { var node *structs.Node // Create a node node = mock.Node() - node.ComputeClass() + require.NoError(t, node.ComputeClass()) require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) // Make a job @@ -1498,6 +1361,7 @@ func TestSystemSched_ExistingAllocNoNodes(t *testing.T) { // Mark the node as ineligible node.SchedulingEligibility = structs.NodeSchedulingIneligible + // Evaluate the job eval2 := &structs.Evaluation{ Namespace: structs.DefaultNamespace, @@ -1549,7 +1413,7 @@ func TestSystemSched_ConstraintErrors(t *testing.T) { for _, tag := range []string{"aaaaaa", "foo", "foo", "foo"} { node = mock.Node() node.Meta["tag"] = tag - node.ComputeClass() + require.NoError(t, node.ComputeClass()) require.Nil(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) } @@ -1614,10 +1478,7 @@ func TestSystemSched_ChainedAlloc(t *testing.T) { h := NewHarness(t) // Create some nodes - for i := 0; i < 10; i++ { - node := mock.Node() - require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) - } + _ = createNodes(t, h, 10) // Create a job job := mock.SystemJob() @@ -1633,10 +1494,10 @@ func TestSystemSched_ChainedAlloc(t *testing.T) { Status: structs.EvalStatusPending, } require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + // Process the evaluation - if err := h.Process(NewSystemScheduler, eval); err != nil { - t.Fatalf("err: %v", err) - } + err := h.Process(NewSystemScheduler, eval) + require.NoError(t, err) var allocIDs []string for _, allocList := range h.Plans[0].NodeAllocation { @@ -1675,6 +1536,7 @@ func TestSystemSched_ChainedAlloc(t *testing.T) { t.Fatalf("err: %v", err) } + require.Len(t, h.Plans, 1) plan := h1.Plans[0] // Collect all the chained allocation ids and the new allocations which @@ -1694,14 +1556,10 @@ func TestSystemSched_ChainedAlloc(t *testing.T) { // Ensure that the new allocations has their corresponding original // allocation ids - if !reflect.DeepEqual(prevAllocs, allocIDs) { - t.Fatalf("expected: %v, actual: %v", len(allocIDs), len(prevAllocs)) - } + require.Equal(t, allocIDs, prevAllocs) // Ensuring two new allocations don't have any chained allocations - if len(newAllocs) != 2 { - t.Fatalf("expected: %v, actual: %v", 2, len(newAllocs)) - } + require.Len(t, newAllocs, 2) } func TestSystemSched_PlanWithDrainedNode(t *testing.T) { @@ -1711,12 +1569,12 @@ func TestSystemSched_PlanWithDrainedNode(t *testing.T) { node := mock.Node() node.NodeClass = "green" node.Drain = true - node.ComputeClass() + require.NoError(t, node.ComputeClass()) require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) node2 := mock.Node() node2.NodeClass = "blue" - node2.ComputeClass() + require.NoError(t, node2.ComputeClass()) require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node2)) // Create a Job with two task groups, each constrained on node class @@ -1766,31 +1624,21 @@ func TestSystemSched_PlanWithDrainedNode(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) plan := h.Plans[0] // Ensure the plan evicted the alloc on the failed node planned := plan.NodeUpdate[node.ID] - if len(planned) != 1 { - t.Fatalf("bad: %#v", plan) - } + require.Len(t, plan.NodeUpdate[node.ID], 1) // Ensure the plan didn't place - if len(plan.NodeAllocation) != 0 { - t.Fatalf("bad: %#v", plan) - } + require.Empty(t, plan.NodeAllocation) // Ensure the allocations is stopped - if planned[0].DesiredStatus != structs.AllocDesiredStatusStop { - t.Fatalf("bad: %#v", planned[0]) - } + require.Equal(t, structs.AllocDesiredStatusStop, planned[0].DesiredStatus) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1801,12 +1649,12 @@ func TestSystemSched_QueuedAllocsMultTG(t *testing.T) { // Register two nodes with two different classes node := mock.Node() node.NodeClass = "green" - node.ComputeClass() + require.NoError(t, node.ComputeClass()) require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) node2 := mock.Node() node2.NodeClass = "blue" - node2.ComputeClass() + require.NoError(t, node2.ComputeClass()) require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node2)) // Create a Job with two task groups, each constrained on node class @@ -1839,19 +1687,14 @@ func TestSystemSched_QueuedAllocsMultTG(t *testing.T) { // Process the evaluation err := h.Process(NewSystemScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(t, err) // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } + require.Len(t, h.Plans, 1) qa := h.Evals[0].QueuedAllocations - if qa["web"] != 0 || qa["web2"] != 0 { - t.Fatalf("bad queued allocations %#v", qa) - } + require.Zero(t, qa["pinger"]) + require.Zero(t, qa["pinger2"]) h.AssertEvalStatus(t, structs.EvalStatusComplete) } @@ -1860,63 +1703,50 @@ func TestSystemSched_Preemption(t *testing.T) { h := NewHarness(t) // Create nodes - var nodes []*structs.Node + nodes := make([]*structs.Node, 0) for i := 0; i < 2; i++ { node := mock.Node() - // TODO(preetha): remove in 0.11 + // TODO: remove in 0.11 node.Resources = &structs.Resources{ CPU: 3072, MemoryMB: 5034, DiskMB: 20 * 1024, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - CIDR: "192.168.0.100/32", - MBits: 1000, - }, - }, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + CIDR: "192.168.0.100/32", + MBits: 1000, + }}, } node.NodeResources = &structs.NodeResources{ - Cpu: structs.NodeCpuResources{ - CpuShares: 3072, - }, - Memory: structs.NodeMemoryResources{ - MemoryMB: 5034, - }, - Disk: structs.NodeDiskResources{ - DiskMB: 20 * 1024, - }, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - CIDR: "192.168.0.100/32", - MBits: 1000, - }, - }, - NodeNetworks: []*structs.NodeNetworkResource{ - { - Mode: "host", - Device: "eth0", - Addresses: []structs.NodeNetworkAddress{ - { - Family: structs.NodeNetworkAF_IPv4, - Alias: "default", - Address: "192.168.0.100", - }, - }, - }, - }, + Cpu: structs.NodeCpuResources{CpuShares: 3072}, + Memory: structs.NodeMemoryResources{MemoryMB: 5034}, + Disk: structs.NodeDiskResources{DiskMB: 20 * 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + CIDR: "192.168.0.100/32", + MBits: 1000, + }}, + NodeNetworks: []*structs.NodeNetworkResource{{ + Mode: "host", + Device: "eth0", + Addresses: []structs.NodeNetworkAddress{{ + Family: structs.NodeNetworkAF_IPv4, + Alias: "default", + Address: "192.168.0.100", + }}, + }}, } require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) nodes = append(nodes, node) } // Enable Preemption - h.State.SchedulerSetConfig(h.NextIndex(), &structs.SchedulerConfiguration{ + err := h.State.SchedulerSetConfig(h.NextIndex(), &structs.SchedulerConfiguration{ PreemptionConfig: structs.PreemptionConfig{ SystemSchedulerEnabled: true, }, }) + require.NoError(t, err) // Create some low priority batch jobs and allocations for them // One job uses a reserved port @@ -1926,17 +1756,13 @@ func TestSystemSched_Preemption(t *testing.T) { job1.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ CPU: 512, MemoryMB: 1024, - Networks: []*structs.NetworkResource{ - { - MBits: 200, - ReservedPorts: []structs.Port{ - { - Label: "web", - Value: 80, - }, - }, - }, - }, + Networks: []*structs.NetworkResource{{ + MBits: 200, + ReservedPorts: []structs.Port{{ + Label: "web", + Value: 80, + }}, + }}, } alloc1 := mock.Alloc() @@ -1948,27 +1774,18 @@ func TestSystemSched_Preemption(t *testing.T) { alloc1.AllocatedResources = &structs.AllocatedResources{ Tasks: map[string]*structs.AllocatedTaskResources{ "web": { - Cpu: structs.AllocatedCpuResources{ - CpuShares: 512, - }, - Memory: structs.AllocatedMemoryResources{ - MemoryMB: 1024, - }, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - IP: "192.168.0.100", - ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, - MBits: 200, - }, - }, + Cpu: structs.AllocatedCpuResources{CpuShares: 512}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, + MBits: 200, + }}, }, }, - Shared: structs.AllocatedSharedResources{ - DiskMB: 5 * 1024, - }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, } - require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job1)) job2 := mock.BatchJob() @@ -1977,11 +1794,7 @@ func TestSystemSched_Preemption(t *testing.T) { job2.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ CPU: 512, MemoryMB: 1024, - Networks: []*structs.NetworkResource{ - { - MBits: 200, - }, - }, + Networks: []*structs.NetworkResource{{MBits: 200}}, } alloc2 := mock.Alloc() @@ -1993,24 +1806,16 @@ func TestSystemSched_Preemption(t *testing.T) { alloc2.AllocatedResources = &structs.AllocatedResources{ Tasks: map[string]*structs.AllocatedTaskResources{ "web": { - Cpu: structs.AllocatedCpuResources{ - CpuShares: 512, - }, - Memory: structs.AllocatedMemoryResources{ - MemoryMB: 1024, - }, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - IP: "192.168.0.100", - MBits: 200, - }, - }, + Cpu: structs.AllocatedCpuResources{CpuShares: 512}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 1024}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + MBits: 200, + }}, }, }, - Shared: structs.AllocatedSharedResources{ - DiskMB: 5 * 1024, - }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, } require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job2)) @@ -2020,12 +1825,10 @@ func TestSystemSched_Preemption(t *testing.T) { job3.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ CPU: 1024, MemoryMB: 2048, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - MBits: 400, - }, - }, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + MBits: 400, + }}, } alloc3 := mock.Alloc() @@ -2037,25 +1840,17 @@ func TestSystemSched_Preemption(t *testing.T) { alloc3.AllocatedResources = &structs.AllocatedResources{ Tasks: map[string]*structs.AllocatedTaskResources{ "web": { - Cpu: structs.AllocatedCpuResources{ - CpuShares: 1024, - }, - Memory: structs.AllocatedMemoryResources{ - MemoryMB: 25, - }, - Networks: []*structs.NetworkResource{ - { - Device: "eth0", - IP: "192.168.0.100", - ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, - MBits: 400, - }, - }, + Cpu: structs.AllocatedCpuResources{CpuShares: 1024}, + Memory: structs.AllocatedMemoryResources{MemoryMB: 25}, + Networks: []*structs.NetworkResource{{ + Device: "eth0", + IP: "192.168.0.100", + ReservedPorts: []structs.Port{{Label: "web", Value: 80}}, + MBits: 400, + }}, }, }, - Shared: structs.AllocatedSharedResources{ - DiskMB: 5 * 1024, - }, + Shared: structs.AllocatedSharedResources{DiskMB: 5 * 1024}, } require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Allocation{alloc1, alloc2, alloc3})) @@ -2068,11 +1863,7 @@ func TestSystemSched_Preemption(t *testing.T) { job4.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ CPU: 1024, MemoryMB: 2048, - Networks: []*structs.NetworkResource{ - { - MBits: 100, - }, - }, + Networks: []*structs.NetworkResource{{MBits: 100}}, } alloc4 := mock.Alloc() @@ -2112,12 +1903,10 @@ func TestSystemSched_Preemption(t *testing.T) { job.TaskGroups[0].Tasks[0].Resources = &structs.Resources{ CPU: 1948, MemoryMB: 256, - Networks: []*structs.NetworkResource{ - { - MBits: 800, - DynamicPorts: []structs.Port{{Label: "http"}}, - }, - }, + Networks: []*structs.NetworkResource{{ + MBits: 800, + DynamicPorts: []structs.Port{{Label: "http"}}, + }}, } require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) @@ -2133,21 +1922,20 @@ func TestSystemSched_Preemption(t *testing.T) { require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) // Process the evaluation - err := h.Process(NewSystemScheduler, eval) - require := require.New(t) - require.Nil(err) + err = h.Process(NewSystemScheduler, eval) + require.Nil(t, err) // Ensure a single plan - require.Equal(1, len(h.Plans)) + require.Equal(t, 1, len(h.Plans)) plan := h.Plans[0] - // Ensure the plan doesn't have annotations. - require.Nil(plan.Annotations) + // Ensure the plan doesn't have annotations + require.Nil(t, plan.Annotations) // Ensure the plan allocated on both nodes var planned []*structs.Allocation preemptingAllocId := "" - require.Equal(2, len(plan.NodeAllocation)) + require.Equal(t, 2, len(plan.NodeAllocation)) // The alloc that got placed on node 1 is the preemptor for _, allocList := range plan.NodeAllocation { @@ -2162,37 +1950,49 @@ func TestSystemSched_Preemption(t *testing.T) { // Lookup the allocations by JobID ws := memdb.NewWatchSet() out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) - require.NoError(err) + require.NoError(t, err) // Ensure all allocations placed - require.Equal(2, len(out)) + require.Equal(t, 2, len(out)) // Verify that one node has preempted allocs - require.NotNil(plan.NodePreemptions[nodes[0].ID]) + require.NotNil(t, plan.NodePreemptions[nodes[0].ID]) preemptedAllocs := plan.NodePreemptions[nodes[0].ID] // Verify that three jobs have preempted allocs - require.Equal(3, len(preemptedAllocs)) + require.Equal(t, 3, len(preemptedAllocs)) expectedPreemptedJobIDs := []string{job1.ID, job2.ID, job3.ID} // We expect job1, job2 and job3 to have preempted allocations // job4 should not have any allocs preempted for _, alloc := range preemptedAllocs { - require.Contains(expectedPreemptedJobIDs, alloc.JobID) + require.Contains(t, expectedPreemptedJobIDs, alloc.JobID) } // Look up the preempted allocs by job ID ws = memdb.NewWatchSet() for _, jobId := range expectedPreemptedJobIDs { out, err = h.State.AllocsByJob(ws, structs.DefaultNamespace, jobId, false) - require.NoError(err) + require.NoError(t, err) for _, alloc := range out { - require.Equal(structs.AllocDesiredStatusEvict, alloc.DesiredStatus) - require.Equal(fmt.Sprintf("Preempted by alloc ID %v", preemptingAllocId), alloc.DesiredDescription) + require.Equal(t, structs.AllocDesiredStatusEvict, alloc.DesiredStatus) + require.Equal(t, fmt.Sprintf("Preempted by alloc ID %v", preemptingAllocId), alloc.DesiredDescription) } } h.AssertEvalStatus(t, structs.EvalStatusComplete) +} +func TestSystemSched_canHandle(t *testing.T) { + s := SystemScheduler{sysbatch: false} + t.Run("system register", func(t *testing.T) { + require.True(t, s.canHandle(structs.EvalTriggerJobRegister)) + }) + t.Run("system scheduled", func(t *testing.T) { + require.False(t, s.canHandle(structs.EvalTriggerScheduled)) + }) + t.Run("system periodic", func(t *testing.T) { + require.False(t, s.canHandle(structs.EvalTriggerPeriodicJob)) + }) } diff --git a/scheduler/stack.go b/scheduler/stack.go index bccabc7899ab..cf01c2992afe 100644 --- a/scheduler/stack.go +++ b/scheduler/stack.go @@ -198,8 +198,12 @@ type SystemStack struct { scoreNorm *ScoreNormalizationIterator } -// NewSystemStack constructs a stack used for selecting system job placements. -func NewSystemStack(ctx Context) *SystemStack { +// NewSystemStack constructs a stack used for selecting system and sysbatch +// job placements. +// +// sysbatch is used to determine which scheduler config option is used to +// control the use of preemption. +func NewSystemStack(sysbatch bool, ctx Context) *SystemStack { // Create a new stack s := &SystemStack{ctx: ctx} @@ -237,10 +241,13 @@ func NewSystemStack(ctx Context) *SystemStack { // previously been marked as eligible or ineligible. Generally this will be // checks that only needs to examine the single node to determine feasibility. jobs := []FeasibilityChecker{s.jobConstraint} - tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint, + tgs := []FeasibilityChecker{ + s.taskGroupDrivers, + s.taskGroupConstraint, s.taskGroupHostVolumes, s.taskGroupDevices, - s.taskGroupNetwork} + s.taskGroupNetwork, + } avail := []FeasibilityChecker{s.taskGroupCSIVolumes} s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs, avail) @@ -257,9 +264,14 @@ func NewSystemStack(ctx Context) *SystemStack { schedulerAlgorithm := schedConfig.EffectiveSchedulerAlgorithm() enablePreemption := true if schedConfig != nil { - enablePreemption = schedConfig.PreemptionConfig.SystemSchedulerEnabled + if sysbatch { + enablePreemption = schedConfig.PreemptionConfig.SysBatchSchedulerEnabled + } else { + enablePreemption = schedConfig.PreemptionConfig.SystemSchedulerEnabled + } } + // Create binpack iterator s.binPack = NewBinPackIterator(ctx, rankSource, enablePreemption, 0, schedulerAlgorithm) // Apply score normalization @@ -360,11 +372,13 @@ func NewGenericStack(batch bool, ctx Context) *GenericStack { // previously been marked as eligible or ineligible. Generally this will be // checks that only needs to examine the single node to determine feasibility. jobs := []FeasibilityChecker{s.jobConstraint} - tgs := []FeasibilityChecker{s.taskGroupDrivers, + tgs := []FeasibilityChecker{ + s.taskGroupDrivers, s.taskGroupConstraint, s.taskGroupHostVolumes, s.taskGroupDevices, - s.taskGroupNetwork} + s.taskGroupNetwork, + } avail := []FeasibilityChecker{s.taskGroupCSIVolumes} s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs, avail) diff --git a/scheduler/stack_test.go b/scheduler/stack_test.go index 4650546d32f1..b45d91bc5d16 100644 --- a/scheduler/stack_test.go +++ b/scheduler/stack_test.go @@ -389,7 +389,7 @@ func TestServiceStack_Select_BinPack_Overflow(t *testing.T) { func TestSystemStack_SetNodes(t *testing.T) { _, ctx := testContext(t) - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) nodes := []*structs.Node{ mock.Node(), @@ -411,7 +411,7 @@ func TestSystemStack_SetNodes(t *testing.T) { func TestSystemStack_SetJob(t *testing.T) { _, ctx := testContext(t) - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) job := mock.Job() stack.SetJob(job) @@ -427,7 +427,7 @@ func TestSystemStack_SetJob(t *testing.T) { func TestSystemStack_Select_Size(t *testing.T) { _, ctx := testContext(t) nodes := []*structs.Node{mock.Node()} - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) stack.SetNodes(nodes) job := mock.Job() @@ -455,7 +455,7 @@ func TestSystemStack_Select_MetricsReset(t *testing.T) { mock.Node(), mock.Node(), } - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) stack.SetNodes(nodes) job := mock.Job() @@ -491,7 +491,7 @@ func TestSystemStack_Select_DriverFilter(t *testing.T) { zero := nodes[0] zero.Attributes["driver.foo"] = "1" - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) stack.SetNodes(nodes) job := mock.Job() @@ -513,7 +513,7 @@ func TestSystemStack_Select_DriverFilter(t *testing.T) { t.Fatalf("ComputedClass() failed: %v", err) } - stack = NewSystemStack(ctx) + stack = NewSystemStack(false, ctx) stack.SetNodes(nodes) stack.SetJob(job) node = stack.Select(job.TaskGroups[0], selectOptions) @@ -534,7 +534,7 @@ func TestSystemStack_Select_ConstraintFilter(t *testing.T) { t.Fatalf("ComputedClass() failed: %v", err) } - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) stack.SetNodes(nodes) job := mock.Job() @@ -577,7 +577,7 @@ func TestSystemStack_Select_BinPack_Overflow(t *testing.T) { } one := nodes[1] - stack := NewSystemStack(ctx) + stack := NewSystemStack(false, ctx) stack.SetNodes(nodes) job := mock.Job() diff --git a/scheduler/util.go b/scheduler/util.go index 7261f67deb8f..75f291e6410a 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -60,21 +60,19 @@ func (d *diffResult) Append(other *diffResult) { // need to be migrated (node is draining), the allocs that need to be evicted // (no longer required), those that should be ignored and those that are lost // that need to be replaced (running on a lost node). -// -// job is the job whose allocs is going to be diff-ed. -// taintedNodes is an index of the nodes which are either down or in drain mode -// by name. -// required is a set of allocations that must exist. -// allocs is a list of non terminal allocations. -// terminalAllocs is an index of the latest terminal allocations by name. -func diffSystemAllocsForNode(job *structs.Job, nodeID string, - eligibleNodes, taintedNodes map[string]*structs.Node, - required map[string]*structs.TaskGroup, allocs []*structs.Allocation, - terminalAllocs map[string]*structs.Allocation) *diffResult { - result := &diffResult{} +func diffSystemAllocsForNode( + job *structs.Job, // job whose allocs are going to be diff-ed + nodeID string, + eligibleNodes map[string]*structs.Node, + taintedNodes map[string]*structs.Node, // nodes which are down or in drain (by node name) + required map[string]*structs.TaskGroup, // set of allocations that must exist + allocs []*structs.Allocation, // non-terminal allocations that exist + terminal structs.TerminalByNodeByName, // latest terminal allocations (by node, name) +) *diffResult { + result := new(diffResult) // Scan the existing updates - existing := make(map[string]struct{}) + existing := make(map[string]struct{}) // set of alloc names for _, exist := range allocs { // Index the existing node name := exist.Name @@ -102,6 +100,17 @@ func diffSystemAllocsForNode(job *structs.Job, nodeID string, }) continue } + + // If we are a sysbatch job and terminal, ignore (or stop?) the alloc + if job.Type == structs.JobTypeSysBatch && exist.TerminalStatus() { + result.ignore = append(result.ignore, allocTuple{ + Name: name, + TaskGroup: tg, + Alloc: exist, + }) + continue + } + // If we are on a tainted node, we must migrate if we are a service or // if the batch allocation did not finish if node, ok := taintedNodes[exist.NodeID]; ok { @@ -154,14 +163,38 @@ func diffSystemAllocsForNode(job *structs.Job, nodeID string, // Scan the required groups for name, tg := range required { + // Check for an existing allocation - _, ok := existing[name] + if _, ok := existing[name]; !ok { + + // Check for a terminal sysbatch allocation, which should be not placed + // again unless the job has been updated. + if job.Type == structs.JobTypeSysBatch { + if alloc, termExists := terminal.Get(nodeID, name); termExists { + // the alloc is terminal, but now the job has been updated + if job.JobModifyIndex != alloc.Job.JobModifyIndex { + result.update = append(result.update, allocTuple{ + Name: name, + TaskGroup: tg, + Alloc: alloc, + }) + } else { + // alloc is terminal and job unchanged, leave it alone + result.ignore = append(result.ignore, allocTuple{ + Name: name, + TaskGroup: tg, + Alloc: alloc, + }) + } + continue + } + } + + // Require a placement if no existing allocation. If there + // is an existing allocation, we would have checked for a potential + // update or ignore above. Ignore placements for tainted or + // ineligible nodes - // Require a placement if no existing allocation. If there - // is an existing allocation, we would have checked for a potential - // update or ignore above. Ignore placements for tainted or - // ineligible nodes - if !ok { // Tainted and ineligible nodes for a non existing alloc // should be filtered out and not count towards ignore or place if _, tainted := taintedNodes[nodeID]; tainted { @@ -171,10 +204,11 @@ func diffSystemAllocsForNode(job *structs.Job, nodeID string, continue } + termOnNode, _ := terminal.Get(nodeID, name) allocTuple := allocTuple{ Name: name, TaskGroup: tg, - Alloc: terminalAllocs[name], + Alloc: termOnNode, } // If the new allocation isn't annotated with a previous allocation @@ -183,6 +217,7 @@ func diffSystemAllocsForNode(job *structs.Job, nodeID string, if allocTuple.Alloc == nil || allocTuple.Alloc.NodeID != nodeID { allocTuple.Alloc = &structs.Allocation{NodeID: nodeID} } + result.place = append(result.place, allocTuple) } } @@ -191,15 +226,13 @@ func diffSystemAllocsForNode(job *structs.Job, nodeID string, // diffSystemAllocs is like diffSystemAllocsForNode however, the allocations in the // diffResult contain the specific nodeID they should be allocated on. -// -// job is the job whose allocs is going to be diff-ed. -// nodes is a list of nodes in ready state. -// taintedNodes is an index of the nodes which are either down or in drain mode -// by name. -// allocs is a list of non terminal allocations. -// terminalAllocs is an index of the latest terminal allocations by name. -func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]*structs.Node, - allocs []*structs.Allocation, terminalAllocs map[string]*structs.Allocation) *diffResult { +func diffSystemAllocs( + job *structs.Job, // jobs whose allocations are going to be diff-ed + nodes []*structs.Node, // list of nodes in the ready state + taintedNodes map[string]*structs.Node, // nodes which are down or drain mode (by name) + allocs []*structs.Allocation, // non-terminal allocations + terminal structs.TerminalByNodeByName, // latest terminal allocations (by name) +) *diffResult { // Build a mapping of nodes to all their allocs. nodeAllocs := make(map[string][]*structs.Allocation, len(allocs)) @@ -219,9 +252,9 @@ func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[ // Create the required task groups. required := materializeTaskGroups(job) - result := &diffResult{} + result := new(diffResult) for nodeID, allocs := range nodeAllocs { - diff := diffSystemAllocsForNode(job, nodeID, eligibleNodes, taintedNodes, required, allocs, terminalAllocs) + diff := diffSystemAllocsForNode(job, nodeID, eligibleNodes, taintedNodes, required, allocs, terminal) result.Append(diff) } diff --git a/scheduler/util_test.go b/scheduler/util_test.go index 5c783d7e59db..865b343fdbd2 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -27,6 +27,76 @@ func TestMaterializeTaskGroups(t *testing.T) { } } +func newNode(name string) *structs.Node { + n := mock.Node() + n.Name = name + return n +} + +func TestDiffSystemAllocsForNode_Sysbatch_terminal(t *testing.T) { + // For a sysbatch job, the scheduler should not re-place an allocation + // that has become terminal, unless the job has been updated. + + job := mock.SystemBatchJob() + required := materializeTaskGroups(job) + + eligible := map[string]*structs.Node{ + "node1": newNode("node1"), + } + + var live []*structs.Allocation // empty + + tainted := map[string]*structs.Node(nil) + + t.Run("current job", func(t *testing.T) { + terminal := structs.TerminalByNodeByName{ + "node1": map[string]*structs.Allocation{ + "my-sysbatch.pinger[0]": &structs.Allocation{ + ID: uuid.Generate(), + NodeID: "node1", + Name: "my-sysbatch.pinger[0]", + Job: job, + ClientStatus: structs.AllocClientStatusComplete, + }, + }, + } + + diff := diffSystemAllocsForNode(job, "node1", eligible, tainted, required, live, terminal) + require.Empty(t, diff.place) + require.Empty(t, diff.update) + require.Empty(t, diff.stop) + require.Empty(t, diff.migrate) + require.Empty(t, diff.lost) + require.True(t, len(diff.ignore) == 1 && diff.ignore[0].Alloc == terminal["node1"]["my-sysbatch.pinger[0]"]) + }) + + t.Run("outdated job", func(t *testing.T) { + previousJob := job.Copy() + previousJob.JobModifyIndex -= 1 + terminal := structs.TerminalByNodeByName{ + "node1": map[string]*structs.Allocation{ + "my-sysbatch.pinger[0]": &structs.Allocation{ + ID: uuid.Generate(), + NodeID: "node1", + Name: "my-sysbatch.pinger[0]", + Job: previousJob, + }, + }, + } + + expAlloc := terminal["node1"]["my-sysbatch.pinger[0]"] + expAlloc.NodeID = "node1" + + diff := diffSystemAllocsForNode(job, "node1", eligible, tainted, required, live, terminal) + require.Empty(t, diff.place) + require.Equal(t, 1, len(diff.update)) + require.Empty(t, diff.stop) + require.Empty(t, diff.migrate) + require.Empty(t, diff.lost) + require.Empty(t, diff.ignore) + }) +} + func TestDiffSystemAllocsForNode(t *testing.T) { job := mock.Job() required := materializeTaskGroups(job) @@ -99,28 +169,30 @@ func TestDiffSystemAllocsForNode(t *testing.T) { } // Have three terminal allocs - terminalAllocs := map[string]*structs.Allocation{ - "my-job.web[4]": { - ID: uuid.Generate(), - NodeID: "zip", - Name: "my-job.web[4]", - Job: job, - }, - "my-job.web[5]": { - ID: uuid.Generate(), - NodeID: "zip", - Name: "my-job.web[5]", - Job: job, - }, - "my-job.web[6]": { - ID: uuid.Generate(), - NodeID: "zip", - Name: "my-job.web[6]", - Job: job, + terminal := structs.TerminalByNodeByName{ + "zip": map[string]*structs.Allocation{ + "my-job.web[4]": { + ID: uuid.Generate(), + NodeID: "zip", + Name: "my-job.web[4]", + Job: job, + }, + "my-job.web[5]": { + ID: uuid.Generate(), + NodeID: "zip", + Name: "my-job.web[5]", + Job: job, + }, + "my-job.web[6]": { + ID: uuid.Generate(), + NodeID: "zip", + Name: "my-job.web[6]", + Job: job, + }, }, } - diff := diffSystemAllocsForNode(job, "zip", eligible, tainted, required, allocs, terminalAllocs) + diff := diffSystemAllocsForNode(job, "zip", eligible, tainted, required, allocs, terminal) place := diff.place update := diff.update migrate := diff.migrate @@ -147,12 +219,14 @@ func TestDiffSystemAllocsForNode(t *testing.T) { require.Equal(t, 6, len(place)) // Ensure that the allocations which are replacements of terminal allocs are - // annotated - for name, alloc := range terminalAllocs { - for _, allocTuple := range diff.place { - if name == allocTuple.Name { - require.True(t, reflect.DeepEqual(alloc, allocTuple.Alloc), - "expected: %#v, actual: %#v", alloc, allocTuple.Alloc) + // annotated. + for _, m := range terminal { + for _, alloc := range m { + for _, tuple := range diff.place { + if alloc.Name == tuple.Name { + require.True(t, reflect.DeepEqual(alloc, tuple.Alloc), + "expected: %#v, actual: %#v", alloc, tuple.Alloc) + } } } } @@ -199,9 +273,9 @@ func TestDiffSystemAllocsForNode_ExistingAllocIneligibleNode(t *testing.T) { } // No terminal allocs - terminalAllocs := map[string]*structs.Allocation{} + terminal := make(structs.TerminalByNodeByName) - diff := diffSystemAllocsForNode(job, eligibleNode.ID, eligible, tainted, required, allocs, terminalAllocs) + diff := diffSystemAllocsForNode(job, eligibleNode.ID, eligible, tainted, required, allocs, terminal) place := diff.place update := diff.update migrate := diff.migrate @@ -276,17 +350,19 @@ func TestDiffSystemAllocs(t *testing.T) { }, } - // Have three terminal allocs - terminalAllocs := map[string]*structs.Allocation{ - "my-job.web[0]": { - ID: uuid.Generate(), - NodeID: "pipe", - Name: "my-job.web[0]", - Job: job, + // Have three (?) terminal allocs + terminal := structs.TerminalByNodeByName{ + "pipe": map[string]*structs.Allocation{ + "my-job.web[0]": { + ID: uuid.Generate(), + NodeID: "pipe", + Name: "my-job.web[0]", + Job: job, + }, }, } - diff := diffSystemAllocs(job, nodes, tainted, allocs, terminalAllocs) + diff := diffSystemAllocs(job, nodes, tainted, allocs, terminal) place := diff.place update := diff.update migrate := diff.migrate @@ -313,12 +389,14 @@ func TestDiffSystemAllocs(t *testing.T) { require.Equal(t, 2, len(place)) // Ensure that the allocations which are replacements of terminal allocs are - // annotated - for _, alloc := range terminalAllocs { - for _, allocTuple := range diff.place { - if alloc.NodeID == allocTuple.Alloc.NodeID { - require.True(t, reflect.DeepEqual(alloc, allocTuple.Alloc), - "expected: %#v, actual: %#v", alloc, allocTuple.Alloc) + // annotated. + for _, m := range terminal { + for _, alloc := range m { + for _, tuple := range diff.place { + if alloc.NodeID == tuple.Alloc.NodeID { + require.True(t, reflect.DeepEqual(alloc, tuple.Alloc), + "expected: %#v, actual: %#v", alloc, tuple.Alloc) + } } } } diff --git a/vendor/github.com/hashicorp/nomad/api/operator.go b/vendor/github.com/hashicorp/nomad/api/operator.go index d5bc5d061d56..de57bffef4b2 100644 --- a/vendor/github.com/hashicorp/nomad/api/operator.go +++ b/vendor/github.com/hashicorp/nomad/api/operator.go @@ -159,9 +159,10 @@ const ( // PreemptionConfig specifies whether preemption is enabled based on scheduler type type PreemptionConfig struct { - SystemSchedulerEnabled bool - BatchSchedulerEnabled bool - ServiceSchedulerEnabled bool + SystemSchedulerEnabled bool + SysBatchSchedulerEnabled bool + BatchSchedulerEnabled bool + ServiceSchedulerEnabled bool } // SchedulerGetConfiguration is used to query the current Scheduler configuration. diff --git a/website/pages/docs/configuration/server.mdx b/website/pages/docs/configuration/server.mdx index 619911b7351f..9bec8ddfbeee 100644 --- a/website/pages/docs/configuration/server.mdx +++ b/website/pages/docs/configuration/server.mdx @@ -291,9 +291,10 @@ server { scheduler_algorithm = "spread" preemption_config { - batch_scheduler_enabled = true - system_scheduler_enabled = true - service_scheduler_enabled = true + batch_scheduler_enabled = true + system_scheduler_enabled = true + service_scheduler_enabled = true + sysbatch_scheduler_enabled = true } } } diff --git a/website/pages/docs/job-specification/job.mdx b/website/pages/docs/job-specification/job.mdx index c12b83320dbc..b73a6f2f8dfe 100644 --- a/website/pages/docs/job-specification/job.mdx +++ b/website/pages/docs/job-specification/job.mdx @@ -114,7 +114,7 @@ job "docs" { node if any of its allocation statuses become "failed". - `type` `(string: "service")` - Specifies the [Nomad scheduler][scheduler] to - use. Nomad provides the `service`, `system` and `batch` schedulers. + use. Nomad provides the `service`, `system`, `batch`, and `sysbatch` schedulers. - `update` ([Update][update]: nil) - Specifies the task's update strategy. When omitted, rolling updates are disabled. diff --git a/website/pages/docs/job-specification/reschedule.mdx b/website/pages/docs/job-specification/reschedule.mdx index 9234ca725eb3..96d340f473ea 100644 --- a/website/pages/docs/job-specification/reschedule.mdx +++ b/website/pages/docs/job-specification/reschedule.mdx @@ -47,8 +47,8 @@ job "docs" { } ``` -~> The reschedule stanza does not apply to `system` jobs because they run on -every node. +~> The reschedule stanza does not apply to `system` or `sysbatch` jobs because +they run on every node. ## `reschedule` Parameters diff --git a/website/pages/docs/job-specification/restart.mdx b/website/pages/docs/job-specification/restart.mdx index 6e9e771db7e6..84b53ce9fa66 100644 --- a/website/pages/docs/job-specification/restart.mdx +++ b/website/pages/docs/job-specification/restart.mdx @@ -14,7 +14,7 @@ description: The "restart" stanza configures a group's behavior on task failure. ]} /> -The `restart` stanza configures a tasks's behavior on task failure. Restarts +The `restart` stanza configures a task's behavior on task failure. Restarts happen on the client that is running the task. ```hcl @@ -36,9 +36,9 @@ For example, assuming that the task group restart policy is: ```hcl restart { - interval = "30m" attempts = 2 delay = "15s" + interval = "30m" mode = "fail" } ``` @@ -55,9 +55,9 @@ then the effective restart policy for the task will be: ```hcl restart { - interval = "30m" attempts = 5 delay = "15s" + interval = "30m" mode = "fail" } ``` @@ -87,7 +87,7 @@ restart { The values for many of the `restart` parameters vary by job type. Here are the defaults by job type: -- The default batch restart policy is: +- The default restart policy for `batch` jobs is: ```hcl restart { @@ -98,13 +98,13 @@ defaults by job type: } ``` -- The default service and system job restart policy is: +- The default restart policy for `service`, `system`, and `sysbatch` jobs is: ```hcl restart { - interval = "30m" attempts = 2 delay = "15s" + interval = "30m" mode = "fail" } ``` diff --git a/website/pages/docs/schedulers.mdx b/website/pages/docs/schedulers.mdx index 304f6d60c241..120530e2f198 100644 --- a/website/pages/docs/schedulers.mdx +++ b/website/pages/docs/schedulers.mdx @@ -7,9 +7,9 @@ description: Learn about Nomad's various schedulers. # Schedulers -Nomad has three scheduler types that can be used when creating your job: -`service`, `batch` and `system`. Here we will describe the differences between -each of these schedulers. +Nomad has four scheduler types that can be used when creating your job: +`service`, `batch`, `system` and `sysbatch`. Here we will describe the differences +between each of these schedulers. ## Service @@ -61,8 +61,30 @@ Systems jobs are intended to run until explicitly stopped either by an operator or [preemption]. If a system task exits it is considered a failure and handled according to the job's [restart] stanza; system jobs do not have rescheduling. +## System Batch + +The `sysbatch` scheduler is used to register jobs that should be run to completion +on all clients that meet the job's constraints. The `sysbatch` scheduler will +schedule jobs similarly to the `system` scheduler, but like a `batch` job once a +task exists successfully it is not restarted on that client. + +This scheduler type is useful for issuing "one off" commands to be run on every +node in the cluster. Sysbatch jobs can also be created as [periodic] and [parameterized] +jobs. Since these tasks are managed by Nomad, they can take advantage of job +updating, service discovery, monitoring, and more. + +The `sysbatch` scheduler will preempt lower priority tasks running on a node if there +is not enough capacity to place the job. See preemption details on how tasks that +get preempted are chosen. + +Sysbatch jobs are intended to run until successful completion, explicitly stopped +by an operator, or evicted through [preemption]. Sysbatch tasks that exit with an +error are handled according to the job's [restart] stanza. + [borg]: https://research.google.com/pubs/pub43438.html -[sparrow]: https://cs.stanford.edu/~matei/papers/2013/sosp_sparrow.pdf +[parameterized]: /docs/job-specification/parameterized +[periodic]: /docs/job-specification/periodic [preemption]: /docs/internals/scheduling/preemption [restart]: /docs/job-specification/restart [reschedule]: /docs/job-specification/reschedule +[sparrow]: https://cs.stanford.edu/~matei/papers/2013/sosp_sparrow.pdf