diff --git a/CHANGELOG.md b/CHANGELOG.md index e19855fb26bf..218d053bcca2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ FEATURES: * **Task dependencies UI**: task lifecycle charts and details +IMPROVEMENTS: + + * core: Allow spreading allocations as an alternative to binpacking [[GH-7810](https://github.com/hashicorp/nomad/issues/7810)] + BUG FIXES: * api: autoscaling policies should not be returned for stopped jobs [[GH-7768](https://github.com/hashicorp/nomad/issues/7768)] diff --git a/api/operator.go b/api/operator.go index d215326f4407..5d4254442198 100644 --- a/api/operator.go +++ b/api/operator.go @@ -112,7 +112,11 @@ func (op *Operator) RaftRemovePeerByID(id string, q *WriteOptions) error { return nil } +// SchedulerConfiguration is the config for controlling scheduler behavior type SchedulerConfiguration struct { + // SchedulerAlgorithm lets you select between available scheduling algorithms. + SchedulerAlgorithm SchedulerAlgorithm + // PreemptionConfig specifies whether to enable eviction of lower // priority jobs to place higher priority jobs. PreemptionConfig PreemptionConfig @@ -140,6 +144,16 @@ type SchedulerSetConfigurationResponse struct { WriteMeta } +// SchedulerAlgorithm is an enum string that encapsulates the valid options for a +// SchedulerConfiguration stanza's SchedulerAlgorithm. These modes will allow the +// scheduler to be user-selectable. +type SchedulerAlgorithm string + +const ( + SchedulerAlgorithmBinpack SchedulerAlgorithm = "binpack" + SchedulerAlgorithmSpread SchedulerAlgorithm = "spread" +) + // PreemptionConfig specifies whether preemption is enabled based on scheduler type type PreemptionConfig struct { SystemSchedulerEnabled bool diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index c5d5e232f81e..da278533b5e9 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -184,6 +184,7 @@ func TestAgent_ServerConfig_SchedulerFlags(t *testing.T) { "default case", nil, structs.SchedulerConfiguration{ + SchedulerAlgorithm: "binpack", PreemptionConfig: structs.PreemptionConfig{ SystemSchedulerEnabled: true, }, diff --git a/command/agent/command.go b/command/agent/command.go index bb1a2cda2a08..7546738e0a94 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -377,6 +377,11 @@ func (c *Command) isValidConfig(config, cmdConfig *Config) bool { c.Ui.Error("WARNING: Bootstrap mode enabled! Potentially unsafe operation.") } + if err := config.Server.DefaultSchedulerConfig.Validate(); err != nil { + c.Ui.Error(err.Error()) + return false + } + return true } diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index cd1284774d7f..157413ec0335 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -126,6 +126,7 @@ var basicConfig = &Config{ RetryMaxAttempts: 3, }, DefaultSchedulerConfig: &structs.SchedulerConfiguration{ + SchedulerAlgorithm: "spread", PreemptionConfig: structs.PreemptionConfig{ SystemSchedulerEnabled: true, BatchSchedulerEnabled: true, diff --git a/command/agent/operator_endpoint.go b/command/agent/operator_endpoint.go index 7ac48c0fdf6c..64d0ea25874b 100644 --- a/command/agent/operator_endpoint.go +++ b/command/agent/operator_endpoint.go @@ -14,6 +14,8 @@ import ( "github.com/hashicorp/raft" ) +// OperatorRequest is used route operator/raft API requests to the implementing +// functions. func (s *HTTPServer) OperatorRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) { path := strings.TrimPrefix(req.URL.Path, "/v1/operator/raft/") switch { @@ -252,12 +254,17 @@ func (s *HTTPServer) schedulerUpdateConfig(resp http.ResponseWriter, req *http.R } args.Config = structs.SchedulerConfiguration{ + SchedulerAlgorithm: structs.SchedulerAlgorithm(conf.SchedulerAlgorithm), PreemptionConfig: structs.PreemptionConfig{ SystemSchedulerEnabled: conf.PreemptionConfig.SystemSchedulerEnabled, BatchSchedulerEnabled: conf.PreemptionConfig.BatchSchedulerEnabled, ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled}, } + if err := args.Config.Validate(); err != nil { + return nil, CodedError(http.StatusBadRequest, err.Error()) + } + // Check for cas value params := req.URL.Query() if _, ok := params["cas"]; ok { diff --git a/command/agent/testdata/basic.hcl b/command/agent/testdata/basic.hcl index a4edcfef6559..78d8d63eff8a 100644 --- a/command/agent/testdata/basic.hcl +++ b/command/agent/testdata/basic.hcl @@ -135,6 +135,8 @@ server { } default_scheduler_config { + scheduler_algorithm = "spread" + preemption_config { batch_scheduler_enabled = true system_scheduler_enabled = true diff --git a/command/agent/testdata/basic.json b/command/agent/testdata/basic.json index 0052b68622d2..b02e0db8868a 100644 --- a/command/agent/testdata/basic.json +++ b/command/agent/testdata/basic.json @@ -297,6 +297,7 @@ "2.2.2.2" ], "default_scheduler_config": [{ + "scheduler_algorithm": "spread", "preemption_config": [{ "batch_scheduler_enabled": true, "system_scheduler_enabled": true, diff --git a/nomad/config.go b/nomad/config.go index 8a0fb7a181b7..8b4fc1c2568a 100644 --- a/nomad/config.go +++ b/nomad/config.go @@ -403,6 +403,7 @@ func DefaultConfig() *Config { ServerHealthInterval: 2 * time.Second, AutopilotInterval: 10 * time.Second, DefaultSchedulerConfig: structs.SchedulerConfiguration{ + SchedulerAlgorithm: structs.SchedulerAlgorithmBinpack, PreemptionConfig: structs.PreemptionConfig{ SystemSchedulerEnabled: true, BatchSchedulerEnabled: false, diff --git a/nomad/structs/funcs.go b/nomad/structs/funcs.go index cee0dd4067bc..855ab7ff0e10 100644 --- a/nomad/structs/funcs.go +++ b/nomad/structs/funcs.go @@ -148,10 +148,7 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi return true, "", used, nil } -// ScoreFit is used to score the fit based on the Google work published here: -// http://www.columbia.edu/~cs2035/courses/ieor4405.S13/datacenter_scheduling.ppt -// This is equivalent to their BestFit v3 -func ScoreFit(node *Node, util *ComparableResources) float64 { +func computeFreePercentage(node *Node, util *ComparableResources) (freePctCpu, freePctRam float64) { // COMPAT(0.11): Remove in 0.11 reserved := node.ComparableReservedResources() res := node.ComparableResources() @@ -165,8 +162,18 @@ func ScoreFit(node *Node, util *ComparableResources) float64 { } // Compute the free percentage - freePctCpu := 1 - (float64(util.Flattened.Cpu.CpuShares) / nodeCpu) - freePctRam := 1 - (float64(util.Flattened.Memory.MemoryMB) / nodeMem) + freePctCpu = 1 - (float64(util.Flattened.Cpu.CpuShares) / nodeCpu) + freePctRam = 1 - (float64(util.Flattened.Memory.MemoryMB) / nodeMem) + return freePctCpu, freePctRam +} + +// ScoreFitBinPack computes a fit score to achieve pinbacking behavior. +// Score is in [0, 18] +// +// It's the BestFit v3 on the Google work published here: +// http://www.columbia.edu/~cs2035/courses/ieor4405.S13/datacenter_scheduling.ppt +func ScoreFitBinPack(node *Node, util *ComparableResources) float64 { + freePctCpu, freePctRam := computeFreePercentage(node, util) // Total will be "maximized" the smaller the value is. // At 100% utilization, the total is 2, while at 0% util it is 20. @@ -187,6 +194,24 @@ func ScoreFit(node *Node, util *ComparableResources) float64 { return score } +// ScoreFitBinSpread computes a fit score to achieve spread behavior. +// Score is in [0, 18] +// +// This is equivalent to Worst Fit of +// http://www.columbia.edu/~cs2035/courses/ieor4405.S13/datacenter_scheduling.ppt +func ScoreFitSpread(node *Node, util *ComparableResources) float64 { + freePctCpu, freePctRam := computeFreePercentage(node, util) + total := math.Pow(10, freePctCpu) + math.Pow(10, freePctRam) + score := total - 2 + + if score > 18.0 { + score = 18.0 + } else if score < 0 { + score = 0 + } + return score +} + func CopySliceConstraints(s []*Constraint) []*Constraint { l := len(s) if l == 0 { diff --git a/nomad/structs/funcs_test.go b/nomad/structs/funcs_test.go index 4302164a61fc..725909abeccf 100644 --- a/nomad/structs/funcs_test.go +++ b/nomad/structs/funcs_test.go @@ -506,7 +506,7 @@ func TestAllocsFit_Devices(t *testing.T) { } // COMPAT(0.11): Remove in 0.11 -func TestScoreFit_Old(t *testing.T) { +func TestScoreFitBinPack_Old(t *testing.T) { node := &Node{} node.Resources = &Resources{ CPU: 4096, @@ -528,7 +528,7 @@ func TestScoreFit_Old(t *testing.T) { }, }, } - score := ScoreFit(node, util) + score := ScoreFitBinPack(node, util) if score != 18.0 { t.Fatalf("bad: %v", score) } @@ -544,7 +544,7 @@ func TestScoreFit_Old(t *testing.T) { }, }, } - score = ScoreFit(node, util) + score = ScoreFitBinPack(node, util) if score != 0.0 { t.Fatalf("bad: %v", score) } @@ -560,13 +560,13 @@ func TestScoreFit_Old(t *testing.T) { }, }, } - score = ScoreFit(node, util) + score = ScoreFitBinPack(node, util) if score < 10.0 || score > 16.0 { t.Fatalf("bad: %v", score) } } -func TestScoreFit(t *testing.T) { +func TestScoreFitBinPack(t *testing.T) { node := &Node{} node.NodeResources = &NodeResources{ Cpu: NodeCpuResources{ @@ -585,52 +585,53 @@ func TestScoreFit(t *testing.T) { }, } - // Test a perfect fit - util := &ComparableResources{ - Flattened: AllocatedTaskResources{ - Cpu: AllocatedCpuResources{ - CpuShares: 2048, - }, - Memory: AllocatedMemoryResources{ - MemoryMB: 4096, + cases := []struct { + name string + flattened AllocatedTaskResources + binPackScore float64 + spreadScore float64 + }{ + { + name: "almost filled node, but with just enough hole", + flattened: AllocatedTaskResources{ + Cpu: AllocatedCpuResources{CpuShares: 2048}, + Memory: AllocatedMemoryResources{MemoryMB: 4096}, }, + binPackScore: 18, + spreadScore: 0, }, - } - score := ScoreFit(node, util) - if score != 18.0 { - t.Fatalf("bad: %v", score) - } - - // Test the worst fit - util = &ComparableResources{ - Flattened: AllocatedTaskResources{ - Cpu: AllocatedCpuResources{ - CpuShares: 0, - }, - Memory: AllocatedMemoryResources{ - MemoryMB: 0, + { + name: "unutilized node", + flattened: AllocatedTaskResources{ + Cpu: AllocatedCpuResources{CpuShares: 0}, + Memory: AllocatedMemoryResources{MemoryMB: 0}, }, + binPackScore: 0, + spreadScore: 18, }, - } - score = ScoreFit(node, util) - if score != 0.0 { - t.Fatalf("bad: %v", score) - } - - // Test a mid-case scenario - util = &ComparableResources{ - Flattened: AllocatedTaskResources{ - Cpu: AllocatedCpuResources{ - CpuShares: 1024, - }, - Memory: AllocatedMemoryResources{ - MemoryMB: 2048, + { + name: "mid-case scnario", + flattened: AllocatedTaskResources{ + Cpu: AllocatedCpuResources{CpuShares: 1024}, + Memory: AllocatedMemoryResources{MemoryMB: 2048}, }, + binPackScore: 13.675, + spreadScore: 4.325, }, } - score = ScoreFit(node, util) - if score < 10.0 || score > 16.0 { - t.Fatalf("bad: %v", score) + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + util := &ComparableResources{Flattened: c.flattened} + + binPackScore := ScoreFitBinPack(node, util) + require.InDelta(t, c.binPackScore, binPackScore, 0.001, "binpack score") + + spreadScore := ScoreFitSpread(node, util) + require.InDelta(t, c.spreadScore, spreadScore, 0.001, "spread score") + + require.InDelta(t, 18, binPackScore+spreadScore, 0.001, "score sum") + }) } } diff --git a/nomad/structs/operator.go b/nomad/structs/operator.go index 03d8caedb352..38d181176000 100644 --- a/nomad/structs/operator.go +++ b/nomad/structs/operator.go @@ -1,6 +1,7 @@ package structs import ( + "fmt" "time" "github.com/hashicorp/raft" @@ -124,8 +125,26 @@ type AutopilotConfig struct { ModifyIndex uint64 } +// SchedulerAlgorithm is an enum string that encapsulates the valid options for a +// SchedulerConfiguration stanza's SchedulerAlgorithm. These modes will allow the +// scheduler to be user-selectable. +type SchedulerAlgorithm string + +const ( + // SchedulerAlgorithmBinpack indicates that the scheduler should spread + // allocations as evenly as possible over the available hardware. + SchedulerAlgorithmBinpack SchedulerAlgorithm = "binpack" + + // SchedulerAlgorithmSpread indicates that the scheduler should spread + // allocations as evenly as possible over the available hardware. + SchedulerAlgorithmSpread SchedulerAlgorithm = "spread" +) + // SchedulerConfiguration is the config for controlling scheduler behavior type SchedulerConfiguration struct { + // SchedulerAlgorithm lets you select between available scheduling algorithms. + SchedulerAlgorithm SchedulerAlgorithm `hcl:"scheduler_algorithm"` + // PreemptionConfig specifies whether to enable eviction of lower // priority jobs to place higher priority jobs. PreemptionConfig PreemptionConfig `hcl:"preemption_config"` @@ -135,6 +154,28 @@ type SchedulerConfiguration struct { ModifyIndex uint64 } +func (s *SchedulerConfiguration) EffectiveSchedulerAlgorithm() SchedulerAlgorithm { + if s == nil || s.SchedulerAlgorithm == "" { + return SchedulerAlgorithmBinpack + } + + return s.SchedulerAlgorithm +} + +func (s *SchedulerConfiguration) Validate() error { + if s == nil { + return nil + } + + switch s.SchedulerAlgorithm { + case "", SchedulerAlgorithmBinpack, SchedulerAlgorithmSpread: + default: + return fmt.Errorf("invalid scheduler algorithm: %v", s.SchedulerAlgorithm) + } + + return nil +} + // SchedulerConfigurationResponse is the response object that wraps SchedulerConfiguration type SchedulerConfigurationResponse struct { // SchedulerConfig contains scheduler config options diff --git a/scheduler/preemption_test.go b/scheduler/preemption_test.go index 798c3985cc94..f8a43d7d90f2 100644 --- a/scheduler/preemption_test.go +++ b/scheduler/preemption_test.go @@ -1349,7 +1349,7 @@ func TestPreemption(t *testing.T) { ctx.plan.NodePreemptions[node.ID] = tc.currentPreemptions } static := NewStaticRankIterator(ctx, nodes) - binPackIter := NewBinPackIterator(ctx, static, true, tc.jobPriority) + binPackIter := NewBinPackIterator(ctx, static, true, tc.jobPriority, structs.SchedulerAlgorithmBinpack) job := mock.Job() job.Priority = tc.jobPriority binPackIter.SetJob(job) diff --git a/scheduler/rank.go b/scheduler/rank.go index e8633340f7c0..06ef9c452f98 100644 --- a/scheduler/rank.go +++ b/scheduler/rank.go @@ -153,17 +153,26 @@ type BinPackIterator struct { priority int jobId *structs.NamespacedID taskGroup *structs.TaskGroup + scoreFit func(*structs.Node, *structs.ComparableResources) float64 } // NewBinPackIterator returns a BinPackIterator which tries to fit tasks // potentially evicting other tasks based on a given priority. -func NewBinPackIterator(ctx Context, source RankIterator, evict bool, priority int) *BinPackIterator { +func NewBinPackIterator(ctx Context, source RankIterator, evict bool, priority int, algorithm structs.SchedulerAlgorithm) *BinPackIterator { + + scoreFn := structs.ScoreFitBinPack + if algorithm == structs.SchedulerAlgorithmSpread { + scoreFn = structs.ScoreFitSpread + } + iter := &BinPackIterator{ ctx: ctx, source: source, evict: evict, priority: priority, + scoreFit: scoreFn, } + iter.ctx.Logger().Named("binpack").Trace("NewBinPackIterator created", "algorithm", algorithm) return iter } @@ -436,7 +445,7 @@ OUTER: } // Score the fit normally otherwise - fitness := structs.ScoreFit(option.Node, util) + fitness := iter.scoreFit(option.Node, util) normalizedFit := fitness / binPackingMaxFitScore option.Scores = append(option.Scores, normalizedFit) iter.ctx.Metrics().ScoreNode(option.Node, "binpack", normalizedFit) diff --git a/scheduler/rank_test.go b/scheduler/rank_test.go index aa1ee01a1f02..7454d075a217 100644 --- a/scheduler/rank_test.go +++ b/scheduler/rank_test.go @@ -107,7 +107,7 @@ func TestBinPackIterator_NoExistingAlloc(t *testing.T) { }, }, } - binp := NewBinPackIterator(ctx, static, false, 0) + binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack) binp.SetTaskGroup(taskGroup) scoreNorm := NewScoreNormalizationIterator(ctx, binp) @@ -220,7 +220,7 @@ func TestBinPackIterator_NoExistingAlloc_MixedReserve(t *testing.T) { }, }, } - binp := NewBinPackIterator(ctx, static, false, 0) + binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack) binp.SetTaskGroup(taskGroup) scoreNorm := NewScoreNormalizationIterator(ctx, binp) @@ -471,7 +471,7 @@ func TestBinPackIterator_Network_Failure(t *testing.T) { }, } - binp := NewBinPackIterator(ctx, static, false, 0) + binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack) binp.SetTaskGroup(taskGroup) scoreNorm := NewScoreNormalizationIterator(ctx, binp) @@ -569,7 +569,7 @@ func TestBinPackIterator_PlannedAlloc(t *testing.T) { }, } - binp := NewBinPackIterator(ctx, static, false, 0) + binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack) binp.SetTaskGroup(taskGroup) scoreNorm := NewScoreNormalizationIterator(ctx, binp) @@ -685,7 +685,7 @@ func TestBinPackIterator_ExistingAlloc(t *testing.T) { }, }, } - binp := NewBinPackIterator(ctx, static, false, 0) + binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack) binp.SetTaskGroup(taskGroup) scoreNorm := NewScoreNormalizationIterator(ctx, binp) @@ -805,7 +805,7 @@ func TestBinPackIterator_ExistingAlloc_PlannedEvict(t *testing.T) { }, } - binp := NewBinPackIterator(ctx, static, false, 0) + binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack) binp.SetTaskGroup(taskGroup) scoreNorm := NewScoreNormalizationIterator(ctx, binp) @@ -1110,7 +1110,7 @@ func TestBinPackIterator_Devices(t *testing.T) { } static := NewStaticRankIterator(ctx, []*RankedNode{{Node: c.Node}}) - binp := NewBinPackIterator(ctx, static, false, 0) + binp := NewBinPackIterator(ctx, static, false, 0, structs.SchedulerAlgorithmBinpack) binp.SetTaskGroup(c.TaskGroup) out := binp.Next() diff --git a/scheduler/stack.go b/scheduler/stack.go index 6fd6df12200f..b9ceb19bb31e 100644 --- a/scheduler/stack.go +++ b/scheduler/stack.go @@ -237,11 +237,13 @@ func NewSystemStack(ctx Context) *SystemStack { // by a particular task group. Enable eviction as system jobs are high // priority. _, schedConfig, _ := s.ctx.State().SchedulerConfig() + schedulerAlgorithm := schedConfig.EffectiveSchedulerAlgorithm() enablePreemption := true if schedConfig != nil { enablePreemption = schedConfig.PreemptionConfig.SystemSchedulerEnabled } - s.binPack = NewBinPackIterator(ctx, rankSource, enablePreemption, 0) + + s.binPack = NewBinPackIterator(ctx, rankSource, enablePreemption, 0, schedulerAlgorithm) // Apply score normalization s.scoreNorm = NewScoreNormalizationIterator(ctx, s.binPack) diff --git a/scheduler/stack_oss.go b/scheduler/stack_oss.go index 50ff5a523258..a705b8c7395b 100644 --- a/scheduler/stack_oss.go +++ b/scheduler/stack_oss.go @@ -60,7 +60,10 @@ func NewGenericStack(batch bool, ctx Context) *GenericStack { // Apply the bin packing, this depends on the resources needed // by a particular task group. - s.binPack = NewBinPackIterator(ctx, rankSource, false, 0) + _, schedConfig, _ := s.ctx.State().SchedulerConfig() + schedulerAlgorithm := schedConfig.EffectiveSchedulerAlgorithm() + + s.binPack = NewBinPackIterator(ctx, rankSource, false, 0, schedulerAlgorithm) // Apply the job anti-affinity iterator. This is to avoid placing // multiple allocations on the same node for this job.