Merge pull request #7730 from hashicorp/b-reserved-scoring

core: fix node reservation scoring
hashicorp · Apr 30, 2020 · cbcd3eb · cbcd3eb
2 parents f592dd9 + e3cba0c
commit cbcd3eb
Show file tree

Hide file tree

Showing 5 changed files with 166 additions and 31 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ BUG FIXES:
 
  * api: autoscaling policies should not be returned for stopped jobs [[GH-7768](https://github.com/hashicorp/nomad/issues/7768)]
  * core: job scale status endpoint was returning incorrect counts [[GH-7789](https://github.com/hashicorp/nomad/issues/7789)]
+ * core: Fixed a bug where scores for allocations were biased toward nodes with resource reservations [[GH-7730](https://github.com/hashicorp/nomad/issues/7730)]
  * jobspec: autoscaling policy block should return a parsing error multiple `policy` blocks are provided [[GH-7716](https://github.com/hashicorp/nomad/issues/7716)]
  * ui: Fixed a bug where exec popup had incorrect URL for jobs where name ≠ id [[GH-7814](https://github.com/hashicorp/nomad/issues/7814)]
 

diff --git a/nomad/structs/funcs.go b/nomad/structs/funcs.go
@@ -101,12 +101,9 @@ func FilterTerminalAllocs(allocs []*Allocation) ([]*Allocation, map[string]*Allo
 // ensured there are no collisions. If checkDevices is set to true, we check if
 // there is a device oversubscription.
 func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevices bool) (bool, string, *ComparableResources, error) {
-	// Compute the utilization from zero
+	// Compute the allocs' utilization from zero
 	used := new(ComparableResources)
 
-	// Add the reserved resources of the node
-	used.Add(node.ComparableReservedResources())
-
 	// For each alloc, add the resources
 	for _, alloc := range allocs {
 		// Do not consider the resource impact of terminal allocations
@@ -117,9 +114,11 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi
 		used.Add(alloc.ComparableResources())
 	}
 
-	// Check that the node resources are a super set of those
-	// that are being allocated
-	if superset, dimension := node.ComparableResources().Superset(used); !superset {
+	// Check that the node resources (after subtracting reserved) are a
+	// super set of those that are being allocated
+	available := node.ComparableResources()
+	available.Subtract(node.ComparableReservedResources())
+	if superset, dimension := available.Superset(used); !superset {
 		return false, dimension, used, nil
 	}
 

diff --git a/nomad/structs/funcs_test.go b/nomad/structs/funcs_test.go
@@ -188,17 +188,17 @@ func TestAllocsFit_Old(t *testing.T) {
 	require.True(fit)
 
 	// Sanity check the used resources
-	require.EqualValues(2000, used.Flattened.Cpu.CpuShares)
-	require.EqualValues(2048, used.Flattened.Memory.MemoryMB)
+	require.EqualValues(1000, used.Flattened.Cpu.CpuShares)
+	require.EqualValues(1024, used.Flattened.Memory.MemoryMB)
 
 	// Should not fit second allocation
 	fit, _, used, err = AllocsFit(n, []*Allocation{a1, a1}, nil, false)
 	require.NoError(err)
 	require.False(fit)
 
 	// Sanity check the used resources
-	require.EqualValues(3000, used.Flattened.Cpu.CpuShares)
-	require.EqualValues(3072, used.Flattened.Memory.MemoryMB)
+	require.EqualValues(2000, used.Flattened.Cpu.CpuShares)
+	require.EqualValues(2048, used.Flattened.Memory.MemoryMB)
 }
 
 // COMPAT(0.11): Remove in 0.11
@@ -255,8 +255,8 @@ func TestAllocsFit_TerminalAlloc_Old(t *testing.T) {
 	require.True(fit)
 
 	// Sanity check the used resources
-	require.EqualValues(2000, used.Flattened.Cpu.CpuShares)
-	require.EqualValues(2048, used.Flattened.Memory.MemoryMB)
+	require.EqualValues(1000, used.Flattened.Cpu.CpuShares)
+	require.EqualValues(1024, used.Flattened.Memory.MemoryMB)
 
 	// Should fit second allocation since it is terminal
 	a2 := a1.Copy()
@@ -266,8 +266,8 @@ func TestAllocsFit_TerminalAlloc_Old(t *testing.T) {
 	require.True(fit)
 
 	// Sanity check the used resources
-	require.EqualValues(2000, used.Flattened.Cpu.CpuShares)
-	require.EqualValues(2048, used.Flattened.Memory.MemoryMB)
+	require.EqualValues(1000, used.Flattened.Cpu.CpuShares)
+	require.EqualValues(1024, used.Flattened.Memory.MemoryMB)
 }
 
 func TestAllocsFit(t *testing.T) {
@@ -340,17 +340,17 @@ func TestAllocsFit(t *testing.T) {
 	require.True(fit)
 
 	// Sanity check the used resources
-	require.EqualValues(2000, used.Flattened.Cpu.CpuShares)
-	require.EqualValues(2048, used.Flattened.Memory.MemoryMB)
+	require.EqualValues(1000, used.Flattened.Cpu.CpuShares)
+	require.EqualValues(1024, used.Flattened.Memory.MemoryMB)
 
 	// Should not fit second allocation
 	fit, _, used, err = AllocsFit(n, []*Allocation{a1, a1}, nil, false)
 	require.NoError(err)
 	require.False(fit)
 
 	// Sanity check the used resources
-	require.EqualValues(3000, used.Flattened.Cpu.CpuShares)
-	require.EqualValues(3072, used.Flattened.Memory.MemoryMB)
+	require.EqualValues(2000, used.Flattened.Cpu.CpuShares)
+	require.EqualValues(2048, used.Flattened.Memory.MemoryMB)
 }
 
 func TestAllocsFit_TerminalAlloc(t *testing.T) {
@@ -424,8 +424,8 @@ func TestAllocsFit_TerminalAlloc(t *testing.T) {
 	require.True(fit)
 
 	// Sanity check the used resources
-	require.EqualValues(2000, used.Flattened.Cpu.CpuShares)
-	require.EqualValues(2048, used.Flattened.Memory.MemoryMB)
+	require.EqualValues(1000, used.Flattened.Cpu.CpuShares)
+	require.EqualValues(1024, used.Flattened.Memory.MemoryMB)
 
 	// Should fit second allocation since it is terminal
 	a2 := a1.Copy()
@@ -435,8 +435,8 @@ func TestAllocsFit_TerminalAlloc(t *testing.T) {
 	require.True(fit, dim)
 
 	// Sanity check the used resources
-	require.EqualValues(2000, used.Flattened.Cpu.CpuShares)
-	require.EqualValues(2048, used.Flattened.Memory.MemoryMB)
+	require.EqualValues(1000, used.Flattened.Cpu.CpuShares)
+	require.EqualValues(1024, used.Flattened.Memory.MemoryMB)
 }
 
 // Tests that AllocsFit detects device collisions

diff --git a/scheduler/rank_test.go b/scheduler/rank_test.go
@@ -1,6 +1,7 @@
 package scheduler
 
 import (
+	"sort"
 	"testing"
 
 	"github.com/hashicorp/nomad/helper/uuid"
@@ -122,11 +123,128 @@ func TestBinPackIterator_NoExistingAlloc(t *testing.T) {
 	if out[0].FinalScore != 1.0 {
 		t.Fatalf("Bad Score: %v", out[0].FinalScore)
 	}
-	if out[1].FinalScore < 0.75 || out[1].FinalScore > 0.95 {
+	if out[1].FinalScore < 0.50 || out[1].FinalScore > 0.60 {
 		t.Fatalf("Bad Score: %v", out[1].FinalScore)
 	}
 }
 
+// TestBinPackIterator_NoExistingAlloc_MixedReserve asserts that node's with
+// reserved resources are scored equivalent to as if they had a lower amount of
+// resources.
+func TestBinPackIterator_NoExistingAlloc_MixedReserve(t *testing.T) {
+	_, ctx := testContext(t)
+	nodes := []*RankedNode{
+		{
+			// Best fit
+			Node: &structs.Node{
+				Name: "no-reserved",
+				NodeResources: &structs.NodeResources{
+					Cpu: structs.NodeCpuResources{
+						CpuShares: 1100,
+					},
+					Memory: structs.NodeMemoryResources{
+						MemoryMB: 1100,
+					},
+				},
+			},
+		},
+		{
+			// Not best fit if reserve is calculated properly
+			Node: &structs.Node{
+				Name: "reserved",
+				NodeResources: &structs.NodeResources{
+					Cpu: structs.NodeCpuResources{
+						CpuShares: 2000,
+					},
+					Memory: structs.NodeMemoryResources{
+						MemoryMB: 2000,
+					},
+				},
+				ReservedResources: &structs.NodeReservedResources{
+					Cpu: structs.NodeReservedCpuResources{
+						CpuShares: 800,
+					},
+					Memory: structs.NodeReservedMemoryResources{
+						MemoryMB: 800,
+					},
+				},
+			},
+		},
+		{
+			// Even worse fit due to reservations
+			Node: &structs.Node{
+				Name: "reserved2",
+				NodeResources: &structs.NodeResources{
+					Cpu: structs.NodeCpuResources{
+						CpuShares: 2000,
+					},
+					Memory: structs.NodeMemoryResources{
+						MemoryMB: 2000,
+					},
+				},
+				ReservedResources: &structs.NodeReservedResources{
+					Cpu: structs.NodeReservedCpuResources{
+						CpuShares: 500,
+					},
+					Memory: structs.NodeReservedMemoryResources{
+						MemoryMB: 500,
+					},
+				},
+			},
+		},
+		{
+			Node: &structs.Node{
+				Name: "overloaded",
+				NodeResources: &structs.NodeResources{
+					Cpu: structs.NodeCpuResources{
+						CpuShares: 900,
+					},
+					Memory: structs.NodeMemoryResources{
+						MemoryMB: 900,
+					},
+				},
+			},
+		},
+	}
+	static := NewStaticRankIterator(ctx, nodes)
+
+	taskGroup := &structs.TaskGroup{
+		EphemeralDisk: &structs.EphemeralDisk{},
+		Tasks: []*structs.Task{
+			{
+				Name: "web",
+				Resources: &structs.Resources{
+					CPU:      1000,
+					MemoryMB: 1000,
+				},
+			},
+		},
+	}
+	binp := NewBinPackIterator(ctx, static, false, 0)
+	binp.SetTaskGroup(taskGroup)
+
+	scoreNorm := NewScoreNormalizationIterator(ctx, binp)
+
+	out := collectRanked(scoreNorm)
+
+	// Sort descending (highest score to lowest) and log for debugging
+	sort.Slice(out, func(i, j int) bool { return out[i].FinalScore > out[j].FinalScore })
+	for i := range out {
+		t.Logf("Node: %-12s Score: %-1.4f", out[i].Node.Name, out[i].FinalScore)
+	}
+
+	// 3 nodes should be feasible
+	require.Len(t, out, 3)
+
+	// Node without reservations is the best fit
+	require.Equal(t, nodes[0].Node.Name, out[0].Node.Name)
+
+	// Node with smallest remaining resources ("best fit") should get a
+	// higher score than node with more remaining resources ("worse fit")
+	require.Equal(t, nodes[1].Node.Name, out[1].Node.Name)
+	require.Equal(t, nodes[2].Node.Name, out[2].Node.Name)
+}
+
 // Tests bin packing iterator with network resources at task and task group level
 func TestBinPackIterator_Network_Success(t *testing.T) {
 	_, ctx := testContext(t)
@@ -239,7 +357,7 @@ func TestBinPackIterator_Network_Success(t *testing.T) {
 	// First node should have a perfect score
 	require.Equal(1.0, out[0].FinalScore)
 
-	if out[1].FinalScore < 0.75 || out[1].FinalScore > 0.95 {
+	if out[1].FinalScore < 0.50 || out[1].FinalScore > 0.60 {
 		t.Fatalf("Bad Score: %v", out[1].FinalScore)
 	}
 

diff --git a/website/pages/docs/upgrade/upgrade-specific.mdx b/website/pages/docs/upgrade/upgrade-specific.mdx
@@ -15,6 +15,22 @@ details provided for their upgrades as a result of new features or changed
 behavior. This page is used to document those details separately from the
 standard upgrade flow.
 
+## Nomad 0.11.2
+
+### Scheduler Scoring Changes
+
+Prior to Nomad 0.11.2 the scheduler algorithm used a [node's reserved
+resources][reserved]
+incorrectly during scoring. The result of this bug was that scoring biased in
+favor of nodes with reserved resources vs nodes without reserved resources.
+
+Placements will be more correct but slightly different in v0.11.2 vs earlier
+versions of Nomad. Operators do *not* need to take any actions as the impact of
+the bug fix will only minimally affect scoring.
+
+Feasability (whether a node is capable of running a job at all) is *not*
+affected.
+
 ## Nomad 0.11.0
 
 ### client.template: `vault_grace` deprecation
@@ -466,21 +482,22 @@ draining the node so no tasks are running on it. This can be verified by running
 state. Once that is done the client can be killed, the `data_dir` should be
 deleted and then Nomad 0.3.0 can be launched.
 
+[dangling-containers]: /docs/drivers/docker#dangling-containers
 [drain-api]: /api-docs/nodes#drain-node
 [drain-cli]: /docs/commands/node/drain
-[dangling-containers]: /docs/drivers/docker#dangling-containers
 [gh-6787]: https://github.com/hashicorp/nomad/issues/6787
 [hcl2]: https://github.com/hashicorp/hcl2
 [limits]: /docs/configuration#limits
 [lxc]: /docs/drivers/external/lxc
 [migrate]: /docs/job-specification/migrate
-[plugins]: /docs/drivers/external
 [plugin-stanza]: /docs/configuration/plugin
-[preemption]: /docs/internals/scheduling/preemption
+[plugins]: /docs/drivers/external
 [preemption-api]: /api-docs/operator#update-scheduler-configuration
+[preemption]: /docs/internals/scheduling/preemption
+[reserved]: /docs/configuration/client#reserved-parameters
 [task-config]: /docs/job-specification/task#config
-[validate]: /docs/commands/job/validate
-[vault_grace]: /docs/job-specification/template
-[update]: /docs/job-specification/update
 [tls-guide]: https://learn.hashicorp.com/nomad/transport-security/enable-tls
 [tls-vault-guide]: https://learn.hashicorp.com/nomad/vault-integration/vault-pki-nomad
+[update]: /docs/job-specification/update
+[validate]: /docs/commands/job/validate
+[vault_grace]: /docs/job-specification/template