From f02c834652a6848ba69570c33a2b4defda6666d7 Mon Sep 17 00:00:00 2001
From: Michael Schurter <mschurter@hashicorp.com>
Date: Wed, 15 Apr 2020 14:24:47 -0700
Subject: [PATCH] core: fix node reservation scoring

The BinPackIter accounted for node reservations twice when scoring nodes
which could bias scores toward nodes with reservations.

Pseudo-code for previous algorithm:
```
	proposed  = reservedResources + sum(allocsResources)
	available = nodeResources - reservedResources
	score     = 1 - (proposed / available)
```

The node's reserved resources are added to the total resources used by
allocations, and then the node's reserved resources are later
substracted from the node's overall resources.

The new algorithm is:
```
	proposed  = sum(allocResources)
	available = nodeResources - reservedResources
	score     = 1 - (proposed / available)
```

The node's reserved resources are no longer added to the total resources
used by allocations.

My guess as to how this bug happened is that the resource utilization
variable (`util`) is calculated and returned by the `AllocsFit` function
which needs to take reserved resources into account as a basic
feasibility check.

To avoid re-calculating alloc resource usage (because there may be a
large number of allocs), we reused `util` in the `ScoreFit` function.
`ScoreFit` properly accounts for reserved resources by subtracting them
from the node's overall resources. However since `util` _also_ took
reserved resources into account the score would be incorrect.

Prior to the fix the added test output:
```
Node: reserved     Score: 1.0000
Node: reserved2    Score: 1.0000
Node: no-reserved  Score: 0.9741
```

The scores being 1.0 for *both* nodes with reserved resources is a good
hint something is wrong as they should receive different scores. Upon
further inspection the double accounting of reserved resources caused
their scores to be >1.0 and clamped.

After the fix the added test outputs:
```
Node: no-reserved  Score: 0.9741
Node: reserved     Score: 0.9480
Node: reserved2    Score: 0.8717
```
---
 nomad/structs/funcs.go |  13 +++--
 scheduler/rank_test.go | 114 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+), 7 deletions(-)

diff --git a/nomad/structs/funcs.go b/nomad/structs/funcs.go
index da449a1eca67..cee0dd4067bc 100644
--- a/nomad/structs/funcs.go
+++ b/nomad/structs/funcs.go
@@ -101,12 +101,9 @@ func FilterTerminalAllocs(allocs []*Allocation) ([]*Allocation, map[string]*Allo
 // ensured there are no collisions. If checkDevices is set to true, we check if
 // there is a device oversubscription.
 func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevices bool) (bool, string, *ComparableResources, error) {
-	// Compute the utilization from zero
+	// Compute the allocs' utilization from zero
 	used := new(ComparableResources)
 
-	// Add the reserved resources of the node
-	used.Add(node.ComparableReservedResources())
-
 	// For each alloc, add the resources
 	for _, alloc := range allocs {
 		// Do not consider the resource impact of terminal allocations
@@ -117,9 +114,11 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi
 		used.Add(alloc.ComparableResources())
 	}
 
-	// Check that the node resources are a super set of those
-	// that are being allocated
-	if superset, dimension := node.ComparableResources().Superset(used); !superset {
+	// Check that the node resources (after subtracting reserved) are a
+	// super set of those that are being allocated
+	available := node.ComparableResources()
+	available.Subtract(node.ComparableReservedResources())
+	if superset, dimension := available.Superset(used); !superset {
 		return false, dimension, used, nil
 	}
 
diff --git a/scheduler/rank_test.go b/scheduler/rank_test.go
index 2cae33087f68..93546bc9c6ac 100644
--- a/scheduler/rank_test.go
+++ b/scheduler/rank_test.go
@@ -1,6 +1,7 @@
 package scheduler
 
 import (
+	"sort"
 	"testing"
 
 	"github.com/hashicorp/nomad/helper/uuid"
@@ -127,6 +128,119 @@ func TestBinPackIterator_NoExistingAlloc(t *testing.T) {
 	}
 }
 
+// TestBinPackIterator_NoExistingAlloc_MixedReserve asserts that node's with
+// reserved resources are scored equivalent to as if they had a lower amount of
+// resources.
+func TestBinPackIterator_NoExistingAlloc_MixedReserve(t *testing.T) {
+	_, ctx := testContext(t)
+	nodes := []*RankedNode{
+		{
+			// Best fit
+			Node: &structs.Node{
+				Name: "no-reserved",
+				NodeResources: &structs.NodeResources{
+					Cpu: structs.NodeCpuResources{
+						CpuShares: 1100,
+					},
+					Memory: structs.NodeMemoryResources{
+						MemoryMB: 1100,
+					},
+				},
+			},
+		},
+		{
+			// Not best fit if reserve is calculated properly
+			Node: &structs.Node{
+				Name: "reserved",
+				NodeResources: &structs.NodeResources{
+					Cpu: structs.NodeCpuResources{
+						CpuShares: 2000,
+					},
+					Memory: structs.NodeMemoryResources{
+						MemoryMB: 2000,
+					},
+				},
+				ReservedResources: &structs.NodeReservedResources{
+					Cpu: structs.NodeReservedCpuResources{
+						CpuShares: 800,
+					},
+					Memory: structs.NodeReservedMemoryResources{
+						MemoryMB: 800,
+					},
+				},
+			},
+		},
+		{
+			// Even worse fit due to reservations
+			Node: &structs.Node{
+				Name: "reserved2",
+				NodeResources: &structs.NodeResources{
+					Cpu: structs.NodeCpuResources{
+						CpuShares: 2000,
+					},
+					Memory: structs.NodeMemoryResources{
+						MemoryMB: 2000,
+					},
+				},
+				ReservedResources: &structs.NodeReservedResources{
+					Cpu: structs.NodeReservedCpuResources{
+						CpuShares: 500,
+					},
+					Memory: structs.NodeReservedMemoryResources{
+						MemoryMB: 500,
+					},
+				},
+			},
+		},
+		{
+			Node: &structs.Node{
+				Name: "overloaded",
+				NodeResources: &structs.NodeResources{
+					Cpu: structs.NodeCpuResources{
+						CpuShares: 900,
+					},
+					Memory: structs.NodeMemoryResources{
+						MemoryMB: 900,
+					},
+				},
+			},
+		},
+	}
+	static := NewStaticRankIterator(ctx, nodes)
+
+	taskGroup := &structs.TaskGroup{
+		EphemeralDisk: &structs.EphemeralDisk{},
+		Tasks: []*structs.Task{
+			{
+				Name: "web",
+				Resources: &structs.Resources{
+					CPU:      1000,
+					MemoryMB: 1000,
+				},
+			},
+		},
+	}
+	binp := NewBinPackIterator(ctx, static, false, 0)
+	binp.SetTaskGroup(taskGroup)
+
+	scoreNorm := NewScoreNormalizationIterator(ctx, binp)
+
+	out := collectRanked(scoreNorm)
+
+	// Sort descending (highest score to lowest) and log for debugging
+	sort.Slice(out, func(i, j int) bool { return out[i].FinalScore > out[j].FinalScore })
+	for i := range out {
+		t.Logf("Node: %-12s Score: %-1.4f", out[i].Node.Name, out[i].FinalScore)
+	}
+
+	// 3 nodes should be feasible
+	require.Len(t, out, 3)
+
+	// Node without reservations is the best fit
+	require.Equal(t, nodes[0].Node.Name, out[0].Node.Name)
+	require.Equal(t, nodes[1].Node.Name, out[1].Node.Name)
+}
+
 // Tests bin packing iterator with network resources at task and task group level
 func TestBinPackIterator_Network_Success(t *testing.T) {
 	_, ctx := testContext(t)