From f02c834652a6848ba69570c33a2b4defda6666d7 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 15 Apr 2020 14:24:47 -0700 Subject: [PATCH] core: fix node reservation scoring The BinPackIter accounted for node reservations twice when scoring nodes which could bias scores toward nodes with reservations. Pseudo-code for previous algorithm: ``` proposed = reservedResources + sum(allocsResources) available = nodeResources - reservedResources score = 1 - (proposed / available) ``` The node's reserved resources are added to the total resources used by allocations, and then the node's reserved resources are later substracted from the node's overall resources. The new algorithm is: ``` proposed = sum(allocResources) available = nodeResources - reservedResources score = 1 - (proposed / available) ``` The node's reserved resources are no longer added to the total resources used by allocations. My guess as to how this bug happened is that the resource utilization variable (`util`) is calculated and returned by the `AllocsFit` function which needs to take reserved resources into account as a basic feasibility check. To avoid re-calculating alloc resource usage (because there may be a large number of allocs), we reused `util` in the `ScoreFit` function. `ScoreFit` properly accounts for reserved resources by subtracting them from the node's overall resources. However since `util` _also_ took reserved resources into account the score would be incorrect. Prior to the fix the added test output: ``` Node: reserved Score: 1.0000 Node: reserved2 Score: 1.0000 Node: no-reserved Score: 0.9741 ``` The scores being 1.0 for *both* nodes with reserved resources is a good hint something is wrong as they should receive different scores. Upon further inspection the double accounting of reserved resources caused their scores to be >1.0 and clamped. After the fix the added test outputs: ``` Node: no-reserved Score: 0.9741 Node: reserved Score: 0.9480 Node: reserved2 Score: 0.8717 ``` --- nomad/structs/funcs.go | 13 +++-- scheduler/rank_test.go | 114 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 7 deletions(-) diff --git a/nomad/structs/funcs.go b/nomad/structs/funcs.go index da449a1eca67..cee0dd4067bc 100644 --- a/nomad/structs/funcs.go +++ b/nomad/structs/funcs.go @@ -101,12 +101,9 @@ func FilterTerminalAllocs(allocs []*Allocation) ([]*Allocation, map[string]*Allo // ensured there are no collisions. If checkDevices is set to true, we check if // there is a device oversubscription. func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevices bool) (bool, string, *ComparableResources, error) { - // Compute the utilization from zero + // Compute the allocs' utilization from zero used := new(ComparableResources) - // Add the reserved resources of the node - used.Add(node.ComparableReservedResources()) - // For each alloc, add the resources for _, alloc := range allocs { // Do not consider the resource impact of terminal allocations @@ -117,9 +114,11 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi used.Add(alloc.ComparableResources()) } - // Check that the node resources are a super set of those - // that are being allocated - if superset, dimension := node.ComparableResources().Superset(used); !superset { + // Check that the node resources (after subtracting reserved) are a + // super set of those that are being allocated + available := node.ComparableResources() + available.Subtract(node.ComparableReservedResources()) + if superset, dimension := available.Superset(used); !superset { return false, dimension, used, nil } diff --git a/scheduler/rank_test.go b/scheduler/rank_test.go index 2cae33087f68..93546bc9c6ac 100644 --- a/scheduler/rank_test.go +++ b/scheduler/rank_test.go @@ -1,6 +1,7 @@ package scheduler import ( + "sort" "testing" "github.com/hashicorp/nomad/helper/uuid" @@ -127,6 +128,119 @@ func TestBinPackIterator_NoExistingAlloc(t *testing.T) { } } +// TestBinPackIterator_NoExistingAlloc_MixedReserve asserts that node's with +// reserved resources are scored equivalent to as if they had a lower amount of +// resources. +func TestBinPackIterator_NoExistingAlloc_MixedReserve(t *testing.T) { + _, ctx := testContext(t) + nodes := []*RankedNode{ + { + // Best fit + Node: &structs.Node{ + Name: "no-reserved", + NodeResources: &structs.NodeResources{ + Cpu: structs.NodeCpuResources{ + CpuShares: 1100, + }, + Memory: structs.NodeMemoryResources{ + MemoryMB: 1100, + }, + }, + }, + }, + { + // Not best fit if reserve is calculated properly + Node: &structs.Node{ + Name: "reserved", + NodeResources: &structs.NodeResources{ + Cpu: structs.NodeCpuResources{ + CpuShares: 2000, + }, + Memory: structs.NodeMemoryResources{ + MemoryMB: 2000, + }, + }, + ReservedResources: &structs.NodeReservedResources{ + Cpu: structs.NodeReservedCpuResources{ + CpuShares: 800, + }, + Memory: structs.NodeReservedMemoryResources{ + MemoryMB: 800, + }, + }, + }, + }, + { + // Even worse fit due to reservations + Node: &structs.Node{ + Name: "reserved2", + NodeResources: &structs.NodeResources{ + Cpu: structs.NodeCpuResources{ + CpuShares: 2000, + }, + Memory: structs.NodeMemoryResources{ + MemoryMB: 2000, + }, + }, + ReservedResources: &structs.NodeReservedResources{ + Cpu: structs.NodeReservedCpuResources{ + CpuShares: 500, + }, + Memory: structs.NodeReservedMemoryResources{ + MemoryMB: 500, + }, + }, + }, + }, + { + Node: &structs.Node{ + Name: "overloaded", + NodeResources: &structs.NodeResources{ + Cpu: structs.NodeCpuResources{ + CpuShares: 900, + }, + Memory: structs.NodeMemoryResources{ + MemoryMB: 900, + }, + }, + }, + }, + } + static := NewStaticRankIterator(ctx, nodes) + + taskGroup := &structs.TaskGroup{ + EphemeralDisk: &structs.EphemeralDisk{}, + Tasks: []*structs.Task{ + { + Name: "web", + Resources: &structs.Resources{ + CPU: 1000, + MemoryMB: 1000, + }, + }, + }, + } + binp := NewBinPackIterator(ctx, static, false, 0) + binp.SetTaskGroup(taskGroup) + + scoreNorm := NewScoreNormalizationIterator(ctx, binp) + + out := collectRanked(scoreNorm) + + // Sort descending (highest score to lowest) and log for debugging + sort.Slice(out, func(i, j int) bool { return out[i].FinalScore > out[j].FinalScore }) + for i := range out { + t.Logf("Node: %-12s Score: %-1.4f", out[i].Node.Name, out[i].FinalScore) + } + + // 3 nodes should be feasible + require.Len(t, out, 3) + + // Node without reservations is the best fit + require.Equal(t, nodes[0].Node.Name, out[0].Node.Name) + require.Equal(t, nodes[1].Node.Name, out[1].Node.Name) +} + // Tests bin packing iterator with network resources at task and task group level func TestBinPackIterator_Network_Success(t *testing.T) { _, ctx := testContext(t)