Skip to content

Commit

Permalink
scheduler: prevent -Inf in spread scoring (#17198)
Browse files Browse the repository at this point in the history
When spread targets have a percent value of zero it's possible for them to
return -Inf scoring because of a float divide by zero. This is very hard for
operators to debug because the string "-Inf" is returned in the API and that
breaks the presentation of debugging data.

Most scoring iterators are bracketed to -1/+1, but spread iterators do not so
that they can handle greatly unbalanced scoring so we can't simply return a -1
score without generating a score that might be greater than the negative scores
set by other spread targets. Instead, track the lowest-seen spread boost and use
that as the spread boost for any cases where we'd divide by zero.

Fixes: #8863
  • Loading branch information
tgross committed May 16, 2023
1 parent 960b120 commit 421636b
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .changelog/17198.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
scheduler: Fixed a bug where scores for spread scheduling could be -Inf
```
15 changes: 14 additions & 1 deletion scheduler/spread.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ type SpreadIterator struct {
// blocks
sumSpreadWeights int32

// lowestSpreadBoost tracks the lowest spread boost across all spread blocks
lowestSpreadBoost float64

// hasSpread is used to early return when the job/task group
// does not have spread configured
hasSpread bool
Expand All @@ -53,6 +56,7 @@ func NewSpreadIterator(ctx Context, source RankIterator) *SpreadIterator {
source: source,
groupPropertySets: make(map[string][]*propertySet),
tgSpreadInfo: make(map[string]spreadAttributeMap),
lowestSpreadBoost: -1.0,
}
return iter
}
Expand Down Expand Up @@ -114,6 +118,7 @@ func (iter *SpreadIterator) hasSpreads() bool {
}

func (iter *SpreadIterator) Next() *RankedNode {

for {
option := iter.source.Next()

Expand Down Expand Up @@ -162,7 +167,7 @@ func (iter *SpreadIterator) Next() *RankedNode {
desiredCount, ok = spreadDetails.desiredCounts[implicitTarget]
if !ok {
// The desired count for this attribute is zero if it gets here
// so use the maximum possible penalty for this node
// so use the default negative penalty for this node
totalSpreadScore -= 1.0
continue
}
Expand All @@ -171,12 +176,20 @@ func (iter *SpreadIterator) Next() *RankedNode {
// Calculate the relative weight of this specific spread attribute
spreadWeight := float64(spreadDetails.weight) / float64(iter.sumSpreadWeights)

if desiredCount == 0 {
totalSpreadScore += iter.lowestSpreadBoost
continue
}

// Score Boost is proportional the difference between current and desired count
// It is negative when the used count is greater than the desired count
// It is multiplied with the spread weight to account for cases where the job has
// more than one spread attribute
scoreBoost := ((desiredCount - float64(usedCount)) / desiredCount) * spreadWeight
totalSpreadScore += scoreBoost
if scoreBoost < iter.lowestSpreadBoost {
iter.lowestSpreadBoost = scoreBoost
}
}
}

Expand Down
100 changes: 100 additions & 0 deletions scheduler/spread_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -558,6 +560,104 @@ func TestSpreadIterator_MaxPenalty(t *testing.T) {

}

func TestSpreadIterator_NoInfinity(t *testing.T) {
ci.Parallel(t)

store, ctx := testContext(t)
var nodes []*RankedNode

// Add 3 nodes in different DCs to the state store
for i := 1; i < 4; i++ {
node := mock.Node()
node.Datacenter = fmt.Sprintf("dc%d", i)
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, uint64(100+i), node))
nodes = append(nodes, &RankedNode{Node: node})
}

static := NewStaticRankIterator(ctx, nodes)

job := mock.Job()
tg := job.TaskGroups[0]
job.TaskGroups[0].Count = 8

// Create spread target of 50% in dc1, 50% in dc2, and 0% in the implicit target
spread := &structs.Spread{
Weight: 100,
Attribute: "${node.datacenter}",
SpreadTarget: []*structs.SpreadTarget{
{
Value: "dc1",
Percent: 50,
},
{
Value: "dc2",
Percent: 50,
},
{
Value: "*",
Percent: 0,
},
},
}
tg.Spreads = []*structs.Spread{spread}
spreadIter := NewSpreadIterator(ctx, static)
spreadIter.SetJob(job)
spreadIter.SetTaskGroup(tg)

scoreNorm := NewScoreNormalizationIterator(ctx, spreadIter)

out := collectRanked(scoreNorm)

// Scores should be even between dc1 and dc2 nodes, without an -Inf on dc3
must.Len(t, 3, out)
test.Eq(t, 0.75, out[0].FinalScore)
test.Eq(t, 0.75, out[1].FinalScore)
test.Eq(t, -1, out[2].FinalScore)

// Reset scores
for _, node := range nodes {
node.Scores = nil
node.FinalScore = 0
}

// Create very unbalanced spread target to force large negative scores
spread = &structs.Spread{
Weight: 100,
Attribute: "${node.datacenter}",
SpreadTarget: []*structs.SpreadTarget{
{
Value: "dc1",
Percent: 99,
},
{
Value: "dc2",
Percent: 1,
},
{
Value: "*",
Percent: 0,
},
},
}
tg.Spreads = []*structs.Spread{spread}
static = NewStaticRankIterator(ctx, nodes)
spreadIter = NewSpreadIterator(ctx, static)
spreadIter.SetJob(job)
spreadIter.SetTaskGroup(tg)

scoreNorm = NewScoreNormalizationIterator(ctx, spreadIter)

out = collectRanked(scoreNorm)

// Scores should heavily favor dc1, with an -Inf on dc3
must.Len(t, 3, out)
desired := 8 * 0.99 // 8 allocs * 99%
test.Eq(t, (desired-1)/desired, out[0].FinalScore)
test.Eq(t, -11.5, out[1].FinalScore)
test.LessEq(t, out[1].FinalScore, out[2].FinalScore,
test.Sprintf("expected implicit dc3 to be <= dc2"))
}

func Test_evenSpreadScoreBoost(t *testing.T) {
ci.Parallel(t)

Expand Down

0 comments on commit 421636b

Please sign in to comment.