From 27102af6d354b240280659bb796d6708c298ddc6 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 4 Feb 2022 09:46:23 -0500 Subject: [PATCH] scheduler: seed random shuffle nodes with eval ID Processing an evaluation is nearly a pure function over the state snapshot, but we randomly shuffle the nodes. This means that developers can't take a given state snapshot and pass an evaluation through it and be guaranteed the same plan results. But the evaluation ID is already random, so if we use this as the seed for shuffling the nodes we can greatly reduce the sources of non-determinism. Unfortunately golang map iteration uses a global source of randomness and not a goroutine-local one, but arguably if the scheduler behavior is impacted by this, that's a bug in the iteration. --- .changelog/12008.txt | 3 +++ scheduler/context_test.go | 1 + scheduler/feasible.go | 3 ++- scheduler/scheduler.go | 3 +++ scheduler/stack.go | 3 ++- scheduler/util.go | 21 ++++++++++++++++++--- scheduler/util_test.go | 11 ++++++++++- 7 files changed, 39 insertions(+), 6 deletions(-) create mode 100644 .changelog/12008.txt diff --git a/.changelog/12008.txt b/.changelog/12008.txt new file mode 100644 index 000000000000..710fdc465977 --- /dev/null +++ b/.changelog/12008.txt @@ -0,0 +1,3 @@ +```release-note:improvement +scheduler: Seed node shuffling with the evaluation ID to make the order reproducible +``` diff --git a/scheduler/context_test.go b/scheduler/context_test.go index 75bc7ed36868..8187e9cbf903 100644 --- a/scheduler/context_test.go +++ b/scheduler/context_test.go @@ -14,6 +14,7 @@ import ( func testContext(t testing.TB) (*state.StateStore, *EvalContext) { state := state.TestStateStore(t) plan := &structs.Plan{ + EvalID: uuid.Generate(), NodeUpdate: make(map[string][]*structs.Allocation), NodeAllocation: make(map[string][]*structs.Allocation), NodePreemptions: make(map[string][]*structs.Allocation), diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 3b10331c5c88..4bb0a325361f 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -122,7 +122,8 @@ func (iter *StaticIterator) SetNodes(nodes []*structs.Node) { // is applied in-place func NewRandomIterator(ctx Context, nodes []*structs.Node) *StaticIterator { // shuffle with the Fisher-Yates algorithm - shuffleNodes(nodes) + idx, _ := ctx.State().LatestIndex() + shuffleNodes(ctx.Plan(), idx, nodes) // Create a static iterator return NewStaticIterator(ctx, nodes) diff --git a/scheduler/scheduler.go b/scheduler/scheduler.go index b472041e059b..11d66a054b6e 100644 --- a/scheduler/scheduler.go +++ b/scheduler/scheduler.go @@ -107,6 +107,9 @@ type State interface { // CSIVolumeByID fetch CSI volumes, containing controller jobs CSIVolumesByNodeID(memdb.WatchSet, string, string) (memdb.ResultIterator, error) + + // LatestIndex returns the greatest index value for all indexes. + LatestIndex() (uint64, error) } // Planner interface is used to submit a task allocation plan. diff --git a/scheduler/stack.go b/scheduler/stack.go index d2b546107f73..b71916af0a99 100644 --- a/scheduler/stack.go +++ b/scheduler/stack.go @@ -70,7 +70,8 @@ type GenericStack struct { func (s *GenericStack) SetNodes(baseNodes []*structs.Node) { // Shuffle base nodes - shuffleNodes(baseNodes) + idx, _ := s.ctx.State().LatestIndex() + shuffleNodes(s.ctx.Plan(), idx, baseNodes) // Update the set of base nodes s.source.SetNodes(baseNodes) diff --git a/scheduler/util.go b/scheduler/util.go index 869442b78f97..28113afba68e 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -1,6 +1,7 @@ package scheduler import ( + "encoding/binary" "fmt" "math/rand" "reflect" @@ -376,11 +377,25 @@ func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*struct return out, nil } -// shuffleNodes randomizes the slice order with the Fisher-Yates algorithm -func shuffleNodes(nodes []*structs.Node) { +// shuffleNodes randomizes the slice order with the Fisher-Yates +// algorithm. We seed the random source with the eval ID (which is +// random) to aid in postmortem debugging of specific evaluations and +// state snapshots. +func shuffleNodes(plan *structs.Plan, index uint64, nodes []*structs.Node) { + + // use the last 4 bytes because those are the random bits + // if we have sortable IDs + buf := []byte(plan.EvalID) + seed := binary.BigEndian.Uint64(buf[len(buf)-8:]) + + // for retried plans the index is the plan result's RefreshIndex + // so that we don't retry with the exact same shuffle + seed ^= index + r := rand.New(rand.NewSource(int64(seed >> 2))) + n := len(nodes) for i := n - 1; i > 0; i-- { - j := rand.Intn(i + 1) + j := r.Intn(i + 1) nodes[i], nodes[j] = nodes[j], nodes[i] } } diff --git a/scheduler/util_test.go b/scheduler/util_test.go index 1b5ec8d9a30a..f03114ba78a6 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -507,8 +507,17 @@ func TestShuffleNodes(t *testing.T) { } orig := make([]*structs.Node, len(nodes)) copy(orig, nodes) - shuffleNodes(nodes) + eval := mock.Eval() // will have random EvalID + plan := eval.MakePlan(mock.Job()) + shuffleNodes(plan, 1000, nodes) require.False(t, reflect.DeepEqual(nodes, orig)) + + nodes2 := make([]*structs.Node, len(nodes)) + copy(nodes2, orig) + shuffleNodes(plan, 1000, nodes2) + + require.True(t, reflect.DeepEqual(nodes, nodes2)) + } func TestTaskUpdatedAffinity(t *testing.T) {