From 27102af6d354b240280659bb796d6708c298ddc6 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Fri, 4 Feb 2022 09:46:23 -0500
Subject: [PATCH] scheduler: seed random shuffle nodes with eval ID

Processing an evaluation is nearly a pure function over the state
snapshot, but we randomly shuffle the nodes. This means that
developers can't take a given state snapshot and pass an evaluation
through it and be guaranteed the same plan results.

But the evaluation ID is already random, so if we use this as the seed
for shuffling the nodes we can greatly reduce the sources of
non-determinism. Unfortunately golang map iteration uses a global
source of randomness and not a goroutine-local one, but arguably
if the scheduler behavior is impacted by this, that's a bug in the
iteration.
---
 .changelog/12008.txt      |  3 +++
 scheduler/context_test.go |  1 +
 scheduler/feasible.go     |  3 ++-
 scheduler/scheduler.go    |  3 +++
 scheduler/stack.go        |  3 ++-
 scheduler/util.go         | 21 ++++++++++++++++++---
 scheduler/util_test.go    | 11 ++++++++++-
 7 files changed, 39 insertions(+), 6 deletions(-)
 create mode 100644 .changelog/12008.txt

diff --git a/.changelog/12008.txt b/.changelog/12008.txt
new file mode 100644
index 000000000000..710fdc465977
--- /dev/null
+++ b/.changelog/12008.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+scheduler: Seed node shuffling with the evaluation ID to make the order reproducible
+```
diff --git a/scheduler/context_test.go b/scheduler/context_test.go
index 75bc7ed36868..8187e9cbf903 100644
--- a/scheduler/context_test.go
+++ b/scheduler/context_test.go
@@ -14,6 +14,7 @@ import (
 func testContext(t testing.TB) (*state.StateStore, *EvalContext) {
 	state := state.TestStateStore(t)
 	plan := &structs.Plan{
+		EvalID:          uuid.Generate(),
 		NodeUpdate:      make(map[string][]*structs.Allocation),
 		NodeAllocation:  make(map[string][]*structs.Allocation),
 		NodePreemptions: make(map[string][]*structs.Allocation),
diff --git a/scheduler/feasible.go b/scheduler/feasible.go
index 3b10331c5c88..4bb0a325361f 100644
--- a/scheduler/feasible.go
+++ b/scheduler/feasible.go
@@ -122,7 +122,8 @@ func (iter *StaticIterator) SetNodes(nodes []*structs.Node) {
 // is applied in-place
 func NewRandomIterator(ctx Context, nodes []*structs.Node) *StaticIterator {
 	// shuffle with the Fisher-Yates algorithm
-	shuffleNodes(nodes)
+	idx, _ := ctx.State().LatestIndex()
+	shuffleNodes(ctx.Plan(), idx, nodes)
 
 	// Create a static iterator
 	return NewStaticIterator(ctx, nodes)
diff --git a/scheduler/scheduler.go b/scheduler/scheduler.go
index b472041e059b..11d66a054b6e 100644
--- a/scheduler/scheduler.go
+++ b/scheduler/scheduler.go
@@ -107,6 +107,9 @@ type State interface {
 
 	// CSIVolumeByID fetch CSI volumes, containing controller jobs
 	CSIVolumesByNodeID(memdb.WatchSet, string, string) (memdb.ResultIterator, error)
+
+	// LatestIndex returns the greatest index value for all indexes.
+	LatestIndex() (uint64, error)
 }
 
 // Planner interface is used to submit a task allocation plan.
diff --git a/scheduler/stack.go b/scheduler/stack.go
index d2b546107f73..b71916af0a99 100644
--- a/scheduler/stack.go
+++ b/scheduler/stack.go
@@ -70,7 +70,8 @@ type GenericStack struct {
 
 func (s *GenericStack) SetNodes(baseNodes []*structs.Node) {
 	// Shuffle base nodes
-	shuffleNodes(baseNodes)
+	idx, _ := s.ctx.State().LatestIndex()
+	shuffleNodes(s.ctx.Plan(), idx, baseNodes)
 
 	// Update the set of base nodes
 	s.source.SetNodes(baseNodes)
diff --git a/scheduler/util.go b/scheduler/util.go
index 869442b78f97..28113afba68e 100644
--- a/scheduler/util.go
+++ b/scheduler/util.go
@@ -1,6 +1,7 @@
 package scheduler
 
 import (
+	"encoding/binary"
 	"fmt"
 	"math/rand"
 	"reflect"
@@ -376,11 +377,25 @@ func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*struct
 	return out, nil
 }
 
-// shuffleNodes randomizes the slice order with the Fisher-Yates algorithm
-func shuffleNodes(nodes []*structs.Node) {
+// shuffleNodes randomizes the slice order with the Fisher-Yates
+// algorithm. We seed the random source with the eval ID (which is
+// random) to aid in postmortem debugging of specific evaluations and
+// state snapshots.
+func shuffleNodes(plan *structs.Plan, index uint64, nodes []*structs.Node) {
+
+	// use the last 4 bytes because those are the random bits
+	// if we have sortable IDs
+	buf := []byte(plan.EvalID)
+	seed := binary.BigEndian.Uint64(buf[len(buf)-8:])
+
+	// for retried plans the index is the plan result's RefreshIndex
+	// so that we don't retry with the exact same shuffle
+	seed ^= index
+	r := rand.New(rand.NewSource(int64(seed >> 2)))
+
 	n := len(nodes)
 	for i := n - 1; i > 0; i-- {
-		j := rand.Intn(i + 1)
+		j := r.Intn(i + 1)
 		nodes[i], nodes[j] = nodes[j], nodes[i]
 	}
 }
diff --git a/scheduler/util_test.go b/scheduler/util_test.go
index 1b5ec8d9a30a..f03114ba78a6 100644
--- a/scheduler/util_test.go
+++ b/scheduler/util_test.go
@@ -507,8 +507,17 @@ func TestShuffleNodes(t *testing.T) {
 	}
 	orig := make([]*structs.Node, len(nodes))
 	copy(orig, nodes)
-	shuffleNodes(nodes)
+	eval := mock.Eval() // will have random EvalID
+	plan := eval.MakePlan(mock.Job())
+	shuffleNodes(plan, 1000, nodes)
 	require.False(t, reflect.DeepEqual(nodes, orig))
+
+	nodes2 := make([]*structs.Node, len(nodes))
+	copy(nodes2, orig)
+	shuffleNodes(plan, 1000, nodes2)
+
+	require.True(t, reflect.DeepEqual(nodes, nodes2))
+
 }
 
 func TestTaskUpdatedAffinity(t *testing.T) {