From e1bf8c32dac41ddfcea076c8ef97cf885220073e Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 7 Jun 2022 13:31:10 -0400 Subject: [PATCH] CSI: no early return when feasibility check fails on eligible nodes (#13274) As a performance optimization in the scheduler, feasibility checks that apply to an entire class are only checked once for all nodes of that class. Other feasibility checks are "available" checks because they rely on more ephemeral characteristics and don't contribute to the hash for the node class. This currently includes only CSI. We have a separate fast path for "available" checks when the node has already been marked eligible on the basis of class. This fast path has a bug where it returns early rather than continuing the loop. This causes the entire task group to be rejected. Fix the bug by not returning early in the fast path and instead jump to the top of the loop like all the other code paths in this method. Includes a new test exercising topology at whole-scheduler level and a fix for an existing test that should've caught this previously. --- .changelog/13274.txt | 3 ++ scheduler/feasible.go | 5 +- scheduler/generic_sched_test.go | 89 ++++++++++++++++++++++++++++++++- 3 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 .changelog/13274.txt diff --git a/.changelog/13274.txt b/.changelog/13274.txt new file mode 100644 index 000000000000..dc5f84f6087d --- /dev/null +++ b/.changelog/13274.txt @@ -0,0 +1,3 @@ +```release-note:bug +csi: Fixed a scheduler bug where failed feasibility checks would return early and prevent processing additional nodes +``` diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 2fcf1a325a6d..18021d3edb68 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -1135,9 +1135,8 @@ OUTER: if w.available(option) { return option } - // We match the class but are temporarily unavailable, the eval - // should be blocked - return nil + // We match the class but are temporarily unavailable + continue OUTER case EvalComputedClassEscaped: tgEscaped = true case EvalComputedClassUnknown: diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index f76e03162b71..7b47768a4eee 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -6353,7 +6353,7 @@ func TestServiceSched_CSIVolumesPerAlloc(t *testing.T) { require.NotEqual("", h.Evals[1].BlockedEval, "expected a blocked eval to be spawned") require.Equal(2, h.Evals[1].QueuedAllocations["web"], "expected 2 queued allocs") - require.Equal(1, h.Evals[1].FailedTGAllocs["web"]. + require.Equal(5, h.Evals[1].FailedTGAllocs["web"]. ConstraintFiltered["missing CSI Volume volume-unique[3]"]) // Upsert 2 more per-alloc volumes @@ -6397,6 +6397,93 @@ func TestServiceSched_CSIVolumesPerAlloc(t *testing.T) { } +func TestServiceSched_CSITopology(t *testing.T) { + ci.Parallel(t) + h := NewHarness(t) + + zones := []string{"zone-0", "zone-1", "zone-2", "zone-3"} + + // Create some nodes, each running a CSI plugin with topology for + // a different "zone" + for i := 0; i < 12; i++ { + node := mock.Node() + node.Datacenter = zones[i%4] + node.CSINodePlugins = map[string]*structs.CSIInfo{ + "test-plugin-" + zones[i%4]: { + PluginID: "test-plugin-" + zones[i%4], + Healthy: true, + NodeInfo: &structs.CSINodeInfo{ + MaxVolumes: 3, + AccessibleTopology: &structs.CSITopology{ + Segments: map[string]string{"zone": zones[i%4]}}, + }, + }, + } + require.NoError(t, h.State.UpsertNode( + structs.MsgTypeTestSetup, h.NextIndex(), node)) + } + + // create 2 per-alloc volumes for those zones + vol0 := structs.NewCSIVolume("myvolume[0]", 0) + vol0.PluginID = "test-plugin-zone-0" + vol0.Namespace = structs.DefaultNamespace + vol0.AccessMode = structs.CSIVolumeAccessModeSingleNodeWriter + vol0.AttachmentMode = structs.CSIVolumeAttachmentModeFilesystem + vol0.RequestedTopologies = &structs.CSITopologyRequest{ + Required: []*structs.CSITopology{{ + Segments: map[string]string{"zone": "zone-0"}, + }}, + } + + vol1 := vol0.Copy() + vol1.ID = "myvolume[1]" + vol1.PluginID = "test-plugin-zone-1" + vol1.RequestedTopologies.Required[0].Segments["zone"] = "zone-1" + + require.NoError(t, h.State.UpsertCSIVolume( + h.NextIndex(), []*structs.CSIVolume{vol0, vol1})) + + // Create a job that uses those volumes + job := mock.Job() + job.Datacenters = zones + job.TaskGroups[0].Count = 2 + job.TaskGroups[0].Volumes = map[string]*structs.VolumeRequest{ + "myvolume": { + Type: "csi", + Name: "unique", + Source: "myvolume", + PerAlloc: true, + }, + } + + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job)) + + // Create a mock evaluation to register the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation and expect a single plan without annotations + err := h.Process(NewServiceScheduler, eval) + require.NoError(t, err) + require.Len(t, h.Plans, 1, "expected one plan") + require.Nil(t, h.Plans[0].Annotations, "expected no annotations") + + // Expect the eval has not spawned a blocked eval + require.Equal(t, len(h.CreateEvals), 0) + require.Equal(t, "", h.Evals[0].BlockedEval, "did not expect a blocked eval") + require.Equal(t, structs.EvalStatusComplete, h.Evals[0].Status) + +} + // TestPropagateTaskState asserts that propagateTaskState only copies state // when the previous allocation is lost or draining. func TestPropagateTaskState(t *testing.T) {