From 4c9822c56570ebb70dcedbafac08b89f7321f00d Mon Sep 17 00:00:00 2001
From: wangjianyu <zmsjianyu@gmail.com>
Date: Thu, 11 Apr 2024 13:40:34 +0800
Subject: [PATCH] scheduler: sort gang of same gangGroup by gangId (#1997)

Signed-off-by: wangjianyu.wjy <wangjianyu.wjy@alibaba-inc.com>
Co-authored-by: wangjianyu.wjy <wangjianyu.wjy@alibaba-inc.com>
Signed-off-by: george <xiangzhihua@gmail.com>
---
 .../plugins/coscheduling/core/gang.go         |  9 ++--
 .../plugins/coscheduling/coscheduling.go      | 21 ++++-----
 .../plugins/coscheduling/coscheduling_test.go | 43 ++++++++++++++++---
 3 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/pkg/scheduler/plugins/coscheduling/core/gang.go b/pkg/scheduler/plugins/coscheduling/core/gang.go
index 0a7271d82..6e64a281e 100644
--- a/pkg/scheduler/plugins/coscheduling/core/gang.go
+++ b/pkg/scheduler/plugins/coscheduling/core/gang.go
@@ -390,12 +390,9 @@ func (gang *Gang) setScheduleCycleValid(valid bool) {
 
 	if !valid && !gang.ScheduleCycleValid {
 		/*
-				let's explain why by following example, there are three gang of one group: A, B and C, every gang group have min-member of 10 pod, noted as A1-A10, B1-B10, C1-C10.
-				1. A1-A5 assumed in gangA scheduleCycle 1, B1-B5 assumed in gangA scheduleCycle 1, C1-C5 assumed in gangA scheduleCycle 1, C6 filter failed due to insufficient resource, then A6-10 failed due to cycle invalid, C7-C10 failed due to cycle invalid, B6-B9 failed due to cycle invalid
-				2. A1-A5 assumed in gangA scheduleCycle 2, C1-C5 assumed in gangA scheduleCycle 2, C6 failed due to insufficient resource and gangA\B\C cycle set to invalid,
-				3. then A6-A10,C7-C10 failed due to cycle invalid
-				4. then B10 failed due to cycle invalid, B1 comes and find its gang scheduling cycle 2 can be valid, so it's assumed.
-				5. however all it's sibling will come until next round of all gangs, these gangs will face insufficient resource due to pre-allocated of B10
+			let's explain why by following example, there are three gang of one group: F, G, every gang group have min-member of 10 pod, noted as F1-F10, G1-G10.
+			1. F1 failed due to insufficient resource, F2-F10 failed due to cycle invalid,
+			2. then G1-G10 assumed, however all it's sibling will come until next round of all gangs, these gangs will face insufficient resource due to pre-allocated of G1-G10
 
 			TODO this logic can be optimized by give gangs of same group same scheduling cycle
 		*/
diff --git a/pkg/scheduler/plugins/coscheduling/coscheduling.go b/pkg/scheduler/plugins/coscheduling/coscheduling.go
index 46531cf3f..0768cbd2f 100644
--- a/pkg/scheduler/plugins/coscheduling/coscheduling.go
+++ b/pkg/scheduler/plugins/coscheduling/coscheduling.go
@@ -140,20 +140,15 @@ func (cs *Coscheduling) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool {
 	}
 	gangId1 := util.GetId(podInfo1.Pod.Namespace, util.GetGangNameByPod(podInfo1.Pod))
 	gangId2 := util.GetId(podInfo2.Pod.Namespace, util.GetGangNameByPod(podInfo2.Pod))
-	// for member gang of same gangGroup, the gang that haven’t been satisfied yet take precedence
+
 	if gangId1 != gangId2 {
-		isGang1Satisfied := cs.pgMgr.IsGangMinSatisfied(podInfo1.Pod)
-		isGang2Satisfied := cs.pgMgr.IsGangMinSatisfied(podInfo2.Pod)
-		if isGang1Satisfied != isGang2Satisfied {
-			return !isGang1Satisfied
-		}
-	} else {
-		// for member pod of same gang, the pod with the smaller scheduling cycle take precedence so that gang scheduling cycle can be valid and iterated
-		childScheduleCycle1 := cs.pgMgr.GetChildScheduleCycle(podInfo1.Pod)
-		childScheduleCycle2 := cs.pgMgr.GetChildScheduleCycle(podInfo2.Pod)
-		if childScheduleCycle1 != childScheduleCycle2 {
-			return childScheduleCycle1 < childScheduleCycle2
-		}
+		return gangId1 < gangId2
+	}
+	// for member pod of same gang, the pod with the smaller scheduling cycle take precedence so that gang scheduling cycle can be valid and iterated
+	childScheduleCycle1 := cs.pgMgr.GetChildScheduleCycle(podInfo1.Pod)
+	childScheduleCycle2 := cs.pgMgr.GetChildScheduleCycle(podInfo2.Pod)
+	if childScheduleCycle1 != childScheduleCycle2 {
+		return childScheduleCycle1 < childScheduleCycle2
 	}
 	return podInfo1.Timestamp.Before(podInfo2.Timestamp)
 }
diff --git a/pkg/scheduler/plugins/coscheduling/coscheduling_test.go b/pkg/scheduler/plugins/coscheduling/coscheduling_test.go
index a36460905..69b4a5fc8 100644
--- a/pkg/scheduler/plugins/coscheduling/coscheduling_test.go
+++ b/pkg/scheduler/plugins/coscheduling/coscheduling_test.go
@@ -243,6 +243,7 @@ func TestLess(t *testing.T) {
 	var lowSubPriority, highSubPriority = "111", "222"
 	var gangA_ns, gangB_ns = "namespace1", "namespace2"
 	gangC_ns := "namespace3"
+	gangGroupNS := "namespace4"
 	now := time.Now()
 	earltTime := now.Add(1 * time.Second)
 	lateTime := now.Add(3 * time.Second)
@@ -282,10 +283,16 @@ func TestLess(t *testing.T) {
 	pg2 := makePg("gangC", gangC_ns, 1, &gangCCreatTime, nil)
 	// GangD by PodGroup
 	pg3 := makePg("gangD", gangC_ns, 1, &gangCCreatTime, nil)
+	gangGroup := []string{"default/gangD", "default/gangE"}
+	rawGangGroup, err := json.Marshal(gangGroup)
+	assert.NoError(t, err)
+	pg4 := makePg("gang4", gangGroupNS, 0, nil, nil)
+	pg5 := makePg("gang5", gangGroupNS, 0, nil, nil)
+	pg4.Annotations = map[string]string{extension.AnnotationGangGroups: string(rawGangGroup)}
 	suit.start()
 	// create gangA and gangB
 
-	err := retry.OnError(
+	err = retry.OnError(
 		retry.DefaultRetry,
 		errors.IsTooManyRequests,
 		func() error {
@@ -329,6 +336,28 @@ func TestLess(t *testing.T) {
 	if err != nil {
 		t.Errorf("retry pgClient create pg err: %v", err)
 	}
+	err = retry.OnError(
+		retry.DefaultRetry,
+		errors.IsTooManyRequests,
+		func() error {
+			var err error
+			_, err = pgClientSet.SchedulingV1alpha1().PodGroups(gangGroupNS).Create(context.TODO(), pg4, metav1.CreateOptions{})
+			return err
+		})
+	if err != nil {
+		t.Errorf("retry pgClient create pg err: %v", err)
+	}
+	err = retry.OnError(
+		retry.DefaultRetry,
+		errors.IsTooManyRequests,
+		func() error {
+			var err error
+			_, err = pgClientSet.SchedulingV1alpha1().PodGroups(gangGroupNS).Create(context.TODO(), pg5, metav1.CreateOptions{})
+			return err
+		})
+	if err != nil {
+		t.Errorf("retry pgClient create pg err: %v", err)
+	}
 	err = retry.OnError(
 		retry.DefaultRetry,
 		errors.IsTooManyRequests,
@@ -472,13 +501,13 @@ func TestLess(t *testing.T) {
 			expected:            false, // p1 should be ahead of p2 in the queue
 		},
 		{
-			name: "equal priority and creation time, p1 belongs to gang that has been satisfied",
+			name: "equal priority, p1 belongs to different gangs of one gangGroup, sort by gangID",
 			p1: &framework.QueuedPodInfo{
-				PodInfo:   framework.NewPodInfo(st.MakePod().Namespace(gangC_ns).Name("pod1").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangD").Obj()),
+				PodInfo:   framework.NewPodInfo(st.MakePod().Namespace(gangGroupNS).Name("pod1").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gang4").Obj()),
 				Timestamp: earltTime,
 			},
 			p2: &framework.QueuedPodInfo{
-				PodInfo:   framework.NewPodInfo(st.MakePod().Namespace(gangC_ns).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangC").Obj()),
+				PodInfo:   framework.NewPodInfo(st.MakePod().Namespace(gangGroupNS).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gang5").Obj()),
 				Timestamp: earltTime,
 			},
 			expected: true,
@@ -915,7 +944,7 @@ func TestFairness(t *testing.T) {
 		"gangJ": {"default/gangH", "default/gangI", "default/gangJ"},
 	}
 	gangMinRequiredNums := []int{20, 10, 32, 20, 20, 18, 43, 20, 30, 20}
-	gangInjectFilterError := []bool{false, false, true, false, false, true, true, false, false, true}
+	gangInjectFilterError := []bool{false, false, true, false, false, true, false, false, false, true}
 	var gangInjectFilterErrorIndex []int
 	for _, gangMinRequiredNum := range gangMinRequiredNums {
 		gangInjectFilterErrorIndex = append(gangInjectFilterErrorIndex, rand.Intn(gangMinRequiredNum))
@@ -996,8 +1025,8 @@ func TestFairness(t *testing.T) {
 		ctx.Done(),
 		scheduler.WithProfiles(cfg.Profiles...),
 		scheduler.WithFrameworkOutOfTreeRegistry(registry),
-		scheduler.WithPodInitialBackoffSeconds(1),
-		scheduler.WithPodMaxBackoffSeconds(1),
+		scheduler.WithPodInitialBackoffSeconds(0),
+		scheduler.WithPodMaxBackoffSeconds(0),
 		scheduler.WithPodMaxInUnschedulablePodsDuration(0),
 	)
 	assert.NoError(t, err)