From 4c9822c56570ebb70dcedbafac08b89f7321f00d Mon Sep 17 00:00:00 2001 From: wangjianyu Date: Thu, 11 Apr 2024 13:40:34 +0800 Subject: [PATCH] scheduler: sort gang of same gangGroup by gangId (#1997) Signed-off-by: wangjianyu.wjy Co-authored-by: wangjianyu.wjy Signed-off-by: george --- .../plugins/coscheduling/core/gang.go | 9 ++-- .../plugins/coscheduling/coscheduling.go | 21 ++++----- .../plugins/coscheduling/coscheduling_test.go | 43 ++++++++++++++++--- 3 files changed, 47 insertions(+), 26 deletions(-) diff --git a/pkg/scheduler/plugins/coscheduling/core/gang.go b/pkg/scheduler/plugins/coscheduling/core/gang.go index 0a7271d82..6e64a281e 100644 --- a/pkg/scheduler/plugins/coscheduling/core/gang.go +++ b/pkg/scheduler/plugins/coscheduling/core/gang.go @@ -390,12 +390,9 @@ func (gang *Gang) setScheduleCycleValid(valid bool) { if !valid && !gang.ScheduleCycleValid { /* - let's explain why by following example, there are three gang of one group: A, B and C, every gang group have min-member of 10 pod, noted as A1-A10, B1-B10, C1-C10. - 1. A1-A5 assumed in gangA scheduleCycle 1, B1-B5 assumed in gangA scheduleCycle 1, C1-C5 assumed in gangA scheduleCycle 1, C6 filter failed due to insufficient resource, then A6-10 failed due to cycle invalid, C7-C10 failed due to cycle invalid, B6-B9 failed due to cycle invalid - 2. A1-A5 assumed in gangA scheduleCycle 2, C1-C5 assumed in gangA scheduleCycle 2, C6 failed due to insufficient resource and gangA\B\C cycle set to invalid, - 3. then A6-A10,C7-C10 failed due to cycle invalid - 4. then B10 failed due to cycle invalid, B1 comes and find its gang scheduling cycle 2 can be valid, so it's assumed. - 5. however all it's sibling will come until next round of all gangs, these gangs will face insufficient resource due to pre-allocated of B10 + let's explain why by following example, there are three gang of one group: F, G, every gang group have min-member of 10 pod, noted as F1-F10, G1-G10. + 1. F1 failed due to insufficient resource, F2-F10 failed due to cycle invalid, + 2. then G1-G10 assumed, however all it's sibling will come until next round of all gangs, these gangs will face insufficient resource due to pre-allocated of G1-G10 TODO this logic can be optimized by give gangs of same group same scheduling cycle */ diff --git a/pkg/scheduler/plugins/coscheduling/coscheduling.go b/pkg/scheduler/plugins/coscheduling/coscheduling.go index 46531cf3f..0768cbd2f 100644 --- a/pkg/scheduler/plugins/coscheduling/coscheduling.go +++ b/pkg/scheduler/plugins/coscheduling/coscheduling.go @@ -140,20 +140,15 @@ func (cs *Coscheduling) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { } gangId1 := util.GetId(podInfo1.Pod.Namespace, util.GetGangNameByPod(podInfo1.Pod)) gangId2 := util.GetId(podInfo2.Pod.Namespace, util.GetGangNameByPod(podInfo2.Pod)) - // for member gang of same gangGroup, the gang that haven’t been satisfied yet take precedence + if gangId1 != gangId2 { - isGang1Satisfied := cs.pgMgr.IsGangMinSatisfied(podInfo1.Pod) - isGang2Satisfied := cs.pgMgr.IsGangMinSatisfied(podInfo2.Pod) - if isGang1Satisfied != isGang2Satisfied { - return !isGang1Satisfied - } - } else { - // for member pod of same gang, the pod with the smaller scheduling cycle take precedence so that gang scheduling cycle can be valid and iterated - childScheduleCycle1 := cs.pgMgr.GetChildScheduleCycle(podInfo1.Pod) - childScheduleCycle2 := cs.pgMgr.GetChildScheduleCycle(podInfo2.Pod) - if childScheduleCycle1 != childScheduleCycle2 { - return childScheduleCycle1 < childScheduleCycle2 - } + return gangId1 < gangId2 + } + // for member pod of same gang, the pod with the smaller scheduling cycle take precedence so that gang scheduling cycle can be valid and iterated + childScheduleCycle1 := cs.pgMgr.GetChildScheduleCycle(podInfo1.Pod) + childScheduleCycle2 := cs.pgMgr.GetChildScheduleCycle(podInfo2.Pod) + if childScheduleCycle1 != childScheduleCycle2 { + return childScheduleCycle1 < childScheduleCycle2 } return podInfo1.Timestamp.Before(podInfo2.Timestamp) } diff --git a/pkg/scheduler/plugins/coscheduling/coscheduling_test.go b/pkg/scheduler/plugins/coscheduling/coscheduling_test.go index a36460905..69b4a5fc8 100644 --- a/pkg/scheduler/plugins/coscheduling/coscheduling_test.go +++ b/pkg/scheduler/plugins/coscheduling/coscheduling_test.go @@ -243,6 +243,7 @@ func TestLess(t *testing.T) { var lowSubPriority, highSubPriority = "111", "222" var gangA_ns, gangB_ns = "namespace1", "namespace2" gangC_ns := "namespace3" + gangGroupNS := "namespace4" now := time.Now() earltTime := now.Add(1 * time.Second) lateTime := now.Add(3 * time.Second) @@ -282,10 +283,16 @@ func TestLess(t *testing.T) { pg2 := makePg("gangC", gangC_ns, 1, &gangCCreatTime, nil) // GangD by PodGroup pg3 := makePg("gangD", gangC_ns, 1, &gangCCreatTime, nil) + gangGroup := []string{"default/gangD", "default/gangE"} + rawGangGroup, err := json.Marshal(gangGroup) + assert.NoError(t, err) + pg4 := makePg("gang4", gangGroupNS, 0, nil, nil) + pg5 := makePg("gang5", gangGroupNS, 0, nil, nil) + pg4.Annotations = map[string]string{extension.AnnotationGangGroups: string(rawGangGroup)} suit.start() // create gangA and gangB - err := retry.OnError( + err = retry.OnError( retry.DefaultRetry, errors.IsTooManyRequests, func() error { @@ -329,6 +336,28 @@ func TestLess(t *testing.T) { if err != nil { t.Errorf("retry pgClient create pg err: %v", err) } + err = retry.OnError( + retry.DefaultRetry, + errors.IsTooManyRequests, + func() error { + var err error + _, err = pgClientSet.SchedulingV1alpha1().PodGroups(gangGroupNS).Create(context.TODO(), pg4, metav1.CreateOptions{}) + return err + }) + if err != nil { + t.Errorf("retry pgClient create pg err: %v", err) + } + err = retry.OnError( + retry.DefaultRetry, + errors.IsTooManyRequests, + func() error { + var err error + _, err = pgClientSet.SchedulingV1alpha1().PodGroups(gangGroupNS).Create(context.TODO(), pg5, metav1.CreateOptions{}) + return err + }) + if err != nil { + t.Errorf("retry pgClient create pg err: %v", err) + } err = retry.OnError( retry.DefaultRetry, errors.IsTooManyRequests, @@ -472,13 +501,13 @@ func TestLess(t *testing.T) { expected: false, // p1 should be ahead of p2 in the queue }, { - name: "equal priority and creation time, p1 belongs to gang that has been satisfied", + name: "equal priority, p1 belongs to different gangs of one gangGroup, sort by gangID", p1: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangC_ns).Name("pod1").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangD").Obj()), + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangGroupNS).Name("pod1").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gang4").Obj()), Timestamp: earltTime, }, p2: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangC_ns).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangC").Obj()), + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangGroupNS).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gang5").Obj()), Timestamp: earltTime, }, expected: true, @@ -915,7 +944,7 @@ func TestFairness(t *testing.T) { "gangJ": {"default/gangH", "default/gangI", "default/gangJ"}, } gangMinRequiredNums := []int{20, 10, 32, 20, 20, 18, 43, 20, 30, 20} - gangInjectFilterError := []bool{false, false, true, false, false, true, true, false, false, true} + gangInjectFilterError := []bool{false, false, true, false, false, true, false, false, false, true} var gangInjectFilterErrorIndex []int for _, gangMinRequiredNum := range gangMinRequiredNums { gangInjectFilterErrorIndex = append(gangInjectFilterErrorIndex, rand.Intn(gangMinRequiredNum)) @@ -996,8 +1025,8 @@ func TestFairness(t *testing.T) { ctx.Done(), scheduler.WithProfiles(cfg.Profiles...), scheduler.WithFrameworkOutOfTreeRegistry(registry), - scheduler.WithPodInitialBackoffSeconds(1), - scheduler.WithPodMaxBackoffSeconds(1), + scheduler.WithPodInitialBackoffSeconds(0), + scheduler.WithPodMaxBackoffSeconds(0), scheduler.WithPodMaxInUnschedulablePodsDuration(0), ) assert.NoError(t, err)