diff --git a/pkg/scheduler/plugins/coscheduling/core/core.go b/pkg/scheduler/plugins/coscheduling/core/core.go index b114ed4f1..c0f2a11d5 100644 --- a/pkg/scheduler/plugins/coscheduling/core/core.go +++ b/pkg/scheduler/plugins/coscheduling/core/core.go @@ -73,6 +73,7 @@ type Manager interface { GetGangSummaries() map[string]*GangSummary IsGangMinSatisfied(*corev1.Pod) bool GetChildScheduleCycle(*corev1.Pod) int + GetGangGroupWaitingBoundPodNum(pod *corev1.Pod) int } // PodGroupManager defines the scheduling operation called @@ -552,3 +553,19 @@ func (pgMgr *PodGroupManager) GetChildScheduleCycle(pod *corev1.Pod) int { return gang.getChildScheduleCycle(pod) } + +func (pgMgr *PodGroupManager) GetGangGroupWaitingBoundPodNum(pod *corev1.Pod) int { + gang := pgMgr.GetGangByPod(pod) + if gang == nil { + return 0 + } + gangGroup := gang.GangGroup + waitingPodNum := 0 + for _, memberGangID := range gangGroup { + memberGang := pgMgr.cache.getGangFromCacheByGangId(memberGangID, false) + if memberGang != nil { + waitingPodNum += memberGang.getGangWaitingPods() + } + } + return waitingPodNum +} diff --git a/pkg/scheduler/plugins/coscheduling/coscheduling.go b/pkg/scheduler/plugins/coscheduling/coscheduling.go index 7b29f066b..f59a12eb8 100644 --- a/pkg/scheduler/plugins/coscheduling/coscheduling.go +++ b/pkg/scheduler/plugins/coscheduling/coscheduling.go @@ -135,28 +135,29 @@ func (cs *Coscheduling) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool { group1, _ := cs.pgMgr.GetGroupId(podInfo1.Pod) group2, _ := cs.pgMgr.GetGroupId(podInfo2.Pod) - if group1 != group2 { - return group1 < group2 - } - - isgang1satisfied := cs.pgMgr.IsGangMinSatisfied(podInfo1.Pod) - isgang2satisfied := cs.pgMgr.IsGangMinSatisfied(podInfo2.Pod) - if isgang1satisfied != isgang2satisfied { - return !isgang1satisfied - } - childScheduleCycle1 := cs.pgMgr.GetChildScheduleCycle(podInfo1.Pod) - childScheduleCycle2 := cs.pgMgr.GetChildScheduleCycle(podInfo2.Pod) - if childScheduleCycle1 != childScheduleCycle2 { - return childScheduleCycle1 < childScheduleCycle2 - } - - creationTime1 := cs.pgMgr.GetCreatTime(podInfo1) - creationTime2 := cs.pgMgr.GetCreatTime(podInfo2) - if creationTime1.Equal(creationTime2) { - return util.GetId(podInfo1.Pod.Namespace, podInfo1.Pod.Name) < util.GetId(podInfo2.Pod.Namespace, podInfo2.Pod.Name) + waitingBoundPodNum1 := cs.pgMgr.GetGangGroupWaitingBoundPodNum(podInfo1.Pod) + waitingBoundPodNum2 := cs.pgMgr.GetGangGroupWaitingBoundPodNum(podInfo2.Pod) + if waitingBoundPodNum1 != 0 || waitingBoundPodNum2 != 0 { + // At the same time, only member pod of one podGroup should be assumed, so we prefer the pod already having sibling assumed, then they can succeed together. + if waitingBoundPodNum1 == 0 || waitingBoundPodNum2 == 0 { + return waitingBoundPodNum1 != 0 + } + /* + Two gang groups may both already have some assumed sibling pods. + For example: + 1. GroupA has submitted 6 member, and have 5 already assumed; + 2. then the sixth has been deleted; + 3. then GroupB submitted its pods and have 3 already assumed; + 4. GroupA submit the sixth pod + In this case, waitingPodNum will make no sense, so just sort it by group id to give fixed order. + Because no matter former succeed or fail, it's waitingPodNum will be zeroed. And the deadlock will be avoided + */ + return group1 < group2 } - return creationTime1.Before(creationTime2) + // If no pod succeed, we will schedule all pod by RoundRobin to assure fairness. + // If some time-consuming member pod of one gang failed, then it's sibling will fail soon(because scheduling cycle invalid), so no need to assure all sibling should fail together. + return podInfo1.Timestamp.Before(podInfo2.Timestamp) } // PreFilter diff --git a/pkg/scheduler/plugins/coscheduling/coscheduling_test.go b/pkg/scheduler/plugins/coscheduling/coscheduling_test.go index 8299e1aae..b3535131a 100644 --- a/pkg/scheduler/plugins/coscheduling/coscheduling_test.go +++ b/pkg/scheduler/plugins/coscheduling/coscheduling_test.go @@ -382,12 +382,12 @@ func TestLess(t *testing.T) { { name: "equal priority, but p1 is added to schedulingQ earlier than p2", p1: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangA_ns).Name("pod1").Priority(highPriority).Label(extension.LabelPodPriority, lowSubPriority).Obj()), - InitialAttemptTimestamp: earltTime, + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangA_ns).Name("pod1").Priority(highPriority).Label(extension.LabelPodPriority, lowSubPriority).Obj()), + Timestamp: earltTime, }, p2: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod2").Priority(highPriority).Label(extension.LabelPodPriority, lowSubPriority).Obj()), - InitialAttemptTimestamp: lateTime, + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod2").Priority(highPriority).Label(extension.LabelPodPriority, lowSubPriority).Obj()), + Timestamp: lateTime, }, expected: true, // p1 should be ahead of p2 in the queue }, @@ -404,12 +404,12 @@ func TestLess(t *testing.T) { { name: "equal priority, p1 is added to schedulingQ earlier than p2", p1: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod1").Priority(highPriority).Obj()), - InitialAttemptTimestamp: earltTime, + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod1").Priority(highPriority).Obj()), + Timestamp: earltTime, }, p2: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangA_ns).Name("pod2").Priority(highPriority).Obj()), - InitialAttemptTimestamp: lateTime, + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangA_ns).Name("pod2").Priority(highPriority).Obj()), + Timestamp: lateTime, }, expected: true, // p1 should be ahead of p2 in the queue }, @@ -439,27 +439,27 @@ func TestLess(t *testing.T) { { name: "equal priority. p2 is added to schedulingQ earlier than p1, p1 belongs to gangA and p2 belongs to gangB", p1: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangA_ns).Name("pod1").Priority(highPriority).Obj()), - InitialAttemptTimestamp: lateTime, + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangA_ns).Name("pod1").Priority(highPriority).Obj()), + Timestamp: lateTime, }, annotations: map[string]string{extension.AnnotationGangName: "gangA"}, p2: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangB").Obj()), - InitialAttemptTimestamp: earltTime, + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangB").Obj()), + Timestamp: earltTime, }, - expected: true, // p1 should be ahead of p2 in the queue + expected: false, // p1 should be ahead of p2 in the queue }, { name: "equal priority and creation time, both belongs to gangB", p1: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod1").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangB").Obj()), - InitialAttemptTimestamp: lateTime, + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod1").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangB").Obj()), + Timestamp: lateTime, }, p2: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangB").Obj()), - InitialAttemptTimestamp: earltTime, + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangB").Obj()), + Timestamp: earltTime, }, - expected: true, // p1 should be ahead of p2 in the queue + expected: false, }, { name: "equal priority and creation time, both belongs to gangB, childScheduleCycle not equal", @@ -478,14 +478,14 @@ func TestLess(t *testing.T) { { name: "equal priority and creation time, p1 belongs to gangA that has been satisfied", p1: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod1").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangD").Obj()), - InitialAttemptTimestamp: lateTime, + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangB_ns).Name("pod1").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangD").Obj()), + Timestamp: lateTime, }, p2: &framework.QueuedPodInfo{ - PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangC_ns).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangC").Obj()), - InitialAttemptTimestamp: earltTime, + PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangC_ns).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangC").Obj()), + Timestamp: earltTime, }, - expected: true, // p1 should be ahead of p2 in the queue + expected: false, }, } { t.Run(tt.name, func(t *testing.T) {