Skip to content

Commit

Permalink
scheduler: sort gang of same gangGroup by gangId (#1997)
Browse files Browse the repository at this point in the history
Signed-off-by: wangjianyu.wjy <wangjianyu.wjy@alibaba-inc.com>
Co-authored-by: wangjianyu.wjy <wangjianyu.wjy@alibaba-inc.com>
  • Loading branch information
ZiMengSheng and wangjianyu.wjy committed Apr 11, 2024
1 parent 10567b8 commit 252c29d
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 26 deletions.
9 changes: 3 additions & 6 deletions pkg/scheduler/plugins/coscheduling/core/gang.go
Original file line number Diff line number Diff line change
Expand Up @@ -390,12 +390,9 @@ func (gang *Gang) setScheduleCycleValid(valid bool) {

if !valid && !gang.ScheduleCycleValid {
/*
let's explain why by following example, there are three gang of one group: A, B and C, every gang group have min-member of 10 pod, noted as A1-A10, B1-B10, C1-C10.
1. A1-A5 assumed in gangA scheduleCycle 1, B1-B5 assumed in gangA scheduleCycle 1, C1-C5 assumed in gangA scheduleCycle 1, C6 filter failed due to insufficient resource, then A6-10 failed due to cycle invalid, C7-C10 failed due to cycle invalid, B6-B9 failed due to cycle invalid
2. A1-A5 assumed in gangA scheduleCycle 2, C1-C5 assumed in gangA scheduleCycle 2, C6 failed due to insufficient resource and gangA\B\C cycle set to invalid,
3. then A6-A10,C7-C10 failed due to cycle invalid
4. then B10 failed due to cycle invalid, B1 comes and find its gang scheduling cycle 2 can be valid, so it's assumed.
5. however all it's sibling will come until next round of all gangs, these gangs will face insufficient resource due to pre-allocated of B10
let's explain why by following example, there are three gang of one group: F, G, every gang group have min-member of 10 pod, noted as F1-F10, G1-G10.
1. F1 failed due to insufficient resource, F2-F10 failed due to cycle invalid,
2. then G1-G10 assumed, however all it's sibling will come until next round of all gangs, these gangs will face insufficient resource due to pre-allocated of G1-G10
TODO this logic can be optimized by give gangs of same group same scheduling cycle
*/
Expand Down
21 changes: 8 additions & 13 deletions pkg/scheduler/plugins/coscheduling/coscheduling.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,20 +140,15 @@ func (cs *Coscheduling) Less(podInfo1, podInfo2 *framework.QueuedPodInfo) bool {
}
gangId1 := util.GetId(podInfo1.Pod.Namespace, util.GetGangNameByPod(podInfo1.Pod))
gangId2 := util.GetId(podInfo2.Pod.Namespace, util.GetGangNameByPod(podInfo2.Pod))
// for member gang of same gangGroup, the gang that haven’t been satisfied yet take precedence

if gangId1 != gangId2 {
isGang1Satisfied := cs.pgMgr.IsGangMinSatisfied(podInfo1.Pod)
isGang2Satisfied := cs.pgMgr.IsGangMinSatisfied(podInfo2.Pod)
if isGang1Satisfied != isGang2Satisfied {
return !isGang1Satisfied
}
} else {
// for member pod of same gang, the pod with the smaller scheduling cycle take precedence so that gang scheduling cycle can be valid and iterated
childScheduleCycle1 := cs.pgMgr.GetChildScheduleCycle(podInfo1.Pod)
childScheduleCycle2 := cs.pgMgr.GetChildScheduleCycle(podInfo2.Pod)
if childScheduleCycle1 != childScheduleCycle2 {
return childScheduleCycle1 < childScheduleCycle2
}
return gangId1 < gangId2
}
// for member pod of same gang, the pod with the smaller scheduling cycle take precedence so that gang scheduling cycle can be valid and iterated
childScheduleCycle1 := cs.pgMgr.GetChildScheduleCycle(podInfo1.Pod)
childScheduleCycle2 := cs.pgMgr.GetChildScheduleCycle(podInfo2.Pod)
if childScheduleCycle1 != childScheduleCycle2 {
return childScheduleCycle1 < childScheduleCycle2
}
return podInfo1.Timestamp.Before(podInfo2.Timestamp)
}
Expand Down
43 changes: 36 additions & 7 deletions pkg/scheduler/plugins/coscheduling/coscheduling_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ func TestLess(t *testing.T) {
var lowSubPriority, highSubPriority = "111", "222"
var gangA_ns, gangB_ns = "namespace1", "namespace2"
gangC_ns := "namespace3"
gangGroupNS := "namespace4"
now := time.Now()
earltTime := now.Add(1 * time.Second)
lateTime := now.Add(3 * time.Second)
Expand Down Expand Up @@ -282,10 +283,16 @@ func TestLess(t *testing.T) {
pg2 := makePg("gangC", gangC_ns, 1, &gangCCreatTime, nil)
// GangD by PodGroup
pg3 := makePg("gangD", gangC_ns, 1, &gangCCreatTime, nil)
gangGroup := []string{"default/gangD", "default/gangE"}
rawGangGroup, err := json.Marshal(gangGroup)
assert.NoError(t, err)
pg4 := makePg("gang4", gangGroupNS, 0, nil, nil)
pg5 := makePg("gang5", gangGroupNS, 0, nil, nil)
pg4.Annotations = map[string]string{extension.AnnotationGangGroups: string(rawGangGroup)}
suit.start()
// create gangA and gangB

err := retry.OnError(
err = retry.OnError(
retry.DefaultRetry,
errors.IsTooManyRequests,
func() error {
Expand Down Expand Up @@ -329,6 +336,28 @@ func TestLess(t *testing.T) {
if err != nil {
t.Errorf("retry pgClient create pg err: %v", err)
}
err = retry.OnError(
retry.DefaultRetry,
errors.IsTooManyRequests,
func() error {
var err error
_, err = pgClientSet.SchedulingV1alpha1().PodGroups(gangGroupNS).Create(context.TODO(), pg4, metav1.CreateOptions{})
return err
})
if err != nil {
t.Errorf("retry pgClient create pg err: %v", err)
}
err = retry.OnError(
retry.DefaultRetry,
errors.IsTooManyRequests,
func() error {
var err error
_, err = pgClientSet.SchedulingV1alpha1().PodGroups(gangGroupNS).Create(context.TODO(), pg5, metav1.CreateOptions{})
return err
})
if err != nil {
t.Errorf("retry pgClient create pg err: %v", err)
}
err = retry.OnError(
retry.DefaultRetry,
errors.IsTooManyRequests,
Expand Down Expand Up @@ -472,13 +501,13 @@ func TestLess(t *testing.T) {
expected: false, // p1 should be ahead of p2 in the queue
},
{
name: "equal priority and creation time, p1 belongs to gang that has been satisfied",
name: "equal priority, p1 belongs to different gangs of one gangGroup, sort by gangID",
p1: &framework.QueuedPodInfo{
PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangC_ns).Name("pod1").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangD").Obj()),
PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangGroupNS).Name("pod1").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gang4").Obj()),
Timestamp: earltTime,
},
p2: &framework.QueuedPodInfo{
PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangC_ns).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gangC").Obj()),
PodInfo: framework.NewPodInfo(st.MakePod().Namespace(gangGroupNS).Name("pod2").Priority(highPriority).Label(v1alpha1.PodGroupLabel, "gang5").Obj()),
Timestamp: earltTime,
},
expected: true,
Expand Down Expand Up @@ -915,7 +944,7 @@ func TestFairness(t *testing.T) {
"gangJ": {"default/gangH", "default/gangI", "default/gangJ"},
}
gangMinRequiredNums := []int{20, 10, 32, 20, 20, 18, 43, 20, 30, 20}
gangInjectFilterError := []bool{false, false, true, false, false, true, true, false, false, true}
gangInjectFilterError := []bool{false, false, true, false, false, true, false, false, false, true}
var gangInjectFilterErrorIndex []int
for _, gangMinRequiredNum := range gangMinRequiredNums {
gangInjectFilterErrorIndex = append(gangInjectFilterErrorIndex, rand.Intn(gangMinRequiredNum))
Expand Down Expand Up @@ -996,8 +1025,8 @@ func TestFairness(t *testing.T) {
ctx.Done(),
scheduler.WithProfiles(cfg.Profiles...),
scheduler.WithFrameworkOutOfTreeRegistry(registry),
scheduler.WithPodInitialBackoffSeconds(1),
scheduler.WithPodMaxBackoffSeconds(1),
scheduler.WithPodInitialBackoffSeconds(0),
scheduler.WithPodMaxBackoffSeconds(0),
scheduler.WithPodMaxInUnschedulablePodsDuration(0),
)
assert.NoError(t, err)
Expand Down

0 comments on commit 252c29d

Please sign in to comment.