Skip to content

Commit

Permalink
scheduler: fix panic of podGroup and pod delete order issue (koordina…
Browse files Browse the repository at this point in the history
…tor-sh#2012)

Signed-off-by: xingbao.zy <xingbao.zy@alibaba-inc.com>
Co-authored-by: xingbao.zy <xingbao.zy@alibaba-inc.com>
  • Loading branch information
buptcozy and xingbao.zy committed Apr 19, 2024
1 parent 6b9aeba commit a931cf2
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 20 deletions.
9 changes: 7 additions & 2 deletions pkg/scheduler/plugins/coscheduling/core/gang.go
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,13 @@ func (gang *Gang) deletePod(pod *v1.Pod) bool {
delete(gang.Children, podId)
delete(gang.WaitingForBindChildren, podId)
delete(gang.BoundChildren, podId)
gang.GangGroupInfo.deleteChildScheduleCycle(podId)
gang.GangGroupInfo.deletePodLastScheduleTime(podId)
if gang.GangGroupInfo != nil {
//t0: podGroup deleted, the gang and gangGroupInfo all deleted
//t1: pod updated, create a new fakeGang and nil gangGroupInfo
//t2: pod deleted
gang.GangGroupInfo.deleteChildScheduleCycle(podId)
gang.GangGroupInfo.deletePodLastScheduleTime(podId)
}
if gang.GangFrom == GangFromPodAnnotation {
if len(gang.Children) == 0 {
return true
Expand Down
8 changes: 8 additions & 0 deletions pkg/scheduler/plugins/coscheduling/core/gang_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ func (gangCache *GangCache) onPodAdd(obj interface{}) {
gang.addBoundPod(pod)
gang.setResourceSatisfied()
}

klog.Infof("watch pod created, Name:%v, pgLabel:%v", pod.Name, pod.Labels[v1alpha1.PodGroupLabel])
}

func (gangCache *GangCache) onPodUpdate(oldObj, newObj interface{}) {
Expand Down Expand Up @@ -202,6 +204,8 @@ func (gangCache *GangCache) onPodDelete(obj interface{}) {
gangCache.deleteGangGroupInfo(gang.GangGroupInfo.GangGroupId)
}
}

klog.Infof("watch pod deleted, Name:%v, pgLabel:%v", pod.Name, pod.Labels[v1alpha1.PodGroupLabel])
}

func (gangCache *GangCache) onPodGroupAdd(obj interface{}) {
Expand All @@ -222,6 +226,8 @@ func (gangCache *GangCache) onPodGroupAdd(obj interface{}) {
gang.SetGangGroupInfo(gangGroupInfo)
//reset already connected pods lastScheduleTime
gang.initAllChildrenPodLastScheduleTime()

klog.Infof("watch podGroup created, Name:%v", pg.Name)
}

func (gangCache *GangCache) onPodGroupUpdate(oldObj interface{}, newObj interface{}) {
Expand Down Expand Up @@ -271,4 +277,6 @@ func (gangCache *GangCache) onPodGroupDelete(obj interface{}) {
if allGangDeleted {
gangCache.deleteGangGroupInfo(gang.GangGroupInfo.GangGroupId)
}

klog.Infof("watch podGroup deleted, Name:%v", pg.Name)
}
50 changes: 32 additions & 18 deletions pkg/scheduler/plugins/coscheduling/core/gang_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,24 +31,38 @@ func TestGangGroupInfo_SetGangGroupInfo(t *testing.T) {
}

func TestDeletePod(t *testing.T) {
gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"})
gangGroupInfo.ChildrenScheduleRoundMap["test/pod1"] = 1
gangGroupInfo.ChildrenLastScheduleTime["test/pod1"] = time.Now()

gang := &Gang{}
gang.Name = "aa"
gang.TotalChildrenNum = 2
gang.SetGangGroupInfo(gangGroupInfo)

pod := &corev1.Pod{}
pod.Namespace = "test"
pod.Name = "pod1"

assert.Equal(t, 1, len(gangGroupInfo.ChildrenScheduleRoundMap))
assert.Equal(t, 1, len(gangGroupInfo.ChildrenLastScheduleTime))
gang.deletePod(pod)
assert.Equal(t, 0, len(gangGroupInfo.ChildrenScheduleRoundMap))
assert.Equal(t, 0, len(gangGroupInfo.ChildrenLastScheduleTime))
{
gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"})
gangGroupInfo.ChildrenScheduleRoundMap["test/pod1"] = 1
gangGroupInfo.ChildrenLastScheduleTime["test/pod1"] = time.Now()

gang := &Gang{}
gang.Name = "aa"
gang.TotalChildrenNum = 2
gang.SetGangGroupInfo(gangGroupInfo)

pod := &corev1.Pod{}
pod.Namespace = "test"
pod.Name = "pod1"

assert.Equal(t, 1, len(gangGroupInfo.ChildrenScheduleRoundMap))
assert.Equal(t, 1, len(gangGroupInfo.ChildrenLastScheduleTime))
gang.deletePod(pod)
assert.Equal(t, 0, len(gangGroupInfo.ChildrenScheduleRoundMap))
assert.Equal(t, 0, len(gangGroupInfo.ChildrenLastScheduleTime))
}
{
//won't panic
gang := &Gang{}
gang.Name = "aa"
gang.TotalChildrenNum = 2

pod := &corev1.Pod{}
pod.Namespace = "test"
pod.Name = "pod1"

gang.deletePod(pod)
}
}

func TestIsScheduleCycleValid_GetScheduleCycle_GetChildScheduleCycle_SetChildScheduleCycle(t *testing.T) {
Expand Down

0 comments on commit a931cf2

Please sign in to comment.