From f85aefca017cbfbd82ba3a96a698f93f3976280d Mon Sep 17 00:00:00 2001 From: buptcozy <70878006+buptcozy@users.noreply.github.com> Date: Mon, 22 Apr 2024 08:40:43 +0800 Subject: [PATCH] fix panic of podGroup and pod delete order issue2 (#2013) Signed-off-by: xingbao.zy Co-authored-by: xingbao.zy --- .../plugins/coscheduling/core/gang.go | 12 ++-- .../plugins/coscheduling/core/gang_cache.go | 1 + .../coscheduling/core/gang_cache_test.go | 36 +++++++---- .../plugins/coscheduling/core/gang_test.go | 60 +++++++++---------- .../plugins/coscheduling/core/ganggroup.go | 55 ++++++++++++++++- .../coscheduling/core/ganggroup_test.go | 3 + 6 files changed, 116 insertions(+), 51 deletions(-) diff --git a/pkg/scheduler/plugins/coscheduling/core/gang.go b/pkg/scheduler/plugins/coscheduling/core/gang.go index a0dfdcff1..e95b67de4 100644 --- a/pkg/scheduler/plugins/coscheduling/core/gang.go +++ b/pkg/scheduler/plugins/coscheduling/core/gang.go @@ -88,6 +88,7 @@ func NewGang(gangName string) *Gang { BoundChildren: make(map[string]*v1.Pod), GangFrom: GangFromPodAnnotation, HasGangInit: false, + GangGroupInfo: NewGangGroupInfo("", nil), } } @@ -230,7 +231,7 @@ func (gang *Gang) SetGangGroupInfo(gangGroupInfo *GangGroupInfo) { gang.lock.Lock() defer gang.lock.Unlock() - if gang.GangGroupInfo == nil { + if !gang.GangGroupInfo.IsInitialized() { gang.GangGroupInfo = gangGroupInfo klog.Infof("SetGangGroupInfo done, gangName: %v, groupSlice: %v, gangGroupId: %v", gang.Name, gang.GangGroup, gang.GangGroupId) @@ -253,13 +254,8 @@ func (gang *Gang) deletePod(pod *v1.Pod) bool { delete(gang.Children, podId) delete(gang.WaitingForBindChildren, podId) delete(gang.BoundChildren, podId) - if gang.GangGroupInfo != nil { - //t0: podGroup deleted, the gang and gangGroupInfo all deleted - //t1: pod updated, create a new fakeGang and nil gangGroupInfo - //t2: pod deleted - gang.GangGroupInfo.deleteChildScheduleCycle(podId) - gang.GangGroupInfo.deletePodLastScheduleTime(podId) - } + gang.GangGroupInfo.deleteChildScheduleCycle(podId) + gang.GangGroupInfo.deletePodLastScheduleTime(podId) if gang.GangFrom == GangFromPodAnnotation { if len(gang.Children) == 0 { return true diff --git a/pkg/scheduler/plugins/coscheduling/core/gang_cache.go b/pkg/scheduler/plugins/coscheduling/core/gang_cache.go index a81572972..7332f66e5 100644 --- a/pkg/scheduler/plugins/coscheduling/core/gang_cache.go +++ b/pkg/scheduler/plugins/coscheduling/core/gang_cache.go @@ -61,6 +61,7 @@ func (gangCache *GangCache) getGangGroupInfo(gangGroupId string, gangGroup []str if gangCache.gangGroupInfoMap[gangGroupId] == nil { if createIfNotExist { gangGroupInfo = NewGangGroupInfo(gangGroupId, gangGroup) + gangGroupInfo.SetInitialized() gangCache.gangGroupInfoMap[gangGroupId] = gangGroupInfo klog.Infof("add gangGroupInfo to cache, gangGroupId: %v", gangGroupId) } diff --git a/pkg/scheduler/plugins/coscheduling/core/gang_cache_test.go b/pkg/scheduler/plugins/coscheduling/core/gang_cache_test.go index 740ab7e9d..e8c7ee0f8 100644 --- a/pkg/scheduler/plugins/coscheduling/core/gang_cache_test.go +++ b/pkg/scheduler/plugins/coscheduling/core/gang_cache_test.go @@ -100,6 +100,7 @@ func TestGangCache_OnPodAdd(t *testing.T) { WaitTime: 0, GangGroupId: "default/test", GangGroup: []string{"default/test"}, + GangGroupInfo: NewGangGroupInfo("", nil), Mode: extension.GangModeStrict, GangFrom: GangFromPodAnnotation, GangMatchPolicy: extension.GangMatchPolicyOnceSatisfied, @@ -519,9 +520,13 @@ func TestGangCache_OnPodAdd(t *testing.T) { continue } - gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum - gang.GangGroupInfo.ChildrenLastScheduleTime[util.GetId(pod.Namespace, pod.Name)] = - gang.GangGroupInfo.LastScheduleTime + if gangCache.gangItems[gangId].GangGroupInfo.IsInitialized() { + gang.GangGroupInfo.SetInitialized() + + gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum + gang.GangGroupInfo.ChildrenLastScheduleTime[util.GetId(pod.Namespace, pod.Name)] = + gang.GangGroupInfo.LastScheduleTime + } } assert.Equal(t, tt.wantCache, gangCache.gangItems) @@ -676,9 +681,13 @@ func TestGangCache_OnPodUpdate(t *testing.T) { continue } - gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum - gang.GangGroupInfo.ChildrenLastScheduleTime[util.GetId(pod.Namespace, pod.Name)] = - gang.GangGroupInfo.LastScheduleTime + if gangCache.gangItems[gangId].GangGroupInfo.IsInitialized() { + gang.GangGroupInfo.SetInitialized() + + gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum + gang.GangGroupInfo.ChildrenLastScheduleTime[util.GetId(pod.Namespace, pod.Name)] = + gang.GangGroupInfo.LastScheduleTime + } } assert.Equal(t, tt.wantCache, gangCache.gangItems) @@ -854,11 +863,9 @@ func TestGangCache_OnPodDelete(t *testing.T) { } gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum - } - for _, gang := range tt.wantCache { - if gangCache.gangItems[gang.Name].GangGroupInfo == nil { - gangCache.gangItems[gang.Name].GangGroupInfo = NewGangGroupInfo("", nil) + if gangCache.gangItems[gangId].GangGroupInfo.IsInitialized() { + gang.GangGroupInfo.SetInitialized() } } @@ -1004,6 +1011,10 @@ func TestGangCache_OnPodGroupAdd(t *testing.T) { if gang.GangGroupInfo != nil { gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum } + + if gangCache.gangItems[gang.Name].GangGroupInfo.IsInitialized() { + gang.GangGroupInfo.SetInitialized() + } } for k, v := range tt.wantCache { @@ -1122,6 +1133,11 @@ func TestGangCache_OnGangDelete(t *testing.T) { cacheGang := cache.getGangFromCacheByGangId("default/gangb", false) wantedGang.GangGroupId = util.GetGangGroupId(wantedGang.GangGroup) + + if cacheGang.GangGroupInfo.IsInitialized() { + wantedGang.GangGroupInfo.SetInitialized() + } + assert.Equal(t, wantedGang, cacheGang) } diff --git a/pkg/scheduler/plugins/coscheduling/core/gang_test.go b/pkg/scheduler/plugins/coscheduling/core/gang_test.go index f5fb46a0b..a69bffdb5 100644 --- a/pkg/scheduler/plugins/coscheduling/core/gang_test.go +++ b/pkg/scheduler/plugins/coscheduling/core/gang_test.go @@ -10,6 +10,7 @@ import ( func TestGangGroupInfo_SetGangGroupInfo(t *testing.T) { gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"}) + gangGroupInfo.SetInitialized() assert.Equal(t, "aa_bb", gangGroupInfo.GangGroupId) assert.Equal(t, 2, len(gangGroupInfo.GangGroup)) assert.Equal(t, 1, gangGroupInfo.ScheduleCycle) @@ -24,6 +25,7 @@ func TestGangGroupInfo_SetGangGroupInfo(t *testing.T) { assert.Equal(t, 0, len(gangGroupInfo.ChildrenLastScheduleTime)) gang := &Gang{} + gang.GangGroupInfo = NewGangGroupInfo("", nil) gang.Name = "aa" gang.TotalChildrenNum = 2 gang.SetGangGroupInfo(gangGroupInfo) @@ -31,46 +33,36 @@ func TestGangGroupInfo_SetGangGroupInfo(t *testing.T) { } func TestDeletePod(t *testing.T) { - { - gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"}) - gangGroupInfo.ChildrenScheduleRoundMap["test/pod1"] = 1 - gangGroupInfo.ChildrenLastScheduleTime["test/pod1"] = time.Now() - - gang := &Gang{} - gang.Name = "aa" - gang.TotalChildrenNum = 2 - gang.SetGangGroupInfo(gangGroupInfo) - - pod := &corev1.Pod{} - pod.Namespace = "test" - pod.Name = "pod1" - - assert.Equal(t, 1, len(gangGroupInfo.ChildrenScheduleRoundMap)) - assert.Equal(t, 1, len(gangGroupInfo.ChildrenLastScheduleTime)) - gang.deletePod(pod) - assert.Equal(t, 0, len(gangGroupInfo.ChildrenScheduleRoundMap)) - assert.Equal(t, 0, len(gangGroupInfo.ChildrenLastScheduleTime)) - } - { - //won't panic - gang := &Gang{} - gang.Name = "aa" - gang.TotalChildrenNum = 2 - - pod := &corev1.Pod{} - pod.Namespace = "test" - pod.Name = "pod1" - - gang.deletePod(pod) - } + gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"}) + gangGroupInfo.SetInitialized() + gangGroupInfo.ChildrenScheduleRoundMap["test/pod1"] = 1 + gangGroupInfo.ChildrenLastScheduleTime["test/pod1"] = time.Now() + + gang := &Gang{} + gang.GangGroupInfo = NewGangGroupInfo("", nil) + gang.Name = "aa" + gang.TotalChildrenNum = 2 + gang.SetGangGroupInfo(gangGroupInfo) + + pod := &corev1.Pod{} + pod.Namespace = "test" + pod.Name = "pod1" + + assert.Equal(t, 1, len(gangGroupInfo.ChildrenScheduleRoundMap)) + assert.Equal(t, 1, len(gangGroupInfo.ChildrenLastScheduleTime)) + gang.deletePod(pod) + assert.Equal(t, 0, len(gangGroupInfo.ChildrenScheduleRoundMap)) + assert.Equal(t, 0, len(gangGroupInfo.ChildrenLastScheduleTime)) } func TestIsScheduleCycleValid_GetScheduleCycle_GetChildScheduleCycle_SetChildScheduleCycle(t *testing.T) { gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"}) + gangGroupInfo.SetInitialized() gangGroupInfo.ChildrenScheduleRoundMap["test/pod1"] = 1 gangGroupInfo.ScheduleCycle = 2 gang := &Gang{} + gang.GangGroupInfo = NewGangGroupInfo("", nil) gang.SetGangGroupInfo(gangGroupInfo) pod := &corev1.Pod{} @@ -90,9 +82,11 @@ func TestIsScheduleCycleValid_GetScheduleCycle_GetChildScheduleCycle_SetChildSch func TestInitPodLastScheduleTime_GetPodLastScheduleTime_ResetPodLastScheduleTime(t *testing.T) { gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"}) + gangGroupInfo.SetInitialized() gangGroupInfo.LastScheduleTime = time.Now() gang := &Gang{} + gang.GangGroupInfo = NewGangGroupInfo("", nil) gang.Children = make(map[string]*corev1.Pod) gang.SetGangGroupInfo(gangGroupInfo) @@ -138,9 +132,11 @@ func TestInitPodLastScheduleTime_GetPodLastScheduleTime_ResetPodLastScheduleTime func TestScheduleCycleRelated(t *testing.T) { gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"}) + gangGroupInfo.SetInitialized() gangGroupInfo.LastScheduleTime = time.Now() gang := &Gang{} + gang.GangGroupInfo = NewGangGroupInfo("", nil) gang.Name = "aa" gang.SetGangGroupInfo(gangGroupInfo) gangGroupInfo.GangTotalChildrenNumMap["aa"] = 1 diff --git a/pkg/scheduler/plugins/coscheduling/core/ganggroup.go b/pkg/scheduler/plugins/coscheduling/core/ganggroup.go index 514e08ca3..755e3434f 100644 --- a/pkg/scheduler/plugins/coscheduling/core/ganggroup.go +++ b/pkg/scheduler/plugins/coscheduling/core/ganggroup.go @@ -11,7 +11,9 @@ import ( ) type GangGroupInfo struct { - lock sync.Mutex + lock sync.Mutex + + Initialized bool GangGroupId string GangGroup []string @@ -32,6 +34,7 @@ type GangGroupInfo struct { func NewGangGroupInfo(gangGroupId string, gangGroup []string) *GangGroupInfo { gangGroupInfo := &GangGroupInfo{ + Initialized: false, GangGroupId: gangGroupId, GangGroup: gangGroup, ScheduleCycle: 1, @@ -49,6 +52,20 @@ func NewGangGroupInfo(gangGroupId string, gangGroup []string) *GangGroupInfo { return gangGroupInfo } +func (gg *GangGroupInfo) SetInitialized() { + gg.lock.Lock() + defer gg.lock.Unlock() + + gg.Initialized = true +} + +func (gg *GangGroupInfo) IsInitialized() bool { + gg.lock.Lock() + defer gg.lock.Unlock() + + return gg.Initialized +} + func (gg *GangGroupInfo) GetScheduleCycle() int { gg.lock.Lock() defer gg.lock.Unlock() @@ -60,6 +77,10 @@ func (gg *GangGroupInfo) setScheduleCycleInvalid() { gg.lock.Lock() defer gg.lock.Unlock() + if !gg.Initialized { + return + } + if gg.ScheduleCycleValid { gg.ScheduleCycleValid = false klog.Infof("setScheduleCycleInvalid, gangGroupName: %v, valid: %v", gg.GangGroupId) @@ -70,6 +91,10 @@ func (gg *GangGroupInfo) IsScheduleCycleValid() bool { gg.lock.Lock() defer gg.lock.Unlock() + if !gg.Initialized { + return false + } + return gg.ScheduleCycleValid } @@ -77,6 +102,10 @@ func (gg *GangGroupInfo) trySetScheduleCycleValid() { gg.lock.Lock() defer gg.lock.Unlock() + if !gg.Initialized { + return + } + num := 0 for _, childScheduleCycle := range gg.ChildrenScheduleRoundMap { if childScheduleCycle == gg.ScheduleCycle { @@ -102,6 +131,10 @@ func (gg *GangGroupInfo) setChildScheduleCycle(pod *corev1.Pod, childCycle int) gg.lock.Lock() defer gg.lock.Unlock() + if !gg.Initialized { + return + } + podId := util.GetId(pod.Namespace, pod.Name) gg.ChildrenScheduleRoundMap[podId] = childCycle klog.Infof("setChildScheduleCycle, pod: %v, childCycle: %v", podId, childCycle) @@ -119,6 +152,10 @@ func (gg *GangGroupInfo) deleteChildScheduleCycle(podId string) { gg.lock.Lock() defer gg.lock.Unlock() + if !gg.Initialized { + return + } + delete(gg.ChildrenScheduleRoundMap, podId) } @@ -126,6 +163,10 @@ func (gg *GangGroupInfo) SetGangTotalChildrenNum(gangName string, totalChildrenN gg.lock.Lock() defer gg.lock.Unlock() + if !gg.Initialized { + return + } + gg.GangTotalChildrenNumMap[gangName] = totalChildrenNum } @@ -133,6 +174,10 @@ func (gg *GangGroupInfo) initPodLastScheduleTime(pod *corev1.Pod) { gg.lock.Lock() defer gg.lock.Unlock() + if !gg.Initialized { + return + } + podId := util.GetId(pod.Namespace, pod.Name) gg.ChildrenLastScheduleTime[podId] = gg.LastScheduleTime } @@ -149,6 +194,10 @@ func (gg *GangGroupInfo) deletePodLastScheduleTime(podId string) { gg.lock.Lock() defer gg.lock.Unlock() + if !gg.Initialized { + return + } + delete(gg.ChildrenLastScheduleTime, podId) } @@ -156,6 +205,10 @@ func (gg *GangGroupInfo) resetPodLastScheduleTime(pod *corev1.Pod) { gg.lock.Lock() defer gg.lock.Unlock() + if !gg.Initialized { + return + } + num := 0 for _, childLastScheduleTime := range gg.ChildrenLastScheduleTime { if childLastScheduleTime.Equal(gg.LastScheduleTime) { diff --git a/pkg/scheduler/plugins/coscheduling/core/ganggroup_test.go b/pkg/scheduler/plugins/coscheduling/core/ganggroup_test.go index 8858b1fa2..906b397a7 100644 --- a/pkg/scheduler/plugins/coscheduling/core/ganggroup_test.go +++ b/pkg/scheduler/plugins/coscheduling/core/ganggroup_test.go @@ -10,12 +10,14 @@ import ( func TestGangGroupInfo(t *testing.T) { { gg := NewGangGroupInfo("aa", []string{"aa"}) + gg.SetInitialized() assert.Equal(t, 1, gg.ScheduleCycle) assert.Equal(t, true, gg.ScheduleCycleValid) assert.True(t, !gg.LastScheduleTime.IsZero()) } { gg := NewGangGroupInfo("aa", []string{"aa"}) + gg.SetInitialized() gg.setScheduleCycleInvalid() gg.SetGangTotalChildrenNum("aa", 1) @@ -56,6 +58,7 @@ func TestGangGroupInfo(t *testing.T) { } { gg := NewGangGroupInfo("aa", []string{"aa"}) + gg.SetInitialized() pod1 := &corev1.Pod{} pod1.Namespace = "test"