Skip to content

Commit

Permalink
fix panic of podGroup and pod delete order issue2 (#2013)
Browse files Browse the repository at this point in the history
Signed-off-by: xingbao.zy <xingbao.zy@alibaba-inc.com>
Co-authored-by: xingbao.zy <xingbao.zy@alibaba-inc.com>
  • Loading branch information
buptcozy and xingbao.zy committed Apr 22, 2024
1 parent a931cf2 commit f85aefc
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 51 deletions.
12 changes: 4 additions & 8 deletions pkg/scheduler/plugins/coscheduling/core/gang.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ func NewGang(gangName string) *Gang {
BoundChildren: make(map[string]*v1.Pod),
GangFrom: GangFromPodAnnotation,
HasGangInit: false,
GangGroupInfo: NewGangGroupInfo("", nil),
}
}

Expand Down Expand Up @@ -230,7 +231,7 @@ func (gang *Gang) SetGangGroupInfo(gangGroupInfo *GangGroupInfo) {
gang.lock.Lock()
defer gang.lock.Unlock()

if gang.GangGroupInfo == nil {
if !gang.GangGroupInfo.IsInitialized() {
gang.GangGroupInfo = gangGroupInfo
klog.Infof("SetGangGroupInfo done, gangName: %v, groupSlice: %v, gangGroupId: %v",
gang.Name, gang.GangGroup, gang.GangGroupId)
Expand All @@ -253,13 +254,8 @@ func (gang *Gang) deletePod(pod *v1.Pod) bool {
delete(gang.Children, podId)
delete(gang.WaitingForBindChildren, podId)
delete(gang.BoundChildren, podId)
if gang.GangGroupInfo != nil {
//t0: podGroup deleted, the gang and gangGroupInfo all deleted
//t1: pod updated, create a new fakeGang and nil gangGroupInfo
//t2: pod deleted
gang.GangGroupInfo.deleteChildScheduleCycle(podId)
gang.GangGroupInfo.deletePodLastScheduleTime(podId)
}
gang.GangGroupInfo.deleteChildScheduleCycle(podId)
gang.GangGroupInfo.deletePodLastScheduleTime(podId)
if gang.GangFrom == GangFromPodAnnotation {
if len(gang.Children) == 0 {
return true
Expand Down
1 change: 1 addition & 0 deletions pkg/scheduler/plugins/coscheduling/core/gang_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ func (gangCache *GangCache) getGangGroupInfo(gangGroupId string, gangGroup []str
if gangCache.gangGroupInfoMap[gangGroupId] == nil {
if createIfNotExist {
gangGroupInfo = NewGangGroupInfo(gangGroupId, gangGroup)
gangGroupInfo.SetInitialized()
gangCache.gangGroupInfoMap[gangGroupId] = gangGroupInfo
klog.Infof("add gangGroupInfo to cache, gangGroupId: %v", gangGroupId)
}
Expand Down
36 changes: 26 additions & 10 deletions pkg/scheduler/plugins/coscheduling/core/gang_cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ func TestGangCache_OnPodAdd(t *testing.T) {
WaitTime: 0,
GangGroupId: "default/test",
GangGroup: []string{"default/test"},
GangGroupInfo: NewGangGroupInfo("", nil),
Mode: extension.GangModeStrict,
GangFrom: GangFromPodAnnotation,
GangMatchPolicy: extension.GangMatchPolicyOnceSatisfied,
Expand Down Expand Up @@ -519,9 +520,13 @@ func TestGangCache_OnPodAdd(t *testing.T) {
continue
}

gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum
gang.GangGroupInfo.ChildrenLastScheduleTime[util.GetId(pod.Namespace, pod.Name)] =
gang.GangGroupInfo.LastScheduleTime
if gangCache.gangItems[gangId].GangGroupInfo.IsInitialized() {
gang.GangGroupInfo.SetInitialized()

gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum
gang.GangGroupInfo.ChildrenLastScheduleTime[util.GetId(pod.Namespace, pod.Name)] =
gang.GangGroupInfo.LastScheduleTime
}
}

assert.Equal(t, tt.wantCache, gangCache.gangItems)
Expand Down Expand Up @@ -676,9 +681,13 @@ func TestGangCache_OnPodUpdate(t *testing.T) {
continue
}

gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum
gang.GangGroupInfo.ChildrenLastScheduleTime[util.GetId(pod.Namespace, pod.Name)] =
gang.GangGroupInfo.LastScheduleTime
if gangCache.gangItems[gangId].GangGroupInfo.IsInitialized() {
gang.GangGroupInfo.SetInitialized()

gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum
gang.GangGroupInfo.ChildrenLastScheduleTime[util.GetId(pod.Namespace, pod.Name)] =
gang.GangGroupInfo.LastScheduleTime
}
}

assert.Equal(t, tt.wantCache, gangCache.gangItems)
Expand Down Expand Up @@ -854,11 +863,9 @@ func TestGangCache_OnPodDelete(t *testing.T) {
}

gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum
}

for _, gang := range tt.wantCache {
if gangCache.gangItems[gang.Name].GangGroupInfo == nil {
gangCache.gangItems[gang.Name].GangGroupInfo = NewGangGroupInfo("", nil)
if gangCache.gangItems[gangId].GangGroupInfo.IsInitialized() {
gang.GangGroupInfo.SetInitialized()
}
}

Expand Down Expand Up @@ -1004,6 +1011,10 @@ func TestGangCache_OnPodGroupAdd(t *testing.T) {
if gang.GangGroupInfo != nil {
gang.GangGroupInfo.GangTotalChildrenNumMap[gang.Name] = gang.TotalChildrenNum
}

if gangCache.gangItems[gang.Name].GangGroupInfo.IsInitialized() {
gang.GangGroupInfo.SetInitialized()
}
}

for k, v := range tt.wantCache {
Expand Down Expand Up @@ -1122,6 +1133,11 @@ func TestGangCache_OnGangDelete(t *testing.T) {

cacheGang := cache.getGangFromCacheByGangId("default/gangb", false)
wantedGang.GangGroupId = util.GetGangGroupId(wantedGang.GangGroup)

if cacheGang.GangGroupInfo.IsInitialized() {
wantedGang.GangGroupInfo.SetInitialized()
}

assert.Equal(t, wantedGang, cacheGang)
}

Expand Down
60 changes: 28 additions & 32 deletions pkg/scheduler/plugins/coscheduling/core/gang_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

func TestGangGroupInfo_SetGangGroupInfo(t *testing.T) {
gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"})
gangGroupInfo.SetInitialized()
assert.Equal(t, "aa_bb", gangGroupInfo.GangGroupId)
assert.Equal(t, 2, len(gangGroupInfo.GangGroup))
assert.Equal(t, 1, gangGroupInfo.ScheduleCycle)
Expand All @@ -24,53 +25,44 @@ func TestGangGroupInfo_SetGangGroupInfo(t *testing.T) {
assert.Equal(t, 0, len(gangGroupInfo.ChildrenLastScheduleTime))

gang := &Gang{}
gang.GangGroupInfo = NewGangGroupInfo("", nil)
gang.Name = "aa"
gang.TotalChildrenNum = 2
gang.SetGangGroupInfo(gangGroupInfo)
assert.Equal(t, gang.GangGroupInfo.GangTotalChildrenNumMap["aa"], 2)
}

func TestDeletePod(t *testing.T) {
{
gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"})
gangGroupInfo.ChildrenScheduleRoundMap["test/pod1"] = 1
gangGroupInfo.ChildrenLastScheduleTime["test/pod1"] = time.Now()

gang := &Gang{}
gang.Name = "aa"
gang.TotalChildrenNum = 2
gang.SetGangGroupInfo(gangGroupInfo)

pod := &corev1.Pod{}
pod.Namespace = "test"
pod.Name = "pod1"

assert.Equal(t, 1, len(gangGroupInfo.ChildrenScheduleRoundMap))
assert.Equal(t, 1, len(gangGroupInfo.ChildrenLastScheduleTime))
gang.deletePod(pod)
assert.Equal(t, 0, len(gangGroupInfo.ChildrenScheduleRoundMap))
assert.Equal(t, 0, len(gangGroupInfo.ChildrenLastScheduleTime))
}
{
//won't panic
gang := &Gang{}
gang.Name = "aa"
gang.TotalChildrenNum = 2

pod := &corev1.Pod{}
pod.Namespace = "test"
pod.Name = "pod1"

gang.deletePod(pod)
}
gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"})
gangGroupInfo.SetInitialized()
gangGroupInfo.ChildrenScheduleRoundMap["test/pod1"] = 1
gangGroupInfo.ChildrenLastScheduleTime["test/pod1"] = time.Now()

gang := &Gang{}
gang.GangGroupInfo = NewGangGroupInfo("", nil)
gang.Name = "aa"
gang.TotalChildrenNum = 2
gang.SetGangGroupInfo(gangGroupInfo)

pod := &corev1.Pod{}
pod.Namespace = "test"
pod.Name = "pod1"

assert.Equal(t, 1, len(gangGroupInfo.ChildrenScheduleRoundMap))
assert.Equal(t, 1, len(gangGroupInfo.ChildrenLastScheduleTime))
gang.deletePod(pod)
assert.Equal(t, 0, len(gangGroupInfo.ChildrenScheduleRoundMap))
assert.Equal(t, 0, len(gangGroupInfo.ChildrenLastScheduleTime))
}

func TestIsScheduleCycleValid_GetScheduleCycle_GetChildScheduleCycle_SetChildScheduleCycle(t *testing.T) {
gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"})
gangGroupInfo.SetInitialized()
gangGroupInfo.ChildrenScheduleRoundMap["test/pod1"] = 1
gangGroupInfo.ScheduleCycle = 2

gang := &Gang{}
gang.GangGroupInfo = NewGangGroupInfo("", nil)
gang.SetGangGroupInfo(gangGroupInfo)

pod := &corev1.Pod{}
Expand All @@ -90,9 +82,11 @@ func TestIsScheduleCycleValid_GetScheduleCycle_GetChildScheduleCycle_SetChildSch

func TestInitPodLastScheduleTime_GetPodLastScheduleTime_ResetPodLastScheduleTime(t *testing.T) {
gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"})
gangGroupInfo.SetInitialized()
gangGroupInfo.LastScheduleTime = time.Now()

gang := &Gang{}
gang.GangGroupInfo = NewGangGroupInfo("", nil)
gang.Children = make(map[string]*corev1.Pod)
gang.SetGangGroupInfo(gangGroupInfo)

Expand Down Expand Up @@ -138,9 +132,11 @@ func TestInitPodLastScheduleTime_GetPodLastScheduleTime_ResetPodLastScheduleTime

func TestScheduleCycleRelated(t *testing.T) {
gangGroupInfo := NewGangGroupInfo("aa_bb", []string{"aa", "bb"})
gangGroupInfo.SetInitialized()
gangGroupInfo.LastScheduleTime = time.Now()

gang := &Gang{}
gang.GangGroupInfo = NewGangGroupInfo("", nil)
gang.Name = "aa"
gang.SetGangGroupInfo(gangGroupInfo)
gangGroupInfo.GangTotalChildrenNumMap["aa"] = 1
Expand Down
55 changes: 54 additions & 1 deletion pkg/scheduler/plugins/coscheduling/core/ganggroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ import (
)

type GangGroupInfo struct {
lock sync.Mutex
lock sync.Mutex

Initialized bool
GangGroupId string
GangGroup []string

Expand All @@ -32,6 +34,7 @@ type GangGroupInfo struct {

func NewGangGroupInfo(gangGroupId string, gangGroup []string) *GangGroupInfo {
gangGroupInfo := &GangGroupInfo{
Initialized: false,
GangGroupId: gangGroupId,
GangGroup: gangGroup,
ScheduleCycle: 1,
Expand All @@ -49,6 +52,20 @@ func NewGangGroupInfo(gangGroupId string, gangGroup []string) *GangGroupInfo {
return gangGroupInfo
}

func (gg *GangGroupInfo) SetInitialized() {
gg.lock.Lock()
defer gg.lock.Unlock()

gg.Initialized = true
}

func (gg *GangGroupInfo) IsInitialized() bool {
gg.lock.Lock()
defer gg.lock.Unlock()

return gg.Initialized
}

func (gg *GangGroupInfo) GetScheduleCycle() int {
gg.lock.Lock()
defer gg.lock.Unlock()
Expand All @@ -60,6 +77,10 @@ func (gg *GangGroupInfo) setScheduleCycleInvalid() {
gg.lock.Lock()
defer gg.lock.Unlock()

if !gg.Initialized {
return
}

if gg.ScheduleCycleValid {
gg.ScheduleCycleValid = false
klog.Infof("setScheduleCycleInvalid, gangGroupName: %v, valid: %v", gg.GangGroupId)
Expand All @@ -70,13 +91,21 @@ func (gg *GangGroupInfo) IsScheduleCycleValid() bool {
gg.lock.Lock()
defer gg.lock.Unlock()

if !gg.Initialized {
return false
}

return gg.ScheduleCycleValid
}

func (gg *GangGroupInfo) trySetScheduleCycleValid() {
gg.lock.Lock()
defer gg.lock.Unlock()

if !gg.Initialized {
return
}

num := 0
for _, childScheduleCycle := range gg.ChildrenScheduleRoundMap {
if childScheduleCycle == gg.ScheduleCycle {
Expand All @@ -102,6 +131,10 @@ func (gg *GangGroupInfo) setChildScheduleCycle(pod *corev1.Pod, childCycle int)
gg.lock.Lock()
defer gg.lock.Unlock()

if !gg.Initialized {
return
}

podId := util.GetId(pod.Namespace, pod.Name)
gg.ChildrenScheduleRoundMap[podId] = childCycle
klog.Infof("setChildScheduleCycle, pod: %v, childCycle: %v", podId, childCycle)
Expand All @@ -119,20 +152,32 @@ func (gg *GangGroupInfo) deleteChildScheduleCycle(podId string) {
gg.lock.Lock()
defer gg.lock.Unlock()

if !gg.Initialized {
return
}

delete(gg.ChildrenScheduleRoundMap, podId)
}

func (gg *GangGroupInfo) SetGangTotalChildrenNum(gangName string, totalChildrenNum int) {
gg.lock.Lock()
defer gg.lock.Unlock()

if !gg.Initialized {
return
}

gg.GangTotalChildrenNumMap[gangName] = totalChildrenNum
}

func (gg *GangGroupInfo) initPodLastScheduleTime(pod *corev1.Pod) {
gg.lock.Lock()
defer gg.lock.Unlock()

if !gg.Initialized {
return
}

podId := util.GetId(pod.Namespace, pod.Name)
gg.ChildrenLastScheduleTime[podId] = gg.LastScheduleTime
}
Expand All @@ -149,13 +194,21 @@ func (gg *GangGroupInfo) deletePodLastScheduleTime(podId string) {
gg.lock.Lock()
defer gg.lock.Unlock()

if !gg.Initialized {
return
}

delete(gg.ChildrenLastScheduleTime, podId)
}

func (gg *GangGroupInfo) resetPodLastScheduleTime(pod *corev1.Pod) {
gg.lock.Lock()
defer gg.lock.Unlock()

if !gg.Initialized {
return
}

num := 0
for _, childLastScheduleTime := range gg.ChildrenLastScheduleTime {
if childLastScheduleTime.Equal(gg.LastScheduleTime) {
Expand Down
3 changes: 3 additions & 0 deletions pkg/scheduler/plugins/coscheduling/core/ganggroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ import (
func TestGangGroupInfo(t *testing.T) {
{
gg := NewGangGroupInfo("aa", []string{"aa"})
gg.SetInitialized()
assert.Equal(t, 1, gg.ScheduleCycle)
assert.Equal(t, true, gg.ScheduleCycleValid)
assert.True(t, !gg.LastScheduleTime.IsZero())
}
{
gg := NewGangGroupInfo("aa", []string{"aa"})
gg.SetInitialized()
gg.setScheduleCycleInvalid()

gg.SetGangTotalChildrenNum("aa", 1)
Expand Down Expand Up @@ -56,6 +58,7 @@ func TestGangGroupInfo(t *testing.T) {
}
{
gg := NewGangGroupInfo("aa", []string{"aa"})
gg.SetInitialized()

pod1 := &corev1.Pod{}
pod1.Namespace = "test"
Expand Down

0 comments on commit f85aefc

Please sign in to comment.