Skip to content

Commit

Permalink
queue value shouldn't be reset once it has been set
Browse files Browse the repository at this point in the history
Signed-off-by: Weiyu Yen <ckyuto@gmail.com>
  • Loading branch information
ckyuto committed May 3, 2024
1 parent 3d0f8f0 commit b885e4d
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 26 deletions.
19 changes: 11 additions & 8 deletions pkg/controller.v1/common/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,33 +279,36 @@ func (jc *JobController) ReconcileJobs(
var pgSpecFill FillPodGroupSpecFunc
switch jc.Config.GangScheduling {
case GangSchedulerVolcano:
pgSpecFill = func(pg metav1.Object) (metav1.Object, error) {
pgSpecFill = func(pg metav1.Object) error {
volcanoPodGroup, match := pg.(*volcanov1beta1.PodGroup)
volcanoPodGroup = volcanoPodGroup.DeepCopy()
if !match {
return nil, fmt.Errorf("unable to recognize PodGroup: %v", klog.KObj(pg))
return fmt.Errorf("unable to recognize PodGroup: %v", klog.KObj(pg))
}

if q := volcanoPodGroup.Spec.Queue; len(q) > 0 {
queue = q
}

volcanoPodGroup.Spec = volcanov1beta1.PodGroupSpec{
MinMember: minMember,
Queue: queue,
PriorityClassName: priorityClass,
MinResources: minResources,
}
return volcanoPodGroup, nil
return nil
}
default:
pgSpecFill = func(pg metav1.Object) (metav1.Object, error) {
pgSpecFill = func(pg metav1.Object) error {
schedulerPluginsPodGroup, match := pg.(*schedulerpluginsv1alpha1.PodGroup)
schedulerPluginsPodGroup = schedulerPluginsPodGroup.DeepCopy()
if !match {
return nil, fmt.Errorf("unable to recognize PodGroup: %v", klog.KObj(pg))
return fmt.Errorf("unable to recognize PodGroup: %v", klog.KObj(pg))
}
schedulerPluginsPodGroup.Spec = schedulerpluginsv1alpha1.PodGroupSpec{
MinMember: minMember,
MinResources: *minResources,
ScheduleTimeoutSeconds: schedulerTimeout,
}
return schedulerPluginsPodGroup, nil
return nil
}
}

Expand Down
28 changes: 10 additions & 18 deletions pkg/controller.v1/common/scheduling.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,15 @@ package common
import (
"fmt"

volcanov1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"

"github.com/google/go-cmp/cmp"
log "github.com/sirupsen/logrus"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/client"
)

type FillPodGroupSpecFunc func(object metav1.Object) (metav1.Object, error)
type FillPodGroupSpecFunc func(object metav1.Object) error

func (jc *JobController) SyncPodGroup(job metav1.Object, specFunc FillPodGroupSpecFunc) (metav1.Object, error) {
pgctl := jc.PodGroupControl
Expand All @@ -37,20 +36,14 @@ func (jc *JobController) SyncPodGroup(job metav1.Object, specFunc FillPodGroupSp
podGroup, err := pgctl.GetPodGroup(job.GetNamespace(), job.GetName())
if err == nil {
// update podGroup for gang scheduling
updatedSpecPodGroup, err := specFunc(podGroup)
if err != nil {
oldPodGroup := &podGroup
if err = specFunc(podGroup); err != nil {
return nil, fmt.Errorf("unable to fill the spec of PodGroup, '%v': %v", klog.KObj(podGroup), err)
}

existVolcanoPodGroup := podGroup.(*volcanov1beta1.PodGroup)
updatedSpecVolcanoPodGroup := updatedSpecPodGroup.(*volcanov1beta1.PodGroup)
// The hpa-controller may update the num of replicas
// https://github.com/kubeflow/common/pull/207
if existVolcanoPodGroup.Spec.MinMember != updatedSpecVolcanoPodGroup.Spec.MinMember {
// The queue name should not be changed after the pg is created
updatedSpecVolcanoPodGroup.Spec.Queue = existVolcanoPodGroup.Spec.Queue
return updatedSpecPodGroup, pgctl.UpdatePodGroup(updatedSpecPodGroup.(client.Object))
if diff := cmp.Diff(oldPodGroup, podGroup); len(diff) != 0 {
return podGroup, pgctl.UpdatePodGroup(podGroup.(client.Object))
}
return podGroup, nil
} else if client.IgnoreNotFound(err) != nil {
return nil, fmt.Errorf("unable to get a PodGroup: %v", err)
} else {
Expand All @@ -60,14 +53,13 @@ func (jc *JobController) SyncPodGroup(job metav1.Object, specFunc FillPodGroupSp
newPodGroup.SetNamespace(job.GetNamespace())
newPodGroup.SetAnnotations(job.GetAnnotations())
newPodGroup.SetOwnerReferences([]metav1.OwnerReference{*jc.GenOwnerReference(job)})
updatedSpecPodGroup, err := specFunc(newPodGroup)
if err != nil {
if err = specFunc(newPodGroup); err != nil {
return nil, fmt.Errorf("unable to fill the spec of PodGroup, '%v': %v", klog.KObj(newPodGroup), err)
}

err = pgctl.CreatePodGroup(updatedSpecPodGroup.(client.Object))
err = pgctl.CreatePodGroup(newPodGroup)
if err != nil {
return updatedSpecPodGroup, fmt.Errorf("unable to create PodGroup: %v", err)
return podGroup, fmt.Errorf("unable to create PodGroup: %v", err)
}
createdPodGroupsCount.Inc()
}
Expand Down

0 comments on commit b885e4d

Please sign in to comment.