-
Notifications
You must be signed in to change notification settings - Fork 71
Add job suspend semantics #196
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -95,7 +95,11 @@ func (jc *JobController) ReconcileJobs( | |
} | ||
|
||
oldStatus := jobStatus.DeepCopy() | ||
if commonutil.IsSucceeded(jobStatus) || commonutil.IsFailed(jobStatus) { | ||
jobSuspended, err := jc.Controller.JobSuspended(job) | ||
if err != nil { | ||
return err | ||
} | ||
if commonutil.IsSucceeded(jobStatus) || commonutil.IsFailed(jobStatus) || (jobSuspended != nil && *jobSuspended) { | ||
// If the Job is succeed or failed, delete all pods and services. | ||
if err := jc.DeletePodsAndServices(runPolicy, job, pods); err != nil { | ||
return err | ||
|
@@ -357,3 +361,8 @@ func (jc *JobController) CleanupJob(runPolicy *apiv1.RunPolicy, jobStatus apiv1. | |
func (jc *JobController) calcPGMinResources(minMember int32, replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec) *v1.ResourceList { | ||
return CalcPGMinResources(minMember, replicas, jc.PriorityClassLister.Get) | ||
} | ||
|
||
func (jc *JobController) JobSuspended(job interface{}) (*bool, error) { | ||
log.Infof("Not implemented.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am wondering if we should merge this since the feature is not completed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a base class default function in case Job subclasses(TFJob, MPIJob, etc.) do not implement this method. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually it will be override in Job subclass which supports job suspend. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If |
||
return nil, nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My concern here is
Suspend
is just a transition state, should we delete all the pods or just the active ones, leaving the completed pods(succeeded/failed) as they are.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If anything, it should have the same semantics as kubernetes Job, where we delete the running pods.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the current implementation is consistent with batch/job.