Skip to content

Commit

Permalink
Merge pull request #1 from djwhatle/v1.3.0-enable-partial-gang-schedu…
Browse files Browse the repository at this point in the history
…le-cherrypick-exitcode-fix

fix: tfjob with restartPolicy=ExitCode not work (kubeflow#1562)
  • Loading branch information
djwhatle authored Jul 26, 2022
2 parents abab340 + eee8d6b commit b157f01
Showing 1 changed file with 22 additions and 7 deletions.
29 changes: 22 additions & 7 deletions pkg/controller.v1/tensorflow/tfjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,20 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1
r.WorkQueue.AddAfter(tfJobKey, time.Duration(*tfJob.Spec.RunPolicy.ActiveDeadlineSeconds)*time.Second)
}
}

// For the situation that jobStatus has a restarting condition, and append a running condition,
// the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(),
// so we need to record the existing restarting condition for later use.
var existingRestartingCondition *commonv1.JobCondition
for _, condition := range jobStatus.Conditions {
if condition.Type == commonv1.JobRestarting {
existingRestartingCondition = &commonv1.JobCondition{
Reason: condition.Reason,
Message: condition.Message,
}
}
}

// iterate the replica spec based on this order
allTypes := []commonv1.ReplicaType{
tensorflowv1.TFReplicaTypeChief,
Expand Down Expand Up @@ -504,14 +518,15 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1
}

if failed > 0 {
restart := false
for _, condition := range jobStatus.Conditions {
if condition.Type == commonv1.JobRestarting {
restart = true
// For the situation that jobStatus has a restarting condition, and appends a new running condition,
// the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(),
// so we need to append the restarting condition back to jobStatus.
if existingRestartingCondition != nil {
err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, existingRestartingCondition.Reason, existingRestartingCondition.Message)
if err != nil {
commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err)
return err
}
}

if restart {
// job is restarting, no need to set it failed
// we know it because we update the status condition when reconciling the replicas
distributed := trainingoperatorcommon.MapBoolToDistributed(isDistributed(tfJob))
Expand Down

0 comments on commit b157f01

Please sign in to comment.