Skip to content

Commit

Permalink
fix: tfjob with restartPolicy=ExitCode not work (#1562)
Browse files Browse the repository at this point in the history
  • Loading branch information
cheimu authored Jul 25, 2022
1 parent f5c5dea commit 9cc1cc9
Showing 1 changed file with 22 additions and 7 deletions.
29 changes: 22 additions & 7 deletions pkg/controller.v1/tensorflow/tfjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,20 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1
r.WorkQueue.AddAfter(tfJobKey, time.Duration(*tfJob.Spec.RunPolicy.ActiveDeadlineSeconds)*time.Second)
}
}

// For the situation that jobStatus has a restarting condition, and append a running condition,
// the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(),
// so we need to record the existing restarting condition for later use.
var existingRestartingCondition *commonv1.JobCondition
for _, condition := range jobStatus.Conditions {
if condition.Type == commonv1.JobRestarting {
existingRestartingCondition = &commonv1.JobCondition{
Reason: condition.Reason,
Message: condition.Message,
}
}
}

// iterate the replica spec based on this order
allTypes := []commonv1.ReplicaType{
kubeflowv1.TFJobReplicaTypeChief,
Expand Down Expand Up @@ -506,14 +520,15 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1
}

if failed > 0 {
restart := false
for _, condition := range jobStatus.Conditions {
if condition.Type == commonv1.JobRestarting {
restart = true
// For the situation that jobStatus has a restarting condition, and appends a new running condition,
// the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(),
// so we need to append the restarting condition back to jobStatus.
if existingRestartingCondition != nil {
err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, existingRestartingCondition.Reason, existingRestartingCondition.Message)
if err != nil {
commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err)
return err
}
}

if restart {
// job is restarting, no need to set it failed
// we know it because we update the status condition when reconciling the replicas
trainingoperatorcommon.RestartedJobsCounterInc(tfJob.Namespace, kubeflowv1.TFJobFrameworkName)
Expand Down

0 comments on commit 9cc1cc9

Please sign in to comment.