From eee8d6bebff324cb65b26dd2bb81ec249595702d Mon Sep 17 00:00:00 2001 From: Cheimu <yimo@xiaohongshu.com> Date: Tue, 26 Jul 2022 06:34:13 +0800 Subject: [PATCH] fix: tfjob with restartPolicy=ExitCode not work (#1562) (cherry picked from commit 9cc1cc96ba6370621c56cba60e8643604b3139e9) --- .../tensorflow/tfjob_controller.go | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/pkg/controller.v1/tensorflow/tfjob_controller.go b/pkg/controller.v1/tensorflow/tfjob_controller.go index b0577e6132..eab638a73b 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller.go @@ -410,6 +410,20 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 r.WorkQueue.AddAfter(tfJobKey, time.Duration(*tfJob.Spec.RunPolicy.ActiveDeadlineSeconds)*time.Second) } } + + // For the situation that jobStatus has a restarting condition, and append a running condition, + // the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(), + // so we need to record the existing restarting condition for later use. + var existingRestartingCondition *commonv1.JobCondition + for _, condition := range jobStatus.Conditions { + if condition.Type == commonv1.JobRestarting { + existingRestartingCondition = &commonv1.JobCondition{ + Reason: condition.Reason, + Message: condition.Message, + } + } + } + // iterate the replica spec based on this order allTypes := []commonv1.ReplicaType{ tensorflowv1.TFReplicaTypeChief, @@ -504,14 +518,15 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 } if failed > 0 { - restart := false - for _, condition := range jobStatus.Conditions { - if condition.Type == commonv1.JobRestarting { - restart = true + // For the situation that jobStatus has a restarting condition, and appends a new running condition, + // the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(), + // so we need to append the restarting condition back to jobStatus. + if existingRestartingCondition != nil { + err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, existingRestartingCondition.Reason, existingRestartingCondition.Message) + if err != nil { + commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) + return err } - } - - if restart { // job is restarting, no need to set it failed // we know it because we update the status condition when reconciling the replicas distributed := trainingoperatorcommon.MapBoolToDistributed(isDistributed(tfJob))