From eee8d6bebff324cb65b26dd2bb81ec249595702d Mon Sep 17 00:00:00 2001
From: Cheimu <yimo@xiaohongshu.com>
Date: Tue, 26 Jul 2022 06:34:13 +0800
Subject: [PATCH] fix: tfjob with restartPolicy=ExitCode not work (#1562)

(cherry picked from commit 9cc1cc96ba6370621c56cba60e8643604b3139e9)
---
 .../tensorflow/tfjob_controller.go            | 29 ++++++++++++++-----
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/pkg/controller.v1/tensorflow/tfjob_controller.go b/pkg/controller.v1/tensorflow/tfjob_controller.go
index b0577e6132..eab638a73b 100644
--- a/pkg/controller.v1/tensorflow/tfjob_controller.go
+++ b/pkg/controller.v1/tensorflow/tfjob_controller.go
@@ -410,6 +410,20 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1
 			r.WorkQueue.AddAfter(tfJobKey, time.Duration(*tfJob.Spec.RunPolicy.ActiveDeadlineSeconds)*time.Second)
 		}
 	}
+
+	// For the situation that jobStatus has a restarting condition, and append a running condition,
+	// the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(),
+	// so we need to record the existing restarting condition for later use.
+	var existingRestartingCondition *commonv1.JobCondition
+	for _, condition := range jobStatus.Conditions {
+		if condition.Type == commonv1.JobRestarting {
+			existingRestartingCondition = &commonv1.JobCondition{
+				Reason:  condition.Reason,
+				Message: condition.Message,
+			}
+		}
+	}
+
 	// iterate the replica spec based on this order
 	allTypes := []commonv1.ReplicaType{
 		tensorflowv1.TFReplicaTypeChief,
@@ -504,14 +518,15 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1
 		}
 
 		if failed > 0 {
-			restart := false
-			for _, condition := range jobStatus.Conditions {
-				if condition.Type == commonv1.JobRestarting {
-					restart = true
+			// For the situation that jobStatus has a restarting condition, and appends a new running condition,
+			// the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(),
+			// so we need to append the restarting condition back to jobStatus.
+			if existingRestartingCondition != nil {
+				err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, existingRestartingCondition.Reason, existingRestartingCondition.Message)
+				if err != nil {
+					commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err)
+					return err
 				}
-			}
-
-			if restart {
 				// job is restarting, no need to set it failed
 				// we know it because we update the status condition when reconciling the replicas
 				distributed := trainingoperatorcommon.MapBoolToDistributed(isDistributed(tfJob))