From 2ba6a99f4713304c363b0725dcae5d3433f70001 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 30 Aug 2024 10:37:47 +0200 Subject: [PATCH 1/3] No cleaning up a job if the job is suspended. Signed-off-by: Michal Szadkowski --- pkg/controller.v1/common/job.go | 2 +- pkg/controller.v1/tensorflow/job_test.go | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pkg/controller.v1/common/job.go b/pkg/controller.v1/common/job.go index 825afdf8b1..d8a494dd0c 100644 --- a/pkg/controller.v1/common/job.go +++ b/pkg/controller.v1/common/job.go @@ -421,7 +421,7 @@ func (jc *JobController) CleanupJob(runPolicy *apiv1.RunPolicy, jobStatus apiv1. currentTime := time.Now() metaObject, _ := job.(metav1.Object) ttl := runPolicy.TTLSecondsAfterFinished - if ttl == nil { + if ttl == nil || trainutil.IsJobSuspended(runPolicy) { return nil } duration := time.Second * time.Duration(*ttl) diff --git a/pkg/controller.v1/tensorflow/job_test.go b/pkg/controller.v1/tensorflow/job_test.go index c7e5a43c45..09df461f11 100644 --- a/pkg/controller.v1/tensorflow/job_test.go +++ b/pkg/controller.v1/tensorflow/job_test.go @@ -663,6 +663,18 @@ var _ = Describe("Test for controller.v1/common", func() { wantTFJobIsRemoved: false, wantErr: false, }), + Entry("No error with completionTime is nil if suspended", &cleanUpCases{ + tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, nil), + runPolicy: &kubeflowv1.RunPolicy{ + TTLSecondsAfterFinished: nil, + Suspend: ptr.To(true), + }, + jobStatus: kubeflowv1.JobStatus{ + CompletionTime: nil, + }, + wantTFJobIsRemoved: false, + wantErr: false, + }), Entry("Error is occurred since completionTime is nil", &cleanUpCases{ tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, ptr.To[int32](10)), runPolicy: &kubeflowv1.RunPolicy{ From 6a7748afb6c09da539519baa001c7ca609a421fd Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 30 Aug 2024 11:57:00 +0200 Subject: [PATCH 2/3] run fmt Signed-off-by: Michal Szadkowski --- pkg/controller.v1/tensorflow/job_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller.v1/tensorflow/job_test.go b/pkg/controller.v1/tensorflow/job_test.go index 09df461f11..3c36887f6b 100644 --- a/pkg/controller.v1/tensorflow/job_test.go +++ b/pkg/controller.v1/tensorflow/job_test.go @@ -667,7 +667,7 @@ var _ = Describe("Test for controller.v1/common", func() { tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, nil), runPolicy: &kubeflowv1.RunPolicy{ TTLSecondsAfterFinished: nil, - Suspend: ptr.To(true), + Suspend: ptr.To(true), }, jobStatus: kubeflowv1.JobStatus{ CompletionTime: nil, From cbc84567dd57e2177371a02c840d2d9dd269627c Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 30 Aug 2024 12:58:10 +0200 Subject: [PATCH 3/3] Another test case Signed-off-by: Michal Szadkowski --- pkg/controller.v1/tensorflow/job_test.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pkg/controller.v1/tensorflow/job_test.go b/pkg/controller.v1/tensorflow/job_test.go index 3c36887f6b..df146ef15a 100644 --- a/pkg/controller.v1/tensorflow/job_test.go +++ b/pkg/controller.v1/tensorflow/job_test.go @@ -675,6 +675,18 @@ var _ = Describe("Test for controller.v1/common", func() { wantTFJobIsRemoved: false, wantErr: false, }), + Entry("No error with TTL is set and completionTime is nil, if suspended", &cleanUpCases{ + tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, ptr.To[int32](10)), + runPolicy: &kubeflowv1.RunPolicy{ + TTLSecondsAfterFinished: ptr.To[int32](10), + Suspend: ptr.To(true), + }, + jobStatus: kubeflowv1.JobStatus{ + CompletionTime: nil, + }, + wantTFJobIsRemoved: false, + wantErr: false, + }), Entry("Error is occurred since completionTime is nil", &cleanUpCases{ tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, ptr.To[int32](10)), runPolicy: &kubeflowv1.RunPolicy{