From c8fcd86cc8379ffcbff94674c8e0cf915a88a209 Mon Sep 17 00:00:00 2001 From: Tina Wu Date: Wed, 14 Aug 2024 02:52:01 -0700 Subject: [PATCH 1/4] log error and create event --- ray-operator/controllers/ray/rayjob_controller.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index a84194e14f..c223060a01 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -508,6 +508,8 @@ func (r *RayJobReconciler) createNewK8sJob(ctx context.Context, rayJobInstance * // Create the Kubernetes Job if err := r.Client.Create(ctx, job); err != nil { + logger.Error(err, "Failed to create new Kubernetes Job") + r.Recorder.Eventf(rayJobInstance, corev1.EventTypeWarning, "k8sJobCreationFailed", "Failed to create Kubernetes Job %s: %v", job.Name, err) return err } logger.Info("Kubernetes Job created", "RayJob", rayJobInstance.Name, "Kubernetes Job", job.Name) From bee5864b29c74357fddc0b19f902e4d76d59dec1 Mon Sep 17 00:00:00 2001 From: Tina Wu Date: Wed, 14 Aug 2024 17:19:39 -0700 Subject: [PATCH 2/4] add createNewK8sJob unit test --- .../controllers/ray/rayjob_controller.go | 2 +- .../ray/rayjob_controller_unit_test.go | 60 +++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index c223060a01..1b911df0f1 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -509,7 +509,7 @@ func (r *RayJobReconciler) createNewK8sJob(ctx context.Context, rayJobInstance * // Create the Kubernetes Job if err := r.Client.Create(ctx, job); err != nil { logger.Error(err, "Failed to create new Kubernetes Job") - r.Recorder.Eventf(rayJobInstance, corev1.EventTypeWarning, "k8sJobCreationFailed", "Failed to create Kubernetes Job %s: %v", job.Name, err) + r.Recorder.Eventf(rayJobInstance, corev1.EventTypeWarning, "k8sJobCreationFailed", "Failed to create new Kubernetes Job %s: %v", job.Name, err) return err } logger.Info("Kubernetes Job created", "RayJob", rayJobInstance.Name, "Kubernetes Job", job.Name) diff --git a/ray-operator/controllers/ray/rayjob_controller_unit_test.go b/ray-operator/controllers/ray/rayjob_controller_unit_test.go index b82656d902..55a7cd2813 100644 --- a/ray-operator/controllers/ray/rayjob_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_unit_test.go @@ -2,6 +2,7 @@ package ray import ( "context" + "strings" "testing" "github.com/stretchr/testify/assert" @@ -12,10 +13,13 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" utils "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" + "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/scheme" ) func TestCreateK8sJobIfNeed(t *testing.T) { @@ -370,3 +374,59 @@ func TestValidateRayJobSpec(t *testing.T) { }) assert.Error(t, err, "The RayJob is invalid because the backoffLimit must be a positive integer.") } + +func TestFailedCreatek8sJob(t *testing.T) { + + rayJob := &rayv1.RayJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-rayjob", + Namespace: "default", + }, + } + + submitterTemplate := corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-submit-pod", + Namespace: "default", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "ray-submit", + Image: "rayproject/ray:latest", + }, + }, + }, + } + + fakeClient := clientFake.NewClientBuilder().WithInterceptorFuncs(interceptor.Funcs{ + Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { + return utils.ErrFailedCreateWorkerPod + }, + }).WithScheme(scheme.Scheme).Build() + + recorder := record.NewFakeRecorder(100) + + reconciler := &RayJobReconciler{ + Client: fakeClient, + Recorder: recorder, + Scheme: scheme.Scheme, + } + + err := reconciler.createNewK8sJob(context.Background(), rayJob, submitterTemplate) + + assert.NotNil(t, err, "Expected error due to simulated job creation failure") + + var foundFailureEvent bool + events := []string{} + for len(recorder.Events) > 0 { + event := <-recorder.Events + if strings.Contains(event, "Failed to create new Kubernetes Job") { + foundFailureEvent = true + break + } + events = append(events, event) + } + + assert.Truef(t, foundFailureEvent, "Expected event to be generated for job creation failure, got events: %s", strings.Join(events, "\n")) +} \ No newline at end of file From 727805b4117f46826d5a837c2031f5d6b5ee67be Mon Sep 17 00:00:00 2001 From: Tina Wu Date: Wed, 14 Aug 2024 22:34:17 -0700 Subject: [PATCH 3/4] fix lint error --- .../controllers/ray/rayjob_controller_unit_test.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ray-operator/controllers/ray/rayjob_controller_unit_test.go b/ray-operator/controllers/ray/rayjob_controller_unit_test.go index 55a7cd2813..21e6aa4f68 100644 --- a/ray-operator/controllers/ray/rayjob_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_unit_test.go @@ -376,7 +376,6 @@ func TestValidateRayJobSpec(t *testing.T) { } func TestFailedCreatek8sJob(t *testing.T) { - rayJob := &rayv1.RayJob{ ObjectMeta: metav1.ObjectMeta{ Name: "test-rayjob", @@ -386,7 +385,7 @@ func TestFailedCreatek8sJob(t *testing.T) { submitterTemplate := corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-submit-pod", + Name: "test-submit-pod", Namespace: "default", }, Spec: corev1.PodSpec{ @@ -398,7 +397,7 @@ func TestFailedCreatek8sJob(t *testing.T) { }, }, } - + fakeClient := clientFake.NewClientBuilder().WithInterceptorFuncs(interceptor.Funcs{ Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error { return utils.ErrFailedCreateWorkerPod @@ -408,9 +407,9 @@ func TestFailedCreatek8sJob(t *testing.T) { recorder := record.NewFakeRecorder(100) reconciler := &RayJobReconciler{ - Client: fakeClient, + Client: fakeClient, Recorder: recorder, - Scheme: scheme.Scheme, + Scheme: scheme.Scheme, } err := reconciler.createNewK8sJob(context.Background(), rayJob, submitterTemplate) @@ -429,4 +428,4 @@ func TestFailedCreatek8sJob(t *testing.T) { } assert.Truef(t, foundFailureEvent, "Expected event to be generated for job creation failure, got events: %s", strings.Join(events, "\n")) -} \ No newline at end of file +} From a395d691873897c0e89a6069d667b30bea66790a Mon Sep 17 00:00:00 2001 From: Tina Wu Date: Mon, 19 Aug 2024 00:30:58 -0500 Subject: [PATCH 4/4] rename event reason --- ray-operator/controllers/ray/rayjob_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 1b911df0f1..d20f1b9b0b 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -513,7 +513,7 @@ func (r *RayJobReconciler) createNewK8sJob(ctx context.Context, rayJobInstance * return err } logger.Info("Kubernetes Job created", "RayJob", rayJobInstance.Name, "Kubernetes Job", job.Name) - r.Recorder.Eventf(rayJobInstance, corev1.EventTypeNormal, "Created", "Created Kubernetes Job %s", job.Name) + r.Recorder.Eventf(rayJobInstance, corev1.EventTypeNormal, "k8sJobCreationCreated", "Created Kubernetes Job %s", job.Name) return nil }