diff --git a/Makefile b/Makefile index 29078c60..bcacd6cd 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ VERSION ?= v0.0.0-dev BUNDLE_VERSION ?= $(VERSION:v%=%) # APPWRAPPER_VERSION defines the default version of the AppWrapper controller -APPWRAPPER_VERSION ?= v0.21.1 +APPWRAPPER_VERSION ?= v0.22.0 APPWRAPPER_REPO ?= github.com/project-codeflare/appwrapper APPWRAPPER_CRD ?= ${APPWRAPPER_REPO}/config/crd?ref=${APPWRAPPER_VERSION} diff --git a/README.md b/README.md index 5b186e6c..0ce51b59 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,9 @@ CodeFlare Stack Compatibility Matrix | Component | Version | |------------------------------|---------------------------------------------------------------------------------------------------| -| CodeFlare Operator | [v1.5.0](https://github.com/project-codeflare/codeflare-operator/releases/tag/v1.5.0) | -| CodeFlare-SDK | [v0.17.0](https://github.com/project-codeflare/codeflare-sdk/releases/tag/v0.17.0) | -| AppWrapper | [v0.20.2](https://github.com/project-codeflare/appwrapper/releases/tag/v0.20.2) | +| CodeFlare Operator | [v1.6.0](https://github.com/project-codeflare/codeflare-operator/releases/tag/v1.6.0) | +| CodeFlare-SDK | [v0.18.0](https://github.com/project-codeflare/codeflare-sdk/releases/tag/v0.18.0) | +| AppWrapper | [v0.22.0](https://github.com/project-codeflare/appwrapper/releases/tag/v0.22.0) | | KubeRay | [v1.1.0](https://github.com/opendatahub-io/kuberay/releases/tag/v1.1.0) | | Kueue | [v0.7.0](https://github.com/opendatahub-io/kueue/releases/tag/v0.7.0) | diff --git a/config/crd/appwrapper/kustomization.yaml b/config/crd/appwrapper/kustomization.yaml index 4a83e2ee..d8c3d2e5 100644 --- a/config/crd/appwrapper/kustomization.yaml +++ b/config/crd/appwrapper/kustomization.yaml @@ -1,4 +1,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- github.com/project-codeflare/appwrapper/config/crd?ref=v0.21.1 +- github.com/project-codeflare/appwrapper/config/crd?ref=v0.22.0 diff --git a/config/manager/params.env b/config/manager/params.env index f591a890..351479ef 100644 --- a/config/manager/params.env +++ b/config/manager/params.env @@ -1,2 +1,2 @@ -codeflare-operator-controller-image=quay.io/opendatahub/codeflare-operator:v1.5.0 +codeflare-operator-controller-image=quay.io/opendatahub/codeflare-operator:v1.6.0 namespace=opendatahub diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 7009a421..879d6bc8 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -14,6 +14,14 @@ rules: - patch - update - watch +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch - apiGroups: - "" resources: diff --git a/go.mod b/go.mod index 3c32d1d8..a36f1c4b 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/opendatahub-io/opendatahub-operator/v2 v2.10.0 github.com/openshift/api v0.0.0-20230823114715-5fdd7511b790 github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c - github.com/project-codeflare/appwrapper v0.21.1 + github.com/project-codeflare/appwrapper v0.22.0 github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e github.com/ray-project/kuberay/ray-operator v1.1.1 go.uber.org/zap v1.27.0 @@ -35,7 +35,7 @@ replace go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp => go.open replace github.com/jackc/pgx/v4 => github.com/jackc/pgx/v5 v5.5.4 // These replace directives support the backlevel go version required by ODH build -replace github.com/project-codeflare/appwrapper v0.21.1 => github.com/project-codeflare/appwrapper v0.21.2-0.20240712173553-5b007c947b37 +replace github.com/project-codeflare/appwrapper v0.22.0 => github.com/project-codeflare/appwrapper v0.22.1-0.20240719212005-aab106b2126e replace sigs.k8s.io/kueue v0.7.1 => github.com/opendatahub-io/kueue v0.7.0-odh-test diff --git a/go.sum b/go.sum index fa550784..33662f83 100644 --- a/go.sum +++ b/go.sum @@ -246,8 +246,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/project-codeflare/appwrapper v0.21.2-0.20240712173553-5b007c947b37 h1:x4qdbN98B9gtaU7pseJWABZzwoDawXLC5QMlx0idXxc= -github.com/project-codeflare/appwrapper v0.21.2-0.20240712173553-5b007c947b37/go.mod h1:gKjO+iRtMIdBvIBYmN+VciL9kzWmkfwgk/+24wCLhSM= +github.com/project-codeflare/appwrapper v0.22.1-0.20240719212005-aab106b2126e h1:cIsCTtAZaT2fsQG/QGUm4/wvJnobYawCPZwTwVE2DGo= +github.com/project-codeflare/appwrapper v0.22.1-0.20240719212005-aab106b2126e/go.mod h1:gKjO+iRtMIdBvIBYmN+VciL9kzWmkfwgk/+24wCLhSM= github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e h1:juFd1dQyioeMxbVE6F0YD25ozm/jiqJE+MpDhu8p22k= github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y= github.com/prometheus/client_golang v1.18.0 h1:HzFfmkOzH5Q8L8G+kSJKUx5dtG87sewO+FoDDqP5Tbk= diff --git a/pkg/controllers/appwrapper_controller.go b/pkg/controllers/appwrapper_controller.go index cab8ef37..f15a4ef2 100644 --- a/pkg/controllers/appwrapper_controller.go +++ b/pkg/controllers/appwrapper_controller.go @@ -41,3 +41,6 @@ package controllers // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/finalizers,verbs=update // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=resourceflavors,verbs=get;list;watch // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloadpriorityclasses,verbs=get;list;watch + +// permission to watch nodes for Autopilot integration +//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch diff --git a/pkg/controllers/raycluster_controller.go b/pkg/controllers/raycluster_controller.go index d0c3cbab..5691a92f 100644 --- a/pkg/controllers/raycluster_controller.go +++ b/pkg/controllers/raycluster_controller.go @@ -213,6 +213,10 @@ func (r *RayClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{RequeueAfter: requeueTime}, err } + if err := r.deleteHeadPodIfMissingImagePullSecrets(ctx, cluster); err != nil { + return ctrl.Result{RequeueAfter: requeueTime}, err + } + _, err = r.kubeClient.RbacV1().ClusterRoleBindings().Apply(ctx, desiredOAuthClusterRoleBinding(cluster), metav1.ApplyOptions{FieldManager: controllerName, Force: true}) if err != nil { logger.Error(err, "Failed to update OAuth ClusterRoleBinding") @@ -470,6 +474,7 @@ func generateCACertificate() ([]byte, []byte, error) { return privateKeyPem, certPem, nil } + func desiredWorkersNetworkPolicy(cluster *rayv1.RayCluster) *networkingv1ac.NetworkPolicyApplyConfiguration { return networkingv1ac.NetworkPolicy(cluster.Name+"-workers", cluster.Namespace). WithLabels(map[string]string{RayClusterNameLabel: cluster.Name}). @@ -486,6 +491,7 @@ func desiredWorkersNetworkPolicy(cluster *rayv1.RayCluster) *networkingv1ac.Netw metav1ac.OwnerReference().WithUID(cluster.UID).WithName(cluster.Name).WithKind(cluster.Kind).WithAPIVersion(cluster.APIVersion).WithController(true), ) } + func desiredHeadNetworkPolicy(cluster *rayv1.RayCluster, cfg *config.KubeRayConfiguration, kubeRayNamespaces []string) *networkingv1ac.NetworkPolicyApplyConfiguration { allSecuredPorts := []*networkingv1ac.NetworkPolicyPortApplyConfiguration{ networkingv1ac.NetworkPolicyPort().WithProtocol(corev1.ProtocolTCP).WithPort(intstr.FromInt(8443)), @@ -544,6 +550,49 @@ func desiredHeadNetworkPolicy(cluster *rayv1.RayCluster, cfg *config.KubeRayConf ) } +func (r *RayClusterReconciler) deleteHeadPodIfMissingImagePullSecrets(ctx context.Context, cluster *rayv1.RayCluster) error { + serviceAccount, err := r.kubeClient.CoreV1().ServiceAccounts(cluster.Namespace).Get(ctx, oauthServiceAccountNameFromCluster(cluster), metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get OAuth ServiceAccount: %w", err) + } + + headPod, err := getHeadPod(ctx, r, cluster) + if err != nil { + return fmt.Errorf("failed to get head pod: %w", err) + } + + if headPod == nil { + return nil + } + + missingSecrets := map[string]bool{} + for _, secret := range serviceAccount.ImagePullSecrets { + missingSecrets[secret.Name] = true + } + for _, secret := range headPod.Spec.ImagePullSecrets { + delete(missingSecrets, secret.Name) + } + if len(missingSecrets) > 0 { + if err := r.kubeClient.CoreV1().Pods(headPod.Namespace).Delete(ctx, headPod.Name, metav1.DeleteOptions{}); err != nil { + return fmt.Errorf("failed to delete head pod: %w", err) + } + } + return nil +} + +func getHeadPod(ctx context.Context, r *RayClusterReconciler, cluster *rayv1.RayCluster) (*corev1.Pod, error) { + podList, err := r.kubeClient.CoreV1().Pods(cluster.Namespace).List(ctx, metav1.ListOptions{ + LabelSelector: fmt.Sprintf("ray.io/node-type=head,ray.io/cluster=%s", cluster.Name), + }) + if err != nil { + return nil, err + } + if len(podList.Items) > 0 { + return &podList.Items[0], nil + } + return nil, nil +} + // SetupWithManager sets up the controller with the Manager. func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager) error { r.kubeClient = kubernetes.NewForConfigOrDie(mgr.GetConfig()) @@ -577,7 +626,8 @@ func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager) error { NamespacedName: client.ObjectKey{ Name: name, Namespace: namespace, - }}} + }, + }} }), ) if r.IsOpenShift { diff --git a/pkg/controllers/raycluster_controller_test.go b/pkg/controllers/raycluster_controller_test.go index 94958b33..e0a7e969 100644 --- a/pkg/controllers/raycluster_controller_test.go +++ b/pkg/controllers/raycluster_controller_test.go @@ -33,7 +33,7 @@ import ( var _ = Describe("RayCluster controller", func() { Context("RayCluster controller test", func() { - var rayClusterName = "test-raycluster" + rayClusterName := "test-raycluster" var namespaceName string BeforeEach(func(ctx SpecContext) { By("Creating a namespace for running the tests.") @@ -145,6 +145,53 @@ var _ = Describe("RayCluster controller", func() { }).WithTimeout(time.Second * 10).Should(WithTransform(OwnerReferenceName, Equal(foundRayCluster.Name))) }) + It("should delete the head pod if missing image pull secrets", func(ctx SpecContext) { + foundRayCluster, err := rayClient.RayV1().RayClusters(namespaceName).Get(ctx, rayClusterName, metav1.GetOptions{}) + Expect(err).To(Not(HaveOccurred())) + + Eventually(func() (*corev1.ServiceAccount, error) { + return k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(foundRayCluster), metav1.GetOptions{}) + }).WithTimeout(time.Second * 10).Should(WithTransform(OwnerReferenceKind, Equal("RayCluster"))) + + headPodName := "head-pod" + headPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: headPodName, + Namespace: namespaceName, + Labels: map[string]string{ + "ray.io/node-type": "head", + "ray.io/cluster": foundRayCluster.Name, + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "head-container", + Image: "busybox", + }, + }, + }, + } + _, err = k8sClient.CoreV1().Pods(namespaceName).Create(ctx, headPod, metav1.CreateOptions{}) + Expect(err).To(Not(HaveOccurred())) + + Eventually(func() (*corev1.Pod, error) { + return k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{}) + }).WithTimeout(time.Second * 10).ShouldNot(BeNil()) + + sa, err := k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(foundRayCluster), metav1.GetOptions{}) + Expect(err).To(Not(HaveOccurred())) + + sa.ImagePullSecrets = append(sa.ImagePullSecrets, corev1.LocalObjectReference{Name: "test-image-pull-secret"}) + _, err = k8sClient.CoreV1().ServiceAccounts(namespaceName).Update(ctx, sa, metav1.UpdateOptions{}) + Expect(err).To(Not(HaveOccurred())) + + Eventually(func() error { + _, err := k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{}) + return err + }).WithTimeout(time.Second * 10).Should(Satisfy(errors.IsNotFound)) + }) + It("should remove CRB when the RayCluster is deleted", func(ctx SpecContext) { foundRayCluster, err := rayClient.RayV1().RayClusters(namespaceName).Get(ctx, rayClusterName, metav1.GetOptions{}) Expect(err).To(Not(HaveOccurred())) @@ -157,7 +204,6 @@ var _ = Describe("RayCluster controller", func() { return err }).WithTimeout(time.Second * 10).Should(Satisfy(errors.IsNotFound)) }) - }) }) diff --git a/pkg/controllers/raycluster_webhook_test.go b/pkg/controllers/raycluster_webhook_test.go index d8e4f8c4..8e0c375a 100644 --- a/pkg/controllers/raycluster_webhook_test.go +++ b/pkg/controllers/raycluster_webhook_test.go @@ -309,16 +309,15 @@ func TestValidateCreate(t *testing.T) { test.Expect(err).ShouldNot(HaveOccurred(), "Expected no errors on call to ValidateCreate function") }) - // Negative Test: Invalid RayCluster with EnableIngress set to true - invalidRayCluster := validRayCluster.DeepCopy() - t.Run("Negative: Expected errors on call to ValidateCreate function due to EnableIngress set to True", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() invalidRayCluster.Spec.HeadGroupSpec.EnableIngress = support.Ptr(true) _, err := rcWebhook.ValidateCreate(test.Ctx(), runtime.Object(invalidRayCluster)) test.Expect(err).Should(HaveOccurred(), "Expected errors on call to ValidateCreate function due to EnableIngress set to True") }) t.Run("Negative: Expected errors on call to ValidateCreate function due to manipulated OAuth Proxy Container", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() for i, headContainer := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers { if headContainer.Name == oauthProxyContainerName { invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[i].Args = []string{"--invalid-arg"} @@ -330,6 +329,7 @@ func TestValidateCreate(t *testing.T) { }) t.Run("Negative: Expected errors on call to ValidateCreate function due to manipulated OAuth Proxy Volume", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() for i, headVolume := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes { if headVolume.Name == oauthProxyVolumeName { invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes[i].Secret.SecretName = "invalid-secret-name" @@ -341,6 +341,7 @@ func TestValidateCreate(t *testing.T) { }) t.Run("Negative: Expected errors on call to ValidateCreate function due to manipulated head group service account name", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName = "invalid-service-account-name" _, err = rcWebhook.ValidateCreate(test.Ctx(), runtime.Object(invalidRayCluster)) test.Expect(err).Should(HaveOccurred(), "Expected errors on call to ValidateCreate function due to manipulated head group service account name") @@ -535,17 +536,15 @@ func TestValidateUpdate(t *testing.T) { test.Expect(err).ShouldNot(HaveOccurred(), "Expected no errors on call to ValidateUpdate function") }) - // Negative Test Cases - trueBool := true - invalidRayCluster := validRayCluster.DeepCopy() - t.Run("Negative: Expected errors on call to ValidateUpdate function due to EnableIngress set to True", func(t *testing.T) { - invalidRayCluster.Spec.HeadGroupSpec.EnableIngress = &trueBool + invalidRayCluster := validRayCluster.DeepCopy() + invalidRayCluster.Spec.HeadGroupSpec.EnableIngress = support.Ptr(true) _, err := rcWebhook.ValidateUpdate(test.Ctx(), runtime.Object(validRayCluster), runtime.Object(invalidRayCluster)) test.Expect(err).Should(HaveOccurred(), "Expected errors on call to ValidateUpdate function due to EnableIngress set to True") }) t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated OAuth Proxy Container", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() for i, headContainer := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers { if headContainer.Name == oauthProxyContainerName { invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[i].Args = []string{"--invalid-arg"} @@ -557,6 +556,7 @@ func TestValidateUpdate(t *testing.T) { }) t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated OAuth Proxy Volume", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() for i, headVolume := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes { if headVolume.Name == oauthProxyVolumeName { invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes[i].Secret.SecretName = "invalid-secret-name" @@ -568,12 +568,14 @@ func TestValidateUpdate(t *testing.T) { }) t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated head group service account name", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName = "invalid-service-account-name" _, err := rcWebhook.ValidateUpdate(test.Ctx(), runtime.Object(validRayCluster), runtime.Object(invalidRayCluster)) test.Expect(err).Should(HaveOccurred(), "Expected errors on call to ValidateUpdate function due to manipulated head group service account name") }) t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated Init Container in the head group", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() for i, headInitContainer := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.InitContainers { if headInitContainer.Name == "create-cert" { invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.InitContainers[i].Command = []string{"manipulated command"} @@ -585,6 +587,7 @@ func TestValidateUpdate(t *testing.T) { }) t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated Init Container in the worker group", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() for _, workerGroup := range invalidRayCluster.Spec.WorkerGroupSpecs { for i, workerInitContainer := range workerGroup.Template.Spec.InitContainers { if workerInitContainer.Name == "create-cert" { @@ -598,6 +601,7 @@ func TestValidateUpdate(t *testing.T) { }) t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated Volume in the head group", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() for i, headVolume := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes { if headVolume.Name == "ca-vol" { invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes[i].Secret.SecretName = "invalid-secret-name" @@ -609,6 +613,7 @@ func TestValidateUpdate(t *testing.T) { }) t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated Volume in the worker group", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() for _, workerGroup := range invalidRayCluster.Spec.WorkerGroupSpecs { for i, workerVolume := range workerGroup.Template.Spec.Volumes { if workerVolume.Name == "ca-vol" { @@ -622,6 +627,7 @@ func TestValidateUpdate(t *testing.T) { }) t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated env vars in the head group", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() for i, headEnvVar := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Env { if headEnvVar.Name == "RAY_USE_TLS" { invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Env[i].Value = "invalid-value" @@ -633,6 +639,7 @@ func TestValidateUpdate(t *testing.T) { }) t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated env vars in the worker group", func(t *testing.T) { + invalidRayCluster := validRayCluster.DeepCopy() for _, workerGroup := range invalidRayCluster.Spec.WorkerGroupSpecs { for i, workerEnvVar := range workerGroup.Template.Spec.Containers[0].Env { if workerEnvVar.Name == "RAY_USE_TLS" {