Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristianZaccaria committed Jul 26, 2024
2 parents 28a02a0 + 3658620 commit ef081f9
Show file tree
Hide file tree
Showing 11 changed files with 135 additions and 21 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ VERSION ?= v0.0.0-dev
BUNDLE_VERSION ?= $(VERSION:v%=%)

# APPWRAPPER_VERSION defines the default version of the AppWrapper controller
APPWRAPPER_VERSION ?= v0.21.1
APPWRAPPER_VERSION ?= v0.22.0
APPWRAPPER_REPO ?= github.com/project-codeflare/appwrapper
APPWRAPPER_CRD ?= ${APPWRAPPER_REPO}/config/crd?ref=${APPWRAPPER_VERSION}

Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ CodeFlare Stack Compatibility Matrix

| Component | Version |
|------------------------------|---------------------------------------------------------------------------------------------------|
| CodeFlare Operator | [v1.5.0](https://github.com/project-codeflare/codeflare-operator/releases/tag/v1.5.0) |
| CodeFlare-SDK | [v0.17.0](https://github.com/project-codeflare/codeflare-sdk/releases/tag/v0.17.0) |
| AppWrapper | [v0.20.2](https://github.com/project-codeflare/appwrapper/releases/tag/v0.20.2) |
| CodeFlare Operator | [v1.6.0](https://github.com/project-codeflare/codeflare-operator/releases/tag/v1.6.0) |
| CodeFlare-SDK | [v0.18.0](https://github.com/project-codeflare/codeflare-sdk/releases/tag/v0.18.0) |
| AppWrapper | [v0.22.0](https://github.com/project-codeflare/appwrapper/releases/tag/v0.22.0) |
| KubeRay | [v1.1.0](https://github.com/opendatahub-io/kuberay/releases/tag/v1.1.0) |
| Kueue | [v0.7.0](https://github.com/opendatahub-io/kueue/releases/tag/v0.7.0) |
<!-- Compatibility Matrix end -->
Expand Down
2 changes: 1 addition & 1 deletion config/crd/appwrapper/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- github.com/project-codeflare/appwrapper/config/crd?ref=v0.21.1
- github.com/project-codeflare/appwrapper/config/crd?ref=v0.22.0
2 changes: 1 addition & 1 deletion config/manager/params.env
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
codeflare-operator-controller-image=quay.io/opendatahub/codeflare-operator:v1.5.0
codeflare-operator-controller-image=quay.io/opendatahub/codeflare-operator:v1.6.0
namespace=opendatahub
8 changes: 8 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ rules:
- patch
- update
- watch
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ require (
github.com/opendatahub-io/opendatahub-operator/v2 v2.10.0
github.com/openshift/api v0.0.0-20230823114715-5fdd7511b790
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c
github.com/project-codeflare/appwrapper v0.21.1
github.com/project-codeflare/appwrapper v0.22.0
github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e
github.com/ray-project/kuberay/ray-operator v1.1.1
go.uber.org/zap v1.27.0
Expand All @@ -35,7 +35,7 @@ replace go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp => go.open
replace github.com/jackc/pgx/v4 => github.com/jackc/pgx/v5 v5.5.4

// These replace directives support the backlevel go version required by ODH build
replace github.com/project-codeflare/appwrapper v0.21.1 => github.com/project-codeflare/appwrapper v0.21.2-0.20240712173553-5b007c947b37
replace github.com/project-codeflare/appwrapper v0.22.0 => github.com/project-codeflare/appwrapper v0.22.1-0.20240719212005-aab106b2126e

replace sigs.k8s.io/kueue v0.7.1 => github.com/opendatahub-io/kueue v0.7.0-odh-test

Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/project-codeflare/appwrapper v0.21.2-0.20240712173553-5b007c947b37 h1:x4qdbN98B9gtaU7pseJWABZzwoDawXLC5QMlx0idXxc=
github.com/project-codeflare/appwrapper v0.21.2-0.20240712173553-5b007c947b37/go.mod h1:gKjO+iRtMIdBvIBYmN+VciL9kzWmkfwgk/+24wCLhSM=
github.com/project-codeflare/appwrapper v0.22.1-0.20240719212005-aab106b2126e h1:cIsCTtAZaT2fsQG/QGUm4/wvJnobYawCPZwTwVE2DGo=
github.com/project-codeflare/appwrapper v0.22.1-0.20240719212005-aab106b2126e/go.mod h1:gKjO+iRtMIdBvIBYmN+VciL9kzWmkfwgk/+24wCLhSM=
github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e h1:juFd1dQyioeMxbVE6F0YD25ozm/jiqJE+MpDhu8p22k=
github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
github.com/prometheus/client_golang v1.18.0 h1:HzFfmkOzH5Q8L8G+kSJKUx5dtG87sewO+FoDDqP5Tbk=
Expand Down
3 changes: 3 additions & 0 deletions pkg/controllers/appwrapper_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,6 @@ package controllers
// +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/finalizers,verbs=update
// +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=resourceflavors,verbs=get;list;watch
// +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloadpriorityclasses,verbs=get;list;watch

// permission to watch nodes for Autopilot integration
//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
52 changes: 51 additions & 1 deletion pkg/controllers/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,10 @@ func (r *RayClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{RequeueAfter: requeueTime}, err
}

if err := r.deleteHeadPodIfMissingImagePullSecrets(ctx, cluster); err != nil {
return ctrl.Result{RequeueAfter: requeueTime}, err
}

_, err = r.kubeClient.RbacV1().ClusterRoleBindings().Apply(ctx, desiredOAuthClusterRoleBinding(cluster), metav1.ApplyOptions{FieldManager: controllerName, Force: true})
if err != nil {
logger.Error(err, "Failed to update OAuth ClusterRoleBinding")
Expand Down Expand Up @@ -470,6 +474,7 @@ func generateCACertificate() ([]byte, []byte, error) {

return privateKeyPem, certPem, nil
}

func desiredWorkersNetworkPolicy(cluster *rayv1.RayCluster) *networkingv1ac.NetworkPolicyApplyConfiguration {
return networkingv1ac.NetworkPolicy(cluster.Name+"-workers", cluster.Namespace).
WithLabels(map[string]string{RayClusterNameLabel: cluster.Name}).
Expand All @@ -486,6 +491,7 @@ func desiredWorkersNetworkPolicy(cluster *rayv1.RayCluster) *networkingv1ac.Netw
metav1ac.OwnerReference().WithUID(cluster.UID).WithName(cluster.Name).WithKind(cluster.Kind).WithAPIVersion(cluster.APIVersion).WithController(true),
)
}

func desiredHeadNetworkPolicy(cluster *rayv1.RayCluster, cfg *config.KubeRayConfiguration, kubeRayNamespaces []string) *networkingv1ac.NetworkPolicyApplyConfiguration {
allSecuredPorts := []*networkingv1ac.NetworkPolicyPortApplyConfiguration{
networkingv1ac.NetworkPolicyPort().WithProtocol(corev1.ProtocolTCP).WithPort(intstr.FromInt(8443)),
Expand Down Expand Up @@ -544,6 +550,49 @@ func desiredHeadNetworkPolicy(cluster *rayv1.RayCluster, cfg *config.KubeRayConf
)
}

func (r *RayClusterReconciler) deleteHeadPodIfMissingImagePullSecrets(ctx context.Context, cluster *rayv1.RayCluster) error {
serviceAccount, err := r.kubeClient.CoreV1().ServiceAccounts(cluster.Namespace).Get(ctx, oauthServiceAccountNameFromCluster(cluster), metav1.GetOptions{})
if err != nil {
return fmt.Errorf("failed to get OAuth ServiceAccount: %w", err)
}

headPod, err := getHeadPod(ctx, r, cluster)
if err != nil {
return fmt.Errorf("failed to get head pod: %w", err)
}

if headPod == nil {
return nil
}

missingSecrets := map[string]bool{}
for _, secret := range serviceAccount.ImagePullSecrets {
missingSecrets[secret.Name] = true
}
for _, secret := range headPod.Spec.ImagePullSecrets {
delete(missingSecrets, secret.Name)
}
if len(missingSecrets) > 0 {
if err := r.kubeClient.CoreV1().Pods(headPod.Namespace).Delete(ctx, headPod.Name, metav1.DeleteOptions{}); err != nil {
return fmt.Errorf("failed to delete head pod: %w", err)
}
}
return nil
}

func getHeadPod(ctx context.Context, r *RayClusterReconciler, cluster *rayv1.RayCluster) (*corev1.Pod, error) {
podList, err := r.kubeClient.CoreV1().Pods(cluster.Namespace).List(ctx, metav1.ListOptions{
LabelSelector: fmt.Sprintf("ray.io/node-type=head,ray.io/cluster=%s", cluster.Name),
})
if err != nil {
return nil, err
}
if len(podList.Items) > 0 {
return &podList.Items[0], nil
}
return nil, nil
}

// SetupWithManager sets up the controller with the Manager.
func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager) error {
r.kubeClient = kubernetes.NewForConfigOrDie(mgr.GetConfig())
Expand Down Expand Up @@ -577,7 +626,8 @@ func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager) error {
NamespacedName: client.ObjectKey{
Name: name,
Namespace: namespace,
}}}
},
}}
}),
)
if r.IsOpenShift {
Expand Down
50 changes: 48 additions & 2 deletions pkg/controllers/raycluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import (

var _ = Describe("RayCluster controller", func() {
Context("RayCluster controller test", func() {
var rayClusterName = "test-raycluster"
rayClusterName := "test-raycluster"
var namespaceName string
BeforeEach(func(ctx SpecContext) {
By("Creating a namespace for running the tests.")
Expand Down Expand Up @@ -145,6 +145,53 @@ var _ = Describe("RayCluster controller", func() {
}).WithTimeout(time.Second * 10).Should(WithTransform(OwnerReferenceName, Equal(foundRayCluster.Name)))
})

It("should delete the head pod if missing image pull secrets", func(ctx SpecContext) {
foundRayCluster, err := rayClient.RayV1().RayClusters(namespaceName).Get(ctx, rayClusterName, metav1.GetOptions{})
Expect(err).To(Not(HaveOccurred()))

Eventually(func() (*corev1.ServiceAccount, error) {
return k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(foundRayCluster), metav1.GetOptions{})
}).WithTimeout(time.Second * 10).Should(WithTransform(OwnerReferenceKind, Equal("RayCluster")))

headPodName := "head-pod"
headPod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: headPodName,
Namespace: namespaceName,
Labels: map[string]string{
"ray.io/node-type": "head",
"ray.io/cluster": foundRayCluster.Name,
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "head-container",
Image: "busybox",
},
},
},
}
_, err = k8sClient.CoreV1().Pods(namespaceName).Create(ctx, headPod, metav1.CreateOptions{})
Expect(err).To(Not(HaveOccurred()))

Eventually(func() (*corev1.Pod, error) {
return k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{})
}).WithTimeout(time.Second * 10).ShouldNot(BeNil())

sa, err := k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(foundRayCluster), metav1.GetOptions{})
Expect(err).To(Not(HaveOccurred()))

sa.ImagePullSecrets = append(sa.ImagePullSecrets, corev1.LocalObjectReference{Name: "test-image-pull-secret"})
_, err = k8sClient.CoreV1().ServiceAccounts(namespaceName).Update(ctx, sa, metav1.UpdateOptions{})
Expect(err).To(Not(HaveOccurred()))

Eventually(func() error {
_, err := k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{})
return err
}).WithTimeout(time.Second * 10).Should(Satisfy(errors.IsNotFound))
})

It("should remove CRB when the RayCluster is deleted", func(ctx SpecContext) {
foundRayCluster, err := rayClient.RayV1().RayClusters(namespaceName).Get(ctx, rayClusterName, metav1.GetOptions{})
Expect(err).To(Not(HaveOccurred()))
Expand All @@ -157,7 +204,6 @@ var _ = Describe("RayCluster controller", func() {
return err
}).WithTimeout(time.Second * 10).Should(Satisfy(errors.IsNotFound))
})

})
})

Expand Down
23 changes: 15 additions & 8 deletions pkg/controllers/raycluster_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,16 +309,15 @@ func TestValidateCreate(t *testing.T) {
test.Expect(err).ShouldNot(HaveOccurred(), "Expected no errors on call to ValidateCreate function")
})

// Negative Test: Invalid RayCluster with EnableIngress set to true
invalidRayCluster := validRayCluster.DeepCopy()

t.Run("Negative: Expected errors on call to ValidateCreate function due to EnableIngress set to True", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
invalidRayCluster.Spec.HeadGroupSpec.EnableIngress = support.Ptr(true)
_, err := rcWebhook.ValidateCreate(test.Ctx(), runtime.Object(invalidRayCluster))
test.Expect(err).Should(HaveOccurred(), "Expected errors on call to ValidateCreate function due to EnableIngress set to True")
})

t.Run("Negative: Expected errors on call to ValidateCreate function due to manipulated OAuth Proxy Container", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
for i, headContainer := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers {
if headContainer.Name == oauthProxyContainerName {
invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[i].Args = []string{"--invalid-arg"}
Expand All @@ -330,6 +329,7 @@ func TestValidateCreate(t *testing.T) {
})

t.Run("Negative: Expected errors on call to ValidateCreate function due to manipulated OAuth Proxy Volume", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
for i, headVolume := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes {
if headVolume.Name == oauthProxyVolumeName {
invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes[i].Secret.SecretName = "invalid-secret-name"
Expand All @@ -341,6 +341,7 @@ func TestValidateCreate(t *testing.T) {
})

t.Run("Negative: Expected errors on call to ValidateCreate function due to manipulated head group service account name", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName = "invalid-service-account-name"
_, err = rcWebhook.ValidateCreate(test.Ctx(), runtime.Object(invalidRayCluster))
test.Expect(err).Should(HaveOccurred(), "Expected errors on call to ValidateCreate function due to manipulated head group service account name")
Expand Down Expand Up @@ -535,17 +536,15 @@ func TestValidateUpdate(t *testing.T) {
test.Expect(err).ShouldNot(HaveOccurred(), "Expected no errors on call to ValidateUpdate function")
})

// Negative Test Cases
trueBool := true
invalidRayCluster := validRayCluster.DeepCopy()

t.Run("Negative: Expected errors on call to ValidateUpdate function due to EnableIngress set to True", func(t *testing.T) {
invalidRayCluster.Spec.HeadGroupSpec.EnableIngress = &trueBool
invalidRayCluster := validRayCluster.DeepCopy()
invalidRayCluster.Spec.HeadGroupSpec.EnableIngress = support.Ptr(true)
_, err := rcWebhook.ValidateUpdate(test.Ctx(), runtime.Object(validRayCluster), runtime.Object(invalidRayCluster))
test.Expect(err).Should(HaveOccurred(), "Expected errors on call to ValidateUpdate function due to EnableIngress set to True")
})

t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated OAuth Proxy Container", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
for i, headContainer := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers {
if headContainer.Name == oauthProxyContainerName {
invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[i].Args = []string{"--invalid-arg"}
Expand All @@ -557,6 +556,7 @@ func TestValidateUpdate(t *testing.T) {
})

t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated OAuth Proxy Volume", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
for i, headVolume := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes {
if headVolume.Name == oauthProxyVolumeName {
invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes[i].Secret.SecretName = "invalid-secret-name"
Expand All @@ -568,12 +568,14 @@ func TestValidateUpdate(t *testing.T) {
})

t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated head group service account name", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName = "invalid-service-account-name"
_, err := rcWebhook.ValidateUpdate(test.Ctx(), runtime.Object(validRayCluster), runtime.Object(invalidRayCluster))
test.Expect(err).Should(HaveOccurred(), "Expected errors on call to ValidateUpdate function due to manipulated head group service account name")
})

t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated Init Container in the head group", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
for i, headInitContainer := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.InitContainers {
if headInitContainer.Name == "create-cert" {
invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.InitContainers[i].Command = []string{"manipulated command"}
Expand All @@ -585,6 +587,7 @@ func TestValidateUpdate(t *testing.T) {
})

t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated Init Container in the worker group", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
for _, workerGroup := range invalidRayCluster.Spec.WorkerGroupSpecs {
for i, workerInitContainer := range workerGroup.Template.Spec.InitContainers {
if workerInitContainer.Name == "create-cert" {
Expand All @@ -598,6 +601,7 @@ func TestValidateUpdate(t *testing.T) {
})

t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated Volume in the head group", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
for i, headVolume := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes {
if headVolume.Name == "ca-vol" {
invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Volumes[i].Secret.SecretName = "invalid-secret-name"
Expand All @@ -609,6 +613,7 @@ func TestValidateUpdate(t *testing.T) {
})

t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated Volume in the worker group", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
for _, workerGroup := range invalidRayCluster.Spec.WorkerGroupSpecs {
for i, workerVolume := range workerGroup.Template.Spec.Volumes {
if workerVolume.Name == "ca-vol" {
Expand All @@ -622,6 +627,7 @@ func TestValidateUpdate(t *testing.T) {
})

t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated env vars in the head group", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
for i, headEnvVar := range invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Env {
if headEnvVar.Name == "RAY_USE_TLS" {
invalidRayCluster.Spec.HeadGroupSpec.Template.Spec.Containers[0].Env[i].Value = "invalid-value"
Expand All @@ -633,6 +639,7 @@ func TestValidateUpdate(t *testing.T) {
})

t.Run("Negative: Expected errors on call to ValidateUpdate function due to manipulated env vars in the worker group", func(t *testing.T) {
invalidRayCluster := validRayCluster.DeepCopy()
for _, workerGroup := range invalidRayCluster.Spec.WorkerGroupSpecs {
for i, workerEnvVar := range workerGroup.Template.Spec.Containers[0].Env {
if workerEnvVar.Name == "RAY_USE_TLS" {
Expand Down

0 comments on commit ef081f9

Please sign in to comment.