kubeflow · google-oss-prow · May 6, 2024 · May 1, 2024
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ This repo periodically syncs all official Kubeflow components from their respect
 
 | Component | Local Manifests Path | Upstream Revision |
 | - | - | - |
-| Training Operator | apps/training-operator/upstream | [v1.7.0](https://github.com/kubeflow/training-operator/tree/v1.7.0/manifests) |
+| Training Operator | apps/training-operator/upstream | [v1.8.0-rc.0](https://github.com/kubeflow/training-operator/tree/v1.8.0-rc.0/manifests) |
 | Notebook Controller | apps/jupyter/notebook-controller/upstream | [v1.8.0](https://github.com/kubeflow/kubeflow/tree/v1.8.0/components/notebook-controller/config) |
 | PVC Viewer Controller | apps/pvcviewer-roller/upstream | [v1.8.0](https://github.com/kubeflow/kubeflow/tree/v1.8.0/components/pvcviewer-controller/config) |
 | Tensorboard Controller | apps/tensorboard/tensorboard-controller/upstream | [v1.8.0](https://github.com/kubeflow/kubeflow/tree/v1.8.0/components/tensorboard-controller/config) |

diff --git a/apps/training-operator/upstream/base/crds/kubeflow.org_mpijobs.yaml b/apps/training-operator/upstream/base/crds/kubeflow.org_mpijobs.yaml
diff --git a/apps/training-operator/upstream/base/crds/kubeflow.org_mxjobs.yaml b/apps/training-operator/upstream/base/crds/kubeflow.org_mxjobs.yaml
diff --git a/apps/training-operator/upstream/base/crds/kubeflow.org_paddlejobs.yaml b/apps/training-operator/upstream/base/crds/kubeflow.org_paddlejobs.yaml
diff --git a/apps/training-operator/upstream/base/crds/kubeflow.org_pytorchjobs.yaml b/apps/training-operator/upstream/base/crds/kubeflow.org_pytorchjobs.yaml
diff --git a/apps/training-operator/upstream/base/crds/kubeflow.org_tfjobs.yaml b/apps/training-operator/upstream/base/crds/kubeflow.org_tfjobs.yaml
diff --git a/apps/training-operator/upstream/base/crds/kubeflow.org_xgboostjobs.yaml b/apps/training-operator/upstream/base/crds/kubeflow.org_xgboostjobs.yaml
diff --git a/apps/training-operator/upstream/base/deployment.yaml b/apps/training-operator/upstream/base/deployment.yaml
@@ -23,6 +23,9 @@ spec:
           name: training-operator
           ports:
             - containerPort: 8080
+            - containerPort: 9443
+              name: webhook-server
+              protocol: TCP
           env:
             - name: MY_POD_NAMESPACE
               valueFrom:
@@ -34,6 +37,10 @@ spec:
                   fieldPath: metadata.name
           securityContext:
             allowPrivilegeEscalation: false
+          volumeMounts:
+            - mountPath: /tmp/k8s-webhook-server/serving-certs
+              name: cert
+              readOnly: true
           livenessProbe:
             httpGet:
               path: /healthz
@@ -50,3 +57,8 @@ spec:
             timeoutSeconds: 3
       serviceAccountName: training-operator
       terminationGracePeriodSeconds: 10
+      volumes:
+        - name: cert
+          secret:
+            defaultMode: 420
+            secretName: training-operator-webhook-cert
diff --git a/apps/training-operator/upstream/base/kustomization.yaml b/apps/training-operator/upstream/base/kustomization.yaml
@@ -5,5 +5,6 @@ resources:
   - ./rbac/cluster-role-binding.yaml
   - ./rbac/role.yaml
   - ./rbac/service-account.yaml
+  - ./webhook
   - service.yaml
   - deployment.yaml
diff --git a/apps/training-operator/upstream/base/rbac/role.yaml b/apps/training-operator/upstream/base/rbac/role.yaml
@@ -43,6 +43,15 @@ rules:
   - pods/exec
   verbs:
   - create
+- apiGroups:
+  - ""
+  resources:
+  - secrets
+  verbs:
+  - get
+  - list
+  - update
+  - watch
 - apiGroups:
   - ""
   resources:
@@ -62,6 +71,15 @@ rules:
   - get
   - list
   - watch
+- apiGroups:
+  - admissionregistration.k8s.io
+  resources:
+  - validatingwebhookconfigurations
+  verbs:
+  - get
+  - list
+  - update
+  - watch
 - apiGroups:
   - autoscaling
   resources:

diff --git a/apps/training-operator/upstream/base/service.yaml b/apps/training-operator/upstream/base/service.yaml
@@ -1,4 +1,3 @@
----
 apiVersion: v1
 kind: Service
 metadata:
@@ -11,9 +10,13 @@ metadata:
   name: training-operator
 spec:
   ports:
-  - name: monitoring-port
-    port: 8080
-    targetPort: 8080
+    - name: monitoring-port
+      port: 8080
+      targetPort: 8080
+    - name: webhook-server
+      port: 443
+      protocol: TCP
+      targetPort: 9443
   selector:
     control-plane: kubeflow-training-operator
   type: ClusterIP
diff --git a/apps/training-operator/upstream/base/webhook/kustomization.yaml b/apps/training-operator/upstream/base/webhook/kustomization.yaml
@@ -0,0 +1,15 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - manifests.yaml
+commonLabels:
+  control-plane: kubeflow-training-operator
+patches:
+  - path: patch.yaml
+    target:
+      group: admissionregistration.k8s.io
+      version: v1
+      kind: ValidatingWebhookConfiguration
+
+configurations:
+  - kustomizeconfig.yaml
diff --git a/apps/training-operator/upstream/base/webhook/kustomizeconfig.yaml b/apps/training-operator/upstream/base/webhook/kustomizeconfig.yaml
@@ -0,0 +1,10 @@
+# the following config is for teaching kustomize where to look at when substituting vars.
+# It requires kustomize v2.1.0 or newer to work properly.
+namespace:
+  - kind: ValidatingWebhookConfiguration
+    group: admissionregistration.k8s.io
+    path: webhooks/clientConfig/service/namespace
+    create: true
+
+varReference:
+  - path: metadata/annotations
diff --git a/apps/training-operator/upstream/base/webhook/manifests.yaml b/apps/training-operator/upstream/base/webhook/manifests.yaml
@@ -0,0 +1,107 @@
+---
+apiVersion: admissionregistration.k8s.io/v1
+kind: ValidatingWebhookConfiguration
+metadata:
+  name: validating-webhook-configuration
+webhooks:
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: webhook-service
+      namespace: system
+      path: /validate-kubeflow-org-v1-mxjob
+  failurePolicy: Fail
+  name: validator.mxjob.training-operator.kubeflow.org
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    - DELETE
+    resources:
+    - mxjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: webhook-service
+      namespace: system
+      path: /validate-kubeflow-org-v1-paddlejob
+  failurePolicy: Fail
+  name: validator.paddlejob.training-operator.kubeflow.org
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - paddlejobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: webhook-service
+      namespace: system
+      path: /validate-kubeflow-org-v1-pytorchjob
+  failurePolicy: Fail
+  name: validator.pytorchjob.training-operator.kubeflow.org
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - pytorchjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: webhook-service
+      namespace: system
+      path: /validate-kubeflow-org-v1-tfjob
+  failurePolicy: Fail
+  name: validator.tfjob.training-operator.kubeflow.org
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - tfjobs
+  sideEffects: None
+- admissionReviewVersions:
+  - v1
+  clientConfig:
+    service:
+      name: webhook-service
+      namespace: system
+      path: /validate-kubeflow-org-v1-xgboostjob
+  failurePolicy: Fail
+  name: validator.xgboostjob.training-operator.kubeflow.org
+  rules:
+  - apiGroups:
+    - kubeflow.org
+    apiVersions:
+    - v1
+    operations:
+    - CREATE
+    - UPDATE
+    resources:
+    - xgboostjobs
+  sideEffects: None
diff --git a/apps/training-operator/upstream/base/webhook/patch.yaml b/apps/training-operator/upstream/base/webhook/patch.yaml
@@ -0,0 +1,18 @@
+- op: replace
+  path: /webhooks/0/clientConfig/service/name
+  value: training-operator
+- op: replace
+  path: /webhooks/1/clientConfig/service/name
+  value: training-operator
+- op: replace
+  path: /webhooks/2/clientConfig/service/name
+  value: training-operator
+- op: replace
+  path: /webhooks/3/clientConfig/service/name
+  value: training-operator
+- op: replace
+  path: /webhooks/4/clientConfig/service/name
+  value: training-operator
+- op: replace
+  path: /metadata/name
+  value: validator.training-operator.kubeflow.org
diff --git a/apps/training-operator/upstream/overlays/kubeflow/kubeflow-training-roles.yaml b/apps/training-operator/upstream/overlays/kubeflow/kubeflow-training-roles.yaml
@@ -47,6 +47,24 @@ rules:
       - paddlejobs/status
     verbs:
       - get
+  - apiGroups:
+      - ""
+    resources:
+      - persistentvolumeclaims
+    verbs:
+      - create
+      - delete
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - ""
+    resources:
+      - events
+    verbs:
+      - get
+      - list
+      - watch
 
 ---
 apiVersion: rbac.authorization.k8s.io/v1

diff --git a/apps/training-operator/upstream/overlays/kubeflow/kustomization.yaml b/apps/training-operator/upstream/overlays/kubeflow/kustomization.yaml
@@ -6,4 +6,10 @@ resources:
   - kubeflow-training-roles.yaml
 images:
   - name: kubeflow/training-operator
-    newTag: "v1-855e096"
+    newTag: "v1-f8f7363"
+# TODO (tenzen-y): Once we support cert-manager, we need to remove this secret generation.
+# REF: https://github.com/kubeflow/training-operator/issues/2049
+secretGenerator:
+  - name: training-operator-webhook-cert
+    options:
+      disableNameSuffixHash: true
diff --git a/apps/training-operator/upstream/overlays/standalone/kustomization.yaml b/apps/training-operator/upstream/overlays/standalone/kustomization.yaml
@@ -6,4 +6,8 @@ resources:
   - namespace.yaml
 images:
   - name: kubeflow/training-operator
-    newTag: "v1-855e096"
+    newTag: "v1-f8f7363"
+secretGenerator:
+  - name: training-operator-webhook-cert
+    options:
+      disableNameSuffixHash: true
diff --git a/hack/sync-training-operator-manifests.sh b/hack/sync-training-operator-manifests.sh
@@ -9,11 +9,13 @@
 #
 # Afterwards the developers can submit the PR to the kubeflow/manifests
 # repo, based on that local branch
+# It must be executed directly from its directory
 
 # strict mode http://redsymbol.net/articles/unofficial-bash-strict-mode/
-set -euo pipefail
+set -euxo pipefail
 IFS=$'\n\t'
 
+COMMIT="v1.8.0-rc.0" # You can use tags as well
 SRC_DIR=${SRC_DIR:=/tmp/kubeflow-training-operator}
 BRANCH=${BRANCH:=sync-kubeflow-training-operator-manifests-${COMMIT?}}
 
@@ -22,34 +24,46 @@ MANIFESTS_DIR=$(dirname $SCRIPT_DIR)
 
 echo "Creating branch: ${BRANCH}"
 
-# DEV: Comment out this if you are testing locally
 if [ -n "$(git status --porcelain)" ]; then
-  # Uncommitted changes
-  echo "WARNING: You have uncommitted changes, exiting..."
-  exit 1
+  echo "WARNING: You have uncommitted changes"
 fi
 
 if [ `git branch --list $BRANCH` ]
 then
-   echo "WARNING: Branch $BRANCH already exists. Exiting..."
-   exit 1
+   echo "WARNING: Branch $BRANCH already exists."
 fi
 
-# DEV: Comment out this checkout command if you are testing locally
-git checkout -b $BRANCH
+# Create the branch in the manifests repository
+if ! git show-ref --verify --quiet refs/heads/$BRANCH; then
+  git checkout -b $BRANCH
+else
+    echo "Branch $BRANCH already exists."
+fi
 
 echo "Checking out in $SRC_DIR to $COMMIT..."
+
+# Checkout the Training Operator repository
+mkdir -p $SRC_DIR
 cd $SRC_DIR
+if [ ! -d "training-operator/.git" ]; then
+    git clone https://github.com/kubeflow/training-operator.git
+fi
+cd $SRC_DIR/training-operator
+if ! git rev-parse --verify --quiet $COMMIT; then
+    git checkout -b $COMMIT
+else
+    git checkout $COMMIT
+fi
+
 if [ -n "$(git status --porcelain)" ]; then
-  # Uncommitted changes
-  echo "WARNING: You have uncommitted changes, exiting..."
-  exit 1
+  echo "WARNING: You have uncommitted changes"
 fi
-git checkout $COMMIT
 
 echo "Copying training-operator manifests..."
 DST_DIR=$MANIFESTS_DIR/apps/training-operator/upstream
-rm -r $DST_DIR
+if [ -d "$DST_DIR" ]; then
+    rm -r "$DST_DIR"
+fi
 cp $SRC_DIR/manifests $DST_DIR -r