Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement webhook validations for the PyTorchJob #2035

Merged
merged 1 commit into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ help: ## Display this help.
##@ Development

manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/..." output:crd:artifacts:config=manifests/base/crds output:rbac:artifacts:config=manifests/base/rbac
$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/..." \
output:crd:artifacts:config=manifests/base/crds \
output:rbac:artifacts:config=manifests/base/rbac \
output:webhook:artifacts:config=manifests/base/webhook

generate: controller-gen ## Generate apidoc, sdk and code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate/boilerplate.go.txt" paths="./pkg/apis/..."
Expand Down
91 changes: 72 additions & 19 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package main
import (
"errors"
"flag"
"net/http"
"os"
"strings"

Expand All @@ -40,9 +41,11 @@ import (
volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned"

kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
"github.com/kubeflow/training-operator/pkg/cert"
"github.com/kubeflow/training-operator/pkg/config"
controllerv1 "github.com/kubeflow/training-operator/pkg/controller.v1"
"github.com/kubeflow/training-operator/pkg/controller.v1/common"
"github.com/kubeflow/training-operator/pkg/webhooks"
//+kubebuilder:scaffold:imports
)

Expand Down Expand Up @@ -72,8 +75,11 @@ func main() {
var enabledSchemes controllerv1.EnabledSchemes
var gangSchedulerName string
var namespace string
var webhookServerPort int
var controllerThreads int
var webhookServerPort int
var webhookServiceName string
var webhookSecretName string

flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
Expand All @@ -86,7 +92,6 @@ func main() {
" Note: If you set another scheduler name, the training-operator assumes it's the scheduler-plugins.")
flag.StringVar(&namespace, "namespace", os.Getenv(EnvKubeflowNamespace), "The namespace to monitor kubeflow jobs. If unset, it monitors all namespaces cluster-wide."+
"If set, it only monitors kubeflow jobs in the given namespace.")
flag.IntVar(&webhookServerPort, "webhook-server-port", 9443, "Endpoint port for the webhook server.")
flag.IntVar(&controllerThreads, "controller-threads", 1, "Number of worker threads used by the controller.")

// PyTorch related flags
Expand All @@ -101,6 +106,11 @@ func main() {
flag.StringVar(&config.Config.MPIKubectlDeliveryImage, "mpi-kubectl-delivery-image",
config.MPIKubectlDeliveryImageDefault, "The image for mpi launcher init container")

// Cert generation flags
flag.IntVar(&webhookServerPort, "webhook-server-port", 9443, "Endpoint port for the webhook server.")
flag.StringVar(&webhookServiceName, "webhook-service-name", "training-operator", "Name of the Service used as part of the DNSName")
flag.StringVar(&webhookSecretName, "webhook-secret-name", "training-operator-webhook-cert", "Name of the Secret to store CA and server certs")

opts := zap.Options{
Development: true,
StacktraceLevel: zapcore.DPanicLevel,
Expand All @@ -124,9 +134,9 @@ func main() {
Metrics: metricsserver.Options{
BindAddress: metricsAddr,
},
WebhookServer: &webhook.DefaultServer{Options: webhook.Options{
WebhookServer: webhook.NewServer(webhook.Options{
Port: webhookServerPort,
}},
}),
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
Expand All @@ -137,30 +147,36 @@ func main() {
os.Exit(1)
}

// Set up controllers using goroutines to start the manager quickly.
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads)

//+kubebuilder:scaffold:builder

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
certsReady := make(chan struct{})
defer close(certsReady)
certGenerationConfig := cert.Config{
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up ready check")
if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil {
setupLog.Error(err, "Unable to set up cert rotation")
os.Exit(1)
}

setupProbeEndpoints(mgr, certsReady)
// Set up controllers using goroutines to start the manager quickly.
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads, certsReady)

//+kubebuilder:scaffold:builder

setupLog.Info("starting manager")
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
setupLog.Error(err, "problem running manager")
os.Exit(1)
}
}

func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int) {
setupLog.Info("registering controllers...")
func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int, certsReady <-chan struct{}) {
setupLog.Info("Waiting for certificate generation to complete")
<-certsReady
setupLog.Info("Certs ready")
Comment on lines +175 to +177
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need to check if certs are ready in setupControllers func ?
Maybe we can check it before we run the func and not pass certsReady to the setupControllers function:

// Set up controllers using goroutines to start the manager quickly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need this checking since the setupController is started with the go routine.
By doing this, we can improve the start-up time and avoid readiness probe errors (#1841) since the manager can start unless not waiting for the completion of setting up controllers.


setupLog.Info("registering controllers...")
// Prepare GangSchedulingSetupFunc
gangSchedulingSetupFunc := common.GenNonGangSchedulerSetupFunc()
if strings.EqualFold(gangSchedulerName, string(common.GangSchedulerVolcano)) {
Expand All @@ -182,15 +198,52 @@ func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchem
}
errMsg := "failed to set up controllers"
for _, s := range enabledSchemes {
setupFunc, supported := controllerv1.SupportedSchemeReconciler[s]
if !supported {
setupReconcilerFunc, supportedReconciler := controllerv1.SupportedSchemeReconciler[s]
if !supportedReconciler {
setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s)
os.Exit(1)
}
if err := setupFunc(mgr, gangSchedulingSetupFunc, controllerThreads); err != nil {
if err := setupReconcilerFunc(mgr, gangSchedulingSetupFunc, controllerThreads); err != nil {
setupLog.Error(errors.New(errMsg), "unable to create controller", "scheme", s)
os.Exit(1)
}
setupWebhookFunc, supportedWebhook := webhooks.SupportedSchemeWebhook[s]
if !supportedWebhook {
setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s)
os.Exit(1)
}
if err := setupWebhookFunc(mgr); err != nil {
setupLog.Error(errors.New(errMsg), "unable to start webhook server", "scheme", s)
os.Exit(1)
}
}
}

func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) {
defer setupLog.Info("Probe endpoints are configured on healthz and readyz")

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
}

// Wait for the webhook server to be listening before advertising the
// training-operator replica as ready. This allows users to wait with sending the first
// requests, requiring webhooks, until the training-operator deployment is available, so
// that the early requests are not rejected during the traininig-operator's startup.
// We wrap the call to GetWebhookServer in a closure to delay calling
// the function, otherwise a not fully-initialized webhook server (without
// ready certs) fails the start of the manager.
if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error {
select {
case <-certsReady:
return mgr.GetWebhookServer().StartedChecker()(req)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain about this closure? how is StartedChecker() working with certsReady logic? StartedChecked checks if webhook server is up while certsReady is called here https://github.com/open-policy-agent/cert-controller/blob/master/pkg/rotator/rotator.go#L888

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@johnugeorge If we don't have this extended readiness probe, the controller-runtime readiness server responds "Ready" even if the training operator is still preparing due to certificate generation.
This extended check allows us to check if the training-operator is ready correctly.

Also, you can find the details here: https://github.com/kubernetes-sigs/kueue/pull/1676/files#r1472752662

default:
return errors.New("certificates are not ready")
}
}); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
}

Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/google/go-cmp v0.6.0
github.com/onsi/ginkgo/v2 v2.14.0
github.com/onsi/gomega v1.30.0
github.com/open-policy-agent/cert-controller v0.10.1
github.com/prometheus/client_golang v1.18.0
github.com/sirupsen/logrus v1.9.0
github.com/stretchr/testify v1.8.4
Expand Down Expand Up @@ -58,6 +59,7 @@ require (
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e // indirect
golang.org/x/mod v0.14.0 // indirect
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ github.com/onsi/ginkgo/v2 v2.14.0 h1:vSmGj2Z5YPb9JwCWT6z6ihcUvDhuXLc3sJiqd3jMKAY
github.com/onsi/ginkgo/v2 v2.14.0/go.mod h1:JkUdW7JkN0V6rFvsHcJ478egV3XH9NxpD27Hal/PhZw=
github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8=
github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ=
github.com/open-policy-agent/cert-controller v0.10.1 h1:RXSYoyn8FdCenWecRP//UV5nbVfmstNpj4kHQFkvPK4=
github.com/open-policy-agent/cert-controller v0.10.1/go.mod h1:4uRbBLY5DsPOog+a9pqk3JLxuuhrWsbUedQW65HcLTI=
github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe1e4c4 h1:5dum5SLEz+95JDLkMls7Z7IDPjvSq3UhJSFe4f5einQ=
github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe1e4c4/go.mod h1:54/KzLMvA5ndBVpm7B1OjLeV0cUtTLTz2bZ2OtydLpU=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
Expand Down Expand Up @@ -115,6 +119,8 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
Expand Down Expand Up @@ -205,6 +211,8 @@ k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAE
k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y=
k8s.io/klog/v2 v2.110.1 h1:U/Af64HJf7FcwMcXyKm2RPM22WZzyR7OSpYj5tg3cL0=
k8s.io/klog/v2 v2.110.1/go.mod h1:YGtd1984u+GgbuZ7e08/yBuAfKLSO0+uR1Fhi6ExXjo=
k8s.io/kube-aggregator v0.28.1 h1:rvG4llYnQKHjj6YjjoBPEJxfD1uH0DJwkrJTNKGAaCs=
k8s.io/kube-aggregator v0.28.1/go.mod h1:JaLizMe+AECSpO2OmrWVsvnG0V3dX1RpW+Wq/QHbu18=
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/AuzbMm96cd3YHRTU83I780=
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA=
k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI=
Expand Down
12 changes: 12 additions & 0 deletions manifests/base/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ spec:
name: training-operator
ports:
- containerPort: 8080
- containerPort: 9443
name: webhook-server
protocol: TCP
env:
- name: MY_POD_NAMESPACE
valueFrom:
Expand All @@ -34,6 +37,10 @@ spec:
fieldPath: metadata.name
securityContext:
allowPrivilegeEscalation: false
volumeMounts:
- mountPath: /tmp/k8s-webhook-server/serving-certs
name: cert
readOnly: true
livenessProbe:
httpGet:
path: /healthz
Expand All @@ -50,3 +57,8 @@ spec:
timeoutSeconds: 3
serviceAccountName: training-operator
terminationGracePeriodSeconds: 10
volumes:
- name: cert
secret:
defaultMode: 420
secretName: training-operator-webhook-cert
1 change: 1 addition & 0 deletions manifests/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ resources:
- ./rbac/cluster-role-binding.yaml
- ./rbac/role.yaml
- ./rbac/service-account.yaml
- ./webhook
- service.yaml
- deployment.yaml
18 changes: 18 additions & 0 deletions manifests/base/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@ rules:
- pods/exec
verbs:
- create
- apiGroups:
- ""
resources:
- secrets
verbs:
- get
- list
- update
- watch
- apiGroups:
- ""
resources:
Expand All @@ -62,6 +71,15 @@ rules:
- get
- list
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- validatingwebhookconfigurations
verbs:
- get
- list
- update
- watch
- apiGroups:
- autoscaling
resources:
Expand Down
11 changes: 7 additions & 4 deletions manifests/base/service.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
---
apiVersion: v1
kind: Service
metadata:
Expand All @@ -11,9 +10,13 @@ metadata:
name: training-operator
spec:
ports:
- name: monitoring-port
port: 8080
targetPort: 8080
- name: monitoring-port
port: 8080
targetPort: 8080
- name: webhook-server
port: 443
protocol: TCP
targetPort: 9443
selector:
control-plane: kubeflow-training-operator
type: ClusterIP
15 changes: 15 additions & 0 deletions manifests/base/webhook/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- manifests.yaml
commonLabels:
control-plane: kubeflow-training-operator
andreyvelich marked this conversation as resolved.
Show resolved Hide resolved
patches:
- path: patch.yaml
target:
group: admissionregistration.k8s.io
version: v1
kind: ValidatingWebhookConfiguration

configurations:
- kustomizeconfig.yaml
10 changes: 10 additions & 0 deletions manifests/base/webhook/kustomizeconfig.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# the following config is for teaching kustomize where to look at when substituting vars.
# It requires kustomize v2.1.0 or newer to work properly.
namespace:
- kind: ValidatingWebhookConfiguration
group: admissionregistration.k8s.io
path: webhooks/clientConfig/service/namespace
create: true

varReference:
- path: metadata/annotations
26 changes: 26 additions & 0 deletions manifests/base/webhook/manifests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we name this file as webhooks.yaml, similar to Katib: https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/components/webhook/webhooks.yaml and other Training Operator deployment manifest name: https://github.com/kubeflow/training-operator/blob/7cc1365c5aac01beef660709a5139321a5e62675/manifests/base/deployment.yaml.

Or kubebuilder sets this name by default ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or kubebuilder sets this name by default ?

Yea, this name is default value 😓

apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
name: validating-webhook-configuration
andreyvelich marked this conversation as resolved.
Show resolved Hide resolved
webhooks:
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make it kubeflow as default.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have no way to modify this namespace since this manifest is automatically generated by the controller-gen. But, this namespace is replaced by https://github.com/kubeflow/training-operator/blob/a618d1c3f77f769da7d4f535e25db88d64aadb88/manifests/base/webhook/kustomizeconfig.yaml.

path: /validate-kubeflow-org-v1-pytorchjob
Copy link
Member

@andreyvelich andreyvelich Apr 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need the full path? Maybe just: /validate-pytorchjob ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to follow this form based on the https://book.kubebuilder.io/reference/markers/webhook.
Also, since this path is global, we need to make an effort to avoid conflicts.

failurePolicy: Fail
andreyvelich marked this conversation as resolved.
Show resolved Hide resolved
name: validator.pytorchjob.training-operator.kubeflow.org
rules:
- apiGroups:
- kubeflow.org
apiVersions:
- v1
operations:
- CREATE
- UPDATE
resources:
- pytorchjobs
sideEffects: None
6 changes: 6 additions & 0 deletions manifests/base/webhook/patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- op: replace
path: /webhooks/0/clientConfig/service/name
value: training-operator
- op: replace
path: /metadata/name
value: validator.training-operator.kubeflow.org
6 changes: 6 additions & 0 deletions manifests/overlays/kubeflow/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@ resources:
images:
- name: kubeflow/training-operator
newTag: "v1-855e096"
# TODO (tenzen-y): Once we support cert-manager, we need to remove this secret generation.
# REF: https://github.com/kubeflow/training-operator/issues/2049
secretGenerator:
- name: training-operator-webhook-cert
options:
disableNameSuffixHash: true
4 changes: 4 additions & 0 deletions manifests/overlays/standalone/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ resources:
images:
- name: kubeflow/training-operator
newTag: "v1-855e096"
secretGenerator:
- name: training-operator-webhook-cert
options:
disableNameSuffixHash: true
Loading
Loading