Skip to content

Commit

Permalink
Implement webhook validations for the PyTorchJob
Browse files Browse the repository at this point in the history
Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
  • Loading branch information
tenzen-y committed Mar 26, 2024
1 parent 2eff94e commit 56c3669
Show file tree
Hide file tree
Showing 21 changed files with 633 additions and 332 deletions.
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ help: ## Display this help.
##@ Development

manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/..." output:crd:artifacts:config=manifests/base/crds output:rbac:artifacts:config=manifests/base/rbac
$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/..." \
output:crd:artifacts:config=manifests/base/crds \
output:rbac:artifacts:config=manifests/base/rbac \
output:webhook:artifacts:config=manifests/base/webhook

generate: controller-gen ## Generate apidoc, sdk and code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate/boilerplate.go.txt" paths="./pkg/apis/..."
Expand Down
33 changes: 28 additions & 5 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned"

kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
"github.com/kubeflow/training-operator/pkg/cert"
"github.com/kubeflow/training-operator/pkg/config"
controllerv1 "github.com/kubeflow/training-operator/pkg/controller.v1"
"github.com/kubeflow/training-operator/pkg/controller.v1/common"
Expand Down Expand Up @@ -72,8 +73,11 @@ func main() {
var enabledSchemes controllerv1.EnabledSchemes
var gangSchedulerName string
var namespace string
var webhookServerPort int
var controllerThreads int
var webhookServerPort int
var webhookServiceName string
var webhookSecretName string

flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
Expand All @@ -86,7 +90,6 @@ func main() {
" Note: If you set another scheduler name, the training-operator assumes it's the scheduler-plugins.")
flag.StringVar(&namespace, "namespace", os.Getenv(EnvKubeflowNamespace), "The namespace to monitor kubeflow jobs. If unset, it monitors all namespaces cluster-wide."+
"If set, it only monitors kubeflow jobs in the given namespace.")
flag.IntVar(&webhookServerPort, "webhook-server-port", 9443, "Endpoint port for the webhook server.")
flag.IntVar(&controllerThreads, "controller-threads", 1, "Number of worker threads used by the controller.")

// PyTorch related flags
Expand All @@ -101,6 +104,11 @@ func main() {
flag.StringVar(&config.Config.MPIKubectlDeliveryImage, "mpi-kubectl-delivery-image",
config.MPIKubectlDeliveryImageDefault, "The image for mpi launcher init container")

// Cert generation flags
flag.IntVar(&webhookServerPort, "webhook-server-port", 9443, "Endpoint port for the webhook server.")
flag.StringVar(&webhookServiceName, "webhook-service-name", "training-operator-webhook-service", "Name of the Service used as part of the DNSName")
flag.StringVar(&webhookSecretName, "webhook-secret-name", "training-operator-webhook-server-secret", "Name of the Secret to store CA and server certs")

opts := zap.Options{
Development: true,
StacktraceLevel: zapcore.DPanicLevel,
Expand Down Expand Up @@ -137,8 +145,20 @@ func main() {
os.Exit(1)
}

certsReady := make(chan struct{})
defer close(certsReady)
certGenerationConfig := cert.Config{
Namespace: namespace,
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
}
if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil {
setupLog.Error(err, "Unable to set up cert rotation")
os.Exit(1)
}

// Set up controllers using goroutines to start the manager quickly.
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads)
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads, certsReady)

//+kubebuilder:scaffold:builder

Expand All @@ -158,9 +178,12 @@ func main() {
}
}

func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int) {
setupLog.Info("registering controllers...")
func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int, certsReady <-chan struct{}) {
setupLog.Info("Waiting for certificate generation to complete")
<-certsReady
setupLog.Info("Certs ready")

setupLog.Info("registering controllers...")
// Prepare GangSchedulingSetupFunc
gangSchedulingSetupFunc := common.GenNonGangSchedulerSetupFunc()
if strings.EqualFold(gangSchedulerName, string(common.GangSchedulerVolcano)) {
Expand Down
6 changes: 4 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ require (
github.com/google/go-cmp v0.5.9
github.com/onsi/ginkgo/v2 v2.11.0
github.com/onsi/gomega v1.27.10
github.com/open-policy-agent/cert-controller v0.10.1
github.com/prometheus/client_golang v1.16.0
github.com/sirupsen/logrus v1.9.0
github.com/stretchr/testify v1.8.2
go.uber.org/zap v1.25.0
golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e
k8s.io/api v0.28.2
k8s.io/apimachinery v0.28.2
k8s.io/client-go v0.28.2
Expand Down Expand Up @@ -58,8 +60,8 @@ require (
github.com/prometheus/common v0.44.0 // indirect
github.com/prometheus/procfs v0.10.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e // indirect
golang.org/x/mod v0.10.0 // indirect
golang.org/x/net v0.17.0 // indirect
golang.org/x/oauth2 v0.8.0 // indirect
Expand All @@ -74,7 +76,7 @@ require (
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apiextensions-apiserver v0.28.0 // indirect
k8s.io/apiextensions-apiserver v0.28.1 // indirect
k8s.io/component-base v0.28.1 // indirect
k8s.io/gengo v0.0.0-20220902162205-c0856e24416d // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
Expand Down
10 changes: 8 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ github.com/onsi/ginkgo/v2 v2.11.0 h1:WgqUCUt/lT6yXoQ8Wef0fsNn5cAuMK7+KT9UFRz2tcU
github.com/onsi/ginkgo/v2 v2.11.0/go.mod h1:ZhrRA5XmEE3x3rhlzamx/JJvujdZoJ2uvgI7kR0iZvM=
github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI=
github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M=
github.com/open-policy-agent/cert-controller v0.10.1 h1:RXSYoyn8FdCenWecRP//UV5nbVfmstNpj4kHQFkvPK4=
github.com/open-policy-agent/cert-controller v0.10.1/go.mod h1:4uRbBLY5DsPOog+a9pqk3JLxuuhrWsbUedQW65HcLTI=
github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe1e4c4 h1:5dum5SLEz+95JDLkMls7Z7IDPjvSq3UhJSFe4f5einQ=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
Expand Down Expand Up @@ -122,6 +125,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
go.uber.org/goleak v1.1.11/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ=
go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
Expand Down Expand Up @@ -215,8 +220,8 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
k8s.io/api v0.28.2 h1:9mpl5mOb6vXZvqbQmankOfPIGiudghwCoLl1EYfUZbw=
k8s.io/api v0.28.2/go.mod h1:RVnJBsjU8tcMq7C3iaRSGMeaKt2TWEUXcpIt/90fjEg=
k8s.io/apiextensions-apiserver v0.28.0 h1:CszgmBL8CizEnj4sj7/PtLGey6Na3YgWyGCPONv7E9E=
k8s.io/apiextensions-apiserver v0.28.0/go.mod h1:uRdYiwIuu0SyqJKriKmqEN2jThIJPhVmOWETm8ud1VE=
k8s.io/apiextensions-apiserver v0.28.1 h1:l2ThkBRjrWpw4f24uq0Da2HaEgqJZ7pcgiEUTKSmQZw=
k8s.io/apiextensions-apiserver v0.28.1/go.mod h1:sVvrI+P4vxh2YBBcm8n2ThjNyzU4BQGilCQ/JAY5kGs=
k8s.io/apimachinery v0.28.2 h1:KCOJLrc6gu+wV1BYgwik4AF4vXOlVJPdiqn0yAWWwXQ=
k8s.io/apimachinery v0.28.2/go.mod h1:RdzF87y/ngqk9H4z3EL2Rppv5jj95vGS/HaFXrLDApU=
k8s.io/client-go v0.28.2 h1:DNoYI1vGq0slMBN/SWKMZMw0Rq+0EQW6/AK4v9+3VeY=
Expand All @@ -230,6 +235,7 @@ k8s.io/gengo v0.0.0-20220902162205-c0856e24416d/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAE
k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y=
k8s.io/klog/v2 v2.100.1 h1:7WCHKK6K8fNhTqfBhISHQ97KrnJNFZMcQvKp7gP/tmg=
k8s.io/klog/v2 v2.100.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0=
k8s.io/kube-aggregator v0.28.1 h1:rvG4llYnQKHjj6YjjoBPEJxfD1uH0DJwkrJTNKGAaCs=
k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9 h1:LyMgNKD2P8Wn1iAwQU5OhxCKlKJy0sHc+PcDwFB24dQ=
k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9/go.mod h1:wZK2AVp1uHCp4VamDVgBP2COHZjqD1T68Rf0CM3YjSM=
k8s.io/utils v0.0.0-20230406110748-d93618cff8a2 h1:qY1Ad8PODbnymg2pRbkyMT/ylpTrCM8P2RJ0yroCyIk=
Expand Down
2 changes: 1 addition & 1 deletion hack/swagger/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ require (
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/api v0.28.2 // indirect
k8s.io/apimachinery v0.28.2 // indirect
k8s.io/utils v0.0.0-20230406110748-d93618cff8a2 // indirect
k8s.io/utils v0.0.0-20240310230437-4693a0247e57 // indirect
sigs.k8s.io/controller-runtime v0.16.2 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
Expand Down
1 change: 1 addition & 0 deletions hack/swagger/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9 h1:LyMgNKD2P8Wn1iAwQU5Ohx
k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9/go.mod h1:wZK2AVp1uHCp4VamDVgBP2COHZjqD1T68Rf0CM3YjSM=
k8s.io/utils v0.0.0-20230406110748-d93618cff8a2 h1:qY1Ad8PODbnymg2pRbkyMT/ylpTrCM8P2RJ0yroCyIk=
k8s.io/utils v0.0.0-20230406110748-d93618cff8a2/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
k8s.io/utils v0.0.0-20240310230437-4693a0247e57/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/controller-runtime v0.16.2 h1:mwXAVuEk3EQf478PQwQ48zGOXvW27UJc8NHktQVuIPU=
sigs.k8s.io/controller-runtime v0.16.2/go.mod h1:vpMu3LpI5sYWtujJOa2uPK61nB5rbwlN7BAB8aSLvGU=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
Expand Down
2 changes: 2 additions & 0 deletions manifests/base/internalcert/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
resources:
- secret.yaml
5 changes: 5 additions & 0 deletions manifests/base/internalcert/secret.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v1
kind: Secret
metadata:
name: webhook-server-cert
namespace: system
2 changes: 2 additions & 0 deletions manifests/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,7 @@ resources:
- ./rbac/cluster-role-binding.yaml
- ./rbac/role.yaml
- ./rbac/service-account.yaml
- ./internalcert
- ./webhook
- service.yaml
- deployment.yaml
18 changes: 18 additions & 0 deletions manifests/base/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@ rules:
- pods/exec
verbs:
- create
- apiGroups:
- ""
resources:
- secrets
verbs:
- get
- list
- update
- watch
- apiGroups:
- ""
resources:
Expand All @@ -62,6 +71,15 @@ rules:
- get
- list
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- validatingwebhookconfigurations
verbs:
- get
- list
- update
- watch
- apiGroups:
- autoscaling
resources:
Expand Down
8 changes: 8 additions & 0 deletions manifests/base/webhook/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- manifests.yaml
- service.yaml

configurations:
- kustomizeconfig.yaml
21 changes: 21 additions & 0 deletions manifests/base/webhook/kustomizeconfig.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

# the following config is for teaching kustomize where to look at when substituting vars.
# It requires kustomize v2.1.0 or newer to work properly.
nameReference:
- kind: Service
version: v1
fieldSpecs:
- kind: ValidatingWebhookConfiguration
group: admissionregistration.k8s.io
path: webhooks/clientConfig/service/name

namespace:
- kind: ValidatingWebhookConfiguration
group: admissionregistration.k8s.io
path: webhooks/clientConfig/service/namespace
create: true

varReference:
- path: metadata/annotations
26 changes: 26 additions & 0 deletions manifests/base/webhook/manifests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
name: validating-webhook-configuration
webhooks:
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
path: /validate-kubeflow-org-v1-pytorchjob
failurePolicy: Fail
name: vpytorchjob.kb.io
rules:
- apiGroups:
- kubeflow.org
apiVersions:
- v1beta1
operations:
- CREATE
- UPDATE
resources:
- pytorchjobs
sideEffects: None
10 changes: 10 additions & 0 deletions manifests/base/webhook/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: v1
kind: Service
metadata:
name: webhook-service
namespace: system
spec:
ports:
- port: 443
protocol: TCP
targetPort: 9443
92 changes: 0 additions & 92 deletions pkg/apis/kubeflow.org/v1/pytorch_validation.go

This file was deleted.

Loading

0 comments on commit 56c3669

Please sign in to comment.