From 31f18fe0203740f78a8c44f76445e14b15261fc0 Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Sun, 4 Jun 2023 21:03:22 +0900 Subject: [PATCH] Breaking Changes: Upgrade Scheduler Plugins version to v0.25.7 Signed-off-by: Yuki Iwai --- .github/workflows/integration-tests.yaml | 5 ++++ go.mod | 2 +- go.sum | 4 +-- manifests/base/rbac/role.yaml | 4 +-- pkg/controller.v1/mpi/mpijob_controller.go | 2 +- pkg/controller.v1/mxnet/mxjob_controller.go | 2 +- .../paddlepaddle/paddlepaddle_controller.go | 2 +- .../pytorch/pytorchjob_controller.go | 2 +- .../tensorflow/tfjob_controller.go | 2 +- .../xgboost/xgboostjob_controller.go | 2 +- scripts/gha/setup-training-operator.sh | 27 +++++++------------ 11 files changed, 25 insertions(+), 29 deletions(-) diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index d2f1ca872a..810be72b83 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -54,6 +54,11 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Setup Go + uses: actions/setup-go@v3 + with: + go-version-file: go.mod + - name: Create k8s Kind Cluster uses: helm/kind-action@v1.3.0 with: diff --git a/go.mod b/go.mod index 697ea22326..d0ce6c8ebf 100644 --- a/go.mod +++ b/go.mod @@ -20,7 +20,7 @@ require ( k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 k8s.io/utils v0.0.0-20221128185143-99ec85e7a448 sigs.k8s.io/controller-runtime v0.14.6 - sigs.k8s.io/scheduler-plugins v0.24.9 + sigs.k8s.io/scheduler-plugins v0.25.7 sigs.k8s.io/yaml v1.3.0 volcano.sh/apis v1.2.0-k8s1.19.6 ) diff --git a/go.sum b/go.sum index 6e1aec2662..086c07c244 100644 --- a/go.sum +++ b/go.sum @@ -782,8 +782,8 @@ sigs.k8s.io/controller-runtime v0.14.6 h1:oxstGVvXGNnMvY7TAESYk+lzr6S3V5VFxQ6d92 sigs.k8s.io/controller-runtime v0.14.6/go.mod h1:WqIdsAY6JBsjfc/CqO0CORmNtoCtE4S6qbPc9s68h+0= sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k= sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/scheduler-plugins v0.24.9 h1:9oGtwk6uh7mZMCX8+O+PipQzBiRq9d2+E3xq1cn7zbc= -sigs.k8s.io/scheduler-plugins v0.24.9/go.mod h1:0u2b/0SwY2ozDhOD/f1S3e5IbStoDFLUK8yP5dJTaQ8= +sigs.k8s.io/scheduler-plugins v0.25.7 h1:2qSTXfHmzfFZJF9M9UHLiDXGdDXX+sUs/cn0dHbc4qk= +sigs.k8s.io/scheduler-plugins v0.25.7/go.mod h1:CKgZ1xu9WZdB3CMSzOjro/rtrBY/bQWMf6un2M9VNS4= sigs.k8s.io/structured-merge-diff/v4 v4.0.1/go.mod h1:bJZC9H9iH24zzfZ/41RGcq60oK1F7G282QMXDPYydCw= sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= diff --git a/manifests/base/rbac/role.yaml b/manifests/base/rbac/role.yaml index 64f21b33b5..4c77d2fae6 100644 --- a/manifests/base/rbac/role.yaml +++ b/manifests/base/rbac/role.yaml @@ -249,7 +249,7 @@ rules: - update - watch - apiGroups: - - scheduling.sigs.k8s.io + - scheduling.volcano.sh resources: - podgroups verbs: @@ -261,7 +261,7 @@ rules: - update - watch - apiGroups: - - scheduling.volcano.sh + - scheduling.x-k8s.io resources: - podgroups verbs: diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go index b7c6e23ce0..a1a95067fd 100644 --- a/pkg/controller.v1/mpi/mpijob_controller.go +++ b/pkg/controller.v1/mpi/mpijob_controller.go @@ -120,7 +120,7 @@ type MPIJobReconciler struct { //+kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=list;watch;create;update //+kubebuilder:rbac:groups="",resources=pods/exec,verbs=create //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to diff --git a/pkg/controller.v1/mxnet/mxjob_controller.go b/pkg/controller.v1/mxnet/mxjob_controller.go index e3d08503cf..6e901275a4 100644 --- a/pkg/controller.v1/mxnet/mxjob_controller.go +++ b/pkg/controller.v1/mxnet/mxjob_controller.go @@ -118,7 +118,7 @@ type MXJobReconciler struct { //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete func (r *MXJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { diff --git a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go index 31e70174e0..858e2a9d7b 100644 --- a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go +++ b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go @@ -110,7 +110,7 @@ type PaddleJobReconciler struct { //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go index 88bb9ddf35..cdbd100320 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller.go @@ -111,7 +111,7 @@ type PyTorchJobReconciler struct { //+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete //+kubebuilder:rbac:groups=autoscaling,resources=horizontalpodautoscalers,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to diff --git a/pkg/controller.v1/tensorflow/tfjob_controller.go b/pkg/controller.v1/tensorflow/tfjob_controller.go index 495e26d911..78565ed2a0 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller.go @@ -134,7 +134,7 @@ type TFJobReconciler struct { //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller.go b/pkg/controller.v1/xgboost/xgboostjob_controller.go index cbf7d631a0..8af8c33f96 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller.go @@ -124,7 +124,7 @@ type XGBoostJobReconciler struct { //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=scheduling.sigs.k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete // Reconcile reads that state of the cluster for a XGBoostJob object and makes changes based on the state read diff --git a/scripts/gha/setup-training-operator.sh b/scripts/gha/setup-training-operator.sh index e051fad1bc..aa9c98cd74 100755 --- a/scripts/gha/setup-training-operator.sh +++ b/scripts/gha/setup-training-operator.sh @@ -14,10 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# This shell script is used to build a cluster and create a namespace from our -# argo workflow - - set -o errexit set -o nounset set -o pipefail @@ -34,20 +30,14 @@ echo "Installing training operator manifests" kustomize build . | kubectl apply -f - if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then - echo "Installing Scheduler Plugins..." - # We need to use latest helm chart since older helm chart has bugs in RBAC. - git clone https://github.com/kubernetes-sigs/scheduler-plugins.git - pushd scheduler-plugins/manifests/install/charts - - # Since https://github.com/kubernetes-sigs/scheduler-plugins/pull/526, the scheduler-plugins switch the API group to 'x-k8s.io'. - # So we must use the specific commit version to available the older API group, 'sigs.k8s.io'. - # Details: https://github.com/kubeflow/training-operator/issues/1769 - # TODO: Once we support new API group, we should switch the scheduler-plugins version. - git checkout df16b76a226e58b6961b30ba800e5a713d433c44 + SCHEDULER_PLUGINS_VERSION=$(go list -m -f "{{.Version}}" sigs.k8s.io/scheduler-plugins) + git clone https://github.com/kubernetes-sigs/scheduler-plugins.git -b "${SCHEDULER_PLUGINS_VERSION}" - helm install scheduler-plugins as-a-second-scheduler/ - popd - rm -rf scheduler-plugins + echo "Installing Scheduler Plugins ${SCHEDULER_PLUGINS_VERSION}..." + helm install scheduler-plugins scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ --create-namespace \ + --namespace scheduler-plugins \ + --set controller.image="registry.k8s.io/scheduler-plugins/controller:${SCHEDULER_PLUGINS_VERSION}" \ + --set scheduler.image="registry.k8s.io/scheduler-plugins/kube-scheduler:${SCHEDULER_PLUGINS_VERSION}" echo "Configure gang-scheduling using scheduler-plugins to training-operator" kubectl patch -n kubeflow deployments training-operator --type='json' \ @@ -60,7 +50,8 @@ until kubectl get pods -n kubeflow | grep training-operator | grep 1/1 || [[ $TI TIMEOUT=$(( TIMEOUT - 1 )) done if [ "${GANG_SCHEDULER_NAME}" = "scheduler-plugins" ]; then - kubectl wait pods --for=condition=ready -n scheduler-plugins --timeout "${TIMEOUT}s" --all + kubectl wait pods --for=condition=ready -n scheduler-plugins --timeout "${TIMEOUT}s" --all || \ + (kubectl get pods -n scheduler-plugins && kubectl describe pods -n scheduler-plugins; exit 1) fi kubectl version