From 6028a0ecfaaa445ea9a2183748fbaa6865bddc6a Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Wed, 17 Jan 2018 06:30:06 -0800 Subject: [PATCH 1/3] Fix bugs with TfJob operator prototype (#127) * Update the Docker image to get a docker image with the dasbhoard * Update the flags because the flag --controller-config-file was renamed. --- kubeflow/core/prototypes/all.jsonnet | 2 +- kubeflow/core/tf-job.libsonnet | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kubeflow/core/prototypes/all.jsonnet b/kubeflow/core/prototypes/all.jsonnet index 82336f821aa..6b77bba6638 100644 --- a/kubeflow/core/prototypes/all.jsonnet +++ b/kubeflow/core/prototypes/all.jsonnet @@ -6,7 +6,7 @@ // @optionalParam namespace string default Namespace // @optionalParam disks string null Comma separated list of Google persistent disks to attach to jupyter environments. // @optionalParam cloud string null String identifying the cloud to customize the deployment for. -// @optionalParam tfJobImage string gcr.io/tf-on-k8s-dogfood/tf_operator:v20171223-37af20d The image for the TfJob controller. +// @optionalParam tfJobImage string gcr.io/tf-on-k8s-dogfood/tf_operator:v20180117-04425d9-dirty-e3b0c44 The image for the TfJob controller. // @optionalParam tfDefaultImage string null The default image to use for TensorFlow. // @optionalParam tfJobUiServiceType string ClusterIP The service type for the UI. diff --git a/kubeflow/core/tf-job.libsonnet b/kubeflow/core/tf-job.libsonnet index d36f9d975cb..d5df9a37d03 100644 --- a/kubeflow/core/tf-job.libsonnet +++ b/kubeflow/core/tf-job.libsonnet @@ -23,9 +23,9 @@ { "command": [ "/opt/mlkube/tf_operator", - "--controller_config_file=/etc/config/controller_config_file.yaml", - "-alsologtostderr", - "-v=1" + "--controller-config-file=/etc/config/controller_config_file.yaml", + "--alsologtostderr", + "-v=1", ], "env": [ { From 8c20ea9af79601ec27beccdbfe2ff31a5f506587 Mon Sep 17 00:00:00 2001 From: Rohit Agarwal Date: Wed, 17 Jan 2018 06:31:38 -0800 Subject: [PATCH 2/3] Add missing namespace to command invocation (#124) --- user_guide.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/user_guide.md b/user_guide.md index 1be7b320f07..e8bc0044970 100644 --- a/user_guide.md +++ b/user_guide.md @@ -82,12 +82,12 @@ ks show ${KF_ENV} -c kubeflow-core The kubeflow-core component deployed JupyterHub and a corresponding load balancer service. You can check its status using the kubectl command line. ```commandline -kubectl get svc +kubectl get svc -n=${NAMESPACE} -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -kubernetes ClusterIP 10.11.240.1 443/TCP 1h -tf-hub-0 ClusterIP None 8000/TCP 1m -tf-hub-lb LoadBalancer 10.11.245.94 xx.yy.zz.ww 80:32481/TCP 1m +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +tf-hub-0 ClusterIP None 8000/TCP 1m +tf-hub-lb LoadBalancer 10.11.245.94 xx.yy.zz.ww 80:32481/TCP 1m +tf-job-dashboard ClusterIP 10.11.240.151 80/TCP 1m ``` If you're using minikube, you can run the following to get the URL for the notebook. From 4c9217d2cbfbb7d47bd03a4dbe019b665a274eb4 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Wed, 17 Jan 2018 20:50:24 -0800 Subject: [PATCH 3/3] Fix TfJob operator roles and TfCNN prototype (#130) * Fix the TFCNN prototype; the termination policy wasn't being properly set * Create service accounts and role bindings for the TfJob operator and UI * Fix #129 TfCnn template doesn't set termination policy correctly * Fix #125 Missing roles for tf-job operator * Fix #95; presubmits/postsubmits need to use the code at the commit we checked out *We do this by replacing the directory in vendor with a symbolic link to where we checked out the source. * It looks like using "--as" with ksonnet leads to strange errors about the server not being able to create the config map * If we don't use "--as" need to fetch credentials a second time or else we get RBAC issues creating the cluster --- kubeflow/core/prototypes/all.jsonnet | 9 +- kubeflow/core/tf-job.libsonnet | 230 +++++++++++++++++- .../prototypes/tf-cnn-benchmarks.jsonnet | 7 +- testing/test-infra/debug_pod.yaml | 31 +++ testing/test_deploy.py | 33 ++- 5 files changed, 294 insertions(+), 16 deletions(-) create mode 100644 testing/test-infra/debug_pod.yaml diff --git a/kubeflow/core/prototypes/all.jsonnet b/kubeflow/core/prototypes/all.jsonnet index 6b77bba6638..411195f13b2 100644 --- a/kubeflow/core/prototypes/all.jsonnet +++ b/kubeflow/core/prototypes/all.jsonnet @@ -71,9 +71,14 @@ std.prune(k.core.v1.list.new([ tfjob.parts(namespace).tfJobDeploy(tfJobImage), tfjob.parts(namespace).configMap(cloud, tfDefaultImage), tfjob.parts(namespace).serviceAccount, + tfjob.parts(namespace).operatorRole, + tfjob.parts(namespace).operatorRoleBinding, // TfJob controll ui - tfjob.parts(namespace).ui(tfJobImage), - tfjob.parts(namespace).uiService(tfJobUiServiceType), + tfjob.parts(namespace).ui(tfJobImage), + tfjob.parts(namespace).uiService(tfJobUiServiceType), + tfjob.parts(namespace).uiServiceAccount, + tfjob.parts(namespace).uiRole, + tfjob.parts(namespace).uiRoleBinding, ] + nfsComponents)) diff --git a/kubeflow/core/tf-job.libsonnet b/kubeflow/core/tf-job.libsonnet index d5df9a37d03..f85765264a9 100644 --- a/kubeflow/core/tf-job.libsonnet +++ b/kubeflow/core/tf-job.libsonnet @@ -132,6 +132,114 @@ } }, + operatorRole: { + "apiVersion": "rbac.authorization.k8s.io/v1beta1", + "kind": "ClusterRole", + "metadata": { + "labels": { + "app": "tf-job-operator" + }, + "name": "tf-job-operator" + }, + "rules": [ + { + "apiGroups": [ + "tensorflow.org" + ], + "resources": [ + "tfjobs" + ], + "verbs": [ + "*" + ] + }, + { + "apiGroups": [ + "apiextensions.k8s.io" + ], + "resources": [ + "customresourcedefinitions" + ], + "verbs": [ + "*" + ] + }, + { + "apiGroups": [ + "storage.k8s.io" + ], + "resources": [ + "storageclasses" + ], + "verbs": [ + "*" + ] + }, + { + "apiGroups": [ + "batch" + ], + "resources": [ + "jobs" + ], + "verbs": [ + "*" + ] + }, + { + "apiGroups": [ + "" + ], + "resources": [ + "configmaps", + "pods", + "services", + "endpoints", + "persistentvolumeclaims", + "events" + ], + "verbs": [ + "*" + ] + }, + { + "apiGroups": [ + "apps", + "extensions" + ], + "resources": [ + "deployments" + ], + "verbs": [ + "*" + ] + } + ] + }, // operator-role + + operatorRoleBinding:: { + "apiVersion": "rbac.authorization.k8s.io/v1beta1", + "kind": "ClusterRoleBinding", + "metadata": { + "labels": { + "app": "tf-job-operator" + }, + "name": "tf-job-operator" + }, + "roleRef": { + "apiGroup": "rbac.authorization.k8s.io", + "kind": "ClusterRole", + "name": "tf-job-operator" + }, + "subjects": [ + { + "kind": "ServiceAccount", + "name": "tf-job-operator", + "namespace": namespace, + } + ] + }, // operator-role binding + uiService(serviceType):: { "apiVersion": "v1", "kind": "Service", @@ -153,6 +261,18 @@ } }, // uiService + uiServiceAccount: { + "apiVersion": "v1", + "kind": "ServiceAccount", + "metadata": { + "labels": { + "app": "tf-job-dashboard" + }, + "name": "tf-job-dashboard", + "namespace": namespace, + } + }, // uiServiceAccount + ui(image):: { "apiVersion": "extensions/v1beta1", "kind": "Deployment", @@ -181,11 +301,119 @@ } ] } - ] + ], + "serviceAccountName": "tf-job-dashboard", } } }, }, // ui + uiRole:: { + "apiVersion": "rbac.authorization.k8s.io/v1beta1", + "kind": "ClusterRole", + "metadata": { + "labels": { + "app": "tf-job-dashboard" + }, + "name": "tf-job-dashboard" + }, + "rules": [ + { + "apiGroups": [ + "tensorflow.org" + ], + "resources": [ + "tfjobs" + ], + "verbs": [ + "*" + ] + }, + { + "apiGroups": [ + "apiextensions.k8s.io" + ], + "resources": [ + "customresourcedefinitions" + ], + "verbs": [ + "*" + ] + }, + { + "apiGroups": [ + "storage.k8s.io" + ], + "resources": [ + "storageclasses" + ], + "verbs": [ + "*" + ] + }, + { + "apiGroups": [ + "batch" + ], + "resources": [ + "jobs" + ], + "verbs": [ + "*" + ] + }, + { + "apiGroups": [ + "" + ], + "resources": [ + "configmaps", + "pods", + "services", + "endpoints", + "persistentvolumeclaims", + "events" + ], + "verbs": [ + "*" + ] + }, + { + "apiGroups": [ + "apps", + "extensions" + ], + "resources": [ + "deployments" + ], + "verbs": [ + "*" + ] + } + ] + }, // uiRole + + uiRoleBinding:: { + "apiVersion": "rbac.authorization.k8s.io/v1beta1", + "kind": "ClusterRoleBinding", + "metadata": { + "labels": { + "app": "tf-job-dashboard" + }, + "name": "tf-job-dashboard" + }, + "roleRef": { + "apiGroup": "rbac.authorization.k8s.io", + "kind": "ClusterRole", + "name": "tf-job-dashboard" + }, + "subjects": [ + { + "kind": "ServiceAccount", + "name": "tf-job-dashboard", + "namespace": namespace, + } + ] + }, // uiRoleBinding }, } diff --git a/kubeflow/tf-job/prototypes/tf-cnn-benchmarks.jsonnet b/kubeflow/tf-job/prototypes/tf-cnn-benchmarks.jsonnet index 791f18bf3a5..3cafda2d2be 100644 --- a/kubeflow/tf-job/prototypes/tf-cnn-benchmarks.jsonnet +++ b/kubeflow/tf-job/prototypes/tf-cnn-benchmarks.jsonnet @@ -92,8 +92,9 @@ local job = error "num_ps must be >= 1" else tfJob.parts.tfJob(name, namespace, replicas) + { - tfImage: image, - terminationPolicy: {chief:{replicaName: "WORKER", replicaIndex: 0 }} - }; + spec+: { + tfImage: image, + terminationPolicy: {chief:{replicaName: "WORKER", replicaIndex: 0 }} + }}; std.prune(k.core.v1.list.new([job])) diff --git a/testing/test-infra/debug_pod.yaml b/testing/test-infra/debug_pod.yaml new file mode 100644 index 00000000000..805404b5308 --- /dev/null +++ b/testing/test-infra/debug_pod.yaml @@ -0,0 +1,31 @@ +# This pod is useful for starting a shell that you can use to interactively debug our tests +apiVersion: batch/v1 +kind: Job +metadata: + name: test-job + namespace: kubeflow-test-infra +spec: + template: + spec: + containers: + - name: test-container + image: gcr.io/mlkube-testing/kubeflow-testing:latest + command: ["tail", "-f", "/dev/null"] + volumeMounts: + - mountPath: /mnt/test-data-volume + name: kubeflow-test-volume + - mountPath: /secret/gcp-credentials + name: gcp-credentials + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /secret/gcp-credentials/key.json + restartPolicy: Never + volumes: + - name: kubeflow-test-volume + persistentVolumeClaim: + claimName: kubeflow-testing + - name: gcp-credentials + secret: + secretName: kubeflow-testing-credentials + + backoffLimit: 4 \ No newline at end of file diff --git a/testing/test_deploy.py b/testing/test_deploy.py index 63c867542fa..1564e52b380 100644 --- a/testing/test_deploy.py +++ b/testing/test_deploy.py @@ -15,6 +15,7 @@ import json import logging import os +import shutil import tempfile import uuid @@ -43,7 +44,8 @@ def _setup_test(api_client, run_label): try: logging.info("Creating namespace %s", namespace.metadata.name) - api.create_namespace(namespace) + namespace = api.create_namespace(namespace) + logging.info("Namespace %s created.", namespace.metadata.name) except rest.ApiException as e: if e.status == 409: logging.info("Namespace %s already exists.", namespace.metadata.name) @@ -96,30 +98,41 @@ def run(): app_dir = os.path.join(args.test_dir, app_name) - # TODO(jlewi): In presubmits we probably want to change this so we can - # pull the changes on a branch. Its not clear whether that's well supported - # in Ksonnet yet. kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages - # TODO(jlewi): For presubmits how do we pull the package from the desired - # branch at the desired commit. packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) + # Delete the vendor directory and replace with a symlink to the src + # so that we use the code at the desired commit. + target_dir = os.path.join(app_dir, "vendor", "kubeflow") + + logging.info("Deleting %s", target_dir) + shutil.rmtree(target_dir) + + source = os.path.join(args.test_dir, "src", "kubeflow") + logging.info("Creating link %s -> %s", target_dir, source) + os.symlink(source, target_dir) + # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) + # TODO(jlewi): For reasons I don't understand even though we ran + # configure_kubectl above, if we don't rerun it we get rbac errors + # when we do ks apply; I think because we aren't using the proper service + # account. This might have something to do with the way ksonnet gets + # its credentials; maybe we need to configure credentials after calling + # ks init? + if args.cluster: + util.configure_kubectl(args.project, args.zone, args.cluster) + apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] - if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): - with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) as hf: - key = json.load(hf) - apply_command.append("--as=" + key["client_email"]) util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed.