From 6028a0ecfaaa445ea9a2183748fbaa6865bddc6a Mon Sep 17 00:00:00 2001
From: Jeremy Lewi <jeremy@lewi.us>
Date: Wed, 17 Jan 2018 06:30:06 -0800
Subject: [PATCH 1/3] Fix bugs with TfJob operator prototype (#127)

* Update the Docker image to get a docker image with the dasbhoard
* Update the flags because the flag --controller-config-file was renamed.
---
 kubeflow/core/prototypes/all.jsonnet | 2 +-
 kubeflow/core/tf-job.libsonnet       | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/kubeflow/core/prototypes/all.jsonnet b/kubeflow/core/prototypes/all.jsonnet
index 82336f821aa..6b77bba6638 100644
--- a/kubeflow/core/prototypes/all.jsonnet
+++ b/kubeflow/core/prototypes/all.jsonnet
@@ -6,7 +6,7 @@
 // @optionalParam namespace string default Namespace
 // @optionalParam disks string null Comma separated list of Google persistent disks to attach to jupyter environments.
 // @optionalParam cloud string null String identifying the cloud to customize the deployment for.
-// @optionalParam tfJobImage string gcr.io/tf-on-k8s-dogfood/tf_operator:v20171223-37af20d The image for the TfJob controller.
+// @optionalParam tfJobImage string gcr.io/tf-on-k8s-dogfood/tf_operator:v20180117-04425d9-dirty-e3b0c44 The image for the TfJob controller.
 // @optionalParam tfDefaultImage string null The default image to use for TensorFlow.
 // @optionalParam tfJobUiServiceType string ClusterIP The service type for the UI.
 
diff --git a/kubeflow/core/tf-job.libsonnet b/kubeflow/core/tf-job.libsonnet
index d36f9d975cb..d5df9a37d03 100644
--- a/kubeflow/core/tf-job.libsonnet
+++ b/kubeflow/core/tf-job.libsonnet
@@ -23,9 +23,9 @@
               {
                 "command": [
                   "/opt/mlkube/tf_operator", 
-                  "--controller_config_file=/etc/config/controller_config_file.yaml", 
-                  "-alsologtostderr", 
-                  "-v=1"
+                  "--controller-config-file=/etc/config/controller_config_file.yaml",
+                  "--alsologtostderr",
+                  "-v=1",
                 ], 
                 "env": [
                   {

From 8c20ea9af79601ec27beccdbfe2ff31a5f506587 Mon Sep 17 00:00:00 2001
From: Rohit Agarwal <mindprince@gmail.com>
Date: Wed, 17 Jan 2018 06:31:38 -0800
Subject: [PATCH 2/3] Add missing namespace to command invocation (#124)

---
 user_guide.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/user_guide.md b/user_guide.md
index 1be7b320f07..e8bc0044970 100644
--- a/user_guide.md
+++ b/user_guide.md
@@ -82,12 +82,12 @@ ks show ${KF_ENV} -c kubeflow-core
 The kubeflow-core component deployed JupyterHub and a corresponding load balancer service. You can check its status using the kubectl command line.
 
 ```commandline
-kubectl get svc
+kubectl get svc -n=${NAMESPACE}
 
-NAME         TYPE           CLUSTER-IP     EXTERNAL-IP    PORT(S)        AGE
-kubernetes   ClusterIP      10.11.240.1    <none>         443/TCP        1h
-tf-hub-0     ClusterIP      None           <none>         8000/TCP       1m
-tf-hub-lb    LoadBalancer   10.11.245.94   xx.yy.zz.ww    80:32481/TCP   1m
+NAME               TYPE           CLUSTER-IP      EXTERNAL-IP   PORT(S)        AGE
+tf-hub-0           ClusterIP      None            <none>        8000/TCP       1m
+tf-hub-lb          LoadBalancer   10.11.245.94   xx.yy.zz.ww   80:32481/TCP   1m
+tf-job-dashboard   ClusterIP      10.11.240.151   <none>        80/TCP         1m
 ```
 
 If you're using minikube, you can run the following to get the URL for the notebook.

From 4c9217d2cbfbb7d47bd03a4dbe019b665a274eb4 Mon Sep 17 00:00:00 2001
From: Jeremy Lewi <jeremy@lewi.us>
Date: Wed, 17 Jan 2018 20:50:24 -0800
Subject: [PATCH 3/3] Fix TfJob operator roles and TfCNN prototype (#130)

* Fix the TFCNN prototype; the termination policy wasn't being properly set

* Create service accounts and role bindings for the TfJob operator and UI

* Fix #129 TfCnn template doesn't set termination policy correctly

* Fix #125 Missing roles for tf-job operator

* Fix #95; presubmits/postsubmits need to use the code at the commit we checked out

*We do this by replacing the directory in vendor with a symbolic link to where we checked out the source.
   * It looks like using "--as" with ksonnet leads to strange errors about the server not being able to create the config map
   * If we don't use "--as" need to fetch credentials a second time or else we get RBAC issues creating the cluster
---
 kubeflow/core/prototypes/all.jsonnet          |   9 +-
 kubeflow/core/tf-job.libsonnet                | 230 +++++++++++++++++-
 .../prototypes/tf-cnn-benchmarks.jsonnet      |   7 +-
 testing/test-infra/debug_pod.yaml             |  31 +++
 testing/test_deploy.py                        |  33 ++-
 5 files changed, 294 insertions(+), 16 deletions(-)
 create mode 100644 testing/test-infra/debug_pod.yaml

diff --git a/kubeflow/core/prototypes/all.jsonnet b/kubeflow/core/prototypes/all.jsonnet
index 6b77bba6638..411195f13b2 100644
--- a/kubeflow/core/prototypes/all.jsonnet
+++ b/kubeflow/core/prototypes/all.jsonnet
@@ -71,9 +71,14 @@ std.prune(k.core.v1.list.new([
     tfjob.parts(namespace).tfJobDeploy(tfJobImage), 
     tfjob.parts(namespace).configMap(cloud, tfDefaultImage),
     tfjob.parts(namespace).serviceAccount,
+    tfjob.parts(namespace).operatorRole,
+    tfjob.parts(namespace).operatorRoleBinding,
 
     // TfJob controll ui
-    tfjob.parts(namespace).ui(tfJobImage),     
-    tfjob.parts(namespace).uiService(tfJobUiServiceType),   
+    tfjob.parts(namespace).ui(tfJobImage),
+    tfjob.parts(namespace).uiService(tfJobUiServiceType),
+    tfjob.parts(namespace).uiServiceAccount,
+    tfjob.parts(namespace).uiRole,
+    tfjob.parts(namespace).uiRoleBinding,
 ] + nfsComponents))
 
diff --git a/kubeflow/core/tf-job.libsonnet b/kubeflow/core/tf-job.libsonnet
index d5df9a37d03..f85765264a9 100644
--- a/kubeflow/core/tf-job.libsonnet
+++ b/kubeflow/core/tf-job.libsonnet
@@ -132,6 +132,114 @@
       }
     },
 
+    operatorRole: {
+      "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
+      "kind": "ClusterRole", 
+      "metadata": {
+        "labels": {
+          "app": "tf-job-operator"
+        }, 
+        "name": "tf-job-operator"
+      }, 
+      "rules": [
+        {
+          "apiGroups": [
+            "tensorflow.org"
+          ], 
+          "resources": [
+            "tfjobs"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }, 
+        {
+          "apiGroups": [
+            "apiextensions.k8s.io"
+          ], 
+          "resources": [
+            "customresourcedefinitions"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }, 
+        {
+          "apiGroups": [
+            "storage.k8s.io"
+          ], 
+          "resources": [
+            "storageclasses"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }, 
+        {
+          "apiGroups": [
+            "batch"
+          ], 
+          "resources": [
+            "jobs"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }, 
+        {
+          "apiGroups": [
+            ""
+          ], 
+          "resources": [
+            "configmaps", 
+            "pods", 
+            "services", 
+            "endpoints", 
+            "persistentvolumeclaims", 
+            "events"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }, 
+        {
+          "apiGroups": [
+            "apps", 
+            "extensions"
+          ], 
+          "resources": [
+            "deployments"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }
+      ]
+    }, // operator-role
+
+    operatorRoleBinding:: {
+      "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
+      "kind": "ClusterRoleBinding", 
+      "metadata": {
+        "labels": {
+          "app": "tf-job-operator"
+        }, 
+        "name": "tf-job-operator"
+      }, 
+      "roleRef": {
+        "apiGroup": "rbac.authorization.k8s.io", 
+        "kind": "ClusterRole", 
+        "name": "tf-job-operator"
+      }, 
+      "subjects": [
+        {
+          "kind": "ServiceAccount", 
+          "name": "tf-job-operator", 
+          "namespace": namespace,
+        }
+      ]
+    }, // operator-role binding
+
     uiService(serviceType):: {
       "apiVersion": "v1", 
       "kind": "Service", 
@@ -153,6 +261,18 @@
       }
     }, // uiService
 
+    uiServiceAccount: {
+      "apiVersion": "v1", 
+      "kind": "ServiceAccount", 
+      "metadata": {
+        "labels": {
+          "app": "tf-job-dashboard"
+        }, 
+        "name": "tf-job-dashboard",
+        "namespace": namespace,
+      }
+    }, // uiServiceAccount
+
     ui(image):: {
       "apiVersion": "extensions/v1beta1", 
       "kind": "Deployment", 
@@ -181,11 +301,119 @@
                   }
                 ]
               }
-            ]
+            ],
+            "serviceAccountName": "tf-job-dashboard", 
           }
         }
       },
     }, // ui
 
+    uiRole:: {
+      "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
+      "kind": "ClusterRole", 
+      "metadata": {
+        "labels": {
+          "app": "tf-job-dashboard"
+        }, 
+        "name": "tf-job-dashboard"
+      }, 
+      "rules": [
+        {
+          "apiGroups": [
+            "tensorflow.org"
+          ], 
+          "resources": [
+            "tfjobs"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }, 
+        {
+          "apiGroups": [
+            "apiextensions.k8s.io"
+          ], 
+          "resources": [
+            "customresourcedefinitions"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }, 
+        {
+          "apiGroups": [
+            "storage.k8s.io"
+          ], 
+          "resources": [
+            "storageclasses"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }, 
+        {
+          "apiGroups": [
+            "batch"
+          ], 
+          "resources": [
+            "jobs"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }, 
+        {
+          "apiGroups": [
+            ""
+          ], 
+          "resources": [
+            "configmaps", 
+            "pods", 
+            "services", 
+            "endpoints", 
+            "persistentvolumeclaims", 
+            "events"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }, 
+        {
+          "apiGroups": [
+            "apps", 
+            "extensions"
+          ], 
+          "resources": [
+            "deployments"
+          ], 
+          "verbs": [
+            "*"
+          ]
+        }
+      ]
+    }, // uiRole 
+
+    uiRoleBinding:: {
+      "apiVersion": "rbac.authorization.k8s.io/v1beta1", 
+      "kind": "ClusterRoleBinding", 
+      "metadata": {
+        "labels": {
+          "app": "tf-job-dashboard"
+        }, 
+        "name": "tf-job-dashboard"
+      }, 
+      "roleRef": {
+        "apiGroup": "rbac.authorization.k8s.io", 
+        "kind": "ClusterRole", 
+        "name": "tf-job-dashboard"
+      }, 
+      "subjects": [
+        {
+          "kind": "ServiceAccount", 
+          "name": "tf-job-dashboard", 
+          "namespace": namespace,
+        }
+      ]
+    }, // uiRoleBinding
   },
 }
diff --git a/kubeflow/tf-job/prototypes/tf-cnn-benchmarks.jsonnet b/kubeflow/tf-job/prototypes/tf-cnn-benchmarks.jsonnet
index 791f18bf3a5..3cafda2d2be 100644
--- a/kubeflow/tf-job/prototypes/tf-cnn-benchmarks.jsonnet
+++ b/kubeflow/tf-job/prototypes/tf-cnn-benchmarks.jsonnet
@@ -92,8 +92,9 @@ local job =
 	error "num_ps must be >= 1"
 	else
 	tfJob.parts.tfJob(name, namespace, replicas) + {
-							tfImage: image,
-							terminationPolicy: {chief:{replicaName: "WORKER", replicaIndex: 0 }}
-							};
+    spec+: {
+				tfImage: image,
+				terminationPolicy: {chief:{replicaName: "WORKER", replicaIndex: 0 }}
+		}};
 
 std.prune(k.core.v1.list.new([job]))
diff --git a/testing/test-infra/debug_pod.yaml b/testing/test-infra/debug_pod.yaml
new file mode 100644
index 00000000000..805404b5308
--- /dev/null
+++ b/testing/test-infra/debug_pod.yaml
@@ -0,0 +1,31 @@
+# This pod is useful for starting a shell that you can use to interactively debug our tests
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: test-job
+  namespace: kubeflow-test-infra
+spec:
+  template:
+    spec:
+      containers:
+      - name: test-container
+        image: gcr.io/mlkube-testing/kubeflow-testing:latest
+        command: ["tail", "-f", "/dev/null"]
+        volumeMounts:
+          - mountPath: /mnt/test-data-volume
+            name: kubeflow-test-volume
+          - mountPath: /secret/gcp-credentials
+            name: gcp-credentials
+        env:        
+          - name: GOOGLE_APPLICATION_CREDENTIALS
+            value: /secret/gcp-credentials/key.json
+      restartPolicy: Never
+      volumes:      
+        - name: kubeflow-test-volume
+          persistentVolumeClaim:
+            claimName: kubeflow-testing
+        - name: gcp-credentials
+          secret:
+            secretName: kubeflow-testing-credentials
+
+  backoffLimit: 4
\ No newline at end of file
diff --git a/testing/test_deploy.py b/testing/test_deploy.py
index 63c867542fa..1564e52b380 100644
--- a/testing/test_deploy.py
+++ b/testing/test_deploy.py
@@ -15,6 +15,7 @@
 import json
 import logging
 import os
+import shutil
 import tempfile
 import uuid
 
@@ -43,7 +44,8 @@ def _setup_test(api_client, run_label):
 
   try:
     logging.info("Creating namespace %s", namespace.metadata.name)
-    api.create_namespace(namespace)
+    namespace = api.create_namespace(namespace)
+    logging.info("Namespace %s created.", namespace.metadata.name)
   except rest.ApiException as e:
     if e.status == 409:
       logging.info("Namespace %s already exists.", namespace.metadata.name)
@@ -96,30 +98,41 @@ def run():
 
     app_dir = os.path.join(args.test_dir, app_name)
 
-    # TODO(jlewi): In presubmits we probably want to change this so we can
-    # pull the changes on a branch. Its not clear whether that's well supported
-    # in Ksonnet yet.
     kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow"
     util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)
 
     # Install required packages
-    # TODO(jlewi): For presubmits how do we pull the package from the desired
-    # branch at the desired commit.
     packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]
 
     for p in packages:
       util.run(["ks", "pkg", "install", p], cwd=app_dir)
 
+    # Delete the vendor directory and replace with a symlink to the src
+    # so that we use the code at the desired commit.
+    target_dir = os.path.join(app_dir, "vendor", "kubeflow")
+
+    logging.info("Deleting %s", target_dir)
+    shutil.rmtree(target_dir)
+
+    source = os.path.join(args.test_dir, "src", "kubeflow")
+    logging.info("Creating link %s -> %s", target_dir, source)
+    os.symlink(source, target_dir)
+
     # Deploy Kubeflow
     util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
               "--namespace=" + namespace.metadata.name], cwd=app_dir)
 
+    # TODO(jlewi): For reasons I don't understand even though we ran
+    # configure_kubectl above, if we don't rerun it we get rbac errors
+    # when we do ks apply; I think because we aren't using the proper service
+    # account. This might have something to do with the way ksonnet gets
+    # its credentials; maybe we need to configure credentials after calling
+    # ks init?
+    if args.cluster:
+      util.configure_kubectl(args.project, args.zone, args.cluster)
+
     apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]
 
-    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
-      with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) as hf:
-        key = json.load(hf)
-        apply_command.append("--as=" + key["client_email"])
     util.run(apply_command, cwd=app_dir)
 
     # Verify that the TfJob operator is actually deployed.