diff --git a/.travis.yml b/.travis.yml index 6d788546..99642b86 100644 --- a/.travis.yml +++ b/.travis.yml @@ -36,6 +36,7 @@ script: # fail fast - set -e - export MAKE_ARGS=--no-print-directory + - export VM_TYPE=minikube # Open SSH # - echo travis:$sshpassword | sudo chpasswd # - sudo sed -i 's/ChallengeResponseAuthentication no/ChallengeResponseAuthentication yes/' /etc/ssh/sshd_config @@ -48,9 +49,14 @@ script: - make $MAKE_ARGS gen-certs - make $MAKE_ARGS build - make $MAKE_ARGS docker-build - - make $MAKE_ARGS create-volumes - # deploy services - - make $MAKE_ARGS deploy + # initialize helm tiller + - helm init + - while ! (kubectl get pods --all-namespaces | grep tiller-deploy | grep '1/1'); do sleep 5; done + # Prepare any local/static volume as the shared file system and deploy all the helper micro-services for ffdl + - helm install docs/helm-charts/ffdl-helper-0.1.1.tgz --set prometheus.deploy=false,localstorage=true --wait + # Deploy all the core ffdl services. + - export IMAGE_TAG=user-$(whoami) + - helm install docs/helm-charts/ffdl-core-0.1.1.tgz --set trainer.version=${IMAGE_TAG},restapi.version=${IMAGE_TAG},lcm.version=${IMAGE_TAG},trainingdata.version=${IMAGE_TAG},databroker.tag=${IMAGE_TAG},databroker.version=${IMAGE_TAG},webui.version=${IMAGE_TAG} --wait # submit a test job - make $MAKE_ARGS test-submit-minikube-ci diff --git a/Chart.yaml b/Chart.yaml deleted file mode 100644 index 6b29a9a5..00000000 --- a/Chart.yaml +++ /dev/null @@ -1,4 +0,0 @@ -name: ffdl -description: Fabric for Deep Learning (FfDL) -version: 0.1.1 -appVersion: 3.3 diff --git a/Makefile b/Makefile index 3fd7b446..333648da 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ WHOAMI ?= $(shell whoami) IMAGE_TAG ?= user-$(WHOAMI) TEST_SAMPLE ?= tf-model # VM_TYPE is "vagrant", "minikube" or "none" -VM_TYPE ?= minikube +VM_TYPE ?= none HAS_STATIC_VOLUMES?=false TEST_USER = test-user SET_LOCAL_ROUTES ?= 0 diff --git a/README.md b/README.md index fc278d57..8da3f10d 100644 --- a/README.md +++ b/README.md @@ -19,10 +19,8 @@ To know more about the architectural details, please read the [design document]( * `helm`: The Kubernetes package manager (https://helm.sh) * `docker`: The Docker command-line interface (https://www.docker.com/) * `S3 CLI`: The [command-line interface](https://aws.amazon.com/cli/) to configure your Object Storage -* An existing Kubernetes cluster (e.g., [Kubeadm-DIND](https://github.com/kubernetes-sigs/kubeadm-dind-cluster#using-preconfigured-scripts) for local testing). +* An existing Kubernetes cluster (e.g., [Kubeadm-DIND](https://github.com/kubernetes-sigs/kubeadm-dind-cluster#using-preconfigured-scripts) for local testing or Follow the appropriate instructions for standing up your Kubernetes cluster using [IBM Cloud Public](https://github.com/IBM/container-journey-template/blob/master/README.md) or [IBM Cloud Private](https://github.com/IBM/deploy-ibm-cloud-private/blob/master/README.md)). The minimum capacity requirement for FfDL is 4GB Memory and 3 CPUs. -* Follow the appropriate instructions for standing up your Kubernetes cluster using [IBM Cloud Public](https://github.com/IBM/container-journey-template/blob/master/README.md) or [IBM Cloud Private](https://github.com/IBM/deploy-ibm-cloud-private/blob/master/README.md) -* The minimum capacity requirement for FfDL is 4GB Memory and 3 CPUs. ## Usage Scenarios @@ -37,8 +35,8 @@ To know more about the architectural details, please read the [design document]( ## Steps 1. [Quick Start](#1-quick-start) - - 1.1 [Installation using Kubeadm-DIND](#11-installation-using-kubeadm-dind) - - 1.2 [Installation using Kubernetes Cluster](#12-installation-using-kubernetes-cluster) + - 1.1 [Installation using Kubernetes Cluster](#11-installation-using-kubernetes-cluster) + - 1.2 [Installation using Kubeadm-DIND](#12-installation-using-kubeadm-dind) 2. [Test](#2-test) 3. [Monitoring](#3-monitoring) 4. [Development](#4-development) @@ -48,57 +46,55 @@ To know more about the architectural details, please read the [design document]( ## 1. Quick Start -There are multiple installation paths for installing FfDL into an existing Kubernetes cluster. Below are the steps for quick install. If you want to follow more detailed step by step instructions , please visit [the detailed installation guide](docs/detailed-installation-guide.md) +There are multiple installation paths for installing FfDL into an existing Kubernetes cluster. Below are the steps for quick install. If you want to follow more detailed step by step instructions , please visit [the detailed installation guide](docs/detailed-installation-guide.md) -> If you are using bash shell, you can modify the necessary environment variables in `env.txt` and export all of them using the following commands -> ```shell -> source env.txt -> export $(cut -d= -f1 env.txt) -> ``` +* You need to initialize tiller with `helm init` before running the following commands. -### 1.1 Installation using Kubeadm-DIND +### 1.1 Installation using Kubernetes Cluster + +To install FfDL to any proper Kubernetes cluster, make sure `kubectl` points to the right namespace, +then deploy the platform services: -If you have [Kubeadm-DIND](https://github.com/kubernetes-sigs/kubeadm-dind-cluster#using-preconfigured-scripts) installed on your machine, use these commands to deploy the FfDL platform: ``` shell -export VM_TYPE=dind -export PUBLIC_IP=localhost -export SHARED_VOLUME_STORAGE_CLASS=""; export NAMESPACE=default # If your namespace does not exist yet, please create the namespace `kubectl create namespace $NAMESPACE` before running the make commands below +export SHARED_VOLUME_STORAGE_CLASS="ibmc-file-gold" # Change the storage class to what's available on your Cloud Kubernetes Cluster. -make deploy-plugin -make quickstart-deploy +helm install ibmcloud-object-storage-plugin --name ibmcloud-object-storage-plugin --repo https://ibm.github.io/FfDL/helm-charts --set namespace=$NAMESPACE # Configure s3 driver on the cluster +helm install ffdl-helper --name ffdl-helper --repo https://ibm.github.io/FfDL/helm-charts --set namespace=$NAMESPACE,shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS --wait # Deploy all the helper micro-services for ffdl +helm install ffdl-core --name ffdl-core --repo https://ibm.github.io/FfDL/helm-charts --set namespace=$NAMESPACE,lcm.shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS --wait # Deploy all the core ffdl services. ``` -### 1.2 Installation using Kubernetes Cluster - -To install FfDL to any proper Kubernetes cluster, make sure `kubectl` points to the right namespace, -then deploy the platform services: -> Note: For PUBLIC_IP, put down one of your Cluster Public IP that can access your Cluster's NodePorts. For IBM Cloud, you can get your Public IP with `bx cs workers `. +### 1.2 Installation using Kubeadm-DIND +If you have [Kubeadm-DIND](https://github.com/kubernetes-sigs/kubeadm-dind-cluster#using-preconfigured-scripts) installed on your machine, use these commands to deploy the FfDL platform: ``` shell -export VM_TYPE=none -export PUBLIC_IP= -export NAMESPACE=default # If your namespace does not exist yet, please create the namespace `kubectl create namespace $NAMESPACE` before running the make commands below +export SHARED_VOLUME_STORAGE_CLASS="" +export NAMESPACE=default -# Change the storage class to what's available on your Cloud Kubernetes Cluster. -export SHARED_VOLUME_STORAGE_CLASS="ibmc-file-gold"; +./bin/s3_driver.sh # Copy the s3 drivers to each of the DIND node +helm install ibmcloud-object-storage-plugin --name ibmcloud-object-storage-plugin --repo https://ibm.github.io/FfDL/helm-charts --set namespace=$NAMESPACE,cloud=false +helm install ffdl-helper --name ffdl-helper --repo https://ibm.github.io/FfDL/helm-charts --set namespace=$NAMESPACE,shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS,localstorage=true --wait +helm install ffdl-core --name ffdl-core --repo https://ibm.github.io/FfDL/helm-charts --set namespace=$NAMESPACE,lcm.shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS --wait -make deploy-plugin -make quickstart-deploy +# Forward the necessary microservices from the DIND cluster to your localhost. +./bin/dind-port-forward.sh ``` ## 2. Test To submit a simple example training job that is included in this repo (see `etc/examples` folder): +> Note: For PUBLIC_IP, put down one of your Cluster Public IP that can access your Cluster's NodePorts. You can check your Cluster Public IP with `kubectl get nodes -o wide`. +> For IBM Cloud, you can get your Public IP with `bx cs workers `. ``` shell +export PUBLIC_IP= # Put down localhost if you are running with Kubeadm-DIND make test-push-data-s3 make test-job-submit ``` ## 3. Monitoring -The platform ships with a simple Grafana monitoring dashboard. The URL is printed out when running the `deploy` make target. +The platform ships with a simple Grafana monitoring dashboard. The URL is printed out when running the `status` make target. ## 4. Development @@ -107,12 +103,11 @@ Please refer to the [developer guide](docs/developer-guide.md) for more details. ## 5. Clean Up If you want to remove FfDL from your cluster, simply use the following commands. ```shell -helm delete $(helm list | grep ffdl | awk '{print $1}' | head -n 1) +helm delete --purge ffdl-core ffdl-helper ``` -If you want to remove the storage driver and pvc from your cluster, run: +If you want to remove the storage driver from your cluster, run: ```shell -kubectl delete pvc static-volume-1 -helm delete $(helm list | grep ibmcloud-object-storage-plugin | awk '{print $1}' | head -n 1) +helm delete --purge ibmcloud-object-storage-plugin ``` For Kubeadm-DIND, you need to kill your forwarded ports. Note that the below command will kill all the ports that are created with `kubectl`. ```shell diff --git a/bin/s3_driver.sh b/bin/s3_driver.sh index 3b51f358..5b8b8432 100755 --- a/bin/s3_driver.sh +++ b/bin/s3_driver.sh @@ -3,8 +3,8 @@ declare -a arrNodes=($(docker ps --format '{{.Names}}' | grep "kube-node-\|kube-master")) for node in "${arrNodes[@]}" do -docker cp $FFDL_PATH/bin/ibmc-s3fs $node:/root/ibmc-s3fs -docker cp $FFDL_PATH/bin/s3fs $node:/usr/local/bin/s3fs +docker cp ./bin/ibmc-s3fs $node:/root/ibmc-s3fs +docker cp ./bin/s3fs $node:/usr/local/bin/s3fs docker exec -i $node /bin/bash <<_EOF apt-get -y update apt-get -y install s3fs diff --git a/docs/detailed-installation-guide.md b/docs/detailed-installation-guide.md index 64ea45f4..9e340377 100644 --- a/docs/detailed-installation-guide.md +++ b/docs/detailed-installation-guide.md @@ -9,138 +9,76 @@ ## 1. Detailed Installation Instructions -0. If you don't have a Kubernetes Cluster, you can create a [Kubeadm-DIND](https://github.com/kubernetes-sigs/kubeadm-dind-cluster#using-preconfigured-scripts) Kubernetes Cluster on your local machine. We recommend you give at least 4 CPUs and 8GB of memory to your Docker. -> For Mac users, visit the instructions on the [Docker website](https://docs.docker.com/docker-for-mac/#advanced) and learn how to give more memory to your Docker. - -1. First, clone this repository and install the helm tiller on your Kubernetes cluster. +1. First, Install the helm tiller on your Kubernetes cluster. ``` shell helm init - -# Make sure the tiller pod is Running before proceeding to the next step. -kubectl get pods --all-namespaces | grep tiller-deploy -# kube-system tiller-deploy-fb8d7b69c-pcvc2 1/1 Running ``` -2. Define the necessary environment variables. -> If you are using bash shell, you can modify the necessary environment variables in `env.txt` and export all of them using the following commands -> ```shell -> source env.txt -> export $(cut -d= -f1 env.txt) -> ``` +2. Deploy FfDL on Kubernetes - * 2.a. For Kubeadm-DIND Cluster only - ```shell - export FFDL_PATH=$(pwd) - export SHARED_VOLUME_STORAGE_CLASS="" - export VM_TYPE=dind - export PUBLIC_IP=localhost - export NAMESPACE=default # If your namespace does not exist yet, please create the namespace `kubectl create namespace $NAMESPACE` before proceeding to the next step - ``` - * 2.b. For Cloud Kubernetes Cluster - > Note: If you are using IBM Cloud Cluster, you can obtain your k8s public ip using `bx cs workers `. + * 2.a. Installation using Kubernetes Cluster - ```shell - # Change the storage class to what's available on your Cloud Kubernetes Cluster. - export SHARED_VOLUME_STORAGE_CLASS="ibmc-file-gold" - export VM_TYPE=none - export PUBLIC_IP= - export NAMESPACE=default # If your namespace does not exist yet, please create the namespace `kubectl create namespace $NAMESPACE` before proceeding to the next step - ``` + To install FfDL to any proper Kubernetes cluster, make sure `kubectl` points to the right namespace, + then deploy the platform services: -3. Install the Object Storage driver using helm install. - * 3.a. For Kubeadm-DIND Cluster only - ```shell - ./bin/s3_driver.sh - helm install storage-plugin --set dind=true,cloud=false,namespace=$NAMESPACE - ``` + ``` shell + export NAMESPACE=default # If your namespace does not exist yet, please create the namespace `kubectl create namespace $NAMESPACE` before running the make commands below + export SHARED_VOLUME_STORAGE_CLASS="ibmc-file-gold" # Change the storage class to what's available on your Cloud Kubernetes Cluster. - * 3.b. For Cloud Kubernetes Cluster - ```shell - helm install storage-plugin --set namespace=$NAMESPACE + # Configure s3 driver on the cluster + helm install ibmcloud-object-storage-plugin --name ibmcloud-object-storage-plugin --repo https://ibm.github.io/FfDL/helm-charts --set namespace=$NAMESPACE + # Deploy all the helper micro-services for ffdl + helm install ffdl-helper --name ffdl-helper --repo https://ibm.github.io/FfDL/helm-charts \ + --set namespace=$NAMESPACE \ + --set shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS \ + --set localstorage=false \ # set to true if your cluster doesn't have any storage class + --set prometheus.deploy=false \ # set to true if you need prometheus logging for ffdl + --wait + # Deploy all the core ffdl services. + helm install ffdl-core --name ffdl-core --repo https://ibm.github.io/FfDL/helm-charts \ + --set namespace=$NAMESPACE \ + --set lcm.shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS \ + --wait ``` -4. Create a static volume to store any metadata from FfDL. - -```shell -pushd bin -./create_static_volumes.sh -./create_static_volumes_config.sh -# Wait while kubectl get pvc shows static-volume-1 in state Pending -popd -``` + * 2.b. Installation using Kubeadm-DIND -5. Now let's install all the necessary FfDL components using helm install. + If you don't have a Kubernetes Cluster, you can create a [Kubeadm-DIND](https://github.com/kubernetes-sigs/kubeadm-dind-cluster#using-preconfigured-scripts) Kubernetes Cluster on your local machine. We recommend you give at least 4 CPUs and 8GB of memory to your Docker. + > For Mac users, visit the instructions on the [Docker website](https://docs.docker.com/docker-for-mac/#advanced) and learn how to give more memory to your Docker. -``` shell -helm install . --set lcm.shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS,namespace=$NAMESPACE -``` -> Note: If you want to upgrade an older version of FfDL, run -> `helm upgrade $(helm list | grep ffdl | awk '{print $1}' | head -n 1) .` + If you have [Kubeadm-DIND](https://github.com/kubernetes-sigs/kubeadm-dind-cluster#using-preconfigured-scripts) installed on your machine, use these commands to deploy the FfDL platform: + ``` shell + export SHARED_VOLUME_STORAGE_CLASS="" + export NAMESPACE=default -Make sure all the FfDL components are installed and running before moving to the next step. -``` shell -kubectl config set-context $(kubectl config current-context) --namespace=$NAMESPACE -kubectl get pods -# NAME READY STATUS RESTARTS AGE -# alertmanager-7cf6b988b9-h9q6q 1/1 Running 0 5h -# etcd0 1/1 Running 0 5h -# ffdl-lcm-65bc97bcfd-qqkfc 1/1 Running 0 5h -# ffdl-restapi-8777444f6-7jfcf 1/1 Running 0 5h -# ffdl-trainer-768d7d6b9-4k8ql 1/1 Running 0 5h -# ffdl-trainingdata-866c8f48f5-ng27z 1/1 Running 0 5h -# ffdl-ui-5bf86cc7f5-zsqv5 1/1 Running 0 5h -# mongo-0 1/1 Running 0 5h -# prometheus-5f85fd7695-6dpt8 2/2 Running 0 5h -# pushgateway-7dd8f7c86d-gzr2g 2/2 Running 0 5h -# storage-0 1/1 Running 0 5h - -helm status $(helm list | grep ffdl | awk '{print $1}' | head -n 1) | grep STATUS: -# STATUS: DEPLOYED -``` + ./bin/s3_driver.sh # Copy the s3 drivers to each of the DIND node + helm install ibmcloud-object-storage-plugin --name ibmcloud-object-storage-plugin --repo https://ibm.github.io/FfDL/helm-charts --set namespace=$NAMESPACE,cloud=false + helm install ffdl-helper --name ffdl-helper --repo https://ibm.github.io/FfDL/helm-charts --set namespace=$NAMESPACE,shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS,localstorage=true,prometheus.deploy=false + helm install ffdl-core --name ffdl-core --repo https://ibm.github.io/FfDL/helm-charts --set namespace=$NAMESPACE,lcm.shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS -6. Obtain the necessary port for Grafana, FfDL Web UI, local object storage, and FfDL restapi. -```shell -grafana_port=$(kubectl get service grafana -o jsonpath='{.spec.ports[0].nodePort}') -ui_port=$(kubectl get service ffdl-ui -o jsonpath='{.spec.ports[0].nodePort}') -restapi_port=$(kubectl get service ffdl-restapi -o jsonpath='{.spec.ports[0].nodePort}') -s3_port=$(kubectl get service s3 -o jsonpath='{.spec.ports[0].nodePort}') -``` - -* For Kubeadm-DIND Cluster, we need to run the below script to forward the port to the localhost machine since we don't want to exec into the docker image and install various dependencies. - ```shell + # Forward the necessary microservices from the DIND cluster to your localhost. ./bin/dind-port-forward.sh ``` -7. Run the following commands to configure Grafana for monitoring FfDL using the logging information from prometheus. - ```shell - ./bin/grafana.init.sh - ``` +Congratulation, FfDL is now running on your Cluster. Now you can go to [Step 2](#2-detailed-testing-instructions) to run some sample jobs or go to the [user guide](docs/user-guide.md) to learn about how to run and deploy your custom models. -8. Lastly, run the following commands to obtain your Grafana, FfDL Web UI, and FfDL restapi endpoints. -``` shell -# Note: $(make --no-print-directory kubernetes-ip) simply gets the Public IP for your cluster. -node_ip=$PUBLIC_IP - -# Echo statements to print out Grafana and Web UI URLs. -echo "Monitoring dashboard: http://$node_ip:$grafana_port/ (login: admin/admin)" -echo "Web UI: http://$node_ip:$ui_port/#/login?endpoint=$node_ip:$restapi_port&username=test-user" -``` - -Congratulation, FfDL is now running on your Cluster. Now you can go to [Step 2](#2-detailed-testing-instructions) to run some sample jobs or go to the [user guide](user-guide.md) to learn about how to run and deploy your custom models. ## 2. Detailed Testing Instructions -In this example, we will run some simple jobs to train a convolutional network model using TensorFlow and Caffe. We will download a set of +In this example, we will run some simple jobs to train a convolutional network model using TensorFlow. We will download a set of MNIST handwritten digit images, store them with Object Storage, and use the FfDL CLI to train a handwritten digit classification model. +> Note: For PUBLIC_IP, put down one of your Cluster Public IP that can access your Cluster's NodePorts. You can check your Cluster Public IP with `kubectl get nodes -o wide`. +> For IBM Cloud, you can get your Public IP with `bx cs workers `. + ### 2.1. Using FfDL Local S3 Based Object Storage -1. Run the following commands to obtain the object storage endpoint from your cluster. +1. Clone this repository and run the following commands to obtain the object storage endpoint from your cluster. ```shell -node_ip=$PUBLIC_IP +PUBLIC_IP= # Put down localhost if you are running with Kubeadm-DIND s3_port=$(kubectl get service s3 -o jsonpath='{.spec.ports[0].nodePort}') -s3_url=http://$node_ip:$s3_port +s3_url=http://$PUBLIC_IP:$s3_port ``` 2. Next, set up the default object storage access ID and KEY. Then create buckets for all the necessary training data and models. @@ -150,8 +88,6 @@ export AWS_ACCESS_KEY_ID=test; export AWS_SECRET_ACCESS_KEY=test; export AWS_DEF s3cmd="aws --endpoint-url=$s3_url s3" $s3cmd mb s3://tf_training_data $s3cmd mb s3://tf_trained_model -$s3cmd mb s3://mnist_lmdb_data -$s3cmd mb s3://dlaas-trained-models ``` 3. Now, create a temporary repository, download the necessary images for training and labeling our TensorFlow model, and upload those images @@ -172,7 +108,7 @@ binary). ```shell restapi_port=$(kubectl get service ffdl-restapi -o jsonpath='{.spec.ports[0].nodePort}') -export DLAAS_URL=http://$node_ip:$restapi_port; export DLAAS_USERNAME=test-user; export DLAAS_PASSWORD=test; +export DLAAS_URL=http://$PUBLIC_IP:$restapi_port; export DLAAS_USERNAME=test-user; export DLAAS_PASSWORD=test; ``` * With the recent changes in DIND, we need update the `node_ip` to its Host IP before proceeding to the below steps. @@ -183,9 +119,9 @@ export DLAAS_URL=http://$node_ip:$restapi_port; export DLAAS_USERNAME=test-user; Replace the default object storage path with your s3_url. You can skip this step if your already modified the object storage path with your s3_url. ```shell if [ "$(uname)" = "Darwin" ]; then - sed -i '' s/s3.default.svc.cluster.local/$node_ip:$s3_port/ etc/examples/tf-model/manifest.yml + sed -i '' s/s3.default.svc.cluster.local/$PUBLIC_IP:$s3_port/ etc/examples/tf-model/manifest.yml else - sed -i s/s3.default.svc.cluster.local/$node_ip:$s3_port/ etc/examples/tf-model/manifest.yml + sed -i s/s3.default.svc.cluster.local/$PUBLIC_IP:$s3_port/ etc/examples/tf-model/manifest.yml fi ``` @@ -195,7 +131,7 @@ CLI_CMD=$(pwd)/cli/bin/ffdl-$(if [ "$(uname)" = "Darwin" ]; then echo 'osx'; els $CLI_CMD train etc/examples/tf-model/manifest.yml etc/examples/tf-model ``` -Congratulations, you had submitted your first job on FfDL. You can check your FfDL status either from the FfDL UI or simply run `$CLI_CMD list` +Congratulations, you had submitted your first job on FfDL. You can check your FfDL status either from the FfDL UI or simply run `$CLI_CMD list`. To learn more about your job execution results, you can simply run `$CLI_CMD logs ` > You can learn about how to create your own model definition files and `manifest.yaml` at [user guild](user-guide.md#2-create-new-models-with-ffdl). @@ -211,30 +147,6 @@ Then, click `Submit Training Job` to run your job. ![ui-example](images/ui-example.png) -6. (Optional) Since it's simple and straightforward to submit jobs with different deep learning framework on FfDL, let's try to run a Caffe Job. Download all the necessary training and testing images in [LMDB format](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) for our Caffe model -and upload those images to your mnist_lmdb_data bucket. - -```shell -for phase in train test; -do - for file in data.mdb lock.mdb; - do - tmpfile=tmp/$phase.$file - test -e $tmpfile || wget -q -O $tmpfile https://github.com/albarji/caffe-demos/raw/master/mnist/mnist_"$phase"_lmdb/$file - $s3cmd cp $tmpfile s3://mnist_lmdb_data/$phase/$file - done -done -``` - -7. Now train your Caffe Job. - -```shell -$CLI_CMD train etc/examples/caffe-model/manifest.yml etc/examples/caffe-model -``` - -Congratulations, now you know how to deploy jobs with different deep learning framework. To learn more about your job execution results, -you can simply run `$CLI_CMD logs ` - > If you no longer need any of the MNIST dataset we used in this example, you can simply delete the `tmp` repository. ### 2.2. Using Cloud Object Storage diff --git a/docs/developer-guide.md b/docs/developer-guide.md index 5cffe4de..b7d0bee6 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -32,6 +32,7 @@ export DOCKER_NAMESPACE= # Container Registry Namespace export DOCKER_PULL_POLICY=Always # Keep IfNotPresent if not pushing to registry, e.g. for Minikube export VM_TYPE=none export HAS_STATIC_VOLUMES=True +export IMAGE_TAG=user-$(whoami) export NAMESPACE=default # If your namespace does not exist yet, please create the namespace `kubectl create namespace $NAMESPACE` before proceeding to the next step ``` @@ -52,9 +53,25 @@ Make sure `kubectl` points to the right target context/namespace, then deploy th environment (using `helm`): ```shell kubectl config set-context $(kubectl config current-context) --namespace=$NAMESPACE # Set your current-context to the FfDL namespace -make create-volumes # Create static volumes for sharing across pods -make deploy-plugin # Deploy S3 storage plugin -make deploy # Deploy FfDL + +# Configure s3 driver on the cluster +helm install docs/helm-charts/ibmcloud-object-storage-plugin --name ibmcloud-object-storage-plugin --set namespace=$NAMESPACE +# Deploy all the helper micro-services for ffdl +helm install docs/helm-charts/ffdl-helper --name ffdl-helper \ +--set namespace=$NAMESPACE \ +--set shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS \ +--set localstorage=false \ # set to true if your cluster doesn't have any storage class +--set prometheus.deploy=true \ # set to false if you don't need prometheus logging for ffdl +--wait +# Deploy all the core ffdl services. +helm install docs/helm-charts/ffdl-core --name ffdl-core \ +--set namespace=$NAMESPACE \ +--set lcm.shared_volume_storage_class=$SHARED_VOLUME_STORAGE_CLASS \ +--set docker.registry=$DOCKER_REPO \ +--set docker.namespace=$DOCKER_NAMESPACE \ +--set docker.pullPolicy=$DOCKER_PULL_POLICY \ +--set trainer.version=${IMAGE_TAG},restapi.version=${IMAGE_TAG},lcm.version=${IMAGE_TAG},trainingdata.version=${IMAGE_TAG},databroker.tag=${IMAGE_TAG},databroker.version=${IMAGE_TAG},webui.version=${IMAGE_TAG} \ +--wait ``` ## Troubleshooting @@ -86,10 +103,4 @@ Please refer to the [gpu-guide.md](gpu-guide.md) for more details. ## Enable custom learner images with development build -Please add the following section under [trainer/trainer/frameworks.go](../trainer/trainer/frameworks.go#L42) and rebuild the trainer image to enable custom learner images from any users. - -``` go -if fwName == "custom" { - return true, "" -} -``` +Custom learner is enabled by default. To use it, simply put `custom` as your framework name and set your custom learner image path as the framework version. diff --git a/docs/helm-charts/ffdl-core-0.1.1.tgz b/docs/helm-charts/ffdl-core-0.1.1.tgz new file mode 100644 index 00000000..1782fb63 Binary files /dev/null and b/docs/helm-charts/ffdl-core-0.1.1.tgz differ diff --git a/docs/helm-charts/ffdl-core/Chart.yaml b/docs/helm-charts/ffdl-core/Chart.yaml new file mode 100644 index 00000000..42033f85 --- /dev/null +++ b/docs/helm-charts/ffdl-core/Chart.yaml @@ -0,0 +1,4 @@ +name: ffdl-core +description: Core service for Fabric for Deep Learning (FfDL) +version: 0.1.1 +appVersion: 3.3 diff --git a/templates/services/lcm-deployment.yml b/docs/helm-charts/ffdl-core/templates/services/lcm-deployment.yml similarity index 95% rename from templates/services/lcm-deployment.yml rename to docs/helm-charts/ffdl-core/templates/services/lcm-deployment.yml index a42f25e9..6642cdb6 100644 --- a/templates/services/lcm-deployment.yml +++ b/docs/helm-charts/ffdl-core/templates/services/lcm-deployment.yml @@ -29,9 +29,6 @@ spec: configMap: name: learner-config {{ if (eq .Values.has_static_volumes true) }} - - name: static-volumes-config-volume - configMap: - name: static-volumes - name: static-volumes-config-volume-v2 configMap: name: static-volumes-v2 @@ -43,10 +40,8 @@ spec: key: DLAAS_ETCD_CERT, path: etcd/etcd.cert }] -{{ if .Values.k8s_1dot8_or_above }} serviceAccount: {{.Values.docker.image_prefix}}lcm serviceAccountName: {{.Values.docker.image_prefix}}lcm -{{ end }} containers: - name: ffdl-lcm-container image: {{.Values.docker.registry}}/{{.Values.docker.namespace}}/{{.Values.docker.image_prefix}}lcm:{{.Values.lcm.version}} @@ -63,8 +58,6 @@ spec: - mountPath: /etc/learner-config name: learner-config-volume {{ if (eq .Values.has_static_volumes true) }} - - mountPath: /etc/static-volumes - name: static-volumes-config-volume - mountPath: /etc/static-volumes-v2 name: static-volumes-config-volume-v2 {{ end }} diff --git a/templates/services/lcm-rbac.yml b/docs/helm-charts/ffdl-core/templates/services/lcm-rbac.yml similarity index 90% rename from templates/services/lcm-rbac.yml rename to docs/helm-charts/ffdl-core/templates/services/lcm-rbac.yml index db3da0ae..e4fb6577 100644 --- a/templates/services/lcm-rbac.yml +++ b/docs/helm-charts/ffdl-core/templates/services/lcm-rbac.yml @@ -1,4 +1,3 @@ -{{ if .Values.k8s_1dot8_or_above }} apiVersion: v1 kind: ServiceAccount metadata: @@ -17,4 +16,3 @@ subjects: - kind: ServiceAccount name: {{.Values.docker.image_prefix}}lcm namespace: {{.Values.namespace}} -{{ end }} diff --git a/templates/services/lcm-secrets.yml b/docs/helm-charts/ffdl-core/templates/services/lcm-secrets.yml similarity index 100% rename from templates/services/lcm-secrets.yml rename to docs/helm-charts/ffdl-core/templates/services/lcm-secrets.yml diff --git a/templates/services/lcm-service.yml b/docs/helm-charts/ffdl-core/templates/services/lcm-service.yml similarity index 100% rename from templates/services/lcm-service.yml rename to docs/helm-charts/ffdl-core/templates/services/lcm-service.yml diff --git a/templates/services/learner-configmap.yml b/docs/helm-charts/ffdl-core/templates/services/learner-configmap.yml similarity index 100% rename from templates/services/learner-configmap.yml rename to docs/helm-charts/ffdl-core/templates/services/learner-configmap.yml diff --git a/templates/services/learner-rsa-keys.yml b/docs/helm-charts/ffdl-core/templates/services/learner-rsa-keys.yml similarity index 100% rename from templates/services/learner-rsa-keys.yml rename to docs/helm-charts/ffdl-core/templates/services/learner-rsa-keys.yml diff --git a/templates/services/restapi-deployment.yml b/docs/helm-charts/ffdl-core/templates/services/restapi-deployment.yml similarity index 100% rename from templates/services/restapi-deployment.yml rename to docs/helm-charts/ffdl-core/templates/services/restapi-deployment.yml diff --git a/templates/services/restapi-service.yml b/docs/helm-charts/ffdl-core/templates/services/restapi-service.yml similarity index 100% rename from templates/services/restapi-service.yml rename to docs/helm-charts/ffdl-core/templates/services/restapi-service.yml diff --git a/templates/services/trainer-deployment.yml b/docs/helm-charts/ffdl-core/templates/services/trainer-deployment.yml similarity index 100% rename from templates/services/trainer-deployment.yml rename to docs/helm-charts/ffdl-core/templates/services/trainer-deployment.yml diff --git a/templates/services/trainer-secrets.yml b/docs/helm-charts/ffdl-core/templates/services/trainer-secrets.yml similarity index 100% rename from templates/services/trainer-secrets.yml rename to docs/helm-charts/ffdl-core/templates/services/trainer-secrets.yml diff --git a/templates/services/trainer-service.yml b/docs/helm-charts/ffdl-core/templates/services/trainer-service.yml similarity index 100% rename from templates/services/trainer-service.yml rename to docs/helm-charts/ffdl-core/templates/services/trainer-service.yml diff --git a/templates/services/trainingdata-deployment.yml b/docs/helm-charts/ffdl-core/templates/services/trainingdata-deployment.yml similarity index 100% rename from templates/services/trainingdata-deployment.yml rename to docs/helm-charts/ffdl-core/templates/services/trainingdata-deployment.yml diff --git a/templates/services/trainingdata-secrets.yml b/docs/helm-charts/ffdl-core/templates/services/trainingdata-secrets.yml similarity index 100% rename from templates/services/trainingdata-secrets.yml rename to docs/helm-charts/ffdl-core/templates/services/trainingdata-secrets.yml diff --git a/templates/services/trainingdata-service.yml b/docs/helm-charts/ffdl-core/templates/services/trainingdata-service.yml similarity index 100% rename from templates/services/trainingdata-service.yml rename to docs/helm-charts/ffdl-core/templates/services/trainingdata-service.yml diff --git a/templates/services/web-ui.yml b/docs/helm-charts/ffdl-core/templates/services/web-ui.yml similarity index 100% rename from templates/services/web-ui.yml rename to docs/helm-charts/ffdl-core/templates/services/web-ui.yml diff --git a/values.yaml b/docs/helm-charts/ffdl-core/values.yaml similarity index 95% rename from values.yaml rename to docs/helm-charts/ffdl-core/values.yaml index c704f577..40929dfa 100644 --- a/values.yaml +++ b/docs/helm-charts/ffdl-core/values.yaml @@ -1,6 +1,5 @@ namespace: default env: dev -k8s_1dot8_or_above: true has_static_volumes: true services: expose_node_port: true @@ -50,15 +49,6 @@ webui: port: 0 cpus: 50m memory: 64Mi -mongo: - address: localhost:27017 - username: test - password: test -etcd: - address: http://etcd:2379 - username: test - password: test - prefix: test objectstore: type: s3_datastore username: test @@ -68,14 +58,15 @@ elasticsearch: username: test password: test scheme: http -prometheus: - deploy: true - etcd_urls: localhost:2379 - cluster_name: FfDL - port: 9090 - alertmanager_port: 9093 -grafana: - port: 3000 +mongo: + address: localhost:27017 + username: test + password: test +etcd: + address: http://etcd:2379 + username: test + password: test + prefix: test log: level: INFO # Default RSA Creds for Horovod diff --git a/docs/helm-charts/ffdl-helper-0.1.1.tgz b/docs/helm-charts/ffdl-helper-0.1.1.tgz new file mode 100644 index 00000000..5fc25360 Binary files /dev/null and b/docs/helm-charts/ffdl-helper-0.1.1.tgz differ diff --git a/docs/helm-charts/ffdl-helper/Chart.yaml b/docs/helm-charts/ffdl-helper/Chart.yaml new file mode 100644 index 00000000..0306aa48 --- /dev/null +++ b/docs/helm-charts/ffdl-helper/Chart.yaml @@ -0,0 +1,4 @@ +name: ffdl-helper +description: Helper Service for Fabric for Deep Learning (FfDL) +version: 0.1.1 +appVersion: 3.3 diff --git a/docs/helm-charts/ffdl-helper/templates/infrastructure/etcd.yml b/docs/helm-charts/ffdl-helper/templates/infrastructure/etcd.yml new file mode 100644 index 00000000..17f676c8 --- /dev/null +++ b/docs/helm-charts/ffdl-helper/templates/infrastructure/etcd.yml @@ -0,0 +1,82 @@ +apiVersion: v1 +kind: Service +metadata: + name: etcd + namespace: {{.Values.namespace}} +spec: + ports: + - port: 2379 + protocol: TCP + targetPort: 2379 + selector: + app: etcd + +--- +apiVersion: apps/v1beta1 +kind: StatefulSet +metadata: + labels: + app: etcd + etcd_node: etcd0 + name: etcd0 + namespace: {{.Values.namespace}} +spec: + serviceName: etcd + replicas: 1 + template: + metadata: + labels: + app: etcd + etcd_node: etcd0 + name: etcd0 + spec: + containers: + - command: + - /usr/local/bin/etcd + - --name + - etcd0 + - --initial-advertise-peer-urls + - http://etcd0:2380 + - --listen-peer-urls + - http://0.0.0.0:2380 + - --listen-client-urls + - http://0.0.0.0:2379 + - --advertise-client-urls + - http://etcd0:2379 + - --initial-cluster + - etcd0=http://etcd0:2380 + - --initial-cluster-state + - new + image: quay.io/coreos/etcd:latest + imagePullPolicy: IfNotPresent + name: etcd0 + ports: + - containerPort: 2379 + name: client + protocol: TCP + - containerPort: 2380 + name: server + protocol: TCP + restartPolicy: Always + +--- + +apiVersion: v1 +kind: Service +metadata: + labels: + etcd_node: etcd0 + name: etcd0 + namespace: {{.Values.namespace}} +spec: + ports: + - name: client + port: 2379 + protocol: TCP + targetPort: 2379 + - name: server + port: 2380 + protocol: TCP + targetPort: 2380 + selector: + etcd_node: etcd0 diff --git a/templates/infrastructure/mongo.yml b/docs/helm-charts/ffdl-helper/templates/infrastructure/mongo.yml similarity index 100% rename from templates/infrastructure/mongo.yml rename to docs/helm-charts/ffdl-helper/templates/infrastructure/mongo.yml diff --git a/templates/infrastructure/storage.yml b/docs/helm-charts/ffdl-helper/templates/infrastructure/storage.yml similarity index 100% rename from templates/infrastructure/storage.yml rename to docs/helm-charts/ffdl-helper/templates/infrastructure/storage.yml diff --git a/templates/monitoring/alertmanager-configmap.yml b/docs/helm-charts/ffdl-helper/templates/monitoring/alertmanager-configmap.yml similarity index 100% rename from templates/monitoring/alertmanager-configmap.yml rename to docs/helm-charts/ffdl-helper/templates/monitoring/alertmanager-configmap.yml diff --git a/templates/monitoring/alertmanager-deployment.yml b/docs/helm-charts/ffdl-helper/templates/monitoring/alertmanager-deployment.yml similarity index 88% rename from templates/monitoring/alertmanager-deployment.yml rename to docs/helm-charts/ffdl-helper/templates/monitoring/alertmanager-deployment.yml index 9b9fc2ad..0e82fe09 100644 --- a/templates/monitoring/alertmanager-deployment.yml +++ b/docs/helm-charts/ffdl-helper/templates/monitoring/alertmanager-deployment.yml @@ -18,9 +18,9 @@ spec: spec: containers: - name: alertmanager - image: quay.io/prometheus/alertmanager:v0.8.0 + image: quay.io/prometheus/alertmanager:v0.15.2 args: - - -config.file=/etc/prometheus/alertmanager.yml + - --config.file=/etc/prometheus/alertmanager.yml ports: - name: alertmanager containerPort: {{.Values.prometheus.alertmanager_port}} diff --git a/templates/monitoring/alertmanager-service.yml b/docs/helm-charts/ffdl-helper/templates/monitoring/alertmanager-service.yml similarity index 100% rename from templates/monitoring/alertmanager-service.yml rename to docs/helm-charts/ffdl-helper/templates/monitoring/alertmanager-service.yml diff --git a/docs/helm-charts/ffdl-helper/templates/monitoring/alertrules-configmap.yml b/docs/helm-charts/ffdl-helper/templates/monitoring/alertrules-configmap.yml new file mode 100644 index 00000000..219f2d51 --- /dev/null +++ b/docs/helm-charts/ffdl-helper/templates/monitoring/alertrules-configmap.yml @@ -0,0 +1,85 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-alertrules + namespace: {{.Values.namespace}} +data: + alert.rules.yml: |- + + groups: + - name: alert.rules + rules: + - alert: ControllerJobFailures + expr: increase(controller_training_failures[2m]) >= 1 + labels: + criticality: S2 + annotations: + description: Controller seems to have a rate > 1 of failed learnings over last + 2 mins + summary: Controller indicates failing learnings + - alert: ControllerETCDFailures + expr: increase(controller_etcd_failures{attempt="4"}[2m]) >= 1 + labels: + criticality: S2 + annotations: + description: Controller seems to have a rate > 1 in connecting to etcd over + last 2 mins after 5 attempts + summary: Controller indicates failure to connect to etcd + - alert: SwiftObjectStoreFailures + expr: increase(databroker_upload_failures{attempt="4",store="swift"}[2m]) > 1 + labels: + criticality: S2 + annotations: + description: Databroker has had more than 1 failures after 5 attempts while + uploading/downloading data + summary: Swift upload/download failures + - alert: S3ObjectStoreFailures + expr: increase(databroker_upload_failures{attempt="4",store="s3"}[2m]) > 1 + labels: + criticality: S2 + annotations: + description: Databroker has had more than 1 failures after 5 attempts while + uploading/downloading data + summary: S3 upload/download failures + - alert: LCMRestarts + expr: increase(lcm_restart_total[2m]) > 1 + labels: + criticality: S2 + annotations: + description: LCM has restarted more than 1 in last 2mins because of failures + summary: LCM restarts because of failures + - alert: LCMTrainingsFailure + expr: sum by(reason, instance, job) (increase(lcm_trainings_launch_failed[2m])) + > 1 + labels: + criticality: S2 + annotations: + description: LCM failed to launch more than 1 trainings for last 2 mins because + of {{`{{$labels.reason}}`}} + summary: LCM failed to launch trainging because of {{`{{$labels.reason}}`}} + - alert: JobMonitorConnectivityFailure + expr: sum by(reason, instance, job) (increase(jobmonitor_connectivity_failures[2m])) + > 1 + labels: + criticality: S2 + annotations: + description: JobMonitor failed to launch more than 1 trainings for last 2 mins + because of {{`{{$labels.reason}}`}} + summary: JobMonitor failed to launch training because of {{`{{$labels.reason}}`}} + - alert: JobMonitorK8sAPIFailure + expr: sum by(reason, instance, job) (increase(jobmonitor_k8s_failures[2m])) > + 1 + labels: + criticality: S2 + annotations: + description: JobMonitor failed to launch more than 1 trainings for last 2 mins + because of {{`{{$labels.reason}}`}} + summary: JobMonitor failed to launch training because of {{`{{$labels.reason}}`}} + - alert: TrainerRatelimitingInvoked + expr: increase(trainer_ratelimitinvocations_total[2m]) > 5 + labels: + criticality: S2 + annotations: + description: Trainer is running into rate limiting issues + summary: Invoked rate limiting on trainer for more than 5 times in a 2 minute + window diff --git a/templates/monitoring/prometheus-configmap.yml b/docs/helm-charts/ffdl-helper/templates/monitoring/prometheus-configmap.yml similarity index 79% rename from templates/monitoring/prometheus-configmap.yml rename to docs/helm-charts/ffdl-helper/templates/monitoring/prometheus-configmap.yml index 08164ae7..8d8150ec 100644 --- a/templates/monitoring/prometheus-configmap.yml +++ b/docs/helm-charts/ffdl-helper/templates/monitoring/prometheus-configmap.yml @@ -8,12 +8,16 @@ data: global: scrape_interval: 15s rule_files: - - '/etc/prometheus-rules/alert.rules' + - '/etc/prometheus-rules/alert.rules.yml' + alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:{{.Values.prometheus.alertmanager_port}}'] scrape_configs: - # etcd is living outside of our cluster and we configure - # it directly. + # etcd is living outside of our cluster and we configure + # it directly. - job_name: 'etcd_compose' - scheme: https + scheme: http tls_config: insecure_skip_verify: true static_configs: diff --git a/templates/monitoring/prometheus-deployment.yml b/docs/helm-charts/ffdl-helper/templates/monitoring/prometheus-deployment.yml similarity index 85% rename from templates/monitoring/prometheus-deployment.yml rename to docs/helm-charts/ffdl-helper/templates/monitoring/prometheus-deployment.yml index 6ad68d2f..50c6a3d0 100644 --- a/templates/monitoring/prometheus-deployment.yml +++ b/docs/helm-charts/ffdl-helper/templates/monitoring/prometheus-deployment.yml @@ -18,12 +18,9 @@ spec: spec: containers: - name: prometheus - image: quay.io/prometheus/prometheus:v1.7.1 + image: quay.io/prometheus/prometheus:v2.3.2 args: - - '-storage.local.retention=72h' - - '-storage.local.memory-chunks=500000' - - '-config.file=/etc/prometheus/prometheus.yml' - - '-alertmanager.url=http://alertmanager:{{.Values.prometheus.alertmanager_port}}' + - '--config.file=/etc/prometheus/prometheus.yml' ports: - name: web containerPort: {{.Values.prometheus.port}} diff --git a/templates/monitoring/prometheus-service.yml b/docs/helm-charts/ffdl-helper/templates/monitoring/prometheus-service.yml similarity index 100% rename from templates/monitoring/prometheus-service.yml rename to docs/helm-charts/ffdl-helper/templates/monitoring/prometheus-service.yml diff --git a/templates/monitoring/pushgateway-configmap.yml b/docs/helm-charts/ffdl-helper/templates/monitoring/pushgateway-configmap.yml similarity index 100% rename from templates/monitoring/pushgateway-configmap.yml rename to docs/helm-charts/ffdl-helper/templates/monitoring/pushgateway-configmap.yml diff --git a/templates/monitoring/pushgateway-deployment.yml b/docs/helm-charts/ffdl-helper/templates/monitoring/pushgateway-deployment.yml similarity index 100% rename from templates/monitoring/pushgateway-deployment.yml rename to docs/helm-charts/ffdl-helper/templates/monitoring/pushgateway-deployment.yml diff --git a/templates/monitoring/pushgateway-service.yml b/docs/helm-charts/ffdl-helper/templates/monitoring/pushgateway-service.yml similarity index 100% rename from templates/monitoring/pushgateway-service.yml rename to docs/helm-charts/ffdl-helper/templates/monitoring/pushgateway-service.yml diff --git a/docs/helm-charts/ffdl-helper/templates/storage-config/config.yaml b/docs/helm-charts/ffdl-helper/templates/storage-config/config.yaml new file mode 100644 index 00000000..cfb64a84 --- /dev/null +++ b/docs/helm-charts/ffdl-helper/templates/storage-config/config.yaml @@ -0,0 +1,28 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: {{.Values.static_volume_name}} + namespace: {{.Values.namespace}} + annotations: + volume.beta.kubernetes.io/storage-class: {{.Values.shared_volume_storage_class}} + labels: + type: dlaas-static-volume +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 20Gi + +--- +kind: ConfigMap +apiVersion: v1 +data: + PVCs-v2.yaml: | + static-volumes-v2: + - name: {{.Values.static_volume_name}} + zlabel: {{.Values.static_volume_name}} + status: active +metadata: + name: static-volumes-v2 + namespace: {{.Values.namespace}} diff --git a/storage-plugin/templates/volume.yaml b/docs/helm-charts/ffdl-helper/templates/storage-config/volume.yaml similarity index 90% rename from storage-plugin/templates/volume.yaml rename to docs/helm-charts/ffdl-helper/templates/storage-config/volume.yaml index 5bc80fed..88345607 100644 --- a/storage-plugin/templates/volume.yaml +++ b/docs/helm-charts/ffdl-helper/templates/storage-config/volume.yaml @@ -1,4 +1,4 @@ -{{ if .Values.dind }} +{{ if .Values.localstorage }} apiVersion: v1 kind: PersistentVolume metadata: diff --git a/docs/helm-charts/ffdl-helper/values.yaml b/docs/helm-charts/ffdl-helper/values.yaml new file mode 100644 index 00000000..c55f8d7c --- /dev/null +++ b/docs/helm-charts/ffdl-helper/values.yaml @@ -0,0 +1,17 @@ +namespace: default +env: dev +static_volume_name: static-volume-1 +localstorage: false +shared_volume_storage_class: "" +services: + expose_node_port: true +docker: + pullPolicy: IfNotPresent +prometheus: + deploy: true + etcd_urls: etcd:2379 + cluster_name: FfDL + port: 9090 + alertmanager_port: 9093 +grafana: + port: 3000 diff --git a/docs/helm-charts/ibmcloud-object-storage-plugin-0.1.tgz b/docs/helm-charts/ibmcloud-object-storage-plugin-0.1.tgz new file mode 100644 index 00000000..4565d24e Binary files /dev/null and b/docs/helm-charts/ibmcloud-object-storage-plugin-0.1.tgz differ diff --git a/storage-plugin/Chart.yaml b/docs/helm-charts/ibmcloud-object-storage-plugin/Chart.yaml similarity index 100% rename from storage-plugin/Chart.yaml rename to docs/helm-charts/ibmcloud-object-storage-plugin/Chart.yaml diff --git a/storage-plugin/templates/deployer.yaml b/docs/helm-charts/ibmcloud-object-storage-plugin/templates/deployer.yaml similarity index 100% rename from storage-plugin/templates/deployer.yaml rename to docs/helm-charts/ibmcloud-object-storage-plugin/templates/deployer.yaml diff --git a/storage-plugin/templates/ibmc-s3fs-standard-StorageClass.yaml b/docs/helm-charts/ibmcloud-object-storage-plugin/templates/ibmc-s3fs-standard-StorageClass.yaml similarity index 100% rename from storage-plugin/templates/ibmc-s3fs-standard-StorageClass.yaml rename to docs/helm-charts/ibmcloud-object-storage-plugin/templates/ibmc-s3fs-standard-StorageClass.yaml diff --git a/storage-plugin/templates/provisioner-sa.yaml b/docs/helm-charts/ibmcloud-object-storage-plugin/templates/provisioner-sa.yaml similarity index 100% rename from storage-plugin/templates/provisioner-sa.yaml rename to docs/helm-charts/ibmcloud-object-storage-plugin/templates/provisioner-sa.yaml diff --git a/storage-plugin/templates/provisioner.yaml b/docs/helm-charts/ibmcloud-object-storage-plugin/templates/provisioner.yaml similarity index 100% rename from storage-plugin/templates/provisioner.yaml rename to docs/helm-charts/ibmcloud-object-storage-plugin/templates/provisioner.yaml diff --git a/storage-plugin/values.yaml b/docs/helm-charts/ibmcloud-object-storage-plugin/values.yaml similarity index 94% rename from storage-plugin/values.yaml rename to docs/helm-charts/ibmcloud-object-storage-plugin/values.yaml index 2f90a7b7..f4bd060e 100644 --- a/storage-plugin/values.yaml +++ b/docs/helm-charts/ibmcloud-object-storage-plugin/values.yaml @@ -5,6 +5,5 @@ image: Build: v0.1 pluginBuild: v0.1 pullPolicy: IfNotPresent -dind: false cloud: true namespace: default diff --git a/docs/helm-charts/index.yaml b/docs/helm-charts/index.yaml new file mode 100644 index 00000000..4e28b6b0 --- /dev/null +++ b/docs/helm-charts/index.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +entries: + ffdl-core: + - appVersion: "3.3" + created: 2018-10-05T11:34:53.083833038-07:00 + description: Core service for Fabric for Deep Learning (FfDL) + digest: 434566c653c8b6bace13546546553e4c9e9d6ece770f065c3cacfb3826680465 + name: ffdl-core + urls: + - https://ibm.github.io/FfDL/helm-charts/ffdl-core-0.1.1.tgz + version: 0.1.1 + ffdl-helper: + - appVersion: "3.3" + created: 2018-10-05T11:34:53.084302812-07:00 + description: Helper Service for Fabric for Deep Learning (FfDL) + digest: 746a8c90a18733bfda431effa480a4023f27062e9cf6b28064fd1365501e9952 + name: ffdl-helper + urls: + - https://ibm.github.io/FfDL/helm-charts/ffdl-helper-0.1.1.tgz + version: 0.1.1 + ibmcloud-object-storage-plugin: + - apiVersion: v1 + created: 2018-10-05T11:34:53.084553812-07:00 + description: A Helm chart for ibmcloud-object-storage plugin + digest: cf4e71e156937e9840eda00ffc7dc1120599bcf4ea9efaaa5e48631c0b83ae51 + name: ibmcloud-object-storage-plugin + urls: + - https://ibm.github.io/FfDL/helm-charts/ibmcloud-object-storage-plugin-0.1.tgz + version: "0.1" +generated: 2018-10-05T11:34:53.08294046-07:00 diff --git a/templates/infrastructure/etcd.yml b/templates/infrastructure/etcd.yml deleted file mode 100644 index 402f7d6b..00000000 --- a/templates/infrastructure/etcd.yml +++ /dev/null @@ -1,74 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: etcd - namespace: {{.Values.namespace}} -spec: - ports: - - port: 2379 - protocol: TCP - targetPort: 2379 - selector: - app: etcd - ---- - -apiVersion: v1 -kind: Pod -metadata: - labels: - app: etcd - etcd_node: etcd0 - name: etcd0 - namespace: {{.Values.namespace}} -spec: - containers: - - command: - - /usr/local/bin/etcd - - --name - - etcd0 - - --initial-advertise-peer-urls - - http://etcd0:2380 - - --listen-peer-urls - - http://0.0.0.0:2380 - - --listen-client-urls - - http://0.0.0.0:2379 - - --advertise-client-urls - - http://etcd0:2379 - - --initial-cluster - - etcd0=http://etcd0:2380 - - --initial-cluster-state - - new - image: quay.io/coreos/etcd:latest - imagePullPolicy: IfNotPresent - name: etcd0 - ports: - - containerPort: 2379 - name: client - protocol: TCP - - containerPort: 2380 - name: server - protocol: TCP - restartPolicy: Always - ---- - -apiVersion: v1 -kind: Service -metadata: - labels: - etcd_node: etcd0 - name: etcd0 - namespace: {{.Values.namespace}} -spec: - ports: - - name: client - port: 2379 - protocol: TCP - targetPort: 2379 - - name: server - port: 2380 - protocol: TCP - targetPort: 2380 - selector: - etcd_node: etcd0 diff --git a/templates/monitoring/alertrules-configmap.yml b/templates/monitoring/alertrules-configmap.yml deleted file mode 100644 index 52cf87f3..00000000 --- a/templates/monitoring/alertrules-configmap.yml +++ /dev/null @@ -1,80 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-alertrules - namespace: {{.Values.namespace}} -data: - alert.rules: |- - - ALERT ControllerJobFailures - IF increase(controller_training_failures[2m]) >= 1 - LABELS { criticality = "S2" } - ANNOTATIONS { - summary = "Controller indicates failing learnings", - description = "Controller seems to have a rate > 1 of failed learnings over last 2 mins", - } - - ALERT ControllerETCDFailures - IF increase(controller_etcd_failures{attempt="4"}[2m]) >= 1 - LABELS { criticality = "S2" } - ANNOTATIONS { - summary = "Controller indicates failure to connect to etcd", - description = "Controller seems to have a rate > 1 in connecting to etcd over last 2 mins after 5 attempts", - } - - ALERT SwiftObjectStoreFailures - IF increase(databroker_upload_failures{store="swift", attempt="4"}[2m]) > 1 - LABELS { criticality = "S2" } - ANNOTATIONS { - summary = "Swift upload/download failures", - description = "Databroker has had more than 1 failures after 5 attempts while uploading/downloading data", - } - - ALERT S3ObjectStoreFailures - IF increase(databroker_upload_failures{store="s3", attempt="4"}[2m]) > 1 - LABELS { criticality = "S2" } - ANNOTATIONS { - summary = "S3 upload/download failures", - description = "Databroker has had more than 1 failures after 5 attempts while uploading/downloading data", - } - - ALERT LCMRestarts - IF increase(lcm_restart_total[2m]) > 1 - LABELS { criticality = "S2" } - ANNOTATIONS { - summary = "LCM restarts because of failures", - description = "LCM has restarted more than 1 in last 2mins because of failures", - } - - - ALERT LCMTrainingsFailure - IF sum(increase(lcm_trainings_launch_failed[2m])) by (reason, instance, job) > 1 - LABELS { criticality = "S2" } - ANNOTATIONS { - summary = "LCM failed to launch trainging because of {{`{{$labels.reason}}`}}", - description = "LCM failed to launch more than 1 trainings for last 2 mins because of {{`{{$labels.reason}}`}}", - } - - ALERT JobMonitorConnectivityFailure - IF sum(increase(jobmonitor_connectivity_failures[2m])) by (reason, instance, job) > 1 - LABELS { criticality = "S2" } - ANNOTATIONS { - summary = "JobMonitor failed to launch training because of {{`{{$labels.reason}}`}}", - description = "JobMonitor failed to launch more than 1 trainings for last 2 mins because of {{`{{$labels.reason}}`}}", - } - - ALERT JobMonitorK8sAPIFailure - IF sum(increase(jobmonitor_k8s_failures[2m])) by (reason, instance, job) > 1 - LABELS { criticality = "S2" } - ANNOTATIONS { - summary = "JobMonitor failed to launch training because of {{`{{$labels.reason}}`}}", - description = "JobMonitor failed to launch more than 1 trainings for last 2 mins because of {{`{{$labels.reason}}`}}", - } - - ALERT TrainerRatelimitingInvoked - IF increase(trainer_ratelimitinvocations_total[2m]) > 5 - LABELS { criticality = "S2" } - ANNOTATIONS { - summary = "Invoked rate limiting on trainer for more than 5 times in a 2 minute window", - description = "Trainer is running into rate limiting issues", - }