diff --git a/examples/mpi/DOCKER.md b/examples/mpi/DOCKER.md index c155200a3e..96c0d9d54f 100644 --- a/examples/mpi/DOCKER.md +++ b/examples/mpi/DOCKER.md @@ -34,13 +34,13 @@ We need to build a Open MPI base image with GPU support to run Open MPI workload ## Advanced environment -You can build runtime TensorFlow or CNTK Docker images based on the MPI base image, -for example, we prepared [TensorFlow mpi Dockerfile](./Dockerfile.example.tensorflow-mpi) and [CNTK mpi Dockerfile](./Dockerfile.example.cntk-mpi) which can be refered to. +You can build runtime CNTK Docker images based on the MPI base image, +for example, we prepared [CNTK mpi Dockerfile](./Dockerfile.example.cntk-mpi) which can be refered to. -Push the Docker image to a Docker registry, we use TensorFlow mpi Docker image as an example: +Push the Docker image to a Docker registry, we use CNTK mpi Docker image as an example: ```bash -$ sudo docker tag pai.example.tensorflow-mpi USER/pai.example.tensorflow-mpi -$ sudo docker push USER/pai.example.tensorflow-mpi +$ sudo docker tag pai.example.cntk USER/pai.example.cntk +$ sudo docker push USER/pai.example.cntk ``` *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.* diff --git a/examples/mpi/Dockerfile.example.tensorflow-mpi b/examples/mpi/Dockerfile.example.tensorflow-mpi deleted file mode 100644 index 817903c22e..0000000000 --- a/examples/mpi/Dockerfile.example.tensorflow-mpi +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and -# to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING -# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -# tag: pai.example.tensorflow-mpi -# -# Before building this image you need to build the base image first: -# -# docker build -f Dockerfile.build.mpi -t pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 . - - -FROM pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 - -ENV TENSORFLOW_VERSION=1.4.0 -ENV BAZEL_VERSION=0.5.4 - -#update and upgrade, install git -RUN apt-get update && apt-get -y upgrade && apt-get install -y git - -#install numpy -RUN pip install numpy - -WORKDIR / - -# Install Bazel. -RUN echo "startup --batch" >>/etc/bazel.bazelrc -RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \ - >>/etc/bazel.bazelrc -RUN mkdir /bazel && \ - cd /bazel && \ - curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh && \ - chmod +x bazel-*.sh && \ - ./bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh && \ - cd / && \ - rm -f /bazel/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh - -# Download and build TensorFlow. -WORKDIR /tensorflow -RUN git clone -b r1.4 https://github.com/tensorflow/tensorflow.git . && \ - git cherry-pick -n f73d7c -ENV TF_NEED_CUDA=1 \ - TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1 \ - TF_CUDA_VERSION=8.0 \ - TF_CUDNN_VERSION=6 \ - TF_NEED_HDFS=1 \ - TF_NEED_MPI=1 \ - TF_NEED_GCP=0 \ - TF_NEED_OPENCL=0 \ - TF_NEED_JEMALLOC=1 \ - TF_ENABLE_XLA=0 \ - TF_NEED_VERBS=0 \ - TF_CUDA_CLANG=0 \ - TF_NEED_MKL=0 \ - TF_DOWNLOAD_MKL=0 -RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \ - LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \ - bazel clean && \ - ./configure && \ - bazel build --config=opt --config=cuda \ - --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \ - //tensorflow/tools/pip_package:build_pip_package && \ - rm /usr/local/cuda/lib64/stubs/libcuda.so.1 && \ - bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \ - pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \ - rm -rf /tmp/pip && \ - rm -rf /root/.cache - -WORKDIR /root diff --git a/examples/mpi/README.md b/examples/mpi/README.md index f7d6bd40b5..d6664014a0 100644 --- a/examples/mpi/README.md +++ b/examples/mpi/README.md @@ -17,8 +17,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --> -# Note -Now(27th September, 2018), the mpi examples are still unready. Ignore them! # MPI on OpenPAI @@ -30,71 +28,26 @@ The following contents show some basic Open MPI examples, other customized MPI c - [Note](#note) - [MPI on OpenPAI](#mpi-on-openpai) - [Contents](#contents) -- [Open MPI TensorFlow / CNTK CIFAR-10 example](#open-mpi-tensorflow--cntk-cifar-10-example) +- [CNTK CIFAR-10 example](#cntk-cifar-10-example) - [Prepare work](#prepare-work) - - [Open MPI TensorFlow CIFAR-10 example](#open-mpi-tensorflow-cifar-10-example) - - [TensorFlow cifar10 benchmark](#tensorflow-cifar10-benchmark) - [Open MPI CNTK grapheme-to-phoneme conversion example](#open-mpi-cntk-grapheme-to-phoneme-conversion-example) - [CNTK G2P example](#cntk-g2p-example) -# Open MPI TensorFlow / CNTK CIFAR-10 example +# CNTK CIFAR-10 example ### Prepare work 1. Prepare the data: -* TensorFlow: Just go to the [official website](http://www.cs.toronto.edu/~kriz/cifar.html) and download the python version data by the [url](http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz). `wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz && tar zxvf cifar-10-python.tar.gz && rm cifar-10-python.tar.gz` -After you downloading the data, upload them to HDFS:`hdfs dfs -put filename hdfs://ip:port/examples/mpi/tensorflow/data` or `hdfs dfs -put filename hdfs://ip:port/examples/tensorflow/distributed-cifar-10/data` -Note that we use the same data as tensorflow distributed cifar-10 example. So, if you have already run that example, just use that data path. * CNTK: Download all files in https://git.io/vbT5A `wget https://github.com/Microsoft/CNTK/raw/master/Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b` and put them up to HDFS:`hdfs dfs -put filename hdfs://ip:port/examples/cntk/data` or `hdfs dfs -put filename hdfs://ip:port/examples/mpi/cntk/data`. Note that we use the same data as cntk example. So, if you have already run that example, just use that data path. 2. Prepare the executable code: -* Tensorflow: We use the same code as tensorflow distributed cifar-10 example. You can follow [that document](https://github.com/Microsoft/pai/blob/master/examples/tensorflow/README.md). * cntk: Download the script example from [github](https://github.com/Microsoft/pai/blob/master/examples/mpi/cntk-mpi.sh)`wget https://github.com/Microsoft/pai/raw/master/examples/mpi/cntk-mpi.sh`. Then upload them to HDFS:`hdfs dfs -put filename hdfs://ip:port/examples/mpi/cntk/code/` -3. Prepare a docker image and upload it to docker hub. OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.tensorflow-mpi`, `openpai/pai.example.cntk-mp` with your own. +3. Prepare a docker image and upload it to docker hub. OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.cntk` with your own. 4. Prepare a job configuration file and submit it through webportal. The config examples are following. **Note** that you can simply run the prepare.sh to do the above preparing work, but you must make sure you can use HDFS client on your local machine. If you can, just run the shell script with a parameter of your HDFS socket! `/bin/bash prepare.sh ip:port` -Here're some configuration file examples: - -## Open MPI TensorFlow CIFAR-10 example - -### [TensorFlow cifar10 benchmark](https://git.io/vF4wT) - -```js -{ - "jobName": "tensorflow-mpi", - "image": "openpai/pai.example.tensorflow-mpi", - - // download cifar10 dataset from http://www.cs.toronto.edu/~kriz/cifar.html and upload to hdfs - "dataDir": "$PAI_DEFAULT_FS_URI/path/tensorflow-mpi/data", - // make a new dir for output on hdfs - "outputDir": "$PAI_DEFAULT_FS_URI/path/tensorflow-mpi/output", - // download code from tensorflow benchmark https://git.io/vF4wT and upload to hdfs - "codeDir": "$PAI_DEFAULT_FS_URI/path/tensorflow-mpi/code", - - "taskRoles": [ - { - "name": "ps_server", - "taskNumber": 2, - "cpuNumber": 2, - "memoryMB": 8192, - "gpuNumber": 0, - "command": "pip --quiet install scipy && python code/tf_cnn_benchmarks.py --local_parameter_device=cpu --batch_size=32 --model=resnet20 --variable_update=parameter_server --data_dir=$PAI_DATA_DIR --data_name=cifar10 --train_dir=$PAI_OUTPUT_DIR --ps_hosts=$PAI_TASK_ROLE_ps_server_HOST_LIST --worker_hosts=$PAI_TASK_ROLE_worker_HOST_LIST --job_name=ps --task_index=$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX --server_protocol=grpc+mpi" - }, - { - "name": "worker", - "taskNumber": 2, - "cpuNumber": 2, - "memoryMB": 16384, - "gpuNumber": 4, - "command": "pip --quiet install scipy && python code/tf_cnn_benchmarks.py --local_parameter_device=cpu --batch_size=32 --model=resnet20 --variable_update=parameter_server --data_dir=$PAI_DATA_DIR --data_name=cifar10 --train_dir=$PAI_OUTPUT_DIR --ps_hosts=$PAI_TASK_ROLE_ps_server_HOST_LIST --worker_hosts=$PAI_TASK_ROLE_worker_HOST_LIST --job_name=worker --task_index=$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX --server_protocol=grpc+mpi", - "minSucceededTaskCount": 2 - } - ], - "retryCount": 0 -} -``` +Here's a configuration file examples: ## Open MPI CNTK grapheme-to-phoneme conversion example diff --git a/examples/mpi/prepare.sh b/examples/mpi/prepare.sh index 57717ea22c..930a4265e2 100644 --- a/examples/mpi/prepare.sh +++ b/examples/mpi/prepare.sh @@ -71,50 +71,3 @@ fi #delete the files rm -rf cntk-mpi.sh* G2P.cntk* mpi_cntk_data/ echo "Removed local mpi cntk code and data succeeded!" - -#mpi tensorflow cifar-10 prepare -function mpi_tensorflow_prepare_data(){ - #download the data - wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz && tar zxvf cifar-10-python.tar.gz && rm cifar-10-python.tar.gz - - #upload the data to HDFS - echo "Uploading cifar-10 data, waiting..." - for i in `ls cifar-10-batches-py` - do - hdfs dfs -put cifar-10-batches-py/$i hdfs://$1/$2/examples/tensorflow/distributed-cifar-10/data - done -} - -function mpi_tensorflow_prepare_code(){ - #download the code - git clone -b tf_benchmark_stage https://github.com/tensorflow/benchmarks.git - - #upload the code to HDFS - echo "Uploading benchmarks code, waiting..." - hdfs dfs -put benchmarks/ hdfs://$1/$2/examples/tensorflow/distributed-cifar-10/code -} - -echo "Make mpi tensorflow directory, waiting..." -hdfs dfs -mkdir -p hdfs://$1/$2/examples/mpi/tensorflow/output -hdfs dfs -mkdir -p hdfs://$1/$2/examples/tensorflow/distributed-cifar-10/code -hdfs dfs -mkdir -p hdfs://$1/$2/examples/tensorflow/distributed-cifar-10/data - -hdfs dfs -test -e hdfs://$1/$2/examples/tensorflow/distributed-cifar-10/code/* -if [ $? -eq 0 ] ;then - echo "Code exists on HDFS!" -else - mpi_tensorflow_prepare_code $1 $2 - echo "Have prepared code!" -fi - -hdfs dfs -test -e hdfs://$1/$2/examples/tensorflow/distributed-cifar-10/data/* -if [ $? -eq 0 ] ;then - echo "Data exists on HDFS!" -else - mpi_tensorflow_prepare_data $1 $2 - echo "Have prepared data" -fi - -rm -r cifar-10-batches-py*/ benchmarks*/ -echo "Removed local cifar-10 code and data succeeded!" -echo "Prepare mpi example based on horovod and tensorflow done!" diff --git a/examples/mpi/tensorflow-mpi.json b/examples/mpi/tensorflow-mpi.json deleted file mode 100644 index 7b33c9abbc..0000000000 --- a/examples/mpi/tensorflow-mpi.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "jobName": "tensorflow-mpi", - "image": "openpai/pai.example.tensorflow-mpi", - - // download cifar10 dataset from http://www.cs.toronto.edu/~kriz/cifar.html and upload to hdfs - "dataDir": "$PAI_DEFAULT_FS_URI/$PAI_USERNAME/examples/tensorflow/distributed-cifar-10/data", - // make a new dir for output on hdfs - "outputDir": "$PAI_DEFAULT_FS_URI/$PAI_USERNAME/examples/mpi/tensorflow/output", - // download code from tensorflow benchmark https://git.io/vF4wT and upload to hdfs - "codeDir": "$PAI_DEFAULT_FS_URI/$PAI_USERNAME/examples/tensorflow/distributed-cifar-10/code", - - "taskRoles": [ - { - "name": "ps_server", - "taskNumber": 2, - "cpuNumber": 2, - "memoryMB": 8192, - "gpuNumber": 0, - "command": "pip --quiet install scipy absl-py && python code/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --local_parameter_device=cpu --batch_size=32 --model=resnet20 --variable_update=parameter_server --data_dir=$PAI_DATA_DIR --data_name=cifar10 --train_dir=$PAI_OUTPUT_DIR --ps_hosts=$PAI_TASK_ROLE_ps_server_HOST_LIST --worker_hosts=$PAI_TASK_ROLE_worker_HOST_LIST --job_name=ps --task_index=$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX --server_protocol=grpc+mpi" - }, - { - "name": "worker", - "taskNumber": 2, - "cpuNumber": 2, - "memoryMB": 16384, - "gpuNumber": 4, - "command": "pip --quiet install scipy absl-py && python code/benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --local_parameter_device=cpu --batch_size=32 --model=resnet20 --variable_update=parameter_server --data_dir=$PAI_DATA_DIR --data_name=cifar10 --train_dir=$PAI_OUTPUT_DIR --ps_hosts=$PAI_TASK_ROLE_ps_server_HOST_LIST --worker_hosts=$PAI_TASK_ROLE_worker_HOST_LIST --job_name=worker --task_index=$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX --server_protocol=grpc+mpi", - "minSucceededTaskCount": 2 - } - ], - "retryCount": 0 -}