From d6e76905420ca393d46df092c20d702cc3ef0e04 Mon Sep 17 00:00:00 2001 From: "@simulark" Date: Tue, 13 Jun 2023 17:43:33 +0300 Subject: [PATCH 1/3] unintended deleted branch repair commit --- examples/v2beta1/deepspeed/Dockerfile | 31 +++++++ .../v2beta1/deepspeed/deepspeed-config.yaml | 93 +++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 examples/v2beta1/deepspeed/Dockerfile create mode 100644 examples/v2beta1/deepspeed/deepspeed-config.yaml diff --git a/examples/v2beta1/deepspeed/Dockerfile b/examples/v2beta1/deepspeed/Dockerfile new file mode 100644 index 00000000..b741c369 --- /dev/null +++ b/examples/v2beta1/deepspeed/Dockerfile @@ -0,0 +1,31 @@ +# Official PyTorch image with CUDA support +FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime + +# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need to disable UserKnownHostsFile to avoid write permissions. +# Disable StrictModes avoids directory and files read permission checks and update system packages & install dependencies +RUN apt-get update && apt-get install -y \ + git \ + wget \ + build-essential \ + cmake \ + libopenmpi-dev \ + openssh-server \ + && rm -rf /var/lib/apt/lists/* \ + && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ + && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +# Install DeepSpeed library +RUN pip install deepspeed +RUN mkdir /deepspeed + +# Workspace for DeepSpeed examples +WORKDIR "/deepspeed" + +# Clone the DeepSpeedExamples from repository +RUN git clone https://github.com/microsoft/DeepSpeedExamples/ + +# Set the working directory to DeepSpeedExamples/training for models +WORKDIR "/deepspeed/DeepSpeedExamples/training" + +# Set the default command to bash +CMD ["/bin/bash"] \ No newline at end of file diff --git a/examples/v2beta1/deepspeed/deepspeed-config.yaml b/examples/v2beta1/deepspeed/deepspeed-config.yaml new file mode 100644 index 00000000..c4252da1 --- /dev/null +++ b/examples/v2beta1/deepspeed/deepspeed-config.yaml @@ -0,0 +1,93 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: deepspeed-mpijob +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support + # Change your image name and version in here + - image: : + name: deepspeed-mpijob-container + command: + - mpirun + - --allow-run-as-root + - -np + - "2" + - -bind-to + - none + - -map-by + - slot + - -x + - NCCL_DEBUG=INFO + - -x + - LD_LIBRARY_PATH + - -x + - PATH + - -mca + - pml + - ob1 + - -mca + - btl + - ^openib + - python + - cifar/cifar10_deepspeed.py + - --deepspeed_mpi + - --deepspeed + - --deepspeed_config + - ds_config.json + - $@ + Worker: + replicas: 2 + template: + spec: + # OPTIONAL: Taint toleration for the specific nodepool + # + # Taints and tolerations are used to ensure that the DeepSpeed worker pods + # are scheduled on the desired nodes. By applying taints to nodes, you can + # repel pods that do not have the corresponding tolerations. This is useful + # in situations where you want to reserve nodes with specific resources + # (e.g. GPU nodes) for particular workloads, like the DeepSpeed training + # job. + # + # In this example, the tolerations are set to allow the DeepSpeed worker + # pods to be scheduled on nodes with the specified taints (i.e., the node + # pool with GPU resources). This ensures that the training job can + # utilize the available GPU resources on those nodes, improving the + # efficiency and performance of the training process. + # + # You can remove the taint tolerations if you do not have any taints on your cluster. + tolerations: + # Change the nodepool name in here + - effect: NoSchedule + key: nodepool + operator: Equal + value: nodepool-256ram32cpu2gpu-0 + # Taint toleration effect for GPU nodes + - effect: NoSchedule + key: nvidia.com/gpu + operator: Equal + value: present + containers: + # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support + # Change your image name and version in here + - image: : + name: deepspeed-mpijob-container + resources: + limits: + # Optional: varies to nodepool group + cpu: 30 + memory: 230Gi + nvidia.com/gpu: 2 + requests: + # Optional: varies to nodepool group + cpu: 16 + memory: 128Gi + nvidia.com/gpu: 1 \ No newline at end of file From 33a57a7d0638bf798b4271ab0cf957068e9e9157 Mon Sep 17 00:00:00 2001 From: simulark Date: Tue, 13 Jun 2023 18:17:16 +0300 Subject: [PATCH 2/3] update Dockerfile for mpioperator --- examples/v2beta1/deepspeed/Dockerfile | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/v2beta1/deepspeed/Dockerfile b/examples/v2beta1/deepspeed/Dockerfile index b741c369..c79b0054 100644 --- a/examples/v2beta1/deepspeed/Dockerfile +++ b/examples/v2beta1/deepspeed/Dockerfile @@ -1,5 +1,5 @@ -# Official PyTorch image with CUDA support -FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime +# Official MPI Operator Base image +FROM mpioperator/base # mpi-operator mounts the .ssh folder from a Secret. For that to work, we need to disable UserKnownHostsFile to avoid write permissions. # Disable StrictModes avoids directory and files read permission checks and update system packages & install dependencies @@ -10,12 +10,17 @@ RUN apt-get update && apt-get install -y \ cmake \ libopenmpi-dev \ openssh-server \ + python3 \ + python3-pip \ && rm -rf /var/lib/apt/lists/* \ && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config -# Install DeepSpeed library -RUN pip install deepspeed +# Install DeepSpeed library and Torch with cu11.8 wheels +RUN pip3 install deepspeed +RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 + +# Create folder for deepspeed workspace RUN mkdir /deepspeed # Workspace for DeepSpeed examples @@ -28,4 +33,4 @@ RUN git clone https://github.com/microsoft/DeepSpeedExamples/ WORKDIR "/deepspeed/DeepSpeedExamples/training" # Set the default command to bash -CMD ["/bin/bash"] \ No newline at end of file +CMD ["/bin/bash"] From 5e11a56e85a935c681e5471d84852cfc4fe13359 Mon Sep 17 00:00:00 2001 From: simulark Date: Tue, 13 Jun 2023 18:48:32 +0300 Subject: [PATCH 3/3] add: cifar_ds.Dockerfile sample image, update DS base image --- examples/v2beta1/deepspeed/Dockerfile | 4 +- examples/v2beta1/deepspeed/README.md | 0 .../v2beta1/deepspeed/cifar_ds.Dockerfile | 12 ++++ .../v2beta1/deepspeed/deepspeed-config.yaml | 60 +++++++++---------- 4 files changed, 44 insertions(+), 32 deletions(-) create mode 100644 examples/v2beta1/deepspeed/README.md create mode 100644 examples/v2beta1/deepspeed/cifar_ds.Dockerfile diff --git a/examples/v2beta1/deepspeed/Dockerfile b/examples/v2beta1/deepspeed/Dockerfile index c79b0054..9f023a0a 100644 --- a/examples/v2beta1/deepspeed/Dockerfile +++ b/examples/v2beta1/deepspeed/Dockerfile @@ -29,8 +29,8 @@ WORKDIR "/deepspeed" # Clone the DeepSpeedExamples from repository RUN git clone https://github.com/microsoft/DeepSpeedExamples/ -# Set the working directory to DeepSpeedExamples/training for models -WORKDIR "/deepspeed/DeepSpeedExamples/training" +# Set the working directory to DeepSpeedExamples for models +WORKDIR "/deepspeed/DeepSpeedExamples/" # Set the default command to bash CMD ["/bin/bash"] diff --git a/examples/v2beta1/deepspeed/README.md b/examples/v2beta1/deepspeed/README.md new file mode 100644 index 00000000..e69de29b diff --git a/examples/v2beta1/deepspeed/cifar_ds.Dockerfile b/examples/v2beta1/deepspeed/cifar_ds.Dockerfile new file mode 100644 index 00000000..cf366acc --- /dev/null +++ b/examples/v2beta1/deepspeed/cifar_ds.Dockerfile @@ -0,0 +1,12 @@ +# Base image for MPIOperator with DeepSpeed and CUDA setup +FROM mpioperator/deepspeedbase + +# Select WORKDIR for cifar tutorial +WORKDIR /deepspeed/DeepSpeedExamples/training/cifar + +# Install dependencies +RUN pip3 install pillow \ + matplotlib + +# Run the script for running DeepSpeed applied model +CMD [ "sh", "run_ds.sh" ] diff --git a/examples/v2beta1/deepspeed/deepspeed-config.yaml b/examples/v2beta1/deepspeed/deepspeed-config.yaml index c4252da1..38dfb543 100644 --- a/examples/v2beta1/deepspeed/deepspeed-config.yaml +++ b/examples/v2beta1/deepspeed/deepspeed-config.yaml @@ -13,37 +13,37 @@ spec: spec: containers: # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support - # Change your image name and version in here - - image: : + # Sample container for DeepSpeed applied model, you can check this image to your application or training process + - image: cifards:v0.0.1 name: deepspeed-mpijob-container command: - - mpirun - - --allow-run-as-root - - -np - - "2" - - -bind-to - - none - - -map-by - - slot - - -x - - NCCL_DEBUG=INFO - - -x - - LD_LIBRARY_PATH - - -x - - PATH - - -mca - - pml - - ob1 - - -mca - - btl - - ^openib - - python - - cifar/cifar10_deepspeed.py - - --deepspeed_mpi - - --deepspeed - - --deepspeed_config - - ds_config.json - - $@ + - mpirun + - --allow-run-as-root + - -np + - "2" + - -bind-to + - none + - -map-by + - slot + - -x + - NCCL_DEBUG=INFO + - -x + - LD_LIBRARY_PATH + - -x + - PATH + - -mca + - pml + - ob1 + - -mca + - btl + - ^openib + - python + - cifar/cifar10_deepspeed.py + - --deepspeed_mpi + - --deepspeed + - --deepspeed_config + - ds_config.json + - $@ Worker: replicas: 2 template: @@ -90,4 +90,4 @@ spec: # Optional: varies to nodepool group cpu: 16 memory: 128Gi - nvidia.com/gpu: 1 \ No newline at end of file + nvidia.com/gpu: 1