From d6e76905420ca393d46df092c20d702cc3ef0e04 Mon Sep 17 00:00:00 2001
From: "@simulark" <dogukanuraztuna@gmail.com>
Date: Tue, 13 Jun 2023 17:43:33 +0300
Subject: [PATCH 1/3] unintended deleted branch repair commit

---
 examples/v2beta1/deepspeed/Dockerfile         | 31 +++++++
 .../v2beta1/deepspeed/deepspeed-config.yaml   | 93 +++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 examples/v2beta1/deepspeed/Dockerfile
 create mode 100644 examples/v2beta1/deepspeed/deepspeed-config.yaml
diff --git a/examples/v2beta1/deepspeed/Dockerfile b/examples/v2beta1/deepspeed/Dockerfile
new file mode 100644
index 00000000..b741c369
--- /dev/null
+++ b/examples/v2beta1/deepspeed/Dockerfile
@@ -0,0 +1,31 @@
+# Official PyTorch image with CUDA support
+FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+
+# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need to disable UserKnownHostsFile to avoid write permissions.
+# Disable StrictModes avoids directory and files read permission checks and update system packages & install dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    wget \
+    build-essential \
+    cmake \
+    libopenmpi-dev \
+    openssh-server \
+    && rm -rf /var/lib/apt/lists/* \
+    && echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
+    && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+# Install DeepSpeed library
+RUN pip install deepspeed
+RUN mkdir /deepspeed
+
+# Workspace for DeepSpeed examples
+WORKDIR "/deepspeed"
+
+# Clone the DeepSpeedExamples from repository
+RUN git clone https://github.com/microsoft/DeepSpeedExamples/
+
+# Set the working directory to DeepSpeedExamples/training for models
+WORKDIR "/deepspeed/DeepSpeedExamples/training"
+
+# Set the default command to bash
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/examples/v2beta1/deepspeed/deepspeed-config.yaml b/examples/v2beta1/deepspeed/deepspeed-config.yaml
new file mode 100644
index 00000000..c4252da1
--- /dev/null
+++ b/examples/v2beta1/deepspeed/deepspeed-config.yaml
@@ -0,0 +1,93 @@
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: deepspeed-mpijob
+spec:
+  slotsPerWorker: 1
+  runPolicy:
+    cleanPodPolicy: Running
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support
+          # Change your image name and version in here
+          - image: <YOUR-DEEPSPEED-CONTAINER-NAME>:<VERSION>
+            name: deepspeed-mpijob-container
+            command:
+              - mpirun
+              - --allow-run-as-root
+              - -np
+              - "2"
+              - -bind-to
+              - none
+              - -map-by
+              - slot
+              - -x
+              - NCCL_DEBUG=INFO
+              - -x
+              - LD_LIBRARY_PATH
+              - -x
+              - PATH
+              - -mca
+              - pml
+              - ob1
+              - -mca
+              - btl
+              - ^openib
+              - python
+              - cifar/cifar10_deepspeed.py
+              - --deepspeed_mpi
+              - --deepspeed
+              - --deepspeed_config
+              - ds_config.json
+              - $@
+    Worker:
+      replicas: 2
+      template:
+        spec:
+          # OPTIONAL: Taint toleration for the specific nodepool
+          #
+          # Taints and tolerations are used to ensure that the DeepSpeed worker pods
+          # are scheduled on the desired nodes. By applying taints to nodes, you can
+          # repel pods that do not have the corresponding tolerations. This is useful
+          # in situations where you want to reserve nodes with specific resources
+          # (e.g. GPU nodes) for particular workloads, like the DeepSpeed training
+          # job.
+          #
+          # In this example, the tolerations are set to allow the DeepSpeed worker
+          # pods to be scheduled on nodes with the specified taints (i.e., the node
+          # pool with GPU resources). This ensures that the training job can
+          # utilize the available GPU resources on those nodes, improving the
+          # efficiency and performance of the training process.
+          #
+          # You can remove the taint tolerations if you do not have any taints on your cluster.
+          tolerations:
+          # Change the nodepool name in here
+          - effect: NoSchedule
+            key: nodepool
+            operator: Equal
+            value: nodepool-256ram32cpu2gpu-0
+          # Taint toleration effect for GPU nodes
+          - effect: NoSchedule
+            key: nvidia.com/gpu
+            operator: Equal
+            value: present
+          containers:
+          # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support
+          # Change your image name and version in here
+          - image: <YOUR-DEEPSPEED-CONTAINER-NAME>:<VERSION>
+            name: deepspeed-mpijob-container
+            resources:
+              limits:
+                # Optional: varies to nodepool group 
+                cpu: 30
+                memory: 230Gi
+                nvidia.com/gpu: 2
+              requests:
+                # Optional: varies to nodepool group
+                cpu: 16
+                memory: 128Gi
+                nvidia.com/gpu: 1
\ No newline at end of file

From 33a57a7d0638bf798b4271ab0cf957068e9e9157 Mon Sep 17 00:00:00 2001
From: simulark <dogukanuraztuna@gmail.com>
Date: Tue, 13 Jun 2023 18:17:16 +0300
Subject: [PATCH 2/3] update Dockerfile for mpioperator

---
 examples/v2beta1/deepspeed/Dockerfile | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/v2beta1/deepspeed/Dockerfile b/examples/v2beta1/deepspeed/Dockerfile
index b741c369..c79b0054 100644
--- a/examples/v2beta1/deepspeed/Dockerfile
+++ b/examples/v2beta1/deepspeed/Dockerfile
@@ -1,5 +1,5 @@
-# Official PyTorch image with CUDA support
-FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+# Official MPI Operator Base image
+FROM mpioperator/base
 
 # mpi-operator mounts the .ssh folder from a Secret. For that to work, we need to disable UserKnownHostsFile to avoid write permissions.
 # Disable StrictModes avoids directory and files read permission checks and update system packages & install dependencies
@@ -10,12 +10,17 @@ RUN apt-get update && apt-get install -y \
     cmake \
     libopenmpi-dev \
     openssh-server \
+    python3 \
+    python3-pip \
     && rm -rf /var/lib/apt/lists/* \
     && echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
     && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
 
-# Install DeepSpeed library
-RUN pip install deepspeed
+# Install DeepSpeed library and Torch with cu11.8 wheels
+RUN pip3 install deepspeed
+RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+
+# Create folder for deepspeed workspace
 RUN mkdir /deepspeed
 
 # Workspace for DeepSpeed examples
@@ -28,4 +33,4 @@ RUN git clone https://github.com/microsoft/DeepSpeedExamples/
 WORKDIR "/deepspeed/DeepSpeedExamples/training"
 
 # Set the default command to bash
-CMD ["/bin/bash"]
\ No newline at end of file
+CMD ["/bin/bash"]

From 5e11a56e85a935c681e5471d84852cfc4fe13359 Mon Sep 17 00:00:00 2001
From: simulark <dogukanuraztuna@gmail.com>
Date: Tue, 13 Jun 2023 18:48:32 +0300
Subject: [PATCH 3/3] add: cifar_ds.Dockerfile sample image, update DS base
 image

---
 examples/v2beta1/deepspeed/Dockerfile         |  4 +-
 examples/v2beta1/deepspeed/README.md          |  0
 .../v2beta1/deepspeed/cifar_ds.Dockerfile     | 12 ++++
 .../v2beta1/deepspeed/deepspeed-config.yaml   | 60 +++++++++----------
 4 files changed, 44 insertions(+), 32 deletions(-)
 create mode 100644 examples/v2beta1/deepspeed/README.md
 create mode 100644 examples/v2beta1/deepspeed/cifar_ds.Dockerfile

diff --git a/examples/v2beta1/deepspeed/Dockerfile b/examples/v2beta1/deepspeed/Dockerfile
index c79b0054..9f023a0a 100644
--- a/examples/v2beta1/deepspeed/Dockerfile
+++ b/examples/v2beta1/deepspeed/Dockerfile
@@ -29,8 +29,8 @@ WORKDIR "/deepspeed"
 # Clone the DeepSpeedExamples from repository
 RUN git clone https://github.com/microsoft/DeepSpeedExamples/
 
-# Set the working directory to DeepSpeedExamples/training for models
-WORKDIR "/deepspeed/DeepSpeedExamples/training"
+# Set the working directory to DeepSpeedExamples for models
+WORKDIR "/deepspeed/DeepSpeedExamples/"
 
 # Set the default command to bash
 CMD ["/bin/bash"]
diff --git a/examples/v2beta1/deepspeed/README.md b/examples/v2beta1/deepspeed/README.md
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/v2beta1/deepspeed/cifar_ds.Dockerfile b/examples/v2beta1/deepspeed/cifar_ds.Dockerfile
new file mode 100644
index 00000000..cf366acc
--- /dev/null
+++ b/examples/v2beta1/deepspeed/cifar_ds.Dockerfile
@@ -0,0 +1,12 @@
+# Base image for MPIOperator with DeepSpeed and CUDA setup
+FROM mpioperator/deepspeedbase
+
+# Select WORKDIR for cifar tutorial
+WORKDIR /deepspeed/DeepSpeedExamples/training/cifar
+
+# Install dependencies
+RUN pip3 install pillow \
+    matplotlib
+
+# Run the script for running DeepSpeed applied model
+CMD [ "sh", "run_ds.sh" ]
diff --git a/examples/v2beta1/deepspeed/deepspeed-config.yaml b/examples/v2beta1/deepspeed/deepspeed-config.yaml
index c4252da1..38dfb543 100644
--- a/examples/v2beta1/deepspeed/deepspeed-config.yaml
+++ b/examples/v2beta1/deepspeed/deepspeed-config.yaml
@@ -13,37 +13,37 @@ spec:
         spec:
           containers:
           # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support
-          # Change your image name and version in here
-          - image: <YOUR-DEEPSPEED-CONTAINER-NAME>:<VERSION>
+          # Sample container for DeepSpeed applied model, you can check this image to your application or training process
+          - image: cifards:v0.0.1
             name: deepspeed-mpijob-container
             command:
-              - mpirun
-              - --allow-run-as-root
-              - -np
-              - "2"
-              - -bind-to
-              - none
-              - -map-by
-              - slot
-              - -x
-              - NCCL_DEBUG=INFO
-              - -x
-              - LD_LIBRARY_PATH
-              - -x
-              - PATH
-              - -mca
-              - pml
-              - ob1
-              - -mca
-              - btl
-              - ^openib
-              - python
-              - cifar/cifar10_deepspeed.py
-              - --deepspeed_mpi
-              - --deepspeed
-              - --deepspeed_config
-              - ds_config.json
-              - $@
+            - mpirun
+            - --allow-run-as-root
+            - -np
+            - "2"
+            - -bind-to
+            - none
+            - -map-by
+            - slot
+            - -x
+            - NCCL_DEBUG=INFO
+            - -x
+            - LD_LIBRARY_PATH
+            - -x
+            - PATH
+            - -mca
+            - pml
+            - ob1
+            - -mca
+            - btl
+            - ^openib
+            - python
+            - cifar/cifar10_deepspeed.py
+            - --deepspeed_mpi
+            - --deepspeed
+            - --deepspeed_config
+            - ds_config.json
+            - $@
     Worker:
       replicas: 2
       template:
@@ -90,4 +90,4 @@ spec:
                 # Optional: varies to nodepool group
                 cpu: 16
                 memory: 128Gi
-                nvidia.com/gpu: 1
\ No newline at end of file
+                nvidia.com/gpu: 1