kubeflow · ghost · Jun 13, 2023 · Jun 13, 2023 · Jun 13, 2023 · tenzen-y
diff --git a/examples/v2beta1/deepspeed/Dockerfile b/examples/v2beta1/deepspeed/Dockerfile
@@ -0,0 +1,36 @@
+# Official MPI Operator Base image
+FROM mpioperator/base
+
+# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need to disable UserKnownHostsFile to avoid write permissions.
+# Disable StrictModes avoids directory and files read permission checks and update system packages & install dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    wget \
+    build-essential \
+    cmake \
+    libopenmpi-dev \
+    openssh-server \
+    python3 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/* \
+    && echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
+    && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+# Install DeepSpeed library and Torch with cu11.8 wheels
+RUN pip3 install deepspeed
+RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+
+# Create folder for deepspeed workspace
+RUN mkdir /deepspeed
+
+# Workspace for DeepSpeed examples
+WORKDIR "/deepspeed"
+
+# Clone the DeepSpeedExamples from repository
+RUN git clone https://github.com/microsoft/DeepSpeedExamples/
+
+# Set the working directory to DeepSpeedExamples for models
+WORKDIR "/deepspeed/DeepSpeedExamples/"
+
+# Set the default command to bash
+CMD ["/bin/bash"]
diff --git a/examples/v2beta1/deepspeed/README.md b/examples/v2beta1/deepspeed/README.md
diff --git a/examples/v2beta1/deepspeed/cifar_ds.Dockerfile b/examples/v2beta1/deepspeed/cifar_ds.Dockerfile
@@ -0,0 +1,12 @@
+# Base image for MPIOperator with DeepSpeed and CUDA setup
+FROM mpioperator/deepspeedbase
+
+# Select WORKDIR for cifar tutorial
+WORKDIR /deepspeed/DeepSpeedExamples/training/cifar
+
+# Install dependencies
+RUN pip3 install pillow \
+    matplotlib
+
+# Run the script for running DeepSpeed applied model
+CMD [ "sh", "run_ds.sh" ]
diff --git a/examples/v2beta1/deepspeed/deepspeed-config.yaml b/examples/v2beta1/deepspeed/deepspeed-config.yaml
@@ -0,0 +1,93 @@
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: deepspeed-mpijob
+spec:
+  slotsPerWorker: 1
+  runPolicy:
+    cleanPodPolicy: Running
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support
+          # Sample container for DeepSpeed applied model, you can check this image to your application or training process
+          - image: cifards:v0.0.1
+            name: deepspeed-mpijob-container
+            command:
+            - mpirun
+            - --allow-run-as-root
+            - -np
+            - "2"
+            - -bind-to
+            - none
+            - -map-by
+            - slot
+            - -x
+            - NCCL_DEBUG=INFO
+            - -x
+            - LD_LIBRARY_PATH
+            - -x
+            - PATH
+            - -mca
+            - pml
+            - ob1
+            - -mca
+            - btl
+            - ^openib
+            - python
+            - cifar/cifar10_deepspeed.py
+            - --deepspeed_mpi
+            - --deepspeed
+            - --deepspeed_config
+            - ds_config.json
+            - $@
+    Worker:
+      replicas: 2
+      template:
+        spec:
+          # OPTIONAL: Taint toleration for the specific nodepool
+          #
+          # Taints and tolerations are used to ensure that the DeepSpeed worker pods
+          # are scheduled on the desired nodes. By applying taints to nodes, you can
+          # repel pods that do not have the corresponding tolerations. This is useful
+          # in situations where you want to reserve nodes with specific resources
+          # (e.g. GPU nodes) for particular workloads, like the DeepSpeed training
+          # job.
+          #
+          # In this example, the tolerations are set to allow the DeepSpeed worker
+          # pods to be scheduled on nodes with the specified taints (i.e., the node
+          # pool with GPU resources). This ensures that the training job can
+          # utilize the available GPU resources on those nodes, improving the
+          # efficiency and performance of the training process.
+          #
+          # You can remove the taint tolerations if you do not have any taints on your cluster.
+          tolerations:
+          # Change the nodepool name in here
+          - effect: NoSchedule
+            key: nodepool
+            operator: Equal
+            value: nodepool-256ram32cpu2gpu-0
+          # Taint toleration effect for GPU nodes
+          - effect: NoSchedule
+            key: nvidia.com/gpu
+            operator: Equal
+            value: present
+          containers:
+          # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support
+          # Change your image name and version in here
+          - image: <YOUR-DEEPSPEED-CONTAINER-NAME>:<VERSION>
+            name: deepspeed-mpijob-container
+            resources:
+              limits:
+                # Optional: varies to nodepool group 
+                cpu: 30
+                memory: 230Gi
+                nvidia.com/gpu: 2
+              requests:
+                # Optional: varies to nodepool group
+                cpu: 16
+                memory: 128Gi
+                nvidia.com/gpu: 1