added support for cuda 11.8 docker image.

vectorch-ai · Dec 3, 2023 · 7f1679f · 7f1679f
1 parent 30ad6fc
commit 7f1679f
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 12 deletions.
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -30,7 +30,17 @@ jobs:
           username: ${{ secrets.DOCKER_HUB_USER }}
           password: ${{ secrets.DOCKER_HUB_TOKEN }}
 
-      - name: Build and push scalellm
+      - name: Build and push gateway
+        uses: docker/build-push-action@v5
+        with:
+          context: ./gateway
+          push: true
+          no-cache: true
+          tags: |
+            vectorchai/scalellm-gateway:${{ steps.tagName.outputs.tag }}
+            vectorchai/scalellm-gateway:latest
+
+      - name: Build and push scalellm for cuda 12.1
         uses: docker/build-push-action@v5
         with:
           context: .
@@ -40,13 +50,15 @@ jobs:
             vectorchai/scalellm:${{ steps.tagName.outputs.tag }}
             vectorchai/scalellm:latest
 
-      - name: Build and push gateway
+      - name: Build and push scalellm for cuda 11.8
         uses: docker/build-push-action@v5
         with:
-          context: ./gateway
+          context: .
           push: true
           no-cache: true
+          build-args: |
+            BASE_IMAGE=nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04
           tags: |
-            vectorchai/scalellm-gateway:${{ steps.tagName.outputs.tag }}
-            vectorchai/scalellm-gateway:latest
+            vectorchai/scalellm_cu118:${{ steps.tagName.outputs.tag }}
+            vectorchai/scalellm_cu118:latest
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -46,7 +46,7 @@ project(
   "ScaleLLM"
   LANGUAGES CXX CUDA
 )
-find_package(CUDA 12.1 REQUIRED)
+find_package(CUDA REQUIRED)
 
 # setup CMake module path, defines path for include() and find_package()
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@@ -97,12 +97,22 @@ if (DEFINED ENV{LIBTORCH_ROOT})
   find_package(Torch REQUIRED HINTS "$ENV{LIBTORCH_ROOT}")
 else()
   include(FetchContent)
-  # download libtorch 2.1 with cuda 12.1 from pytorch.org
-  set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu121.zip")
+  if (CUDA_VERSION VERSION_GREATER_EQUAL 12.1)
+    # download libtorch 2.1 with cuda 12.1 from pytorch.org
+    set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu121.zip")
+    message(STATUS "LIBTORCH_ROOT not found, downloading and using libtorch 2.1 for cuda ${CUDA_VERSION}")
+  elseif(CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+    # download libtorch 2.1 with cuda 11.8 from pytorch.org
+    set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu118.zip")
+    message(STATUS "LIBTORCH_ROOT not found, downloading and using libtorch 2.1 for cuda ${CUDA_VERSION}")
+  else()
+    # error out if cuda version is not supported
+    message(FATAL_ERROR "Unsupported CUDA version: ${CUDA_VERSION}")
+  endif()
+
   FetchContent_Declare(libtorch URL ${LIBTORCH_URL})
   FetchContent_MakeAvailable(libtorch)
 
-  message(STATUS "LIBTORCH_ROOT not found, downloading and using libtorch at ${libtorch_SOURCE_DIR}")
   find_package(Torch REQUIRED PATHS ${libtorch_SOURCE_DIR} NO_DEFAULT_PATH)
 endif()
 

diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,8 @@
-# ---- Build ----
-FROM nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu22.04 as build
+# set base image using nvidia cuda 12.1 for ubuntu 22.04
+ARG BASE_IMAGE=nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu22.04
 
-ARG VERSION=main
+# ---- Build ----
+FROM $BASE_IMAGE as build
 
 LABEL maintainer="mi@vectorch.com"
 

diff --git a/README.md b/README.md
@@ -98,6 +98,7 @@ docker run -it --gpus=all --net=host --shm-size=1g \
 
 > **Warning**<br />
 > * The docker image with tag '[latest](https://hub.docker.com/r/vectorchai/scalellm/tags)' could be changed to a new version upon new release. I don't have an efficient method to automatically repull the latest image upon new release. You'll need to manually manage the image version. All the available images can be found [here](https://hub.docker.com/r/vectorchai/scalellm/tags?page=1&ordering=last_updated).
+> * The docker image with tag '[latest](https://hub.docker.com/r/vectorchai/scalellm/tags)' is built with [CUDA 12.1](https://developer.nvidia.com/cuda-12-1-0-download-archive). If you want to use [CUDA 11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive), please use the image '[docker.io/vectorchai/scalellm_cu118:latest](https://hub.docker.com/r/vectorchai/scalellm_cu118)' instead.
 > * NCCL might fall back to using the host memory if NVLink or PCI is not available. To allow NCCL to use the host memory, we added '--shm-size=1g' to the docker run command.
 
 This command starts the Docker container with GPU support and various configuration options.