Skip to content

Commit

Permalink
added support for cuda 11.8 docker image.
Browse files Browse the repository at this point in the history
  • Loading branch information
guocuimi committed Dec 3, 2023
1 parent 30ad6fc commit 7f1679f
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 12 deletions.
22 changes: 17 additions & 5 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,17 @@ jobs:
username: ${{ secrets.DOCKER_HUB_USER }}
password: ${{ secrets.DOCKER_HUB_TOKEN }}

- name: Build and push scalellm
- name: Build and push gateway
uses: docker/build-push-action@v5
with:
context: ./gateway
push: true
no-cache: true
tags: |
vectorchai/scalellm-gateway:${{ steps.tagName.outputs.tag }}
vectorchai/scalellm-gateway:latest
- name: Build and push scalellm for cuda 12.1
uses: docker/build-push-action@v5
with:
context: .
Expand All @@ -40,13 +50,15 @@ jobs:
vectorchai/scalellm:${{ steps.tagName.outputs.tag }}
vectorchai/scalellm:latest
- name: Build and push gateway
- name: Build and push scalellm for cuda 11.8
uses: docker/build-push-action@v5
with:
context: ./gateway
context: .
push: true
no-cache: true
build-args: |
BASE_IMAGE=nvcr.io/nvidia/cuda:11.8.0-devel-ubuntu22.04
tags: |
vectorchai/scalellm-gateway:${{ steps.tagName.outputs.tag }}
vectorchai/scalellm-gateway:latest
vectorchai/scalellm_cu118:${{ steps.tagName.outputs.tag }}
vectorchai/scalellm_cu118:latest
18 changes: 14 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ project(
"ScaleLLM"
LANGUAGES CXX CUDA
)
find_package(CUDA 12.1 REQUIRED)
find_package(CUDA REQUIRED)

# setup CMake module path, defines path for include() and find_package()
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
Expand Down Expand Up @@ -97,12 +97,22 @@ if (DEFINED ENV{LIBTORCH_ROOT})
find_package(Torch REQUIRED HINTS "$ENV{LIBTORCH_ROOT}")
else()
include(FetchContent)
# download libtorch 2.1 with cuda 12.1 from pytorch.org
set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu121.zip")
if (CUDA_VERSION VERSION_GREATER_EQUAL 12.1)
# download libtorch 2.1 with cuda 12.1 from pytorch.org
set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu121.zip")
message(STATUS "LIBTORCH_ROOT not found, downloading and using libtorch 2.1 for cuda ${CUDA_VERSION}")
elseif(CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
# download libtorch 2.1 with cuda 11.8 from pytorch.org
set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu118.zip")
message(STATUS "LIBTORCH_ROOT not found, downloading and using libtorch 2.1 for cuda ${CUDA_VERSION}")
else()
# error out if cuda version is not supported
message(FATAL_ERROR "Unsupported CUDA version: ${CUDA_VERSION}")
endif()

FetchContent_Declare(libtorch URL ${LIBTORCH_URL})
FetchContent_MakeAvailable(libtorch)

message(STATUS "LIBTORCH_ROOT not found, downloading and using libtorch at ${libtorch_SOURCE_DIR}")
find_package(Torch REQUIRED PATHS ${libtorch_SOURCE_DIR} NO_DEFAULT_PATH)
endif()

Expand Down
7 changes: 4 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# ---- Build ----
FROM nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu22.04 as build
# set base image using nvidia cuda 12.1 for ubuntu 22.04
ARG BASE_IMAGE=nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu22.04

ARG VERSION=main
# ---- Build ----
FROM $BASE_IMAGE as build

LABEL maintainer="mi@vectorch.com"

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ docker run -it --gpus=all --net=host --shm-size=1g \

> **Warning**<br />
> * The docker image with tag '[latest](https://hub.docker.com/r/vectorchai/scalellm/tags)' could be changed to a new version upon new release. I don't have an efficient method to automatically repull the latest image upon new release. You'll need to manually manage the image version. All the available images can be found [here](https://hub.docker.com/r/vectorchai/scalellm/tags?page=1&ordering=last_updated).
> * The docker image with tag '[latest](https://hub.docker.com/r/vectorchai/scalellm/tags)' is built with [CUDA 12.1](https://developer.nvidia.com/cuda-12-1-0-download-archive). If you want to use [CUDA 11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive), please use the image '[docker.io/vectorchai/scalellm_cu118:latest](https://hub.docker.com/r/vectorchai/scalellm_cu118)' instead.
> * NCCL might fall back to using the host memory if NVLink or PCI is not available. To allow NCCL to use the host memory, we added '--shm-size=1g' to the docker run command.
This command starts the Docker container with GPU support and various configuration options.
Expand Down

0 comments on commit 7f1679f

Please sign in to comment.