Skip to content

Commit

Permalink
CPU only build (vllm-project#9)
Browse files Browse the repository at this point in the history
  • Loading branch information
maktukmak authored and bigPYJ1151 committed Dec 29, 2023
1 parent e20ae23 commit 62936e3
Show file tree
Hide file tree
Showing 11 changed files with 185 additions and 36 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ sanitizer:
py_install:
VLLM_BUILD_CPU_OPS=1 MAX_JOBS=JOBS pip install --no-build-isolation -v -e .

py_install_cpu:
VLLM_BUILD_CPU_ONLY=1 MAX_JOBS=JOBS pip install --no-build-isolation -v -e .

package:
VLLM_BUILD_CPU_OPS=1 MAX_JOBS=JOBS python setup.py bdist_wheel
echo "Wheel package is saved in ./dist/"
Expand Down
77 changes: 77 additions & 0 deletions cpu.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
FROM python:3.10 AS dev

RUN apt-get update -y \
&& apt-get install -y python3-pip

WORKDIR /workspace

# install build and runtime dependencies
COPY requirements-cpu.txt requirements-cpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-cpu.txt

# install development dependencies
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt

# image to build pytorch extensions
FROM dev AS build

# install build dependencies
COPY requirements-build-cpu.txt requirements-build-cpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-build-cpu.txt

# copy input files
COPY csrc csrc
COPY setup.py setup.py
COPY requirements-cpu.txt requirements-cpu.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py

# max jobs used by Ninja to build extensions
ENV MAX_JOBS=$max_jobs
RUN python3 setup.py build_ext --inplace

# image to run unit testing suite
FROM dev AS test

# copy pytorch extensions separately to avoid having to rebuild
# when python code changes
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY tests tests
COPY vllm vllm

ENTRYPOINT ["python3", "-m", "pytest", "tests"]

# use CUDA base as CUDA runtime dependencies are already installed via pip
FROM python:3.10 AS dev

# libnccl required for ray
RUN apt-get update -y \
&& apt-get install -y python3-pip

WORKDIR /workspace
COPY requirements-cpu.txt requirements-cpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-cpu.txt

FROM vllm-base AS vllm
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm

EXPOSE 8000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]

# openai api server alternative
FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate fschat

COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

4 changes: 4 additions & 0 deletions csrc/dispatch_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))

#ifdef VLLM_BUILD_CPU_ONLY
#define VLLM_DISPATCH_TO_CUDA_CASE(BASENAME, ...)
#else
#define VLLM_DISPATCH_TO_CUDA_CASE(BASENAME, ...) \
case c10::DeviceType::CUDA: { \
return BASENAME(__VA_ARGS__); \
}
#endif

#ifdef VLLM_BUILD_CPU_OPS
#define VLLM_DISPATCH_TO_CPU_CASE(BASENAME, ...) \
Expand Down
6 changes: 6 additions & 0 deletions csrc/pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@ void gptq_shuffle_dispatch(
VLLM_DISPATCH_DEVICES(q_weight.device(), gptq_shuffle, q_weight, q_perm);
}

#ifdef VLLM_BUILD_CPU_ONLY
int get_device_attribute(
int attribute,
int device_id) { return 94387; }
#endif

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// vLLM custom ops
pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
Expand Down
14 changes: 7 additions & 7 deletions Dockerfile → gpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ RUN apt-get update -y \
WORKDIR /workspace

# install build and runtime dependencies
COPY requirements.txt requirements.txt
COPY requirements-gpu.txt requirements-gpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt
pip install -r requirements-gpu.txt

# install development dependencies
COPY requirements-dev.txt requirements-dev.txt
Expand All @@ -19,14 +19,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \
FROM dev AS build

# install build dependencies
COPY requirements-build.txt requirements-build.txt
COPY requirements-build-gpu.txt requirements-build-gpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-build.txt
pip install -r requirements-build-gpu.txt

# copy input files
COPY csrc csrc
COPY setup.py setup.py
COPY requirements.txt requirements.txt
COPY requirements-gpu.txt requirements-gpu.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py

Expand Down Expand Up @@ -60,9 +60,9 @@ RUN apt-get update -y \
&& apt-get install -y python3-pip

WORKDIR /workspace
COPY requirements.txt requirements.txt
COPY requirements-gpu.txt requirements-gpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt
pip install -r requirements-gpu.txt

FROM vllm-base AS vllm
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
Expand Down
6 changes: 6 additions & 0 deletions requirements-build-cpu.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Should be mirrored in pyproject.toml
ninja
packaging
setuptools>=49.4.0
torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp310-cp310-linux_x86_64.whl#sha256=88f1ee550c6291af8d0417871fb7af84b86527d18bc02ac4249f07dcd84dda56 #2.1.0+cpu
wheel
File renamed without changes.
14 changes: 14 additions & 0 deletions requirements-cpu.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
ninja # For faster builds.
psutil
ray >= 2.5.1
pandas # Required for Ray data.
pyarrow # Required for Ray data.
sentencepiece # Required for LLaMA tokenizer.
numpy
einops # Required for phi-1_5
torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp310-cp310-linux_x86_64.whl#sha256=88f1ee550c6291af8d0417871fb7af84b86527d18bc02ac4249f07dcd84dda56 #2.1.0+cpu
transformers >= 4.34.0 # Required for Mistral.
fastapi
uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server.
aioprometheus[starlette]
File renamed without changes.
95 changes: 66 additions & 29 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@
from packaging.version import parse, Version
import setuptools
import torch
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME

BUILD_CPU_ONLY = os.getenv('VLLM_BUILD_CPU_ONLY', "0") == "1"

if not BUILD_CPU_ONLY:
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
else:
from torch.utils.cpp_extension import BuildExtension, CppExtension

ROOT_DIR = os.path.dirname(__file__)

Expand All @@ -21,11 +27,11 @@


def _is_hip() -> bool:
return torch.version.hip is not None
return torch.version.hip is not None and not BUILD_CPU_ONLY


def _is_cuda() -> bool:
return torch.version.cuda is not None
return torch.version.cuda is not None and not BUILD_CPU_ONLY


# Compiler flags.
Expand Down Expand Up @@ -86,7 +92,6 @@ def get_hipcc_rocm_version():
print("Could not find HIP version in the output")
return None


def get_nvcc_cuda_version(cuda_dir: str) -> Version:
"""Get the CUDA version from nvcc.
Expand Down Expand Up @@ -137,6 +142,19 @@ def get_torch_arch_list() -> Set[str]:
stacklevel=2)
return arch_list

if not BUILD_CPU_ONLY:
# First, check the TORCH_CUDA_ARCH_LIST environment variable.
compute_capabilities = get_torch_arch_list()
if not compute_capabilities:
# If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
# GPUs on the current machine.
device_count = torch.cuda.device_count()
for i in range(device_count):
major, minor = torch.cuda.get_device_capability(i)
if major < 7:
raise RuntimeError(
"GPUs with compute capability below 7.0 are not supported.")
compute_capabilities.add(f"{major}.{minor}")

# First, check the TORCH_CUDA_ARCH_LIST environment variable.
compute_capabilities = get_torch_arch_list()
Expand Down Expand Up @@ -211,9 +229,11 @@ def get_torch_arch_list() -> Set[str]:
f"amdgpu_arch_found: {amd_arch}")

# Setup CPU Operations
BUILD_CPU_OPS = os.getenv('VLLM_BUILD_CPU_OPS', "0") == "1"
BUILD_CPU_OPS = (os.getenv('VLLM_BUILD_CPU_OPS', "0") == "1" or BUILD_CPU_ONLY)
CPU_OPS_SOURCES = []
if BUILD_CPU_OPS:
if BUILD_CPU_ONLY:
CXX_FLAGS += ["-DVLLM_BUILD_CPU_ONLY"]
CXX_FLAGS += [
"-DVLLM_BUILD_CPU_OPS", "-fopenmp", "-mavx512f", "-mavx512bf16",
"-mavx512vl"
Expand All @@ -228,29 +248,42 @@ def get_torch_arch_list() -> Set[str]:

ext_modules = []

vllm_extension_sources = [
"csrc/cache_kernels.cu",
"csrc/attention/attention_kernels.cu",
"csrc/pos_encoding_kernels.cu",
"csrc/activation_kernels.cu",
"csrc/layernorm_kernels.cu",
"csrc/quantization/squeezellm/quant_cuda_kernel.cu",
"csrc/quantization/gptq/q_gemm.cu",
"csrc/cuda_utils_kernels.cu",
"csrc/pybind.cpp",
] + CPU_OPS_SOURCES
if not BUILD_CPU_ONLY:
vllm_extension_sources = [
"csrc/cache_kernels.cu",
"csrc/attention/attention_kernels.cu",
"csrc/pos_encoding_kernels.cu",
"csrc/activation_kernels.cu",
"csrc/layernorm_kernels.cu",
"csrc/quantization/squeezellm/quant_cuda_kernel.cu",
"csrc/quantization/gptq/q_gemm.cu",
"csrc/cuda_utils_kernels.cu",
"csrc/pybind.cpp",
] + CPU_OPS_SOURCES

if _is_cuda():
vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")

vllm_extension = CUDAExtension(
name="vllm._C",
sources=vllm_extension_sources,
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
)
else:
vllm_extension_sources = [
"csrc/pybind.cpp",
] + CPU_OPS_SOURCES
vllm_extension = CppExtension(
name="vllm._C",
sources=vllm_extension_sources,
extra_compile_args={
"cxx": CXX_FLAGS,
},
)

if _is_cuda():
vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")

vllm_extension = CUDAExtension(
name="vllm._C",
sources=vllm_extension_sources,
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
)
ext_modules.append(vllm_extension)


Expand Down Expand Up @@ -280,7 +313,7 @@ def get_vllm_version() -> str:
if hipcc_version != MAIN_CUDA_VERSION:
rocm_version_str = hipcc_version.replace(".", "")[:3]
version += f"+rocm{rocm_version_str}"
else:
elif _is_cuda():
cuda_version = str(nvcc_cuda_version)
if cuda_version != MAIN_CUDA_VERSION:
cuda_version_str = cuda_version.replace(".", "")[:3]
Expand All @@ -303,9 +336,13 @@ def get_requirements() -> List[str]:
if _is_hip():
with open(get_path("requirements-rocm.txt")) as f:
requirements = f.read().strip().split("\n")
elif _is_cuda():
with open(get_path("requirements-gpu.txt")) as f:
requirements = f.read().strip().split("\n")
else:
with open(get_path("requirements.txt")) as f:
with open(get_path("requirements-cpu.txt")) as f:
requirements = f.read().strip().split("\n")

return requirements


Expand Down
2 changes: 2 additions & 0 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@

import psutil
import torch
import os

from vllm._C import cuda_utils



class Device(enum.Enum):
GPU = enum.auto()
CPU = enum.auto()
Expand Down

0 comments on commit 62936e3

Please sign in to comment.