Merge pull request vllm-project#9 from ROCm/main_upstream_candidate

Bringing in the latest commits from upstream post FA Triton merge, FP8 KV merge, layernorm optimizations merge.
fxmarty · Apr 10, 2024 · 3489ce7 · 3489ce7
2 parents eed9963 + d64f80a
commit 3489ce7
Show file tree

Hide file tree

Showing 202 changed files with 19,421 additions and 1,692 deletions.
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
@@ -23,8 +23,9 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 python3 benchmarks/benchmark_serving.py \
-    --backend openai \
-    --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
+    --backend vllm \
+    --dataset-name sharegpt \
+    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
     --model meta-llama/Llama-2-7b-chat-hf \
     --num-prompts 20 \
     --endpoint /v1/completions \
@@ -48,7 +49,9 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
 echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
-tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
+echo '```' >> benchmark_results.md
+tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
+echo '```' >> benchmark_results.md
 
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -0,0 +1,14 @@
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t cpu-test -f Dockerfile.cpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f cpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -12,29 +12,42 @@ steps:
   command: pytest -v -s async_engine
 
 - label: Basic Correctness Test
-  command: pytest -v -s --forked basic_correctness
+  command: pytest -v -s basic_correctness
 
 - label: Core Test
   command: pytest -v -s core
 
 - label: Distributed Comm Ops Test
-  command: pytest -v -s --forked test_comm_ops.py
+  command: pytest -v -s test_comm_ops.py
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
 - label: Distributed Tests
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
   commands:
-  - pytest -v -s --forked test_pynccl.py
-  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py
+  - pytest -v -s test_pynccl.py
+  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
 
 - label: Engine Test
   command: pytest -v -s engine tokenization test_sequence.py test_config.py
 
 - label: Entrypoints Test
-  command: pytest -v -s entrypoints
+  commands:
+  # these tests have to be separated, because each one will allocate all posible GPU memory
+  - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
+  - pytest -v -s entrypoints/test_server_oot_registration.py
+
+- label: Examples Test
+  working_dir: "/vllm-workspace/examples"
+  commands:
+    # install aws cli for llava_example.py
+    - pip install awscli
+    - python3 offline_inference.py
+    - python3 offline_inference_with_prefix.py
+    - python3 llm_engine_example.py
+    - python3 llava_example.py
 
 - label: Kernels Test %N
   command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -43,8 +56,7 @@ steps:
 - label: Models Test
   commands:
     - bash ../.buildkite/download-images.sh
-    - pytest -v -s models --ignore=models/test_llava.py  --forked
-  soft_fail: true
+    - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
 
 - label: Llava Test
   commands:
@@ -81,7 +93,7 @@ steps:
   - bash run-benchmarks.sh
 
 - label: Documentation Build
-  working_dir: "/vllm-workspace/docs"
+  working_dir: "/vllm-workspace/test_docs/docs"
   no_gpu: True
   commands:
   - pip install -r requirements-docs.txt

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -8,6 +8,9 @@ steps:
       queue: amd
     command: bash .buildkite/run-amd-test.sh
 
+  - label: "CPU Test"
+    command: bash .buildkite/run-cpu-test.sh
+
   - label: ":docker: build image"
     commands:
       - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
@@ -53,6 +56,8 @@ steps:
                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                 {% endif %}
                 env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
                   - name: HF_TOKEN
                     valueFrom:
                       secretKeyRef:

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.1.2']  # Must be the most recent version that meets requirements.txt.
+          pytorch-version: ['2.2.1']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:

diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
@@ -9,12 +9,13 @@ LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 
 # Install requirements
 $python_executable -m pip install wheel packaging
-$python_executable -m pip install -r requirements.txt
+$python_executable -m pip install -r requirements-cuda.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
 # Make sure punica is built for the release (for LoRA)
 export VLLM_INSTALL_PUNICA_KERNELS=1
-
+# Make sure release wheels are built for the following architectures
+export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,10 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
+option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
+
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
 
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 
@@ -16,7 +19,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
 
 #
 # Supported/expected torch versions for CUDA/ROCm.
@@ -28,7 +31,7 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1")
 set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
 set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
 
@@ -70,10 +73,24 @@ find_package(Torch REQUIRED)
 # config is used for standalone C++ binaries that link against torch).
 # The `libtorch_python.so` library defines some of the glue code between
 # torch/python via pybind and is required by VLLM extensions for this
-# reason. So, add it by manually using `append_torchlib_if_found` from
-# torch's cmake setup.
+# reason. So, add it by manually with `find_library` using torch's
+# installed library path.
+#
+find_library(torch_python_LIBRARY torch_python PATHS
+  "${TORCH_INSTALL_PREFIX}/lib")
+
 #
-append_torchlib_if_found(torch_python)
+# Forward the non-CUDA device extensions to external CMake scripts.
+#
+if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
+    NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
+    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
+    else()
+        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
+    endif()
+    return()
+endif()
 
 #
 # Set up GPU language and check the torch version and warn if it isn't

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -21,7 +21,6 @@ Express your support on Twitter if vLLM aids you, or simply offer your appreciat
 ### Build from source
 
 ```bash
-pip install -r requirements.txt
 pip install -e .  # This may take several minutes.
 ```
 
@@ -30,6 +29,8 @@ pip install -e .  # This may take several minutes.
 ```bash
 pip install -r requirements-dev.txt
 
+# linting and formatting
+bash format.sh
 # Static type checking
 mypy
 # Unit tests

diff --git a/Dockerfile b/Dockerfile
@@ -2,6 +2,7 @@
 # to run the OpenAI compatible server.
 
 #################### BASE BUILD IMAGE ####################
+# prepare basic build environment
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 
 RUN apt-get update -y \
@@ -16,37 +17,46 @@ RUN ldconfig /usr/local/cuda-12.1/compat/
 WORKDIR /workspace
 
 # install build and runtime dependencies
-COPY requirements.txt requirements.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt
+    pip install -r requirements-cuda.txt
 
 # install development dependencies
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-dev.txt
+
+# cuda arch list used by torch
+# can be useful for both `dev` and `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################
 
 
-#################### EXTENSION BUILD IMAGE ####################
+#################### WHEEL BUILD IMAGE ####################
 FROM dev AS build
 
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-build.txt
 
-# copy input files
+# install compiler cache to speed up compilation leveraging local or remote caching
+RUN apt-get update -y && apt-get install -y ccache
+
+# files and directories related to build wheels
 COPY csrc csrc
 COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
-COPY requirements.txt requirements.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
-COPY vllm/__init__.py vllm/__init__.py
+COPY vllm vllm
 
-# cuda arch list used by torch
-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
 ENV MAX_JOBS=${max_jobs}
@@ -56,7 +66,17 @@ ENV NVCC_THREADS=$nvcc_threads
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
-RUN python3 setup.py build_ext --inplace
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 setup.py bdist_wheel --dist-dir=dist
+
+# the `vllm_nccl` package must be installed from source distribution
+# pip is too smart to store a wheel in the cache, and other CI jobs
+# will directly use the wheel from the cache, which is not what we want.
+# we need to remove it manually
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip cache remove vllm_nccl*
 #################### EXTENSION Build IMAGE ####################
 
 #################### FLASH_ATTENTION Build IMAGE ####################
@@ -76,56 +96,60 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
 
 #################### FLASH_ATTENTION Build IMAGE ####################
 
-#################### TEST IMAGE ####################
-# image to run unit testing suite
-FROM dev AS test
-
-# copy pytorch extensions separately to avoid having to rebuild
-# when python code changes
+#################### vLLM installation IMAGE ####################
+# image with vLLM installed
+FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
 WORKDIR /vllm-workspace
-# ADD is used to preserve directory structure
-ADD . /vllm-workspace/
-COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
-# Install flash attention (from pre-built wheel)
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git vim
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.1/compat/
+
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    pip install dist/*.whl --verbose
+
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+    --mount=type=cache,target=/root/.cache/pip \
     pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
-# ignore build dependencies installation because we are using pre-complied extensions
-RUN rm pyproject.toml
-RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
-#################### TEST IMAGE ####################
+#################### vLLM installation IMAGE ####################
 
 
-#################### RUNTIME BASE IMAGE ####################
-# We used base cuda image because pytorch installs its own cuda libraries.
-# However pynccl depends on cuda libraries so we had to switch to the runtime image
-# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
-FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
+#################### TEST IMAGE ####################
+# image to run unit testing suite
+# note that this uses vllm installed by `pip`
+FROM vllm-base AS test
 
-# libnccl required for ray
-RUN apt-get update -y \
-    && apt-get install -y python3-pip
+ADD . /vllm-workspace/
 
-WORKDIR /workspace
-COPY requirements.txt requirements.txt
+# install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt
-
-# Install flash attention (from pre-built wheel)
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
-    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
+    pip install -r requirements-dev.txt
 
-#################### RUNTIME BASE IMAGE ####################
+# doc requires source code
+# we hide them inside `test_docs/` , so that this source code
+# will not be imported by other tests
+RUN mkdir test_docs
+RUN mv docs test_docs/
+RUN mv vllm test_docs/
 
+#################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
 # openai api server alternative
 FROM vllm-base AS vllm-openai
+
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install accelerate hf_transfer modelscope
 
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
-COPY vllm vllm
+ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################