diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 49ae838cf0690..fd60f5b6afeca 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -18,7 +18,13 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg # Run basic model test docker exec cpu-test bash -c " pip install pytest matplotlib einops transformers_stream_generator - pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported + pytest -v -s tests/models -m \"not vlm\" \ + --ignore=tests/models/test_embedding.py \ + --ignore=tests/models/test_oot_registration.py \ + --ignore=tests/models/test_registry.py \ + --ignore=tests/models/test_jamba.py \ + --ignore=tests/models/test_mamba.py \ + --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported # online inference docker exec cpu-test bash -c " diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 62d3afb0212fd..c2818c38965ea 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -27,6 +27,7 @@ docker exec cpu-test bash -c " pytest -v -s tests/models/decoder_only/language \ --ignore=tests/models/test_fp8.py \ --ignore=tests/models/decoder_only/language/test_jamba.py \ + --ignore=tests/models/decoder_only/language/test_mamba.py \ --ignore=tests/models/decoder_only/language/test_granitemoe.py \ --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ccc5003e66beb..4c2fe41c739b1 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -121,7 +121,9 @@ steps: - vllm/core/ - tests/distributed - tests/spec_decode/e2e/test_integration_dist_tp4 + - tests/compile commands: + - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py @@ -231,14 +233,16 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_full_graph_smoke.py + - pytest -v -s compile/test_basic_correctness.py -- label: "PyTorch Fullgraph Test" # 18min - source_file_dependencies: - - vllm/ - - tests/compile - commands: - - pytest -v -s compile/test_full_graph.py +# TODO: re-write in comparison tests, and fix symbolic shape +# for quantization ops. +# - label: "PyTorch Fullgraph Test" # 18min +# source_file_dependencies: +# - vllm/ +# - tests/compile +# commands: +# - pytest -v -s compile/test_full_graph.py - label: Kernels Test %N # 1h each mirror_hardwares: [amd] @@ -343,10 +347,11 @@ steps: - pytest -v -s models/encoder_decoder/language - pytest -v -s models/encoder_decoder/vision_language +# This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test - #mirror_hardwares: [amd] optional: true commands: + - echo 'Testing custom models...' # PR authors can temporarily add commands below to test individual models # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* @@ -394,7 +399,7 @@ steps: - tests/distributed/ - vllm/compilation commands: - - pytest -v -s ./compile/test_full_graph_multi_gpu.py + - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' - TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e15f129719f8f..cd721971d01d6 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,19 +1,30 @@ # See https://help.github.com/articles/about-codeowners/ # for more info about CODEOWNERS file +# This lists cover the "core" components of vLLM that require careful review +/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +CMakeLists.txt @tlrmchlsmth @WoosukKwon + +# Test ownership /tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo /tests/test_inputs.py @DarkLight1337 @ywang96 -/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo +/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 -/tests/prefix_caching @comaniac @KuntaiDu +/tests/prefix_caching @comaniac @KuntaiDu /tests/spec_decode @njhill @LiuXiaoxuanPKU -/tests/kernels @tlrmchlsmth @WoosukKwon +/tests/kernels @tlrmchlsmth @WoosukKwon /tests/quantization @mgoin @robertgshaw2-neuralmagic -/.buildkite/lm-eval-harness @mgoin @simon-mo +/.buildkite/lm-eval-harness @mgoin @simon-mo /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/multi_step @alexm-neuralmagic @SolitaryThinker @comaniac +/tests/multi_step @alexm-neuralmagic @comaniac /tests/weight_loading @mgoin @youkaichao /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 38e23651eefef..2a0e3239f58da 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -28,7 +28,7 @@ jobs: runs-on: ubuntu-latest steps: - name: "Checkout" - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 with: fetch-depth: 0 diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml index 761cae8e33fbd..2e7c7f7f087af 100644 --- a/.github/workflows/add_label_automerge.yml +++ b/.github/workflows/add_label_automerge.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Add label - uses: actions/github-script@v6 + uses: actions/github-script@v7 with: script: | github.rest.issues.addLabels({ diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index 4eec72b96622d..064af291009fa 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.11"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index d578d7c521402..22e3564779ad9 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 4cbe32bdf33bd..96549b3f99181 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,7 +21,7 @@ jobs: upload_url: ${{ steps.create_release.outputs.upload_url }} steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Extract branch info shell: bash @@ -30,7 +30,7 @@ jobs: - name: Create Release id: create_release - uses: "actions/github-script@v6" + uses: "actions/github-script@v7" env: RELEASE_TAG: ${{ env.release_tag }} with: @@ -54,7 +54,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup ccache uses: hendrikmuhs/ccache-action@v1.2 @@ -68,7 +68,7 @@ jobs: bash -x .github/workflows/scripts/env.sh - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml index 99827756d2066..d1791c3bc865a 100644 --- a/.github/workflows/reminder_comment.yml +++ b/.github/workflows/reminder_comment.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Remind to run full CI on PR - uses: actions/github-script@v6 + uses: actions/github-script@v7 with: script: | github.rest.issues.createComment({ diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 73ce56e9e6a2e..be73fb85ed1fa 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index 5f24b5b90b513..eb728ae04dfc1 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -16,9 +16,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/CMakeLists.txt b/CMakeLists.txt index 4be524808a23a..3a424ad7b110f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,27 +144,32 @@ else() endif() -# -# For cuda we want to be able to control which architectures we compile for on -# a per-file basis in order to cut down on compile time. So here we extract -# the set of architectures we want to compile for and remove the from the -# CMAKE_CUDA_FLAGS so that they are not applied globally. -# if(VLLM_GPU_LANG STREQUAL "CUDA") + # + # For cuda we want to be able to control which architectures we compile for on + # a per-file basis in order to cut down on compile time. So here we extract + # the set of architectures we want to compile for and remove the from the + # CMAKE_CUDA_FLAGS so that they are not applied globally. + # clear_cuda_arches(CUDA_ARCH_FLAGS) extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}") message(STATUS "CUDA target architectures: ${CUDA_ARCHS}") + # Filter the target architectures by the supported supported archs + # since for some files we will build for all CUDA_ARCHS. + cuda_archs_loose_intersection(CUDA_ARCHS + "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}") + message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}") +else() + # + # For other GPU targets override the GPU architectures detected by cmake/torch + # and filter them by the supported versions for the current language. + # The final set of arches is stored in `VLLM_GPU_ARCHES`. + # + override_gpu_arches(VLLM_GPU_ARCHES + ${VLLM_GPU_LANG} + "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}") endif() -# -# Override the GPU architectures detected by cmake/torch and filter them by -# the supported versions for the current language. -# The final set of arches is stored in `VLLM_GPU_ARCHES`. -# -override_gpu_arches(VLLM_GPU_ARCHES - ${VLLM_GPU_LANG} - "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}") - # # Query torch for additional GPU compilation flags for the given # `VLLM_GPU_LANG`. diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 1803b38629002..b9134d4ae41cb 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -26,7 +26,8 @@ RUN pip install intel_extension_for_pytorch==2.4.0 WORKDIR /workspace -ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu +ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ pip install --upgrade pip && \ diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 292d1f37fbf3e..04999518b7138 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -176,9 +176,9 @@ def sample_sonnet_requests( # Sample the rest of lines per request. sampled_requests: List[Tuple[str, int, int]] = [] for _ in range(num_requests): - sampled_lines = "".join( - prefix_lines + - random.sample(poem_lines, num_input_lines - num_prefix_lines)) + num_lines_needed = num_input_lines - num_prefix_lines + sampled_lines = "".join(prefix_lines + + random.choices(poem_lines, k=num_lines_needed)) prompt = f"{base_prompt}{sampled_lines}" message = [ @@ -536,7 +536,7 @@ def process_one_metric( # E.g., "Time to First Token" metric_header: str, ): - # This function print and add statistics of the specified + # This function prints and adds statistics of the specified # metric. if metric_attribute_name not in selected_percentile_metrics: return diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu index a27f1e7c83df9..ff037756f55ab 100644 --- a/csrc/quantization/machete/machete_pytorch.cu +++ b/csrc/quantization/machete/machete_pytorch.cu @@ -89,6 +89,10 @@ torch::Tensor prepack_B(torch::Tensor const& B, TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("machete_prepack_B", &prepack_B); m.impl("machete_gemm", &gemm); +} + +// use CatchAll since supported_schedules has no tensor arguments +TORCH_LIBRARY_IMPL(TORCH_EXTENSION_NAME, CatchAll, m) { m.impl("machete_supported_schedules", &supported_schedules); } diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 81287762d3c0a..cfd2dcb3bd5d3 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -1,32 +1,53 @@ .. _debugging: +=============== Debugging Tips =============== -Debugging hang/crash issues ---------------------------- +This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. + +.. note:: + + Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. -When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time: +Hangs downloading a model +---------------------------------------- +If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. +It's recommended to download the model first using the `huggingface-cli `_ and passing the local path to the model to vLLM. This way, you can isolate the issue. -- **Downloading a model**: Do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface-cli `_ and then use the local path to the model. This way, you can isolate the issue. -- **Loading the model from disk**: If the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory. -- **Tensor parallel inference**: If the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +Hangs loading a model from disk +---------------------------------------- +If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. +It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. -If you have already taken care of the above issues, but the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue: +Model is too large +---------------------------------------- +If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism `_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. -- Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging. -- Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble. -- Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL. -- Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs. +Enable more logging +---------------------------------------- +If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue: -With more logging, hopefully you can find the root cause of the issue. +- ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging. +- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem. +- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL. +- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs. -If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error. +Incorrect network setup +---------------------------------------- +The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one. +If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=``. -Here are some common issues that can cause hangs: +You might also need to set ``export NCCL_SOCKET_IFNAME=`` and ``export GLOO_SOCKET_IFNAME=`` to specify the network interface for the IP address. -- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address. -- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly. +Error near ``self.graph.replay()`` +---------------------------------------- +If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph. +To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. + +Incorrect hardware/driver +---------------------------------------- +If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. .. code-block:: python @@ -84,33 +105,29 @@ Here are some common issues that can cause hangs: dist.destroy_process_group(gloo_group) dist.destroy_process_group() -.. tip:: +If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use: - Save the script as ``test.py``. - - If you are testing in a single-node, run it with ``NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py``, adjust ``--nproc-per-node`` to the number of GPUs you want to use. - - If you are testing with multi-nodes, run it with ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py``. Adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup. Make sure ``MASTER_ADDR``: - - - is the correct IP address of the master node - - is reachable from all nodes - - is set before running the script. +.. code-block:: shell - If the script runs successfully, you should see the message ``sanity check is successful!``. + NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py - Note that multi-node environment is more complicated than single-node. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: +If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run: - - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``. - - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``. +.. code-block:: shell + + NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py - Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup. The difference is that you need to execute different commands (with different ``--node-rank``) on different nodes. +If the script runs successfully, you should see the message ``sanity check is successful!``. -If the problem persists, feel free to `open an issue on GitHub `_, with a detailed description of the issue, your environment, and the logs. +.. note:: -Some known issues: + A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: -- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can cause hangs at a low probability (once in about 20 times, depending on the machine configuration). The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_ . + - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``. + - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``. -.. warning:: + Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes. - After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on. +Known Issues +---------------------------------------- +- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_. diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index 2e6f6cdd163ce..99c695ac4ddb1 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -84,6 +84,8 @@ Latest code can contain bugs and may not be stable. Please use it with caution. Build from source ================== +.. _python-only-build: + Python-only build (without compilation) ---------------------------------------- @@ -114,6 +116,23 @@ The script will: Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM. +Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev``(or ``-q`` for short) flag: + +.. code-block:: console + + $ python python_only_dev.py --quit-dev + +The script with ``--quit-dev`` flag will: + +* Remove the symbolic link from the current directory to the vLLM package. +* Restore the original vLLM package from the backup. + +If you update the vLLM wheel and want to rebuild from the source and make further edits, you will need to start `all above <#python-only-build>`_ over again. + +.. note:: + + There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. + It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the above section <#install-the-latest-code>`_ for instructions on how to install a specified wheel. Full build (with compilation) --------------------------------- diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst index a9ed4d7fa2cd7..ec99fc013057b 100644 --- a/docs/source/getting_started/neuron-installation.rst +++ b/docs/source/getting_started/neuron-installation.rst @@ -27,6 +27,10 @@ Installation steps: .. _build_from_source_neuron: +.. note:: + + The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. + Build from source ----------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 961373eb71c0b..d20e46b4a3656 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -86,6 +86,7 @@ Documentation serving/usage_stats serving/integrations serving/tensorizer + serving/compatibility_matrix serving/faq .. toctree:: diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index fa1003874033e..ae09259c0756c 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -99,7 +99,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a 5. Register your model ---------------------- -Finally, register your :code:`*ForCausalLM` class to the :code:`_MODELS` in `vllm/model_executor/models/registry.py `_. +Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py `_. 6. Out-of-Tree Model Integration -------------------------------------------- diff --git a/docs/source/models/performance.rst b/docs/source/models/performance.rst index d8750ddc34e8e..23b5ab79a7378 100644 --- a/docs/source/models/performance.rst +++ b/docs/source/models/performance.rst @@ -22,6 +22,8 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False. +.. _chunked-prefill: + Chunked Prefill --------------- vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index ec64a82de84d4..bf86a72e20b57 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -152,6 +152,11 @@ Text Generation - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc. - ✅︎ - ✅︎ + * - :code:`MambaForCausalLM` + - Mamba + - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc. + - ✅︎ + - * - :code:`MiniCPMForCausalLM` - MiniCPM - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc. @@ -346,6 +351,12 @@ Text Generation - :code:`adept/fuyu-8b` etc. - - ✅︎ + * - :code:`ChatGLMModel` + - GLM-4V + - Image + - :code:`THUDM/glm-4v-9b` etc. + - + - ✅︎ * - :code:`InternVLChatModel` - InternVL2 - Image\ :sup:`E+` diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst new file mode 100644 index 0000000000000..cac0605ca132b --- /dev/null +++ b/docs/source/serving/compatibility_matrix.rst @@ -0,0 +1,427 @@ +.. _compatibility_matrix: + +Compatibility Matrix +==================== + +The tables below show mutually exclusive features and the support on some hardware. + +.. note:: + + Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. + +Feature x Feature +----------------- + + +.. raw:: html + + + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - Feature + - :ref:`CP ` + - :ref:`APC ` + - :ref:`LoRA ` + - :abbr:`prmpt adptr (Prompt Adapter)` + - :ref:`SD ` + - CUDA graph + - :abbr:`enc-dec (Encoder-Decoder Models)` + - :abbr:`logP (Logprobs)` + - :abbr:`prmpt logP (Prompt Logprobs)` + - :abbr:`async output (Async Output Processing)` + - multi-step + - :abbr:`MM (Multimodal)` + - best-of + - beam-search + - :abbr:`guided dec (Guided Decoding)` + * - :ref:`CP ` + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`APC ` + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`LoRA ` + - `✗ `__ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :abbr:`prmpt adptr (Prompt Adapter)` + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`SD ` + - ✗ + - ✅ + - ✗ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + * - :abbr:`enc-dec (Encoder-Decoder Models)` + - ✗ + - `✗ `__ + - ✗ + - ✗ + - `✗ `__ + - ✅ + - + - + - + - + - + - + - + - + - + * - :abbr:`logP (Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + * - :abbr:`prmpt logP (Prompt Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + * - :abbr:`async output (Async Output Processing)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✅ + - + - + - + - + - + - + * - multi-step + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - `✗ `__ + - ✅ + - + - + - + - + - + * - :abbr:`MM (Multimodal)` + - `✗ `__ + - `✗ `__ + - `✗ `__ + - ? + - ? + - ✅ + - ✗ + - ✅ + - ✅ + - ✅ + - ? + - + - + - + - + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - `✗ `__ + - ✅ + - + - + - + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - `✗ `__ + - ? + - ✅ + - + - + * - :abbr:`guided dec (Guided Decoding)` + - ✅ + - ✅ + - ? + - ? + - ✅ + - ✅ + - ? + - ✅ + - ✅ + - ✅ + - ✗ + - ? + - ✅ + - ✅ + - + + +Feature x Hardware +^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - Feature + - Volta + - Turing + - Ampere + - Ada + - Hopper + - CPU + - AMD + * - :ref:`CP ` + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :ref:`APC ` + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :ref:`LoRA ` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :abbr:`prmpt adptr (Prompt Adapter)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :ref:`SD ` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :abbr:`enc-dec (Encoder-Decoder Models)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✗ + * - :abbr:`logP (Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`prmpt logP (Prompt Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`async output (Async Output Processing)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✗ + * - multi-step + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :abbr:`MM (Multimodal)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`guided dec (Guided Decoding)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 5dd539c3d5ee4..8d6818e7dfd3e 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -300,6 +300,21 @@ def run_mllama(question: str, modality: str): return llm, prompt, stop_token_ids +# GLM-4v +def run_glm4v(question: str, modality: str): + assert modality == "image" + model_name = "THUDM/glm-4v-9b" + + llm = LLM(model=model_name, + max_model_len=2048, + max_num_seqs=2, + trust_remote_code=True, + enforce_eager=True) + prompt = question + stop_token_ids = [151329, 151336, 151338] + return llm, prompt, stop_token_ids + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -316,6 +331,7 @@ def run_mllama(question: str, modality: str): "qwen_vl": run_qwen_vl, "qwen2_vl": run_qwen2_vl, "mllama": run_mllama, + "glm4v": run_glm4v, } diff --git a/python_only_dev.py b/python_only_dev.py index d84122280a3c2..72d4e78ee14f6 100644 --- a/python_only_dev.py +++ b/python_only_dev.py @@ -1,10 +1,20 @@ # enable python only development # copy compiled files to the current directory directly +import argparse import os import shutil import subprocess import sys +import warnings + +parser = argparse.ArgumentParser( + description="Development mode for python-only code") +parser.add_argument('-q', + '--quit-dev', + action='store_true', + help='Set the flag to quit development mode') +args = parser.parse_args() # cannot directly `import vllm` , because it will try to # import from the current directory @@ -37,18 +47,46 @@ # "vllm/_version.py", # not available in nightly wheels yet ] -for file in files_to_copy: - src = os.path.join(package_path, file) - dst = file - print(f"Copying {src} to {dst}") - shutil.copyfile(src, dst) +# Try to create _version.py to avoid version related warning +# Refer to https://github.com/vllm-project/vllm/pull/8771 +try: + from setuptools_scm import get_version + get_version(write_to="vllm/_version.py") +except ImportError: + warnings.warn( + "To avoid warnings related to vllm._version, " + "you should install setuptools-scm by `pip install setuptools-scm`", + stacklevel=2) + +if not args.quit_dev: + for file in files_to_copy: + src = os.path.join(package_path, file) + dst = file + print(f"Copying {src} to {dst}") + shutil.copyfile(src, dst) + + pre_built_vllm_path = os.path.join(package_path, "vllm") + tmp_path = os.path.join(package_path, "vllm_pre_built") + current_vllm_path = os.path.join(cwd, "vllm") + + print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup") + os.rename(pre_built_vllm_path, tmp_path) -pre_built_vllm_path = os.path.join(package_path, "vllm") -tmp_path = os.path.join(package_path, "vllm_pre_built") -current_vllm_path = os.path.join(cwd, "vllm") + print(f"Linking {current_vllm_path} to {pre_built_vllm_path}") + os.symlink(current_vllm_path, pre_built_vllm_path) +else: + vllm_symlink_path = os.path.join(package_path, "vllm") + vllm_backup_path = os.path.join(package_path, "vllm_pre_built") + current_vllm_path = os.path.join(cwd, "vllm") -print(f"Renaming {pre_built_vllm_path} to {tmp_path}") -os.rename(pre_built_vllm_path, tmp_path) + print(f"Unlinking {current_vllm_path} to {vllm_symlink_path}") + assert os.path.islink( + vllm_symlink_path + ), f"not in dev mode: {vllm_symlink_path} is not a symbolic link" + assert current_vllm_path == os.readlink( + vllm_symlink_path + ), "current directory is not the source code of package" + os.unlink(vllm_symlink_path) -print(f"linking {current_vllm_path} to {pre_built_vllm_path}") -os.symlink(current_vllm_path, pre_built_vllm_path) + print(f"Recovering backup from {vllm_backup_path} to {vllm_symlink_path}") + os.rename(vllm_backup_path, vllm_symlink_path) diff --git a/requirements-openvino.txt b/requirements-openvino.txt index 800d59e2b9483..ac54cf0c3288f 100644 --- a/requirements-openvino.txt +++ b/requirements-openvino.txt @@ -1,8 +1,8 @@ # Common dependencies -r requirements-common.txt -# OpenVINO dependencies -torch >= 2.1.2 -openvino ~= 2024.4.0 -openvino-tokenizers[transformers] ~= 2024.4.0 -optimum-intel[openvino] >= 1.19.0 +torch == 2.4.0 # should be aligned with "common" vLLM torch version +openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention + +optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version +optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py new file mode 100644 index 0000000000000..b6ec7413978f4 --- /dev/null +++ b/tests/compile/test_basic_correctness.py @@ -0,0 +1,48 @@ +from typing import Dict, List, Optional + +import pytest + +from vllm.compilation.levels import CompilationLevel +from vllm.utils import cuda_device_count_stateless + +from ..utils import compare_all_settings + + +# we cannot afford testing the full Catesian product +# of all models and all levels +@pytest.mark.parametrize( + "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph", + [ + ("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate", + True), + ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", + ["--quantization", "compressed-tensors" + ], 1, 1, "FLASH_ATTN", "generate", True), + ("google/gemma-2-2b-it", [], 1, 2, "FLASHINFER", "generate", True), + # TODO: add multi-modality test for llava + ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False) + ]) +def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend, + method, fullgraph): + # this test is run under multiple suits, with different GPUs. + # make sure we only run the test with correct CUDA devices. + # don't use "<", as it will duplicate the tests. + if cuda_device_count_stateless() != pp_size * tp_size: + pytest.skip("Not correct CUDA devices for the test.") + import os + os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend + if not fullgraph: + os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" + all_args = [["--enforce-eager"] + model_args + ["--max_model_len", "1024"] + + ["-pp", str(pp_size)] + ["-tp", str(tp_size)]] * 3 + # don't test VLLM_TORCH_COMPILE_LEVEL == 3 case + # inductor will change the output, so we cannot compare them. + all_envs: List[Optional[Dict[str, str]]] = [{ + "VLLM_TORCH_COMPILE_LEVEL": + str(level) + } for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.DYNAMO_AS_IS, + CompilationLevel.DYNAMO_ONCE, + ]] + compare_all_settings(model, all_args, all_envs, method=method) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 5dd65ad7236f9..f28f9145bb442 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -1,13 +1,20 @@ import pytest -from vllm.compilation.backends import vllm_backend +from vllm.compilation.levels import CompilationLevel +from ..utils import fork_new_process_for_each_test from .utils import TEST_MODELS, check_full_graph_support @pytest.mark.parametrize("model_info", TEST_MODELS) -@pytest.mark.parametrize("backend", ["eager", vllm_backend]) -def test_full_graph(model_info, backend): +@pytest.mark.parametrize( + "optimization_level", + [CompilationLevel.DYNAMO_ONCE, CompilationLevel.INDUCTOR]) +@fork_new_process_for_each_test +def test_full_graph(model_info, optimization_level): model = model_info[0] model_kwargs = model_info[1] - check_full_graph_support(model, model_kwargs, backend, tp_size=1) + check_full_graph_support(model, + model_kwargs, + optimization_level, + tp_size=1) diff --git a/tests/compile/test_full_graph_multi_gpu.py b/tests/compile/test_full_graph_multi_gpu.py deleted file mode 100644 index e9883d5254e72..0000000000000 --- a/tests/compile/test_full_graph_multi_gpu.py +++ /dev/null @@ -1,22 +0,0 @@ -import pytest - -from vllm.compilation.backends import vllm_backend -from vllm.utils import cuda_device_count_stateless - -from ..utils import fork_new_process_for_each_test -from .utils import TEST_MODELS_SMOKE, check_full_graph_support - - -@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE) -@pytest.mark.parametrize("tp_size", [2]) -@pytest.mark.parametrize("backend", ["eager", vllm_backend]) -@fork_new_process_for_each_test -def test_full_graph_multi_gpu(model_info, tp_size, backend): - model = model_info[0] - model_kwargs = model_info[1] - - # Skip the test if there are not enough CUDA devices. - if cuda_device_count_stateless() < tp_size: - pytest.skip("Not enough CUDA devices for the test.") - - check_full_graph_support(model, model_kwargs, backend, tp_size=tp_size) diff --git a/tests/compile/test_full_graph_smoke.py b/tests/compile/test_full_graph_smoke.py deleted file mode 100644 index 0c5a95b4ead4c..0000000000000 --- a/tests/compile/test_full_graph_smoke.py +++ /dev/null @@ -1,13 +0,0 @@ -import pytest - -from vllm.compilation.backends import vllm_backend - -from .utils import TEST_MODELS_SMOKE, check_full_graph_support - - -@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE) -@pytest.mark.parametrize("backend", ["eager", vllm_backend]) -def test_full_graph(model_info, backend): - model = model_info[0] - model_kwargs = model_info[1] - check_full_graph_support(model, model_kwargs, backend, tp_size=1) diff --git a/tests/compile/utils.py b/tests/compile/utils.py index 2d06a0946d911..5386eb0e3795d 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -4,16 +4,9 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.plugins import set_torch_compile_backend +from vllm.compilation.levels import CompilationLevel from vllm.utils import is_hip -TEST_MODELS_SMOKE = [ - ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", { - "quantization": "compressed-tensors" - }), - ("meta-llama/Meta-Llama-3-8B", {}), -] - TEST_MODELS = [ ("facebook/opt-125m", {}), ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { @@ -68,20 +61,21 @@ })) -def check_full_graph_support(model, model_kwargs, backend, tp_size=1): +def check_full_graph_support(model, + model_kwargs, + optimization_level, + tp_size=1): # make sure these models can be captured in full graph mode - if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ: - os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1" - os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1" + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level) + os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1" # Inductor doesn't support fp8/gptq_marlin_24 yet. quantization = model_kwargs.get("quantization") if (quantization == "fp8" or quantization == "gptq_marlin" - or quantization == "gptq_marlin_24") and backend != "eager": + or quantization == "gptq_marlin_24" + ) and optimization_level >= CompilationLevel.INDUCTOR: return - set_torch_compile_backend(backend) - prompts = [ "Hello, my name is", "The president of the United States is", diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 5e9a9f8ab7d4d..6cb74eb78cbf0 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -70,7 +70,6 @@ async def client(server): [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), ("_count", _NUM_REQUESTS)], "vllm:request_params_n": [("_count", _NUM_REQUESTS)], - "vllm:request_params_best_of": [("_count", _NUM_REQUESTS)], "vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], "vllm:generation_tokens": @@ -151,9 +150,6 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): "vllm:request_params_n_sum", "vllm:request_params_n_bucket", "vllm:request_params_n_count", - "vllm:request_params_best_of_sum", - "vllm:request_params_best_of_bucket", - "vllm:request_params_best_of_count", "vllm:num_preemptions_total", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index c1fb45955a0e5..f471dcee938be 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -20,22 +20,22 @@ def test_env(name: str, device: str, monkeypatch): if device == "cpu": with patch("vllm.attention.selector.is_cpu", return_value=True): - backend = which_attn_to_use(8, 16, 8, None, torch.float16, - torch.float16, 16) + backend = which_attn_to_use(16, None, torch.float16, torch.float16, + 16, False) assert backend.name == "TORCH_SDPA" elif device == "hip": with patch("vllm.attention.selector.is_hip", return_value=True): - backend = which_attn_to_use(8, 16, 8, None, torch.float16, - torch.float16, 16) + backend = which_attn_to_use(16, None, torch.float16, torch.float16, + 16, False) assert backend.name == "ROCM_FLASH" elif device == "openvino": with patch("vllm.attention.selector.is_openvino", return_value=True): - backend = which_attn_to_use(8, 16, 8, None, torch.float16, - torch.float16, 16) + backend = which_attn_to_use(16, None, torch.float16, torch.float16, + 16, False) assert backend.name == "OPENVINO" else: - backend = which_attn_to_use(8, 16, 8, None, torch.float16, - torch.float16, 16) + backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16, + False) assert backend.name == name @@ -46,32 +46,37 @@ def test_flash_attn(monkeypatch): # Unsupported CUDA arch with patch("torch.cuda.get_device_capability", return_value=(7, 5)): - backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) + backend = which_attn_to_use(16, None, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported data type - backend = which_attn_to_use(8, 16, 8, None, torch.float8_e4m3fn, None, 16) + backend = which_attn_to_use(16, None, torch.float8_e4m3fn, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported kv cache data type - backend = which_attn_to_use(8, 16, 8, None, torch.float16, "fp8", 16) + backend = which_attn_to_use(16, None, torch.float16, "fp8", 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported block size - backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 8) + backend = which_attn_to_use(16, None, torch.float16, None, 8, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported sliding window - backend = which_attn_to_use(8, 16, 8, 1, torch.float16, None, 16) + backend = which_attn_to_use(16, 1, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # flash-attn is not installed with patch.dict('sys.modules', {'vllm_flash_attn': None}): - backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) + backend = which_attn_to_use(16, None, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported head size - backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16) + backend = which_attn_to_use(17, None, torch.float16, None, 16, False) + assert backend.name != STR_FLASH_ATTN_VAL + + # Attention-free models should bypass env and use PlaceholderAttention + backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16, + True) assert backend.name != STR_FLASH_ATTN_VAL @@ -79,4 +84,4 @@ def test_invalid_env(monkeypatch): """Throw an exception if the backend name is invalid.""" override_backend_env_variable(monkeypatch, STR_INVALID_VAL) with pytest.raises(ValueError): - which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) + which_attn_to_use(16, None, torch.float16, None, 16, False) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index da98fac99cf22..405c0d0efad65 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -199,6 +199,11 @@ def baichuan_zero_lora_files(): return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init") +@pytest.fixture(scope="session") +def baichuan_regex_lora_files(): + return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex") + + @pytest.fixture(scope="session") def minicpmv_lora_files(): return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon") diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 3514dcb7aedf4..9a529e27b4cd8 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -5,7 +5,9 @@ from vllm.lora.models import LoRAModel from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM -lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"] +lora_lst = [ + "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b" +] @pytest.mark.parametrize("lora_name", lora_lst) @@ -13,6 +15,7 @@ def test_load_checkpoints( lora_name, baichuan_lora_files, baichuan_zero_lora_files, + baichuan_regex_lora_files, chatglm3_lora_files, ): supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules @@ -36,7 +39,7 @@ def test_load_checkpoints( embedding_modules=embedding_modules, embedding_padding_modules=embed_padding_modules) elif lora_name == "baichuan7B-zero": - #Test that the target_modules contain prefix + # Test that the target_modules contain prefix # such as "model.layers.0.self_atten.W_pack", and # the test should pass. LoRAModel.from_local_checkpoint( @@ -46,6 +49,16 @@ def test_load_checkpoints( device="cpu", embedding_modules=embedding_modules, embedding_padding_modules=embed_padding_modules) + elif lora_name == "baichuan7B-zero-regex": + # Test that the `target_modules` in the form of regular expressions, + # such as `model\\..*(W_pack|o_proj)`, and the test should pass. + LoRAModel.from_local_checkpoint( + baichuan_regex_lora_files, + expected_lora_modules, + lora_model_id=1, + device="cpu", + embedding_modules=embedding_modules, + embedding_padding_modules=embed_padding_modules) else: # For the baichuan7B model, load chatglm3-6b's LoRA, # and the test should raise the following error. diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 23a7a85580a0a..f1003221ab518 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -326,7 +326,6 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool, "vllm:e2e_request_latency_seconds", "vllm:request_prompt_tokens", "vllm:request_generation_tokens", - "vllm:request_params_best_of", "vllm:request_params_n", ] for metric_name in request_histogram_metrics: diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py new file mode 100644 index 0000000000000..c27bf6a60a4f4 --- /dev/null +++ b/tests/models/decoder_only/language/test_mamba.py @@ -0,0 +1,295 @@ +"""Compare the outputs of HF and vLLM when using greedy sampling for Mamba. + +Run `pytest tests/models/test_mamba.py`. +""" +import pytest +from transformers import AutoModelForCausalLM, AutoTokenizer + +from vllm.sampling_params import SamplingParams +from vllm.worker.model_runner import _get_graph_batch_size + +from ...utils import check_outputs_equal + +MODELS = ["state-spaces/mamba-130m-hf"] + + +# Use lower-level interfaces to create this greedy generator, as mamba will +# choke on the model_kwarg 'attention_mask' if hf_model.generate_greedy is used. +def generate_greedy(model_name, example_prompts, max_tokens): + # Create a text generation pipeline + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(model_name) + + # Generate texts from the prompts + outputs = [] + for prompt in example_prompts: + # Tokenize the input prompt with truncation + inputs = tokenizer(prompt, return_tensors="pt", truncation=True) + input_ids = inputs["input_ids"].to(model.device) + + # Generate text using the model's generate method directly + generated_ids = model.generate(input_ids, max_new_tokens=max_tokens) + generated_text = tokenizer.decode(generated_ids[0], + skip_special_tokens=True) + + outputs.append((generated_ids[0].tolist(), generated_text)) + + return outputs + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [96]) +def test_models( + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + hf_outputs = generate_greedy(model, example_prompts, max_tokens) + + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + + for i in range(len(example_prompts)): + hf_output_ids, hf_output_str = hf_outputs[i] + vllm_output_ids, vllm_output_str = vllm_outputs[i] + assert hf_output_str == vllm_output_str, ( + f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") + assert hf_output_ids == vllm_output_ids, ( + f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [96]) +def test_batching( + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + # To pass the small model tests, we need full precision. + for_loop_outputs = [] + with vllm_runner(model, dtype=dtype) as vllm_model: + for prompt in example_prompts: + for_loop_outputs.append( + vllm_model.generate_greedy([prompt], max_tokens)[0]) + + batched_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) + + check_outputs_equal( + outputs_0_lst=for_loop_outputs, + outputs_1_lst=batched_outputs, + name_0="for_loop_vllm", + name_1="batched_vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [10]) +def test_chunked_prefill_with_parallel_sampling(vllm_runner, example_prompts, + model: str, dtype: str, + max_tokens: int) -> None: + # Tests chunked prefill in conjunction with n>1. In this case, prefill is + # populated with decoding tokens and we test that it doesn't fail. + # This test might fail if cache is not allocated correctly for n > 1 + # decoding steps inside a chunked prefill forward pass (where we have both + # prefill and decode together ) + sampling_params = SamplingParams(n=3, + temperature=1, + seed=0, + max_tokens=max_tokens) + with vllm_runner( + model, + dtype=dtype, + enable_chunked_prefill=True, + max_num_batched_tokens=30, + max_num_seqs=10 # forces prefill chunks with decoding + ) as vllm_model: + vllm_model.generate(example_prompts, sampling_params) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) +def test_chunked_prefill(vllm_runner, example_prompts, model: str, dtype: str, + max_tokens: int, + chunked_prefill_token_size: int) -> None: + """ + Checks exact match decode between huggingface model and vllm runner with + chunked prefill. + """ + max_num_seqs = chunked_prefill_token_size + max_num_batched_tokens = chunked_prefill_token_size + + non_chunked = generate_greedy(model, example_prompts, max_tokens) + + with vllm_runner(model, + dtype=dtype, + enable_chunked_prefill=True, + max_num_batched_tokens=max_num_batched_tokens, + max_num_seqs=max_num_seqs) as vllm_model: + chunked = vllm_model.generate_greedy(example_prompts, + max_tokens=max_tokens) + + check_outputs_equal( + outputs_0_lst=chunked, + outputs_1_lst=non_chunked, + name_0="chunked", + name_1="non_chunked", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [15]) +def test_parallel_sampling( + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + + with vllm_runner(model, dtype=dtype) as vllm_model: + for_loop_outputs = [] + for _ in range(10): + for_loop_outputs.append( + # using example_prompts index 1 instead of 0 since with 0 the + # logprobs get really close and the test doesn't pass + vllm_model.generate_greedy([example_prompts[1]], max_tokens) + [0]) + sampling_params = SamplingParams(n=10, + temperature=0.001, + seed=0, + max_tokens=max_tokens) + n_lt_1_outputs = vllm_model.generate([example_prompts[1]], + sampling_params) + token_ids, texts = n_lt_1_outputs[0] + n_lt_1_outputs = [(token_id, text) + for token_id, text in zip(token_ids, texts)] + + check_outputs_equal( + outputs_0_lst=n_lt_1_outputs, + outputs_1_lst=for_loop_outputs, + name_0="vllm_n_lt_1_outputs", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [20]) +def test_mamba_cache_cg_padding( + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + # This test is for verifying that mamba cache is padded to CG captured + # batch size. If it's not, a torch RuntimeError will be raised because + # tensor dimensions aren't compatible + while len(example_prompts) == _get_graph_batch_size(len(example_prompts)): + example_prompts.append(example_prompts[0]) + + try: + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + except RuntimeError: + pytest.fail( + "Couldn't run batch size which is not equal to a Cuda Graph " + "captured batch size. " + "Could be related to mamba cache not padded correctly") + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [20]) +def test_models_preemption_recompute( + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + # Tests that outputs are identical with and w/o preemtions (recompute) + assert dtype == "float" + + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_model.model.llm_engine.scheduler[ + 0].ENABLE_ARTIFICIAL_PREEMPT = True + preempt_vllm_outputs = vllm_model.generate_greedy( + example_prompts, max_tokens) + + vllm_model.model.llm_engine.scheduler[ + 0].ENABLE_ARTIFICIAL_PREEMPT = False + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=preempt_vllm_outputs, + outputs_1_lst=vllm_outputs, + name_0="vllm_preepmtions", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks( + vllm_runner, + model: str, + dtype: str, + example_prompts, +) -> None: + # This test is for verifying that the Mamba inner state management doesn't + # collapse in case where the number of incoming requests and + # finished_requests_ids is larger than the maximum Mamba block capacity. + # This could generally happen due to the fact that Mamba does support + # statelessness mechanism where it can cleanup new incoming requests in + # a single step. + try: + with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model: + vllm_model.generate_greedy([example_prompts[0]] * 100, 10) + except ValueError: + pytest.fail("Mamba inner state wasn't cleaned up properly between" + "steps finished requests registered unnecessarily ") + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +def test_state_cleanup( + vllm_runner, + model: str, + dtype: str, + example_prompts, +) -> None: + # This test is for verifying that the Mamba state is cleaned up between + # steps, If its not cleaned, an error would be expected. + try: + with vllm_runner(model, dtype=dtype) as vllm_model: + for _ in range(10): + vllm_model.generate_greedy([example_prompts[0]] * 100, 1) + except ValueError: + pytest.fail("Mamba inner state wasn't cleaned up between states, " + "could be related to finished_requests_ids") + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +def test_model_print( + vllm_runner, + model: str, + dtype: str, +) -> None: + with vllm_runner(model, dtype=dtype) as vllm_model: + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) diff --git a/tests/models/decoder_only/vision_language/test_glm4.py b/tests/models/decoder_only/vision_language/test_glm4.py new file mode 100644 index 0000000000000..47922a57f680b --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_glm4.py @@ -0,0 +1,133 @@ +from typing import List, Optional, Tuple, Type + +import pytest + +from vllm.multimodal.utils import rescale_image_size +from vllm.transformers_utils.tokenizer import patch_padding_side + +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....utils import large_gpu_test +from ...utils import check_logprobs_close + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "What's the content of the image?", + "cherry_blossom": + "What is the season?", +}) + +models = ["THUDM/glm-4v-9b"] +target_dtype = "bfloat16" + + +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + inputs: List[Tuple[List[str], PromptImageInput]], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + mm_limit: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + # max_model_len should be greater than image_feature_size + with vllm_runner(model, + max_model_len=2048, + max_num_seqs=2, + dtype=dtype, + limit_mm_per_prompt={"image": mm_limit}, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + stop_token_ids = [151329, 151336, 151338] + vllm_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images, + stop_token_ids=stop_token_ids) + for prompts, images in inputs + ] + + with hf_runner(model, dtype=dtype) as hf_model: + hf_processor = hf_model.processor + patch_padding_side(hf_processor) + + def processor(*args, text="", images=None, **kwargs): + if images is None: + return hf_processor(*args, **kwargs) + + return hf_processor.apply_chat_template( + [{ + "role": "user", + "image": images, + "content": text + }], + add_generation_prompt=True, + tokenize=True, + return_dict=True, + **kwargs, + ) + + hf_model.processor = processor + hf_model.model.get_output_embeddings = lambda: \ + hf_model.model.transformer.output_layer + hf_outputs_per_image = [ + hf_model.generate_greedy_logprobs_limit( + prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images, + ) for prompts, images in inputs + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, + vllm_outputs_per_image): + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, + dtype: str, max_tokens: int, num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + run_test( + hf_runner, + vllm_runner, + inputs_per_image, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index 5f703b03ab7fe..e579c8b38db91 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -1,3 +1,6 @@ +import random +from typing import List + import pytest import torch @@ -10,31 +13,45 @@ from .utils import create_batch, create_worker -def create_proposal(batch_size: int, propose_len: int, vocab_size: int, +def create_proposal(propose_lens: List[int], vocab_size: int, device: str) -> SpeculativeProposals: - proposal_probs = torch.rand((batch_size, propose_len, vocab_size), + batch_size = len(propose_lens) + max_propose_len = max(propose_lens) + proposal_probs = torch.rand((batch_size, max_propose_len, vocab_size), device=device) - proposal_token_ids = torch.argmax(proposal_probs, dim=-1) - proposal_lens = torch.tensor([propose_len] * batch_size, device=device) + + proposal_token_ids = torch.full((batch_size, max_propose_len), + fill_value=-1, + device=device) + for i in range(batch_size): + proposal_token_ids[i][:propose_lens[i]] = torch.argmax( + proposal_probs[i][:propose_lens[i]], dim=-1) + + propose_lens = torch.tensor(propose_lens, device=device) return SpeculativeProposals(proposal_token_ids, proposal_probs, - proposal_lens) + propose_lens) def assert_score_equal(score1: SpeculativeScores, score2: SpeculativeScores) -> None: assert torch.allclose(score1.probs, score2.probs) assert torch.allclose(score1.logprobs, score2.logprobs) - assert torch.equal(score1.token_ids, score2.token_ids) + assert torch.equal( + score1.token_ids, + score2.token_ids), f"{score1.token_ids}, {score2.token_ids}" @pytest.mark.parametrize('model_name', ['facebook/opt-125m']) @pytest.mark.parametrize('batch_size', [1, 2, 4, 8, 16]) -@pytest.mark.parametrize('propose_len', [1, 3, 5]) +@pytest.mark.parametrize('max_propose_len', [1, 3, 5]) +@pytest.mark.parametrize('mixed_propose_len', [True]) @pytest.mark.parametrize('device', ['cuda']) -def test_scoroer(model_name: str, batch_size: int, propose_len: int, - device: str) -> None: +def test_scorer(model_name: str, batch_size: int, max_propose_len: int, + mixed_propose_len: bool, device: str) -> None: """ - Compare the batch expansion scorer and mqa scorer return the same score + Compare the batch expansion scorer and mqa scorer return the same score. + We test for both queries with the same propose length and different + propose length. """ seed = 0 block_size = 32 @@ -46,13 +63,22 @@ def test_scoroer(model_name: str, batch_size: int, propose_len: int, should_modify_greedy_probs_inplace = True vocab_size = scorer_worker.vocab_size - proposals = create_proposal(batch_size, propose_len, vocab_size, device) + + if not mixed_propose_len: + propose_lens = [max_propose_len] * batch_size + else: + non_zero_cnt = random.randint(0, batch_size) + propose_lens = [max_propose_len + ] * non_zero_cnt + [0] * (batch_size - non_zero_cnt) + random.shuffle(propose_lens) + + proposals = create_proposal(propose_lens, vocab_size, device) seq_group_metadatalist, _, _ = create_batch(batch_size, - propose_len, + max_propose_len, block_size=block_size, num_gpu_blocks=num_gpu_blocks) requests = ExecuteModelRequest(seq_group_metadatalist, - num_lookahead_slots=propose_len) + num_lookahead_slots=max_propose_len) batch_expansion_scorer = BatchExpansionTop1Scorer(scorer_worker, device, vocab_size) diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py index d8df86b2aaa14..86d9af88e49ea 100644 --- a/tests/tpu/test_compilation.py +++ b/tests/tpu/test_compilation.py @@ -5,9 +5,11 @@ import depyf +from vllm.compilation.levels import CompilationLevel + # disable custom dispatcher, let Dynamo takes over # all the control -os.environ['VLLM_DYNAMO_USE_CUSTOM_DISPATCHER'] = "0" +os.environ['VLLM_TORCH_COMPILE_LEVEL'] = str(CompilationLevel.DYNAMO_AS_IS) temp_dir = tempfile.mkdtemp() with depyf.prepare_debug(temp_dir): diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index 69ab67abdd12b..923d0f1680802 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -1,5 +1,7 @@ import os +from vllm.compilation.levels import CompilationLevel + from ..utils import compare_two_settings # --enforce-eager on TPU causes graph compilation @@ -9,8 +11,9 @@ def test_custom_dispatcher(): - compare_two_settings("google/gemma-2b", - arg1=["--enforce-eager"], - arg2=["--enforce-eager"], - env1={"VLLM_DYNAMO_USE_CUSTOM_DISPATCHER": "0"}, - env2={}) + compare_two_settings( + "google/gemma-2b", + arg1=["--enforce-eager"], + arg2=["--enforce-eager"], + env1={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_ONCE)}, + env2={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_AS_IS)}) diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index 3cee3b890862a..64ed8e26f38ed 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -98,8 +98,6 @@ def test_traces(trace_service): SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p assert attributes.get( SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get( - SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len( outputs[0].prompt_token_ids) @@ -155,8 +153,6 @@ def test_traces_with_detailed_steps(trace_service): SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p assert attributes.get( SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get( - SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len( outputs[0].prompt_token_ids) diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 57ac152d9edb6..c216d195c9e7e 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -186,11 +186,8 @@ class BlocksparseFlashAttentionMetadata(AttentionMetadata): # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. use_cuda_graph: bool - # Number of query tokens for each request in the batch. - # Currently, we require that all requests have the same number of query - # tokens during the decoding phase. When speculavie decoding is enabled, - # decode_query_len might be greater than 1. In all other cases, it is 1. - decode_query_len: Optional[int] = None + # Max number of query tokens for among request in the batch. + max_decode_query_len: Optional[int] = None _cached_prefill_metadata: Optional[ "BlocksparseFlashAttentionMetadata"] = None diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index bba80262e52d3..8457bde066eb7 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -111,11 +111,8 @@ class FlashAttentionMetadata(AttentionMetadata): # Maximum query length in the batch. max_query_len: Optional[int] - # Number of query tokens for each request in the batch. - # Currently, we require that all requests have the same number of query - # tokens during the decoding phase. When speculavie decoding is enabled, - # decode_query_len might be greater than 1. In all other cases, it is 1. - decode_query_len: Optional[int] + # Max number of query tokens among request in the batch. + max_decode_query_len: Optional[int] # Maximum sequence length among prefill batch. 0 if there are decoding # requests only. @@ -173,9 +170,9 @@ def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]: slot_mapping=self.slot_mapping[:self.num_prefill_tokens], seq_lens=self.seq_lens[:self.num_prefills], seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], - decode_query_len=0, max_query_len=self.max_query_len, max_prefill_seq_len=self.max_prefill_seq_len, + max_decode_query_len=0, max_decode_seq_len=0, query_start_loc=self.query_start_loc[:self.num_prefills + 1], seq_start_loc=self.seq_start_loc[:self.num_prefills + 1], @@ -202,12 +199,14 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]: slot_mapping=self.slot_mapping[self.num_prefill_tokens:], seq_lens=None, seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], - decode_query_len=self.decode_query_len, + max_decode_query_len=self.max_decode_query_len, max_query_len=self.max_query_len, max_prefill_seq_len=0, max_decode_seq_len=self.max_decode_seq_len, - query_start_loc=None, - seq_start_loc=None, + query_start_loc=self.query_start_loc[self.num_prefills:] + if self.query_start_loc is not None else None, + seq_start_loc=self.seq_start_loc[self.num_prefills:] + if self.seq_start_loc is not None else None, context_lens_tensor=None, block_tables=self.block_tables[self.num_prefills:], use_cuda_graph=self.use_cuda_graph, @@ -413,9 +412,9 @@ def build(self, seq_lens: List[int], query_lens: List[int], max_query_len = max(query_lens) decode_query_lens = query_lens[self.num_prefills:] if len(decode_query_lens) > 0: - decode_query_len = max(decode_query_lens) + max_decode_query_len = max(decode_query_lens) else: - decode_query_len = 1 + max_decode_query_len = 1 max_prefill_seq_len = max(self.prefill_seq_lens, default=0) max_decode_seq_len = max(self.curr_seq_lens, default=0) num_decode_tokens = self.num_decode_tokens @@ -468,7 +467,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, max_query_len=max_query_len, - decode_query_len=decode_query_len, + max_decode_query_len=max_decode_query_len, max_prefill_seq_len=max_prefill_seq_len, max_decode_seq_len=max_decode_seq_len, query_start_loc=query_start_loc, @@ -714,20 +713,37 @@ def unified_flash_attention( if decode_meta := attn_metadata.decode_metadata: # Decoding run. - _, num_head, head_dim = decode_query.shape - decode_query = decode_query.reshape(-1, decode_meta.decode_query_len, - num_head, head_dim) - decode_output = flash_attn_with_kvcache( - q=decode_query, - k_cache=key_cache, - v_cache=value_cache, - block_table=decode_meta.block_tables, - cache_seqlens=decode_meta.seq_lens_tensor, - softmax_scale=softmax_scale, - causal=True, - alibi_slopes=alibi_slopes, - softcap=logits_soft_cap, - ).squeeze(1) + # Use flash_attn_varlen_func kernel for speculative decoding + # because different queries might have different lengths. + assert decode_meta.max_decode_query_len is not None + if decode_meta.max_decode_query_len > 1: + decode_output = flash_attn_varlen_func( + q=decode_query, + k=key_cache, + v=value_cache, + cu_seqlens_q=decode_meta.query_start_loc, + max_seqlen_q=decode_meta.max_decode_query_len, + cu_seqlens_k=decode_meta.seq_start_loc, + max_seqlen_k=decode_meta.max_decode_seq_len, + softmax_scale=softmax_scale, + causal=True, + alibi_slopes=alibi_slopes, + softcap=logits_soft_cap, + block_table=decode_meta.block_tables, + ) + else: + # Use flash_attn_with_kvcache for normal decoding. + decode_output = flash_attn_with_kvcache( + q=decode_query.unsqueeze(1), + k_cache=key_cache, + v_cache=value_cache, + block_table=decode_meta.block_tables, + cache_seqlens=decode_meta.seq_lens_tensor, + softmax_scale=softmax_scale, + causal=True, + alibi_slopes=alibi_slopes, + softcap=logits_soft_cap, + ).squeeze(1) if prefill_output is None: assert decode_output is not None @@ -739,7 +755,6 @@ def unified_flash_attention( # Chunked prefill does not work with speculative decoding. # Therefore, the query length for decode should be 1 in chunked prefill. assert decode_meta is not None - assert decode_meta.decode_query_len == 1 decode_output = decode_output.squeeze(1) output = torch.cat([prefill_output, decode_output], dim=0) return output.view(num_tokens, hidden_size) diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py new file mode 100644 index 0000000000000..3987986f1786b --- /dev/null +++ b/vllm/attention/backends/placeholder_attn.py @@ -0,0 +1,321 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, List, Optional, Tuple, Type + +import torch + +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata, + AttentionMetadataBuilder) +from vllm.attention.backends.utils import CommonAttentionState + +if TYPE_CHECKING: + from vllm.worker.model_runner import ModelInputForGPUBuilder + +# Placeholder attention backend for models like Mamba and embedding models that +# lack attention. + + +class PlaceholderAttentionBackend(AttentionBackend): + """Placeholder backend for when no attention is needed.""" + + @staticmethod + def get_name() -> str: + return "placeholder-attn" + + @staticmethod + def get_impl_cls() -> Type["PlaceholderAttentionImpl"]: + return PlaceholderAttentionImpl + + @staticmethod + def get_builder_cls() -> Type["PlaceholderAttentionMetadataBuilder"]: + return PlaceholderAttentionMetadataBuilder + + @staticmethod + def get_metadata_cls() -> Type["PlaceholderAttentionMetadata"]: + return PlaceholderAttentionMetadata + + @staticmethod + def get_state_cls() -> Type["CommonAttentionState"]: + return CommonAttentionState + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return (1, 1, 1, 1, 1) + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: torch.Tensor, + ) -> None: + return + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + return + + +@dataclass +class PlaceholderAttentionMetadata(AttentionMetadata): + """Attention metadata for prefill and decode batched together.""" + # (batch_size,). The sequence length per sequence. Sequence length means + # the computed tokens + new tokens None if it is a decoding. + seq_lens: Optional[List[int]] + # seq_lens stored as a tensor. + seq_lens_tensor: Optional[torch.Tensor] + + # Maximum query length in the batch. + max_query_len: Optional[int] + + # Max number of query tokens among request in the batch. + max_decode_query_len: Optional[int] + + # Maximum sequence length among prefill batch. 0 if there are decoding + # requests only. + max_prefill_seq_len: int + # Maximum sequence length among decode batch. 0 if there are prefill + # requests only. + max_decode_seq_len: int + # (batch_size + 1,). The cumulative subquery lengths of the sequences in + # the batch, used to index into subquery. E.g., if the subquery length + # is [4, 6], it is [0, 4, 10]. + query_start_loc: Optional[torch.Tensor] + # (batch_size + 1,). The cumulative sequence lengths of the sequences in + # the batch, used to index into sequence. E.g., if the sequence length is + # [4, 6], it is [0, 4, 10]. + seq_start_loc: Optional[torch.Tensor] + # (batch_size,) A tensor of context lengths (tokens that are computed + # so far). + context_lens_tensor: Optional[torch.Tensor] + + # (batch_size, max_blocks_per_seq). + # Block addresses per sequence. (Seq id -> list of physical block) + # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks + # in the kv cache. Each block can contain up to block_size tokens. + # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph + # captured. + block_tables: Optional[torch.Tensor] + + # Whether or not if cuda graph is enabled. + # Cuda-graph is currently enabled for decoding only. + # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. + use_cuda_graph: bool + + _cached_prefill_metadata: Optional["PlaceholderAttentionMetadata"] = None + _cached_decode_metadata: Optional["PlaceholderAttentionMetadata"] = None + + @property + def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]: + if self.num_prefills == 0: + return None + + if self._cached_prefill_metadata is not None: + return self._cached_prefill_metadata + + assert self.seq_lens is not None + assert self.seq_lens_tensor is not None + assert self.query_start_loc is not None + assert self.context_lens_tensor is not None + assert self.seq_start_loc is not None + + # Placeholders + slot_mapping = torch.empty(0) + block_tables = torch.empty(0) + + self._cached_prefill_metadata = PlaceholderAttentionMetadata( + num_prefills=self.num_prefills, + num_prefill_tokens=self.num_prefill_tokens, + num_decode_tokens=0, + slot_mapping=slot_mapping, + seq_lens=self.seq_lens[:self.num_prefills], + seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], + max_decode_query_len=0, + max_query_len=self.max_query_len, + max_prefill_seq_len=self.max_prefill_seq_len, + max_decode_seq_len=0, + query_start_loc=self.query_start_loc[:self.num_prefills + 1], + seq_start_loc=self.seq_start_loc[:self.num_prefills + 1], + context_lens_tensor=self.context_lens_tensor[:self.num_prefills], + block_tables=block_tables, + use_cuda_graph=False, + ) + return self._cached_prefill_metadata + + @property + def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]: + if self.num_decode_tokens == 0: + return None + + if self._cached_decode_metadata is not None: + return self._cached_decode_metadata + assert self.seq_lens_tensor is not None + + # Placeholders + slot_mapping = torch.empty(0) + block_tables = torch.empty(0) + + self._cached_decode_metadata = PlaceholderAttentionMetadata( + num_prefills=0, + num_prefill_tokens=0, + num_decode_tokens=self.num_decode_tokens, + slot_mapping=slot_mapping, + seq_lens=None, + seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], + max_decode_query_len=self.max_decode_query_len, + max_query_len=None, + max_prefill_seq_len=0, + max_decode_seq_len=self.max_decode_seq_len, + query_start_loc=None, + seq_start_loc=None, + context_lens_tensor=None, + block_tables=block_tables, + use_cuda_graph=self.use_cuda_graph, + ) + return self._cached_decode_metadata + + +class PlaceholderAttentionMetadataBuilder( + AttentionMetadataBuilder[PlaceholderAttentionMetadata]): + + def __init__(self, input_builder: "ModelInputForGPUBuilder"): + self.prefill_seq_lens: List[int] = [] + self.context_lens: List[int] = [] + self.curr_seq_lens: List[int] = [] + self.num_prefills = 0 + self.num_prefill_tokens = 0 + self.num_decode_tokens = 0 + + self.input_builder = input_builder + self.runner = input_builder.runner + + def _add_seq_group( + self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", + chunked_prefill_enabled: bool): + """Add a sequence group to the metadata. Specifically update/append + 1. context length. + """ + is_prompt = inter_data.is_prompt + + for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, + curr_sliding_window_block) in zip( + inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], + inter_data.orig_seq_lens, inter_data.seq_lens, + inter_data.query_lens, inter_data.context_lens, + inter_data.curr_sliding_window_blocks): + self.context_lens.append(context_len) + + if is_prompt: + self.num_prefills += 1 + self.num_prefill_tokens += token_len + self.prefill_seq_lens.append(seq_len) + else: + assert query_len == 1, ( + "seq_len: {}, context_len: {}, query_len: {}".format( + seq_len, context_len, query_len)) + self.num_decode_tokens += query_len + self.curr_seq_lens.append(curr_seq_len) + + def build(self, seq_lens: List[int], query_lens: List[int], + cuda_graph_pad_size: int, batch_size: int): + """Build attention metadata with on-device tensors. + + Args: + seq_lens: The maybe padded sequence lengths of the input sequences. + query_lens: The query lengths of the input sequences. + cuda_graph_pad_size: The padding size for cuda graph. + -1 if cuda graph is not used. + batch_size: The maybe padded batch size. + """ + for inter_data in self.input_builder.inter_data_list: + self._add_seq_group(inter_data, + self.input_builder.chunked_prefill_enabled) + + device = self.runner.device + use_captured_graph = cuda_graph_pad_size != -1 + + logits_soft_cap = getattr(self.runner.model_config.hf_config, + "attn_logit_softcapping", None) + if logits_soft_cap is not None: + raise ValueError( + "Please use Flashinfer backend for models with logits_soft_cap" + " (i.e., Gemma-2). Otherwise, the output might be wrong." + " Set Flashinfer backend by " + "export VLLM_ATTENTION_BACKEND=FLASHINFER.") + + max_query_len = max(query_lens) + decode_query_lens = query_lens[self.num_prefills:] + if len(decode_query_lens) > 0: + max_decode_query_len = max(decode_query_lens) + else: + max_decode_query_len = 1 + max_prefill_seq_len = max(self.prefill_seq_lens, default=0) + max_decode_seq_len = max(self.curr_seq_lens, default=0) + num_decode_tokens = self.num_decode_tokens + + if use_captured_graph: + num_decode_tokens = batch_size + + assert max_query_len > 0, ("query_lens: {}".format(query_lens)) + + context_lens_tensor = torch.tensor(self.context_lens, + dtype=torch.int, + device=device) + seq_lens_tensor = torch.tensor(seq_lens, + dtype=torch.int, + device=device) + query_lens_tensor = torch.tensor(query_lens, + dtype=torch.long, + device=device) + query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=device) + seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=device) + torch.cumsum(seq_lens_tensor, + dim=0, + dtype=seq_start_loc.dtype, + out=seq_start_loc[1:]) + torch.cumsum(query_lens_tensor, + dim=0, + dtype=query_start_loc.dtype, + out=query_start_loc[1:]) + + # Placeholders + slot_mapping = torch.empty(0) + block_tables = torch.empty(0) + + return PlaceholderAttentionMetadata( + num_prefills=self.num_prefills, + slot_mapping=slot_mapping, + num_prefill_tokens=self.num_prefill_tokens, + num_decode_tokens=num_decode_tokens, + seq_lens=seq_lens, + seq_lens_tensor=seq_lens_tensor, + max_query_len=max_query_len, + max_decode_query_len=max_decode_query_len, + max_prefill_seq_len=max_prefill_seq_len, + max_decode_seq_len=max_decode_seq_len, + query_start_loc=query_start_loc, + seq_start_loc=seq_start_loc, + context_lens_tensor=context_lens_tensor, + block_tables=block_tables, + use_cuda_graph=use_captured_graph, + ) + + +class PlaceholderAttentionImpl(AttentionImpl): + + def __init__(self, *args, **kwargs) -> None: + return + + def forward(self, *args, **kwargs) -> torch.Tensor: + raise NotImplementedError diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 7456aab8b8d2a..682eac50126ad 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -121,11 +121,8 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): # so far). context_lens_tensor: Optional[torch.Tensor] - # Number of query tokens for each request in the batch. - # Currently, we require that all requests have the same number of query - # tokens during the decoding phase. When speculavie decoding is enabled, - # decode_query_len might be greater than 1. In all other cases, it is 1. - decode_query_len: Optional[int] = None + # Max number of query tokens among request in the batch. + max_decode_query_len: Optional[int] = None _cached_prefill_metadata: Optional["ROCmFlashAttentionMetadata"] = None _cached_decode_metadata: Optional["ROCmFlashAttentionMetadata"] = None @@ -420,6 +417,8 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " "encoder/decoder cross-attention " diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 2b8c373178ab3..53e3a53badeae 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -313,7 +313,7 @@ def graph_capture_get_metadata_for_batch( seq_lens=None, seq_lens_tensor=self._graph_seq_lens[:batch_size], max_query_len=1, - decode_query_len=1, + max_decode_query_len=1, max_prefill_seq_len=0, max_decode_seq_len=self.runner.max_seq_len_to_capture, query_start_loc=None, diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index a3f9ff64f8b8b..25b86176f630e 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -118,11 +118,8 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata): # Maximum query length in the batch. None for decoding. max_query_len: Optional[int] = None - # Number of query tokens for each request in the batch. - # Currently, we require that all requests have the same number of query - # tokens during the decoding phase. When speculavie decoding is enabled, - # decode_query_len might be greater than 1. In all other cases, it is 1. - decode_query_len: Optional[int] = None + # Max number of query tokens among request in the batch. + max_decode_query_len: Optional[int] = None # (batch_size + 1,). The cumulative subquery lengths of the sequences in # the batch, used to index into subquery. E.g., if the subquery length @@ -562,25 +559,32 @@ def forward( self.kv_cache_dtype, k_scale, v_scale) - if attn_type != AttentionType.ENCODER: - # Decoder self-attention supports chunked prefill. - # Encoder/decoder cross-attention requires no chunked - # prefill (100% prefill or 100% decode tokens, no mix) - num_prefill_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - else: + if attn_type == AttentionType.ENCODER: # Encoder attention - chunked prefill is not applicable; # derive token-count from query shape & and treat them # as 100% prefill tokens assert attn_metadata.num_encoder_tokens is not None num_prefill_tokens = attn_metadata.num_encoder_tokens + num_encoder_tokens = attn_metadata.num_encoder_tokens num_decode_tokens = 0 - - if attn_type == AttentionType.DECODER: + elif attn_type == AttentionType.DECODER: + # Decoder self-attention supports chunked prefill. + num_prefill_tokens = attn_metadata.num_prefill_tokens + num_encoder_tokens = attn_metadata.num_prefill_tokens + num_decode_tokens = attn_metadata.num_decode_tokens # Only enforce this shape-constraint for decoder # self-attention assert key.shape[0] == num_prefill_tokens + num_decode_tokens assert value.shape[0] == num_prefill_tokens + num_decode_tokens + else: # attn_type == AttentionType.ENCODER_DECODER + # Encoder/decoder cross-attention requires no chunked + # prefill (100% prefill or 100% decode tokens, no mix) + num_prefill_tokens = attn_metadata.num_prefill_tokens + if attn_metadata.num_encoder_tokens is not None: + num_encoder_tokens = attn_metadata.num_encoder_tokens + else: + num_encoder_tokens = attn_metadata.num_prefill_tokens + num_decode_tokens = attn_metadata.num_decode_tokens output = torch.empty_like(query) # Query for decode. KV is not needed because it is already cached. @@ -588,8 +592,8 @@ def forward( # QKV for prefill. query = query[:num_prefill_tokens] if key is not None and value is not None: - key = key[:num_prefill_tokens] - value = value[:num_prefill_tokens] + key = key[:num_encoder_tokens] + value = value[:num_encoder_tokens] assert query.shape[0] == num_prefill_tokens assert decode_query.shape[0] == num_decode_tokens diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index ecf964fa49d9b..0112f49876996 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -42,10 +42,12 @@ def __init__( kv_cache_dtype = cache_config.cache_dtype block_size = cache_config.block_size sliding_window = cache_config.sliding_window + is_attention_free = cache_config.is_attention_free else: kv_cache_dtype = "auto" block_size = 16 sliding_window = None + is_attention_free = False if num_kv_heads is None: num_kv_heads = num_heads @@ -76,9 +78,9 @@ def __init__( # During model initialization, the default dtype is set as the model # weight and activation dtype. dtype = torch.get_default_dtype() - attn_backend = get_attn_backend(num_heads, head_size, num_kv_heads, - sliding_window, dtype, kv_cache_dtype, - block_size, blocksparse_params + attn_backend = get_attn_backend(head_size, sliding_window, dtype, + kv_cache_dtype, block_size, + is_attention_free, blocksparse_params is not None) impl_cls = attn_backend.get_impl_cls() self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 30aa7cb311afb..7edb7676ea2cd 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -24,6 +24,7 @@ class _Backend(enum.Enum): FLASHINFER = enum.auto() PALLAS = enum.auto() IPEX = enum.auto() + NO_ATTENTION = enum.auto() def backend_name_to_enum(backend_name: str) -> _Backend: @@ -88,13 +89,12 @@ def get_global_forced_attn_backend() -> Optional[_Backend]: @lru_cache(maxsize=None) def get_attn_backend( - num_heads: int, head_size: int, - num_kv_heads: int, sliding_window: Optional[int], dtype: torch.dtype, kv_cache_dtype: Optional[str], block_size: int, + is_attention_free: bool, is_blocksparse: bool = False, ) -> Type[AttentionBackend]: """Selects which attention backend to use and lazily imports it.""" @@ -105,9 +105,8 @@ def get_attn_backend( BlocksparseFlashAttentionBackend) return BlocksparseFlashAttentionBackend - backend = which_attn_to_use(num_heads, head_size, num_kv_heads, - sliding_window, dtype, kv_cache_dtype, - block_size) + backend = which_attn_to_use(head_size, sliding_window, dtype, + kv_cache_dtype, block_size, is_attention_free) if backend == _Backend.FLASH_ATTN: from vllm.attention.backends.flash_attn import ( # noqa: F401 FlashAttentionBackend) @@ -146,23 +145,31 @@ def get_attn_backend( logger.info("Using Pallas backend.") from vllm.attention.backends.pallas import PallasAttentionBackend return PallasAttentionBackend + elif backend == _Backend.NO_ATTENTION: + from vllm.attention.backends.placeholder_attn import ( + PlaceholderAttentionBackend) + return PlaceholderAttentionBackend else: raise ValueError("Invalid attention backend.") def which_attn_to_use( - num_heads: int, head_size: int, - num_kv_heads: int, sliding_window: Optional[int], dtype: torch.dtype, kv_cache_dtype: Optional[str], block_size: int, + is_attention_free: bool, ) -> _Backend: """Returns which flash attention backend to use.""" # Default case. selected_backend = _Backend.FLASH_ATTN + # If there are no attention layers (e.g. we are running Mamba), + # use the placeholder NO_ATTENTION + if is_attention_free: + return _Backend.NO_ATTENTION + # Check whether a particular choice of backend was # previously forced. # diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index de0b1d8a75757..4780358cea517 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -1,8 +1,17 @@ +import copy import operator +from typing import Callable, Dict, List, Optional, Tuple, Union import torch import torch.fx as fx +from vllm.logger import init_logger + +from .compile_context import get_compile_context +from .levels import CompilationLevel + +logger = init_logger(__name__) + def fix_functionalization(graph: fx.Graph): """ @@ -148,9 +157,113 @@ def fix_functionalization(graph: fx.Graph): # print(graph.python_code(root_module="self", verbose=True).src, file=f) -def vllm_backend(graph, example_inputs): +def wrap_inductor(graph, example_inputs, additional_inductor_config): from torch._inductor import config current_config = config.shallow_copy_dict() from torch._inductor.compile_fx import compile_fx + + if additional_inductor_config is not None: + current_config.update(additional_inductor_config) + if current_config['post_grad_custom_post_pass'] is not None: + logger.warning( + "post_grad_custom_post_pass is already set in the config. " + "Overwriting it with the fix_functionalization") current_config['post_grad_custom_post_pass'] = fix_functionalization return compile_fx(graph, example_inputs, config_patches=current_config) + + +def vllm_backend( + graph, + example_inputs, + additional_inductor_config: Optional[Dict] = None) -> Callable: + + context = get_compile_context() + context = copy.deepcopy(context) if context is not None else [] + sizes_to_specialize: List[int] = context + + # flags for all the seen shapes, whether we need to specialize + runtime_shapes_to_compile_flags: Dict[Tuple[int, ...], bool] = {} + + # if we need to specialize, the compiled graph for that shape + runtime_shapes_to_compiled_graph: Dict[Tuple[int, ...], Callable] = {} + + # this is the first compilation, we will compile a graph with + # dynamic shape, as the caller will mark first dimension as dynamic + logger.info("Compiling a graph for general shapes") + graph_for_symbolic_shape = wrap_inductor(graph, example_inputs, + additional_inductor_config) + + # TODO: Dynamo does not pass all dynamic shapes. + # Need to investigate why. It works now because all the dynamic + # shapes have the same value, and either of them can be used. + sym_shape_indices = [ + i for i, x in enumerate(example_inputs) if isinstance(x, torch.SymInt) + ] + + first_run = True + + # this is the function we return to Dynamo to run finally + def compiled_graph_wrapper(*args): + + runtime_shapes: Tuple[int, + ...] = tuple(args[i] for i in sym_shape_indices) + + nonlocal first_run + nonlocal runtime_shapes_to_compile_flags + nonlocal runtime_shapes_to_compiled_graph + + if first_run: + # the first compilation is for profiling, we directly run it + first_run = False + return graph_for_symbolic_shape(*args) + + if runtime_shapes not in runtime_shapes_to_compile_flags: + # we haven't seen this shape before + # query if we need to specialize for this shape + # we only specialize for the first dimension. + # TODO: investigate if any model needs to specialize + # beyond the first dimension + runtime_shapes_to_compile_flags[runtime_shapes] = runtime_shapes[ + 0] in sizes_to_specialize + + if not runtime_shapes_to_compile_flags[runtime_shapes]: + # we don't need to specialize for this shape + return graph_for_symbolic_shape(*args) + + if runtime_shapes not in runtime_shapes_to_compiled_graph: + # we need to specialize for this shape, and we haven't compiled + # compile the graph for this shape + logger.info("Compiling a graph for shapes %s", runtime_shapes) + runtime_shapes_to_compiled_graph[runtime_shapes] = wrap_inductor( + graph, args, additional_inductor_config) + + return runtime_shapes_to_compiled_graph[runtime_shapes](*args) + + return compiled_graph_wrapper + + +def select_default_backend(level: int) -> Union[str, Callable]: + if level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]: + backend = "eager" + return backend + assert level in [ + CompilationLevel.INDUCTOR, CompilationLevel.INDUCTOR_MAX_AUTOTUNE + ], f"Invalid level {level}" + + from vllm.compilation.backends import vllm_backend + from vllm.plugins import get_inductor_additional_configs + additional_configs = get_inductor_additional_configs() + + if level == CompilationLevel.INDUCTOR_MAX_AUTOTUNE: + if "max_autotune" in additional_configs and not additional_configs[ + "max_autotune"]: + logger.warning( + "max_autotune is disabled, but is overridden by level %s", + CompilationLevel.INDUCTOR_MAX_AUTOTUNE) + additional_configs['max_autotune'] = True + + from functools import partial + backend = partial(vllm_backend, + additional_inductor_config=additional_configs) + + return backend diff --git a/vllm/compilation/compile_context.py b/vllm/compilation/compile_context.py new file mode 100644 index 0000000000000..29db3d4c637b9 --- /dev/null +++ b/vllm/compilation/compile_context.py @@ -0,0 +1,23 @@ +from contextlib import contextmanager +from typing import Any + +_compile_context: Any = None + + +def get_compile_context() -> Any: + """Get the current compile context.""" + return _compile_context + + +@contextmanager +def set_compile_context(context: Any): + """A context manager that stores the current compile context, + usually it is a list of sizes to specialize. + """ + global _compile_context + prev_context = _compile_context + _compile_context = context + try: + yield + finally: + _compile_context = prev_context diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py new file mode 100644 index 0000000000000..655c4c4430179 --- /dev/null +++ b/vllm/compilation/decorators.py @@ -0,0 +1,113 @@ +import inspect +from typing import Dict, List, Union + +import torch + +import vllm.envs as envs +from vllm.compilation.levels import CompilationLevel +from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher +from vllm.sequence import IntermediateTensors +from vllm.utils import supports_dynamo + + +def support_torch_compile(dynamic_arg_dims: Dict[str, Union[int, List[int]]]): + """ + A decorator to add support for compiling the forward method of a class. + + `dynamic_arg_dims` is a dictionary that maps argument names to the dynamic + dimensions of the argument. The dynamic dimensions can be either a single + integer or a list of integers. + + Depending on the value of arguments: + + - if it is a single integer, the corresponding dimension of the argument + will be marked as dynamic. + - if it is `None`, ignored. + - if it is `IntermediateTensors`, all the tensors in the intermediate + tensors will be marked as dynamic. + - otherwise, it will raise an error. + + NOTE: if an argument is `None`, it should always be passed as `None` during + the lifetime of the model, otherwise, it cannot be captured as a single + computation graph. + """ + + def cls_decorator_helper(cls: type): + # helper to pass `dynamic_arg_dims`` to `_support_torch_compile`` + # to avoid too much indentation for `_support_torch_compile`` + sig = inspect.signature(cls.forward) + for k in dynamic_arg_dims: + if k not in sig.parameters: + raise ValueError( + f"Argument {k} not found in the forward method of {cls}") + return _support_torch_compile(cls, dynamic_arg_dims) + + return cls_decorator_helper + + +def _support_torch_compile(cls: type, + dynamic_arg_dims: Dict[str, Union[int, List[int]]]): + """ + A decorator to add support for compiling the forward method of a class. + """ + + # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner + # will handle the compilation, so we don't need to do anything here. + if envs.VLLM_TORCH_COMPILE_LEVEL in [ + CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS + ] or not supports_dynamo(): + return cls + + # take care of method resolution order + # make sure super().__init__ is called on the base class + # other than TorchCompileWrapperWithCustomDispatcher + cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, ) + + old_init = cls.__init__ + + def __init__(self, *args, **kwargs): + old_init(self, *args, **kwargs) + TorchCompileWrapperWithCustomDispatcher.__init__(self) + + cls.__init__ = __init__ + + def __call__(self, *args, **kwargs): + # torch.compiler.is_compiling() means we are inside the compilation + # e.g. TPU has the compilation logic in model runner, so we don't + # need to compile the model inside. + if torch.compiler.is_compiling(): + return self.forward(*args, **kwargs) + + # the first compilation needs to have dynamic shapes marked + if len(self.compiled_codes) < 1: + sig = inspect.signature(self.__class__.forward) + bound_args = sig.bind(self, *args, **kwargs) + bound_args.apply_defaults() + for k, dims in dynamic_arg_dims.items(): + arg = bound_args.arguments.get(k) + if arg is not None: + if isinstance(arg, torch.Tensor): + torch._dynamo.mark_dynamic(arg, dims) + elif isinstance(arg, IntermediateTensors): + for tensor in arg.tensors.values(): + torch._dynamo.mark_dynamic(tensor, dims) + else: + raise ValueError( + "Unsupported dynamic dimensions" + f" {dims} for argument {k} with type {type(arg)}.") + + # if we don't use custom dispatcher, we can directly call the + # compiled function and let torch.compile handle the dispatching, + # with the overhead of guard evaluation and recompilation. + if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher: + return self.compiled_callable(*args, **kwargs) + + # usually, capturing the model once is enough, and then we can + # dispatch to the compiled code directly, without going through + # the Dynamo guard mechanism. + with self.dispatch_to_code(0): + model_output = self.forward(*args, **kwargs) + return model_output + + cls.__call__ = __call__ + return cls diff --git a/vllm/compilation/levels.py b/vllm/compilation/levels.py new file mode 100644 index 0000000000000..162bf5ae64997 --- /dev/null +++ b/vllm/compilation/levels.py @@ -0,0 +1,9 @@ +# constants for the levels of the compilation process + + +class CompilationLevel: + NO_COMPILATION = 0 + DYNAMO_AS_IS = 1 + DYNAMO_ONCE = 2 + INDUCTOR = 3 + INDUCTOR_MAX_AUTOTUNE = 4 diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index e923bd36ccc08..1594b64a61b94 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -3,12 +3,14 @@ from abc import abstractmethod from contextlib import contextmanager from types import CodeType -from typing import Callable, List +from typing import Callable, List, Optional import torch import vllm.envs as envs +from .levels import CompilationLevel + class TorchCompileWrapperWithCustomDispatcher: """ @@ -23,7 +25,26 @@ class TorchCompileWrapperWithCustomDispatcher: `torch.compile` over the forward method. """ - def __init__(self, compiled_callable: Callable): + def __init__(self, compiled_callable: Optional[Callable] = None): + + if compiled_callable is None: + # default compilation settings + # compiling the forward method + + # choose the compile backend + + # if the user has set the backend, use it + from vllm.plugins import get_torch_compile_backend + backend = get_torch_compile_backend() + if backend is None: + from vllm.compilation.backends import select_default_backend + backend = select_default_backend(envs.VLLM_TORCH_COMPILE_LEVEL) + + compiled_callable = torch.compile( + self.forward, + fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + backend=backend) + self.compiled_callable = compiled_callable self.original_code_object = self.__class__.forward.__code__ self.compiled_codes: List[CodeType] = [] @@ -33,7 +54,7 @@ def __init__(self, compiled_callable: Callable): # subclasses can use this to switch between the custom dispatcher # and the default Dynamo guard mechanism. self.use_custom_dispatcher: bool = \ - envs.VLLM_DYNAMO_USE_CUSTOM_DISPATCHER + envs.VLLM_TORCH_COMPILE_LEVEL >= CompilationLevel.DYNAMO_ONCE def __call__(self, *args, **kwargs): """Implement the dispatch logic here, beyond the torch.compile level. diff --git a/vllm/config.py b/vllm/config.py index 91ba45798b4ba..b0761ae0ee869 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -196,6 +196,9 @@ def __init__(self, if not self.skip_tokenizer_init: self._verify_tokenizer_mode() + self.is_attention_free = self._init_attention_free() + self.has_inner_state = self._init_has_inner_state() + self.override_neuron_config = override_neuron_config if is_neuron( ) else None self._verify_embedding_mode() @@ -216,6 +219,14 @@ def _init_multimodal_config( return None + def _init_attention_free(self) -> bool: + architectures = getattr(self.hf_config, "architectures", []) + return ModelRegistry.is_attention_free_model(architectures) + + def _init_has_inner_state(self) -> bool: + architectures = getattr(self.hf_config, "architectures", []) + return ModelRegistry.model_has_inner_state(architectures) + def _verify_tokenizer_mode(self) -> None: tokenizer_mode = self.tokenizer_mode.lower() if tokenizer_mode not in ["auto", "slow", "mistral"]: @@ -348,6 +359,8 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if device_config.device_type not in ("cuda", "tpu"): logger.warning( "Async output processing is only supported for CUDA or TPU. " @@ -361,6 +374,8 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if device_config.device_type == "cuda" and self.enforce_eager: logger.warning( "To see benefits of async output processing, enable CUDA " @@ -374,6 +389,8 @@ def verify_async_output_proc(self, parallel_config, speculative_config, if self.embedding_mode: self.use_async_output_proc = False + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if speculative_config: logger.warning("Async output processing is not supported with" " speculative decoding currently.") @@ -438,6 +455,10 @@ def get_head_size(self) -> int: # FlashAttention supports only head_size 32, 64, 128, 256, # we need to pad head_size 192 to 256 return 256 + + if self.is_attention_free: + return 0 + if hasattr(self.hf_text_config, "head_dim"): return self.hf_text_config.head_dim # FIXME(woosuk): This may not be true for all models. @@ -469,6 +490,9 @@ def get_total_num_kv_heads(self) -> int: return getattr(self.hf_config.attn_config, "kv_n_heads", self.hf_config.num_attention_heads) + if self.is_attention_free: + return 0 + attributes = [ # For Falcon: "n_head_kv", @@ -511,31 +535,17 @@ def get_num_layers(self, parallel_config: "ParallelConfig") -> int: start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) return end - start - def contains_seqlen_agnostic_layers( - self, parallel_config: "ParallelConfig") -> bool: - """True for Mamba/SSM models (Jamba)""" - return self._get_num_seqlen_agnostic_layers(parallel_config) > 0 + def get_num_attention_layers(self, + parallel_config: "ParallelConfig") -> int: + if self.is_attention_free: + return 0 - def get_layers_block_type(self, - parallel_config: "ParallelConfig") -> List[str]: num_layers = self.get_num_layers(parallel_config) - # Transformers supports layers_block_type @property - return getattr(self.hf_config, "layers_block_type", - ["attention"] * num_layers) - def get_num_attention_layers(self, - parallel_config: "ParallelConfig") -> int: - return len([ - t for t in self.get_layers_block_type(parallel_config) - if t == "attention" - ]) - - def _get_num_seqlen_agnostic_layers( - self, parallel_config: "ParallelConfig") -> int: - return len([ - t for t in self.get_layers_block_type(parallel_config) - if t != "attention" - ]) + # Transformers supports layers_block_type @property + layers = getattr(self.hf_config, "layers_block_type", + ["attention"] * num_layers) + return len([t for t in layers if t == "attention"]) def get_multimodal_config(self) -> "MultiModalConfig": """ @@ -585,6 +595,7 @@ def __init__( gpu_memory_utilization: float, swap_space: float, cache_dtype: str, + is_attention_free: bool = False, num_gpu_blocks_override: Optional[int] = None, sliding_window: Optional[int] = None, enable_prefix_caching: bool = False, @@ -595,6 +606,7 @@ def __init__( self.swap_space_bytes = swap_space * GiB_bytes self.num_gpu_blocks_override = num_gpu_blocks_override self.cache_dtype = cache_dtype + self.is_attention_free = is_attention_free self.sliding_window = sliding_window self.enable_prefix_caching = enable_prefix_caching self.cpu_offload_gb = cpu_offload_gb @@ -1194,6 +1206,8 @@ def maybe_create_spec_config( "speculative decoding is > 1, but got " f"{speculative_disable_by_batch_size=}") + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if enable_chunked_prefill: raise ValueError( "Speculative decoding and chunked prefill are " @@ -1555,6 +1569,8 @@ def verify_with_model_config(self, model_config: ModelConfig): model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if scheduler_config.chunked_prefill_enabled: raise ValueError("LoRA is not supported with chunked prefill yet.") diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index c7ee6609306d7..cb047c832e6cb 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -24,9 +24,8 @@ class BlockSpaceManagerV2(BlockSpaceManager): autoregressively-generated tokens, and other advanced features such as prefix caching, forking/copy-on-write, and sliding-window memory allocation. - The current implementation is partial; in particular prefix caching and - sliding-window are not feature complete. This class implements the design - described in https://github.com/vllm-project/vllm/pull/3492. + This class implements the design described in + https://github.com/vllm-project/vllm/pull/3492. Lookahead slots The block manager has the notion of a "lookahead slot". These are slots @@ -190,7 +189,7 @@ def allocate(self, seq_group: SequenceGroup) -> None: assert (request_id not in self.cross_block_tables), \ - "block table already exists" + "block table already exists" check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index 6346711587301..9e1d1b02f6805 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -36,10 +36,10 @@ def get_block_space_manager_class(version: str): from vllm.core.block_manager_v2 import BlockSpaceManagerV2 return BlockSpaceManagerV2 - if version == "embedding": - from vllm.core.embedding_model_block_manager import ( - EmbeddingModelBlockSpaceManager) - return EmbeddingModelBlockSpaceManager + if version == "placeholder": + from vllm.core.placeholder_block_space_manager import ( + PlaceholderBlockSpaceManager) + return PlaceholderBlockSpaceManager raise ValueError(f"Unknown version {version=}") diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/placeholder_block_space_manager.py similarity index 90% rename from vllm/core/embedding_model_block_manager.py rename to vllm/core/placeholder_block_space_manager.py index 476e043ecc52d..a337392bbed53 100644 --- a/vllm/core/embedding_model_block_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -5,9 +5,10 @@ from vllm.utils import Device -class EmbeddingModelBlockSpaceManager(BlockSpaceManager): - """An embedding version of BlockSpaceManager for use in environments - with embedding models where block management is not required. +class PlaceholderBlockSpaceManager(BlockSpaceManager): + """A version of BlockSpaceManager for use in environments + where block management is not required. + For example: embedding models or attention-free models like Mamba. This class provides the same interface as BlockSpaceManager, but its methods perform no actions or return simple values like True in specific @@ -40,7 +41,7 @@ def append_slots( seq: Sequence, num_lookahead_slots: int, ) -> List[Tuple[int, int]]: - return None # type: ignore + return [] def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: pass diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index e930f807280f0..1f0a121711db5 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -314,8 +314,9 @@ def __init__( version = "v1" if self.scheduler_config.use_v2_block_manager: version = "v2" - if self.scheduler_config.embedding_mode: - version = "embedding" + if (self.scheduler_config.embedding_mode + or self.cache_config.is_attention_free): + version = "placeholder" BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class( version) @@ -1205,7 +1206,7 @@ def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool: # async_output_proc is allowed only when we have a single sequence # in the sequence group no_single_seq = seq_group.sampling_params is None or ( - seq_group.sampling_params.best_of == 1) + seq_group.sampling_params.n == 1) return no_single_seq def schedule( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cae95d20ca23d..1b132cf76a10d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -183,6 +183,8 @@ class EngineArgs: def __post_init__(self): if self.tokenizer is None: self.tokenizer = self.model + + # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() @@ -910,6 +912,7 @@ def create_engine_config(self) -> EngineConfig: gpu_memory_utilization=self.gpu_memory_utilization, swap_space=self.swap_space, cache_dtype=self.kv_cache_dtype, + is_attention_free=model_config.is_attention_free, num_gpu_blocks_override=self.num_gpu_blocks_override, sliding_window=model_config.get_sliding_window(), enable_prefix_caching=self.enable_prefix_caching, @@ -943,13 +946,9 @@ def create_engine_config(self) -> EngineConfig: use_sliding_window = (model_config.get_sliding_window() is not None) use_spec_decode = self.speculative_model is not None - has_seqlen_agnostic_layers = ( - model_config.contains_seqlen_agnostic_layers( - parallel_config)) if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora - and not self.enable_prompt_adapter - and not has_seqlen_agnostic_layers): + and not self.enable_prompt_adapter): self.enable_chunked_prefill = True logger.warning( "Chunked prefill is enabled by default for models with " @@ -1001,6 +1000,8 @@ def create_engine_config(self) -> EngineConfig: disable_logprobs=self.disable_logprobs_during_spec_decoding, ) + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: raise ValueError("Speculative decoding is not supported with " diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 510ffac6f6892..563e52a37d935 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -767,7 +767,7 @@ def add_request( Details: - Set arrival_time to the current time if it is None. - Set prompt_token_ids to the encoded prompt if it is None. - - Create `best_of` number of :class:`~vllm.Sequence` objects. + - Create `n` number of :class:`~vllm.Sequence` objects. - Create a :class:`~vllm.SequenceGroup` object from the list of :class:`~vllm.Sequence`. - Add the :class:`~vllm.SequenceGroup` object to the scheduler. @@ -1242,8 +1242,7 @@ def _advance_to_next_step( if seq_group_metadata.do_sample: assert len(sequence_group_outputs.samples) == 1, ( "Async output processor expects a single sample" - " (i.e sampling_params.n == 1 and no " - "sampling_params.best_of > 1)") + " (i.e sampling_params.n == 1)") sample = sequence_group_outputs.samples[0] assert len(seq_group.seqs) == 1 @@ -1612,7 +1611,6 @@ def _get_stats(self, # Metadata num_prompt_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = [] - best_of_requests: List[int] = [] n_requests: List[int] = [] finished_reason_requests: List[str] = [] @@ -1683,8 +1681,6 @@ def _get_stats(self, for seq in seq_group.get_finished_seqs() ]) if seq_group.sampling_params is not None: - best_of_requests.append( - seq_group.sampling_params.best_of) n_requests.append(seq_group.sampling_params.n) finished_reason_requests.extend([ SequenceStatus.get_finished_reason(seq.status) @@ -1737,7 +1733,6 @@ def _get_stats(self, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, - best_of_requests=best_of_requests, n_requests=n_requests, finished_reason_requests=finished_reason_requests, ) @@ -1824,8 +1819,6 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: seq_group.sampling_params.top_p) seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS, seq_group.sampling_params.max_tokens) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_BEST_OF, - seq_group.sampling_params.best_of) seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N, seq_group.sampling_params.n) seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 74277cae7c8ef..42acd3ea4c94c 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -134,12 +134,6 @@ def __init__(self, labelnames: List[str], max_model_len: int): labelnames=labelnames, buckets=build_1_2_5_buckets(max_model_len), ) - self.histogram_best_of_request = self._histogram_cls( - name="vllm:request_params_best_of", - documentation="Histogram of the best_of request parameter.", - labelnames=labelnames, - buckets=[1, 2, 5, 10, 20], - ) self.histogram_n_request = self._histogram_cls( name="vllm:request_params_n", documentation="Histogram of the n request parameter.", @@ -473,8 +467,6 @@ def _log_prometheus(self, stats: Stats) -> None: self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests) self._log_histogram(self.metrics.histogram_n_request, stats.n_requests) - self._log_histogram(self.metrics.histogram_best_of_request, - stats.best_of_requests) def _log_prometheus_interval(self, prompt_throughput: float, generation_throughput: float) -> None: diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 1eccb23593408..bafd5fa1a8a82 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -49,7 +49,6 @@ class Stats: # Metadata num_prompt_tokens_requests: List[int] num_generation_tokens_requests: List[int] - best_of_requests: List[int] n_requests: List[int] finished_reason_requests: List[str] diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index eecca82cd2f7d..2bf0ce83c7607 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -130,6 +130,9 @@ def dead_error(self) -> BaseException: def from_engine_args(cls, engine_args: AsyncEngineArgs, usage_context: UsageContext, ipc_path: str): """Creates an MQLLMEngine from the engine arguments.""" + # Setup plugins for each process + from vllm.plugins import load_general_plugins + load_general_plugins() engine_config = engine_args.create_engine_config() @@ -282,7 +285,8 @@ def _handle_process_request(self, request: RPCProcessRequest): params=request.params, lora_request=request.lora_request, trace_headers=request.trace_headers, - prompt_adapter_request=request.prompt_adapter_request) + prompt_adapter_request=request.prompt_adapter_request, + priority=request.priority) if self.log_requests: logger.info("Added request %s.", request.request_id) diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 47de3656ca892..74ddb250ccd9e 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -62,6 +62,8 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, @staticmethod @functools.lru_cache() def _log_prompt_logprob_unsupported_warning_once(): + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " "(e.g., speculative decode uses multi step workers).") diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index 00d9297e41d99..cfa84077685a0 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -112,7 +112,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, outputs: SequenceGroupOutput, is_async: bool) -> None: sampling_params = seq_group.sampling_params - if sampling_params.best_of == 1: + if sampling_params.n == 1: # only have one output sample sample = outputs.samples[0] # only have one sequence diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 1e85167ea7619..4931195ae0e02 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -538,10 +538,12 @@ async def chat_completion_stream_generator( # any tokens that were generated but previously # matched by partial json parsing # only happens if we are NOT using guided decoding + auto_tools_called = False if tool_parser: - index = len( - tool_parser.prev_tool_call_arr) - 1 if len( - tool_parser.prev_tool_call_arr) > 0 else 0 + auto_tools_called = len( + tool_parser.prev_tool_call_arr) > 0 + index = len(tool_parser.prev_tool_call_arr + ) - 1 if auto_tools_called else 0 else: index = 0 @@ -576,9 +578,7 @@ async def chat_completion_stream_generator( delta=delta_message, logprobs=logprobs, finish_reason=output.finish_reason - if not (tool_parser - and len(tool_parser.prev_tool_call_arr)) - else "tool_calls", + if not auto_tools_called else "tool_calls", stop_reason=output.stop_reason) chunk = ChatCompletionStreamResponse( id=request_id, @@ -680,8 +680,10 @@ async def chat_completion_full_generator( else: logprobs = None - # by default, tools are not used. - tools_called = False + # In the OpenAI API the finish_reason is "tools_called" + # if the tool choice is auto and the model produced a tool + # call. The same is not true for named function calls + auto_tools_called = False # if auto tools are not enabled, and a named tool choice using # outlines is not being used @@ -703,7 +705,6 @@ async def chat_completion_full_generator( name=request.tool_choice.function.name, arguments=output.text)) ]) - tools_called = True # if the request doesn't use tool choice # OR specifies to not use a tool @@ -725,7 +726,10 @@ async def chat_completion_full_generator( tool_call_info = tool_parser.extract_tool_calls( output.text, request=request) - tools_called = tool_call_info.tools_called + # In the OpenAI API the finish_reason is "tools_called" + # if the tool choice is auto and the model produced a tool + # call. The same is not true for named function calls + auto_tools_called = tool_call_info.tools_called if tool_call_info.tools_called: message = ChatMessage(role=role, content=tool_call_info.content, @@ -748,7 +752,7 @@ async def chat_completion_full_generator( index=output.index, message=message, logprobs=logprobs, - finish_reason="tool_calls" if tools_called else + finish_reason="tool_calls" if auto_tools_called else output.finish_reason if output.finish_reason else "stop", stop_reason=output.stop_reason) choices.append(choice_data) diff --git a/vllm/envs.py b/vllm/envs.py index 97767bf5b5ad9..8b541e5b78c01 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -65,6 +65,7 @@ VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False + VLLM_TORCH_COMPILE_LEVEL: int = 0 def get_default_cache_root(): @@ -198,23 +199,12 @@ def get_default_config_root(): lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")), - # Internal flag to enable Dynamo graph capture - "VLLM_TEST_DYNAMO_GRAPH_CAPTURE": - lambda: int(os.environ.get("VLLM_TEST_DYNAMO_GRAPH_CAPTURE", "0")), - "VLLM_DYNAMO_USE_CUSTOM_DISPATCHER": - lambda: - (os.environ.get("VLLM_DYNAMO_USE_CUSTOM_DISPATCHER", "True").lower() in - ("true", "1")), - - # Internal flag to control whether we use custom op, - # or use the native pytorch implementation - "VLLM_TEST_COMPILE_NO_CUSTOM_OPS": - lambda: int(os.environ.get("VLLM_TEST_COMPILE_NO_CUSTOM_OPS", "0")), - # Internal flag to enable Dynamo fullgraph capture "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": lambda: bool( os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), + "VLLM_TORCH_COMPILE_LEVEL": + lambda: int(os.environ.get("VLLM_TORCH_COMPILE_LEVEL", "0")), # local rank of the process in the distributed setting, used to determine # the GPU device id diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 9ad240ef60820..e32993e0e452e 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -28,6 +28,8 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid assert self.lora_config is None, "cpu backend doesn't support LoRA" # @@ -324,6 +326,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: if config.dtype == torch.float16: logger.warning("float16 is not supported on CPU, casting to bfloat16.") config.dtype = torch.bfloat16 + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if not config.enforce_eager: logger.warning( "CUDA graph is not supported on CPU, fallback to the eager " @@ -334,6 +338,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: def _verify_and_get_scheduler_config( config: SchedulerConfig) -> SchedulerConfig: + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if config.chunked_prefill_enabled: logger.warning("Chunked prefill is not supported on CPU, disable it.") config.chunked_prefill_enabled = False @@ -342,6 +348,8 @@ def _verify_and_get_scheduler_config( def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig: + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if config.enable_prefix_caching: logger.warning("Prefix caching is not supported on CPU, disable it.") config.enable_prefix_caching = False diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 22adb1631d410..64387fd2fa47d 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -310,6 +310,8 @@ def _build_enc_dec_llm_inputs( encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if decoder_mm_data is not None: raise ValueError( "Multi-modality decoder inputs of encoder-decoder models are " diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 91e9f55e82433..aaadca9a4d16d 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -23,6 +23,7 @@ from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.punica import PunicaWrapper from vllm.lora.utils import (from_layer, from_layer_logits_processor, + is_regex_target_modules, parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -233,6 +234,8 @@ def from_local_checkpoint( # modules. unexpected_modules = [] target_modules = config["target_modules"] + if not isinstance(target_modules, list): + target_modules = [target_modules] for module in target_modules: # Compatible with more modules, # such as:layers.11.self_attn.k_proj @@ -243,8 +246,8 @@ def from_local_checkpoint( # expected_lora_modules. It is not reliable. See # https://github.com/vllm-project/vllm/pull/5909. But there's no # other better mechanism. - if unexpected_modules: - print(unexpected_modules, "modules") + if unexpected_modules and not is_regex_target_modules( + config["target_modules"], expected_lora_modules): raise ValueError( f"While loading {lora_dir}, expected" f" target modules in {expected_lora_modules}" @@ -334,7 +337,11 @@ def __init__( self.packed_modules_mapping = copy.deepcopy( self.model.packed_modules_mapping) # Used to indicate whether the model is a multimodal model - self.supports_mm: bool = supports_multimodal(self.model) + self.supports_mm: bool = ( + supports_multimodal(self.model) + # In case the model only supports LoRA for + # text modules (e.g. ChatGLM) + and hasattr(self.model, "get_mm_mapping")) self.packed_modules: Dict[str, List[str]] = {} self.modules: Dict[str, "BaseLayerWithLoRA"] = {} # Dict instead of a Set for compatibility with LRUCache. diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index ee983328e2c5b..a780429f413d3 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -1,5 +1,6 @@ import os -from typing import List, Optional, Set, Tuple, Type +import re +from typing import List, Optional, Set, Tuple, Type, Union import huggingface_hub from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, @@ -113,6 +114,38 @@ def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]: raise ValueError(f"{name} is unsupported LoRA weight") +def is_regex_target_modules(load_modules: Union[str, List[str]], + expected_lora_modules: List[str]) -> bool: + """ + PEFT supports passing `target_modules` in the form of regular expressions, + such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to + determine whether the suffix in the regular expression is present in the + `expected_lora_modules`. + """ + + def is_valid_regex(pattern): + try: + re.compile(pattern) + return True + except re.error: + return False + + def is_subset(sub_list, full_list): + return set(sub_list).issubset(set(full_list)) + + # Similar to PEFT's processing logic, regex-related operations are only + # executed when the load_modules is a `str`. + if not isinstance(load_modules, str): + return False + + if is_valid_regex(load_modules): + match = re.search(r"\((.*?)\)\$?$", load_modules) + if match: + suffix = match.group(1).split("|") + return is_subset(suffix, expected_lora_modules) + return False + + def get_adapter_absolute_path(lora_path: str) -> str: """ Resolves the given lora_path to an absolute local path. diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 9102b5e19ebec..d0e90245ad010 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,6 +1,7 @@ import torch.nn as nn import vllm.envs as envs +from vllm.compilation.levels import CompilationLevel from vllm.platforms import current_platform from vllm.utils import is_cpu, is_hip, is_xpu @@ -55,7 +56,7 @@ def dispatch_forward(self): # NOTE(woosuk): Here we assume that vLLM was built for only one # specific backend. Currently, we do not support dynamic dispatching. - if envs.VLLM_TEST_COMPILE_NO_CUSTOM_OPS: + if envs.VLLM_TORCH_COMPILE_LEVEL >= CompilationLevel.INDUCTOR: return self.forward_native if is_hip(): diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json new file mode 100644 index 0000000000000..d720deb4bdd73 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json @@ -0,0 +1,173 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 7 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2 + }, + "192": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 8 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2 + }, + "6144": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 0b959da79c3be..42a6a0e6b3229 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -508,7 +508,7 @@ def _random_sample( same as the length of selected_seq_groups. If the corresponding seq_group has do_sample=False, tuple contains ([], []) """ - # Find the maximum best_of value of the prompt phase requests. + # Find the maximum n value of the prompt phase requests. random_samples = random_samples.cpu() sample_idx = 0 results: SampleResultType = [] @@ -523,9 +523,9 @@ def _random_sample( num_parent_seqs = len(seq_ids) if is_prompt: # Prompt phase. - parent_ids = [0] * sampling_params.best_of + parent_ids = [0] * sampling_params.n next_token_ids = random_samples[ - sample_idx, :sampling_params.best_of].tolist() + sample_idx, :sampling_params.n].tolist() else: # Generation phase. parent_ids = list(range(num_parent_seqs)) @@ -570,7 +570,7 @@ def _beam_search_sample( is_prompt = seq_group.is_prompt seq_ids, sampling_params = seq_group.seq_ids, seq_group.sampling_params num_parent_seqs = len(seq_ids) - beam_width = sampling_params.best_of + beam_width = sampling_params.n seq_group_logprobs = logprobs[sample_idx:sample_idx + num_parent_seqs] if is_prompt: # Prompt phase. @@ -797,12 +797,11 @@ def _sample_with_torch( greedy_samples) elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): - max_best_of_in_batch = 1 + max_n_in_batch = 1 for seq_group in seq_groups: if seq_group.is_prompt: sampling_params = seq_group.sampling_params - max_best_of_in_batch = max(max_best_of_in_batch, - sampling_params.best_of) + max_n_in_batch = max(max_n_in_batch, sampling_params.n) seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else seq_groups) @@ -812,13 +811,13 @@ def _sample_with_torch( probs[long_sample_indices], sampling_tensors.top_ks[long_sample_indices], sampling_tensors.top_ps[long_sample_indices], - max_best_of_in_batch, + max_n_in_batch, seq_groups_arg, ) else: multinomial_samples[sampling_type] = _multinomial( probs[long_sample_indices], - max_best_of_in_batch, + max_n_in_batch, seq_groups=seq_groups_arg) if sampled_token_ids_tensor is not None: diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 5051d45dd1154..1e2857ee28cbf 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -6,7 +6,8 @@ import os import tempfile from collections import defaultdict -from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, Union +from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional, + Tuple, Union) import filelock import gguf @@ -559,6 +560,38 @@ def row_parallel_weight_loader(param: torch.Tensor, return default_weight_loader(param, loaded_weight) +LoaderFunction = Callable[[torch.Tensor, torch.Tensor], torch.Tensor] + + +def sharded_weight_loader(shard_axis: int) -> LoaderFunction: + """Create a weight loader that shards the weights along the given axis""" + + def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: + tp_rank = get_tensor_model_parallel_rank() + + shard_size = param.data.shape[shard_axis] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(shard_axis, start_idx, shard_size) + + return default_weight_loader(param, loaded_weight) + + return loader + + +def composed_weight_loader( + loader: LoaderFunction, fn: Callable[[torch.Tensor], + torch.Tensor]) -> LoaderFunction: + """Create a weight loader that post-processes the weights after loading""" + + def composed_loader(param: torch.Tensor, + loaded_weight: torch.Tensor) -> None: + loader(param, loaded_weight) + param.data.copy_(fn(param)) + return + + return composed_loader + + def initialize_dummy_weights( model: torch.nn.Module, low: float = -1e-3, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 879795c0d5955..f26c9f950dd36 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,42 +1,229 @@ # coding=utf-8 # Adapted from -# https://github.com/THUDM/ChatGLM2-6B +# https://github.com/THUDM/GLM-4 """Inference-only ChatGLM model compatible with THUDM weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from argparse import Namespace +from array import array +from typing import Dict, Iterable, List, Mapping, Optional, Tuple, TypedDict import torch +from PIL import Image from torch import nn from torch.nn import LayerNorm from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig, LoRAConfig -from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs +from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, + MultiModalInputs) +from vllm.multimodal.base import MultiModalData +from vllm.multimodal.utils import cached_get_tokenizer +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, + SequenceData) from vllm.transformers_utils.configs import ChatGLMConfig -from .interfaces import SupportsLoRA, SupportsPP -from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) +from .interfaces import SupportsLoRA, SupportsMultiModal + +logger = init_logger(__name__) + + +def calculate_image_placeholder(vision_config): + return (vision_config["image_size"] // vision_config["patch_size"] // 2)**2 + + +def mm_input_mapper_for_glmv( + ctx: InputContext, + data: MultiModalData[object], +) -> Dict: + model_config = ctx.model_config + tokenizer = cached_get_tokenizer(model_config.tokenizer, + trust_remote_code=True) + if tokenizer is None: + raise RuntimeError("No HuggingFace processor is available " + "to process the image object") + try: + raw_batch_data = tokenizer.apply_chat_template( + conversation=[{ + "role": "user", + "image": data + }], + add_generation_prompt=True, + tokenize=True, + return_tensors="pt", + return_dict=True).data + except Exception: + logger.error("Failed to process image (%s)", data) + raise + pixel_values = raw_batch_data['images'] + + return MultiModalInputs({'pixel_values': pixel_values}) + + +def merge_glm_vision_embeddings( + input_ids: torch.Tensor, + inputs_embeds: torch.Tensor, + vision_embeddings: torch.Tensor, + boi_token_id: int, + eoi_token_id: int, +) -> torch.Tensor: + + boi_positions = (input_ids == boi_token_id).nonzero(as_tuple=True)[0] + eoi_positions = (input_ids == eoi_token_id).nonzero(as_tuple=True)[0] + + mask = torch.zeros_like(input_ids, dtype=torch.bool) + + for boi_pos, eoi_pos in zip(boi_positions, eoi_positions): + assert boi_pos < eoi_pos + mask[boi_pos:eoi_pos + 1] = True + inputs_embeds[mask] = vision_embeddings.view(-1, + vision_embeddings.shape[-1]) + return inputs_embeds + + +class GLMImagePixelInputs(TypedDict): + pixel_values: torch.Tensor + """Shape: `(batch_size, num_channels, height, width)`""" + + +def get_max_glmv_image_tokens(ctx: InputContext): + hf_config = ctx.get_hf_config(ChatGLMConfig) + + vision_config = getattr(hf_config, 'vision_config', None) + if vision_config is None: + return 1 + elif isinstance(vision_config, dict): + return calculate_image_placeholder(vision_config) + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + +def dummy_data_for_glmv( + ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int] +) -> Tuple[SequenceData, Optional[MultiModalDataDict]]: + hf_config = ctx.get_hf_config(ChatGLMConfig) + vision_config = getattr(hf_config, 'vision_config', None) + + if vision_config is None: + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len) + seq_data = SequenceData(token_ids) + return seq_data, None + elif isinstance(vision_config, dict): + image_size = vision_config["image_size"] + image_placeholder_length = calculate_image_placeholder(vision_config) + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.boi_token_id] + + [0] * image_placeholder_length + + [hf_config.eoi_token_id]) + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, + [0] * (seq_len - image_placeholder_length - 2)) + seq_data = SequenceData(token_ids) + + mm_data = { + "image": Image.new("RGB", (image_size, image_size), color=0) + } + + return seq_data, mm_data + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + +def find_all_positions(input_ids: List[int], target: int) -> List[int]: + return [index for index, value in enumerate(input_ids) if value == target] + + +def input_processor_for_glmv(ctx: InputContext, llm_inputs: LLMInputs): + hf_config = ctx.get_hf_config(ChatGLMConfig) + vision_config = getattr(hf_config, 'vision_config', None) + + if vision_config is None: + return llm_inputs + elif isinstance(vision_config, dict): + image_placeholder_length = calculate_image_placeholder(vision_config) + else: + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + input_ids = llm_inputs.get("prompt_token_ids") + position_ids = llm_inputs.get("position_ids") + tokenizer = cached_get_tokenizer( + ctx.model_config.model, + trust_remote_code=ctx.model_config.trust_remote_code) + + try: + raw_batch_data = tokenizer.apply_chat_template( + conversation=[{ + "role": "user", + "image": llm_inputs['multi_modal_data']["image"], + "content": llm_inputs['prompt'] + }], + add_generation_prompt=True, + tokenize=True, + return_tensors="pt", + return_dict=True).data + except Exception: + logger.error("Failed to process content (%s)", llm_inputs['prompt']) + raise + input_ids = raw_batch_data['input_ids'][0].tolist() + + if position_ids is None: + position_ids = list(range(len(input_ids))) + boi_token_id = hf_config.boi_token_id + eoi_token_id = hf_config.eoi_token_id + boi_positions = find_all_positions(input_ids, boi_token_id) + eoi_positions = find_all_positions(input_ids, eoi_token_id) + + assert len(boi_positions) == len(eoi_positions) + + new_input_ids = [] + new_position_ids = [] + final_processed_position = 0 + final_processed_position = 0 + + for boi_position, eoi_position in zip(boi_positions, eoi_positions): + assert boi_position < eoi_position + new_input_ids.extend(input_ids[final_processed_position:boi_position + + 1]) + new_position_ids.extend( + list(range(final_processed_position, boi_position + 1))) + new_input_ids.extend([input_ids[boi_position + 1]] * + image_placeholder_length) + new_position_ids.extend([boi_position + 1] * image_placeholder_length) + final_processed_position = eoi_position + + new_input_ids.extend(input_ids[final_processed_position:]) + new_position_ids.extend( + list(range(final_processed_position, len(input_ids)))) + + assert len(new_input_ids) == len(new_position_ids) + + llm_inputs["prompt_token_ids"] = new_input_ids + llm_inputs["position_ids"] = new_position_ids + return llm_inputs class GLMAttention(nn.Module): def __init__( self, - config: ChatGLMConfig, + config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ): @@ -127,7 +314,7 @@ class GLMMLP(nn.Module): def __init__( self, - config: ChatGLMConfig, + config, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -170,7 +357,7 @@ class GLMBlock(nn.Module): def __init__( self, - config: ChatGLMConfig, + config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ): @@ -241,10 +428,9 @@ class GLMTransformer(nn.Module): def __init__( self, - config: ChatGLMConfig, + config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", ): super().__init__() self.post_layer_norm = config.post_layer_norm @@ -253,11 +439,10 @@ def __init__( self.num_layers = config.num_layers # Transformer layers. - self.start_layer, self.end_layer, self.layers = make_layers( - self.num_layers, - lambda prefix: GLMBlock(config, cache_config, quant_config), - prefix=f"{prefix}.layers", - ) + self.layers = nn.ModuleList([ + GLMBlock(config, cache_config, quant_config) + for i in range(self.num_layers) + ]) if self.post_layer_norm: layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm @@ -272,16 +457,16 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, ) -> torch.Tensor: - for i in range(self.start_layer, self.end_layer): + for i in range(self.num_layers): layer = self.layers[i] hidden_states = layer( hidden_states=hidden_states, position_ids=position_ids, - kv_cache=kv_caches[i - self.start_layer], + kv_cache=kv_caches[i], attn_metadata=attn_metadata, ) # Final layer norm. - if get_pp_group().is_last_rank and self.post_layer_norm: + if self.post_layer_norm: hidden_states = self.final_layernorm(hidden_states) return hidden_states @@ -291,14 +476,17 @@ class ChatGLMModel(nn.Module): def __init__( self, - config: ChatGLMConfig, + config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() + self.config = config + self.embedding = VocabParallelEmbedding(config.padded_vocab_size, - config.hidden_size) + config.hidden_size, + quant_config=quant_config) self.num_layers = config.num_layers self.multi_query_group_num = config.multi_query_group_num @@ -308,37 +496,73 @@ def __init__( self.output_layer = ParallelLMHead(config.padded_vocab_size, config.hidden_size, quant_config=quant_config) - self.make_empty_intermediate_tensors = ( - make_empty_intermediate_tensors_factory(["hidden_states"], - config.hidden_size)) + + vision_config_flag = getattr(config, 'vision_config', None) + if vision_config_flag is not None: + self.vision_config = Namespace(**config.vision_config) + self.vision = EVA2CLIPModel(self.config, quant_config) + else: + self.vision = None + + def _parse_and_validate_image_input( + self, **kwargs: object) -> GLMImagePixelInputs: + + pixel_values = kwargs.pop("pixel_values", None) + if pixel_values is not None and self.vision is not None: + if isinstance(pixel_values, torch.Tensor): + if pixel_values.ndim > 2: + pixel_values = torch.concat(list(pixel_values)) + elif isinstance(pixel_values, list): + return torch.concat(pixel_values) + else: + raise TypeError("""pixel_values must be a torch.Tensor + or a list of torch.Tensor + """) + return GLMImagePixelInputs(pixel_values=pixel_values) def forward( self, input_ids: torch.Tensor, - position_ids: torch.Tensor, + positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors], - ) -> Union[torch.Tensor, IntermediateTensors]: - if get_pp_group().is_first_rank: - inputs_embeds = self.embedding(input_ids) - else: - inputs_embeds = intermediate_tensors["hidden_states"] + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> torch.Tensor: + + inputs_embeds = self.embedding(input_ids) + image_input = self._parse_and_validate_image_input(**kwargs) + + if image_input["pixel_values"] is not None: + pixel_values = image_input["pixel_values"].to( + dtype=inputs_embeds.dtype) + image_embeds = self.vision(pixel_values) + + boi_token_id = self.config.boi_token_id + eoi_token_id = self.config.eoi_token_id + + inputs_embeds = merge_glm_vision_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + vision_embeddings=image_embeds, + boi_token_id=boi_token_id, + eoi_token_id=eoi_token_id) # Run encoder. hidden_states = self.encoder( hidden_states=inputs_embeds, - position_ids=position_ids, + position_ids=positions, kv_caches=kv_caches, attn_metadata=attn_metadata, ) - - if not get_pp_group().is_last_rank: - return IntermediateTensors({"hidden_states": hidden_states}) return hidden_states -class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): +@MULTIMODAL_REGISTRY.register_image_input_mapper(mm_input_mapper_for_glmv) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glmv_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_glmv) +@INPUT_REGISTRY.register_input_processor(input_processor_for_glmv) +class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"] @@ -356,6 +580,7 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def __init__( self, config: ChatGLMConfig, + multimodal_config: MultiModalConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, @@ -364,6 +589,7 @@ def __init__( self.config = config self.lora_config = lora_config + self.multimodal_config = multimodal_config self.quant_config = quant_config self.max_position_embeddings = getattr(config, "max_sequence_length", @@ -375,19 +601,16 @@ def __init__( self.lm_head = self.transformer.output_layer self.logits_processor = LogitsProcessor(config.padded_vocab_size) self.sampler = Sampler() - self.make_empty_intermediate_tensors = ( - self.transformer.make_empty_intermediate_tensors) - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, **kwargs) return hidden_states def compute_logits( @@ -408,8 +631,24 @@ def sample( return next_tokens def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + # Merge two ColumnParallelLinear into one MergedColumnParallelLinear + merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = { + "transformer.vision.linear_proj.merged_proj.weight": { + "transformer.vision.linear_proj.gate_proj.weight": None, + "transformer.vision.linear_proj.dense_h_to_4h.weight": None, + } + } + params_dict = dict(self.named_parameters(remove_duplicate=False)) for name, loaded_weight in weights: + is_weight_to_be_merge = False + for _, merged_weight_dict in merged_weights_dict.items(): + if name in merged_weight_dict: + assert merged_weight_dict[name] is None + merged_weight_dict[name] = loaded_weight + is_weight_to_be_merge = True + if is_weight_to_be_merge: + continue if "rotary_pos_emb.inv_freq" in name: continue if "word_embeddings" in name: @@ -417,9 +656,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - if is_pp_missing_parameter(name, self): - continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + + for combined_name, merged_weight_dict in merged_weights_dict.items(): + if combined_name in params_dict: + param = params_dict[combined_name] + combined_weight = torch.cat(list(merged_weight_dict.values()), + dim=0) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, combined_weight) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index c442b6d2e7c96..bcb03ef55ef94 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -21,6 +21,7 @@ from transformers import Gemma2Config from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger @@ -238,6 +239,13 @@ def forward( return hidden_states, residual +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + "positions": 0, + "inputs_embeds": 0, + "intermediate_tensors": 0, + }) class Gemma2Model(nn.Module): def __init__( diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py new file mode 100644 index 0000000000000..3213a8b29a104 --- /dev/null +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -0,0 +1,298 @@ +# coding=utf-8 +# Adapted from +# https://github.com/THUDM/GLM-4 +"""Inference-only GLM-4v model visual encoder compatible with THUDM weights.""" +from argparse import Namespace +from typing import Optional + +import torch +from torch import nn +from torch.nn import LayerNorm + +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) + + +class PatchEmbedding(nn.Module): + + def __init__(self, config): + super().__init__() + self.proj = nn.Conv2d(config.in_channels, + config.hidden_size, + kernel_size=config.patch_size, + stride=config.patch_size) + self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.position_embedding = nn.Embedding(config.num_positions, + config.hidden_size) + + def forward(self, images: torch.Tensor) -> torch.Tensor: + """ + Parameters: + images : torch.Tensor + Input image tensor with shape (B, C, H, W) + + Returns: + torch.Tensor + Transformed tensor with shape (B, L, D) + """ + images = images.to(self.proj.weight.device) + x = self.proj(images) + x = x.flatten(2).transpose(1, 2) + cls_token = self.cls_embedding.expand(x.shape[0], -1, -1) + x = torch.cat((cls_token, x), dim=1) + x += self.position_embedding.weight.unsqueeze(0) + return x + + +class Attention(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.hidden_size = config.hidden_size + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_rank = config.num_heads // self.tp_size + self.head_dim = config.hidden_size // config.num_heads + self.scale = self.head_dim**-0.5 + + self.query_key_value = QKVParallelLinear( + config.hidden_size, + self.head_dim, + config.num_heads, + quant_config=quant_config, + ) + self.dense = RowParallelLinear( + config.hidden_size, + config.hidden_size, + quant_config=quant_config, + ) + + self.output_dropout = torch.nn.Dropout(config.dropout_prob) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, L, _ = x.shape + qkv, _ = self.query_key_value(x) # B, L, 3 * H * D + q, k, v = qkv.chunk(3, dim=-1) + q = q.reshape(B, L, self.num_heads_per_rank, + self.head_dim).permute(0, 2, 1, 3) # B, H, L, D + k = k.reshape(B, L, self.num_heads_per_rank, + self.head_dim).permute(0, 2, 1, 3) # B, H, L, D + v = v.reshape(B, L, self.num_heads_per_rank, + self.head_dim).permute(0, 2, 1, 3) # B, H, L, D + + out = torch.nn.functional.scaled_dot_product_attention(q, + k, + v, + attn_mask=None, + dropout_p=0., + is_causal=False) + + output, _ = self.dense(out.transpose(1, 2).view(B, L, -1)) + output = self.output_dropout(output) + return output + + +class MLP(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.config = config + self.activation_fn = get_act_fn(config.hidden_act) + self.fc1 = ColumnParallelLinear( + config.hidden_size, + config.intermediate_size, + quant_config=quant_config, + ) + self.fc2 = RowParallelLinear( + config.intermediate_size, + config.hidden_size, + quant_config=quant_config, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, _ = self.fc1(x) + x = self.activation_fn(x) + x, _ = self.fc2(x) + return x + + +class TransformerLayer(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.input_layernorm = LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.attention = Attention(config, quant_config=quant_config) + self.mlp = MLP(config, quant_config=quant_config) + self.post_attention_layernorm = LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, hidden_states): + attention_input = hidden_states + attention_output = self.input_layernorm( + self.attention(attention_input)) + hidden_states = attention_input + attention_output + mlp_input = hidden_states + mlp_output = self.post_attention_layernorm(self.mlp(mlp_input)) + output = mlp_input + mlp_output + return output + + +class Transformer(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.layers = nn.ModuleList([ + TransformerLayer(config, quant_config=quant_config) + for _ in range(config.num_hidden_layers) + ]) + + def forward(self, hidden_states): + for layer_module in self.layers: + hidden_states = layer_module(hidden_states) + return hidden_states + + +class GLU(nn.Module): + + def __init__( + self, + config, + in_features, + quant_config: Optional[QuantizationConfig] = None, + ): + """ + The original implementation is the same as: + ```python + self.dense_h_to_4h = ColumnParallelLinear( + config.hidden_size, + config.ffn_hidden_size, + bias=False, + quant_config=quant_config + ) + + self.gate_proj = ColumnParallelLinear( + config.hidden_size, + config.ffn_hidden_size, + bias=False, + quant_config=quant_config + ) + ``` + ``` + gate_proj_output, _ = self.gate_proj(x) + dense_h_to_4h_output, _ = self.dense_h_to_4h(x) + x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1) + ``` + + We merge two ColumnParallelLinear into one MergedColumnParallelLinear: + ``` + self.merged_proj = MergedColumnParallelLinear( + config.hidden_size, + [config.ffn_hidden_size] * 2, + bias=False, + quant_config=quant_config + ) + ``` + ``` + x, _ = self.merged_proj(x) + ``` + """ + super().__init__() + self.linear_proj = ReplicatedLinear(in_features, + config.hidden_size, + bias=False, + quant_config=quant_config) + self.norm1 = nn.LayerNorm(config.hidden_size) + self.act1 = nn.GELU() + self.act2 = SiluAndMul() + + self.merged_proj = MergedColumnParallelLinear( + config.hidden_size, [config.ffn_hidden_size] * 2, + bias=False, + quant_config=quant_config) + + self.dense_4h_to_h = RowParallelLinear(config.ffn_hidden_size, + config.hidden_size, + bias=False, + quant_config=quant_config) + + def forward(self, x): + x, _ = self.linear_proj(x) + x = self.act1(self.norm1(x)) + x, _ = self.merged_proj(x) + x = self.act2(x) + x, _ = self.dense_4h_to_h(x) + return x + + +class EVA2CLIPModel(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + vision_config = Namespace(**config.vision_config) + self.patch_embedding = PatchEmbedding(vision_config) + self.transformer = Transformer(vision_config, + quant_config=quant_config) + self.linear_proj = GLU(config, + in_features=config.hidden_size, + quant_config=quant_config) + self.conv = nn.Conv2d(in_channels=vision_config.hidden_size, + out_channels=config.hidden_size, + kernel_size=2, + stride=2) + self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.scaling_factor = vision_config.scaling_factor + + def forward(self, images: torch.Tensor) -> torch.Tensor: + """ + Parameters: + images : torch.Tensor + Input image tensor with shape (B, C, H, W) + + Returns: + torch.Tensor + Transformed tensor with shape (B, L, D) + """ + x = self.patch_embedding(images) + x = self.transformer(x) + x = x[:, 1:] + + b, s, h = x.shape + grid_size = int(s**0.5) + x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2) + x = self.conv(x) + + x = x.flatten(2).transpose(1, 2) + x = self.linear_proj(x) + boi = self.boi.expand(x.shape[0], -1, -1) + eoi = self.eoi.expand(x.shape[0], -1, -1) + x = torch.cat((boi, x, eoi), dim=1) + x = x / self.scaling_factor + return x diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 278dfc52078ef..dcead65115132 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -271,7 +271,7 @@ class HasInnerState(Protocol): """ A flag that indicates this model has inner state. Models that has inner state usually need access to the scheduler_config - for max_num_seqs ,etc... (Currently only used by Jamba) + for max_num_seqs, etc. True for e.g. both Mamba and Jamba. """ def __init__(self, @@ -307,3 +307,46 @@ def has_inner_state( return isinstance(model, _HasInnerStateType) return isinstance(model, HasInnerState) + + +@runtime_checkable +class IsAttentionFree(Protocol): + """The interface required for all models like Mamba that lack attention, + but do have state whose size is constant wrt the number of tokens.""" + + is_attention_free: ClassVar[Literal[True]] = True + """ + A flag that indicates this model has no attention. + Used for block manager and attention backend selection. + True for Mamba but not Jamba. + """ + + def __init__(self) -> None: + ... + + +@runtime_checkable +class _IsAttentionFreeType(Protocol): + is_attention_free: ClassVar[Literal[True]] + + def __init__(self) -> None: + ... + + +@overload +def is_attention_free(model: object) -> TypeIs[IsAttentionFree]: + ... + + +@overload +def is_attention_free(model: Type[object]) -> TypeIs[Type[IsAttentionFree]]: + ... + + +def is_attention_free( + model: Union[Type[object], object] +) -> Union[TypeIs[Type[IsAttentionFree]], TypeIs[IsAttentionFree]]: + if isinstance(model, type): + return isinstance(model, _IsAttentionFreeType) + + return isinstance(model, IsAttentionFree) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 72ecbe95b83ef..90e8d01f0ad61 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,19 +1,17 @@ # coding=utf-8 """Inference-only Jamba model.""" from dataclasses import dataclass -from typing import Dict, Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Tuple import torch from torch import nn -from torch.nn.parameter import Parameter from transformers import JambaConfig from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.attention.layer import Attention from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig -from vllm.distributed import (get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) +from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -30,7 +28,9 @@ from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + composed_weight_loader, default_weight_loader, sharded_weight_loader) +from vllm.model_executor.models.mamba_cache import MambaCacheManager, MambaCacheParams from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import IntermediateTensors @@ -41,19 +41,6 @@ KVCache = Tuple[torch.Tensor, torch.Tensor] - -@dataclass -class MambaCacheParams: - conv_state: torch.Tensor = torch.Tensor() - ssm_state: torch.Tensor = torch.Tensor() - state_indices_tensor: torch.Tensor = torch.Tensor() - - def at_layer_idx(self, layer_idx): - return MambaCacheParams(self.conv_state[layer_idx], - self.ssm_state[layer_idx], - self.state_indices_tensor) - - # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer class JambaMambaMixer(nn.Module): """ @@ -104,16 +91,6 @@ def __init__(self, config: JambaConfig): bias=True, skip_bias_add=True) - def weight_loader(param: Parameter, loaded_weight: torch.Tensor): - tp_rank = get_tensor_model_parallel_rank() - tp_size = get_tensor_model_parallel_world_size() - param.data.copy_( - loaded_weight.data.split(loaded_weight.shape[0] // tp_size, - dim=0)[tp_rank]) - - def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor): - weight_loader(param, -torch.exp(loaded_weight.float())) - tp_size = get_tensor_model_parallel_world_size() self.A = nn.Parameter( torch.empty( @@ -123,8 +100,10 @@ def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor): )) self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size)) - set_weight_attrs(self.D, {"weight_loader": weight_loader}) - set_weight_attrs(self.A, {"weight_loader": A_weight_loader}) + set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)}) + a_weight_loader = composed_weight_loader( + sharded_weight_loader(0), lambda x: -torch.exp(x.float())) + set_weight_attrs(self.A, {"weight_loader": a_weight_loader}) self.out_proj = RowParallelLinear( self.intermediate_size, @@ -572,11 +551,8 @@ def __init__( if not lora_config else lora_config.lora_vocab_padding_size, ) # Used to track and store by the Mamba cache between steps. - self.mamba_cache: Tuple[torch.Tensor, torch.Tensor] = tuple() - # Maps between the request id and a dict that maps between the seq_id - # and its index inside the self.mamba_cache - self.cache_indices_mapping: Dict[str, Dict[int, int]] = {} - self.free_cache_indices = list(range(self._get_max_batch_size())) + self.mamba_cache: Optional[MambaCacheManager] = None + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) self.sampler = Sampler() @@ -588,117 +564,39 @@ def forward(self, attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, **kwargs): - if not self.mamba_cache: - self._prepare_mamba_cache() - - if "seqlen_agnostic_capture_inputs" not in kwargs: - # We get here only on Prefill/Eager mode runs - request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"] - finished_requests_ids = kwargs["finished_requests_ids"] - state_indices = self._release_finished_and_prepare_mamba_cache( - finished_requests_ids, request_ids_to_seq_ids) - state_indices_tensor = torch.as_tensor(state_indices, - dtype=torch.int32, - device="cuda") - mamba_cache = self.mamba_cache - else: - # CUDA graph capturing runs - (mamba_cache, - state_indices_tensor) = kwargs["seqlen_agnostic_capture_inputs"] + if self.mamba_cache is None: + max_batch_size = (_get_graph_batch_size( + self.scheduler_config.max_num_seqs) if self.scheduler_config + else max(_BATCH_SIZES_TO_CAPTURE) + 2) + + layers_type = self.config.layers_block_type + num_mamba_layers = sum( + [layer_type == "mamba" for layer_type in layers_type]) + + self.mamba_cache = MambaCacheManager( + self.lm_head.weight.dtype, num_mamba_layers, max_batch_size, + *self._get_mamba_cache_shape()) - mamba_cache_params = MambaCacheParams(mamba_cache[0], mamba_cache[1], - state_indices_tensor) + mamba_cache_tensors, state_indices_tensor= self.mamba_cache.current_run_tensors(input_ids, attn_metadata, **kwargs) + + mamba_cache_params = MambaCacheParams( + mamba_cache_tensors[0], + mamba_cache_tensors[1], + state_indices_tensor + ) hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata, mamba_cache_params) return hidden_states - def _copy_mamba_cache(self, from_index: int, to_index: int): - assert len(self.mamba_cache) > 0 - for cache_t in self.mamba_cache: - cache_t[:, to_index].copy_(cache_t[:, from_index], - non_blocking=True) - - def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int, - finished_requests_ids) -> int: - """ - Assign (req_id,seq_id) pair to a `destination_index` index, if - already occupied, move the occupying index to a free index. - """ - if cur_rid in finished_requests_ids: - # set as pad, do not allocate destination index - return PAD_SLOT_ID - elif cur_rid not in self.cache_indices_mapping: - destination_index = self.free_cache_indices.pop() - self.cache_indices_mapping[cur_rid] = {seq_id: destination_index} - return destination_index - elif seq_id not in (seq_ids2indices := - self.cache_indices_mapping[cur_rid]): - # parallel sampling , where n > 1, assume prefill have - # already happened, so we copy the - # existing cache into the siblings seq_ids caches - index_exists = next(iter(seq_ids2indices.values())) - # case of decoding n>1, copy prefill cache to decoding indices - destination_index = self.free_cache_indices.pop() - self._copy_mamba_cache(from_index=index_exists, - to_index=destination_index) - self.cache_indices_mapping[cur_rid][seq_id] = destination_index - return destination_index - else: - # already exists - return self.cache_indices_mapping[cur_rid][seq_id] - - def _prepare_current_run_mamba_cache( - self, request_ids_to_seq_ids: Dict[str, list[int]], - finished_requests_ids: List[str]) -> List[int]: - return [ - self._assign_seq_id_to_cache_index(req_id, seq_id, - finished_requests_ids) - for req_id, seq_ids in request_ids_to_seq_ids.items() - for seq_id in seq_ids - ] - - def _release_finished_and_prepare_mamba_cache( - self, finished_requests_ids, request_ids_to_seq_ids) -> List[int]: - self._release_mamba_cache(finished_requests_ids) - return self._prepare_current_run_mamba_cache(request_ids_to_seq_ids, - finished_requests_ids) - def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): - """ - Copy the relevant state_indices into the CUDA graph input buffer - """ - _, input_state_indices_buffer = input_buffers[ - "seqlen_agnostic_capture_inputs"] - state_indices = self._release_finished_and_prepare_mamba_cache( - kwargs["finished_requests_ids"], kwargs["request_ids_to_seq_ids"]) - cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len( - state_indices) - state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len) - - input_state_indices_buffer.copy_( - torch.as_tensor(state_indices, dtype=torch.int32, device="cuda")) - - def get_seqlen_agnostic_capture_inputs(self, batch_size): - """ - Provide the CUDA graph capture runs with a state_indices buffer. - will be used during the CUDA graph decode runs. - """ - state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size, - dtype=torch.int32, - device="cuda") - return (self.mamba_cache, state_indices_tensor) - - def _release_mamba_cache(self, finished_seq_groups_req_ids: List[str]): - for req_id in finished_seq_groups_req_ids: - if req_id in self.cache_indices_mapping: - for seq_id in self.cache_indices_mapping[req_id]: - self.free_cache_indices.append( - self.cache_indices_mapping[req_id][seq_id]) - self.cache_indices_mapping.pop(req_id) + return self.mamba_cache.copy_inputs_before_cuda_graphs( + input_buffers, **kwargs) + + def get_seqlen_agnostic_capture_inputs(self, batch_size: int): + return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self - ) -> Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]: + self) -> Tuple[Tuple[int, int], Tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size conv_state_shape = ( @@ -706,33 +604,11 @@ def _get_mamba_cache_shape( self.config.mamba_d_conv - 1, ) temporal_state_shape = ( - self.config.mamba_expand * self.config.hidden_size // world_size, + self.config.mamba_expand * hidden_size // world_size, self.config.mamba_d_state, ) return conv_state_shape, temporal_state_shape - def _get_max_batch_size(self): - return (_get_graph_batch_size(self.scheduler_config.max_num_seqs) - if self.scheduler_config else max(_BATCH_SIZES_TO_CAPTURE) + 2) - - def _prepare_mamba_cache(self): - dtype = self.lm_head.weight.dtype - layers_type = self.config.layers_block_type - mamba_layers = sum( - [layer_type == "mamba" for layer_type in layers_type]) - max_batch_size = self._get_max_batch_size() - conv_state_shape, temporal_state_shape = self._get_mamba_cache_shape() - assert conv_state_shape is not None and temporal_state_shape is not None - - self.mamba_cache = (torch.empty(size=(mamba_layers, max_batch_size) + - conv_state_shape, - dtype=dtype, - device="cuda"), - torch.empty(size=(mamba_layers, max_batch_size) + - temporal_state_shape, - dtype=dtype, - device="cuda")) - def compute_logits( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2a79a9edf2111..ad5cfcc44022f 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -28,6 +28,7 @@ from transformers import LlamaConfig from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) @@ -265,6 +266,13 @@ def forward( return hidden_states, residual +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + "positions": 0, + "inputs_embeds": 0, + "intermediate_tensors": 0, + }) class LlamaModel(nn.Module): def __init__( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index a3acb93dc3c11..864b9ff66a84e 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -365,6 +365,8 @@ def forward( input_ids = None inputs_embeds = None else: + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent image_input = self._parse_and_validate_image_input(**kwargs) if image_input is not None: @@ -375,10 +377,10 @@ def forward( inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, vision_embeddings, self.config.image_token_index) - - input_ids = None else: - inputs_embeds = None + inputs_embeds = self.language_model.model.get_input_embeddings( + input_ids) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py new file mode 100644 index 0000000000000..2c5d83b9664aa --- /dev/null +++ b/vllm/model_executor/models/mamba.py @@ -0,0 +1,493 @@ +# coding=utf-8 +"""PyTorch MAMBA model.""" +from dataclasses import dataclass +from typing import Iterable, List, Optional, Tuple + +import torch +from torch import nn +from transformers import MambaConfig + +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( + causal_conv1d_fn, causal_conv1d_update) +from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( + selective_scan_fn, selective_state_update) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + composed_weight_loader, default_weight_loader, sharded_weight_loader) +from vllm.model_executor.models.interfaces import (HasInnerState, + IsAttentionFree) +from vllm.model_executor.models.mamba_cache import ( + MambaCacheManager, + MambaCacheParams, +) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.utils import set_weight_attrs +from vllm.sequence import IntermediateTensors +from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE, + _get_graph_batch_size) + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer +class MambaMixer(nn.Module): + """ + Compute ∆, A, B, C, and D the state space parameters and compute + the `contextualized_states`. A, D are input independent + (see Mamba paper [1] Section 3.5.2 "Interpretation of A" + for why A isn't selective) ∆, B, C are input-dependent + (this is a key difference between Mamba and the linear time + invariant S4, and is why Mamba is called + **selective** state spaces) + """ + + def __init__(self, config: MambaConfig, layer_idx): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.hidden_size = config.hidden_size + self.ssm_state_size = config.state_size + self.conv_kernel_size = config.conv_kernel + self.intermediate_size = config.intermediate_size + self.time_step_rank = int(config.time_step_rank) + + self.conv1d = ColumnParallelLinear( + input_size=self.conv_kernel_size, + output_size=self.intermediate_size, + bias=config.use_conv_bias, + ) + # unsqueeze to fit conv1d weights shape into the linear weights shape. + # Can't do this in `weight_loader` since it already exists in + # `ColumnParallelLinear` and `set_weight_attrs` + # doesn't allow to override it + self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) + + self.in_proj = MergedColumnParallelLinear(self.hidden_size, + [self.intermediate_size] * 2, + bias=config.use_bias) + # selective projection used to make dt, B and C input dependent + self.x_proj = RowParallelLinear( + self.intermediate_size, + self.time_step_rank + self.ssm_state_size * 2, + bias=False, + ) + # time step projection (discretization) - + # In the forward we need to apply dt_proj without the bias, + # as the bias is added in the selective scan kernel. + self.dt_proj = ColumnParallelLinear(self.time_step_rank, + self.intermediate_size, + bias=True, + skip_bias_add=True) + + tp_size = get_tensor_model_parallel_world_size() + self.A = nn.Parameter( + torch.empty( + self.intermediate_size // tp_size, + self.ssm_state_size, + dtype=torch.float32, + )) + self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size)) + + set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)}) + a_weight_loader = composed_weight_loader( + sharded_weight_loader(0), lambda x: -torch.exp(x.float())) + set_weight_attrs(self.A, {"weight_loader": a_weight_loader}) + + self.out_proj = RowParallelLinear( + self.intermediate_size, + self.hidden_size, + bias=config.use_bias, + input_is_parallel=True, + ) + self.activation = config.hidden_act + + def forward(self, hidden_states: torch.Tensor, + attn_metadata: AttentionMetadata, + mamba_cache_params: MambaCacheParams): + + # 1. Gated MLP's linear projection + projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) + hidden_states, gate = projected_states.chunk(2, dim=-2) + + # 2. Convolution sequence transformation + conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), + self.conv1d.weight.size(2)) + + if attn_metadata.query_start_loc is not None \ + and attn_metadata.context_lens_tensor is not None: + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ---------------------| + # |-- query_len ---| + hidden_states = causal_conv1d_fn( + hidden_states, + conv_weights, + self.conv1d.bias, + activation=self.activation, + conv_states=mamba_cache_params.conv_state, + has_initial_state=attn_metadata.context_lens_tensor > 0, + cache_indices=mamba_cache_params.state_indices_tensor, + query_start_loc=attn_metadata.query_start_loc) + else: + hidden_states = causal_conv1d_update( + hidden_states.transpose(0, 1), + mamba_cache_params.conv_state, + conv_weights, + self.conv1d.bias, + self.activation, + conv_state_indices=mamba_cache_params.state_indices_tensor) + hidden_states = hidden_states.transpose(0, 1) + + # 3. State Space Model sequence transformation + # 3.a. input varying initialization of time_step, B and C + ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] + + time_step, B, C = torch.split( + ssm_parameters, + [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], + dim=-1, + ) + + # Note that Jamba normalizes B, C, and time_step here but Mamba doesn't. + + discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) + # 3.c perform the recurrence y ← SSM(A, B, C)(x) + time_proj_bias = (self.dt_proj.bias.float() if hasattr( + self.dt_proj, "bias") else None) + + if attn_metadata.query_start_loc is not None \ + and attn_metadata.context_lens_tensor is not None: + scan_outputs = selective_scan_fn( + hidden_states, + mamba_cache_params.ssm_state, + discrete_time_step, + self.A, + B.transpose(-2, -1), + C.transpose(-2, -1), + self.D.float(), + gate, + time_proj_bias, + delta_softplus=True, + cache_indices=mamba_cache_params.state_indices_tensor, + has_initial_state=attn_metadata.context_lens_tensor > 0, + query_start_loc=attn_metadata.query_start_loc) + else: + scan_outputs = selective_state_update( + mamba_cache_params.ssm_state, + hidden_states.transpose(0, 1), + discrete_time_step.transpose(0, 1), + self.A, + B, + C, + self.D, + gate.transpose(0, 1), + time_proj_bias, + dt_softplus=True, + state_batch_indices=mamba_cache_params.state_indices_tensor) + scan_outputs = scan_outputs.transpose(0, 1) + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_outputs.transpose(-2, + -1))[0] + return contextualized_states + + +class MambaMLP(nn.Module): + + def __init__( + self, + config: MambaConfig, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + hidden_size = config.hidden_size + intermediate_size = config.intermediate_size + hidden_act = config.hidden_act + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class MambaDecoderLayer(nn.Module): + + def __init__(self, + config: MambaConfig, + layer_idx: int, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + super().__init__() + self.layer_idx = layer_idx + self.config = config + self.mixer = MambaMixer(config, layer_idx) + + self.feed_forward = MambaMLP(config, quant_config=quant_config) + self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + self.pre_ff_layernorm = RMSNorm(config.hidden_size, + eps=config.layer_norm_epsilon) + + def forward( + self, + hidden_states: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + mamba_cache_params: MambaCacheParams, + ): + if residual is None: + residual = hidden_states + hidden_states = self.norm(hidden_states) + else: + hidden_states, residual = self.norm(hidden_states, residual) + + hidden_states = self.mixer( + hidden_states, + attn_metadata, + mamba_cache_params + ) + # Fully Connected + hidden_states, residual = self.pre_ff_layernorm( + hidden_states, residual) + hidden_states = self.feed_forward(hidden_states) + return hidden_states, residual + + +class MambaModel(nn.Module): + + def __init__( + self, + config: MambaConfig, + quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + lora_vocab = ((lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0) + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + + self.embeddings = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + + decoder_layers = [] + for i in range(config.num_hidden_layers): + decoder_layers.append( + MambaDecoderLayer(config, + layer_idx=i, + cache_config=cache_config, + quant_config=quant_config)) + self.layers = nn.ModuleList(decoder_layers) + self.norm_f = RMSNorm(config.hidden_size, + eps=config.layer_norm_epsilon) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + attn_metadata: AttentionMetadata, + mamba_cache_params: MambaCacheParams, + ) -> torch.Tensor: + + hidden_states = self.embeddings(input_ids) + residual = None + + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions=positions, + hidden_states=hidden_states, + attn_metadata=attn_metadata, + residual=residual, + mamba_cache_params=mamba_cache_params.at_layer_idx(i) + ) + hidden_states, _ = self.norm_f(hidden_states, residual) + + return hidden_states + + +class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "embed_tokens", + "lm_head", + ] + embedding_modules = { + "embeddings": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + + def __init__( + self, + config: MambaConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + scheduler_config: Optional[SchedulerConfig] = None, + ) -> None: + assert not cache_config.enable_prefix_caching, \ + "Mamba does not support prefix caching" + + super().__init__() + self.config = config + self.scheduler_config = scheduler_config + self.backbone = MambaModel(config, + cache_config=cache_config, + quant_config=quant_config, + lora_config=lora_config) + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + + self.lm_head = self.backbone.embeddings + + # Used to track and store by the Mamba cache between steps. + self.mamba_cache: Optional[MambaCacheManager] = None + + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size) + self.sampler = Sampler() + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs): + if self.mamba_cache is None: + max_batch_size = (_get_graph_batch_size( + self.scheduler_config.max_num_seqs) if self.scheduler_config + else max(_BATCH_SIZES_TO_CAPTURE) + 2) + self.mamba_cache = MambaCacheManager( + self.lm_head.weight.dtype, self.config.num_hidden_layers, + max_batch_size, *self._get_mamba_cache_shape()) + + mamba_cache_tensors = self.mamba_cache.current_run_tensors( + input_ids, attn_metadata, **kwargs) + + hidden_states = self.backbone(input_ids, positions, kv_caches, + attn_metadata, mamba_cache_tensors[0], + mamba_cache_tensors[1]) + + return hidden_states + + def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): + return self.mamba_cache.copy_inputs_before_cuda_graphs( + input_buffers, **kwargs) + + def get_seqlen_agnostic_capture_inputs(self, batch_size: int): + return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) + + def _get_mamba_cache_shape( + self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + world_size = get_tensor_model_parallel_world_size() + conv_state_shape = ( + self.config.intermediate_size // world_size, + self.config.conv_kernel - 1, + ) + temporal_state_shape = ( + self.config.intermediate_size // world_size, + self.config.state_size, + ) + return conv_state_shape, temporal_state_shape + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + if "A_log" in name: + name = name.replace("A_log", "A") + + if ".self_attn." in name: + name = name.replace(".self_attn", "") + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py new file mode 100644 index 0000000000000..8cc8fc39e51ec --- /dev/null +++ b/vllm/model_executor/models/mamba_cache.py @@ -0,0 +1,164 @@ +from dataclasses import dataclass +from typing import Dict, List, Optional + +import torch + +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.attention.backends.utils import PAD_SLOT_ID + +@dataclass +class MambaCacheParams: + conv_state: torch.Tensor = torch.Tensor() + ssm_state: torch.Tensor = torch.Tensor() + state_indices_tensor: torch.Tensor = torch.Tensor() + + def at_layer_idx(self, layer_idx): + return MambaCacheParams(self.conv_state[layer_idx], + self.ssm_state[layer_idx], + self.state_indices_tensor) + + +class MambaCacheManager: + + def __init__(self, dtype, num_mamba_layers, max_batch_size, + conv_state_shape, temporal_state_shape): + + conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) + + conv_state_shape, + dtype=dtype, + device="cuda") + temporal_state = torch.empty(size=(num_mamba_layers, max_batch_size) + + temporal_state_shape, + dtype=dtype, + device="cuda") + + self.mamba_cache = (conv_state, temporal_state) + + # Maps between the request id and a dict that maps between the seq_id + # and its index inside the self.mamba_cache + self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {} + self.free_cache_indices = list(range(max_batch_size)) + + def current_run_tensors(self, input_ids: torch.Tensor, + attn_metadata: AttentionMetadata, **kwargs): + """ + Return the tensors for the current run's conv and ssm state. + """ + if "seqlen_agnostic_capture_inputs" not in kwargs: + # We get here only on Prefill/Eager mode runs + request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"] + finished_requests_ids = kwargs["finished_requests_ids"] + + self._release_finished_requests(finished_requests_ids) + state_indices = self._prepare_current_run_mamba_cache( + request_ids_to_seq_ids, finished_requests_ids) + + state_indices_tensor = torch.as_tensor(state_indices, + dtype=torch.int32, + device="cuda") + mamba_cache_tensors = self.mamba_cache + + else: + # CUDA graph capturing runs + (mamba_cache_tensors, + state_indices_tensor) = kwargs["seqlen_agnostic_capture_inputs"] + + return (mamba_cache_tensors, state_indices_tensor) + + def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): + """ + Copy the relevant state_indices into the CUDA graph input buffer + """ + assert all( + key in kwargs + for key in ["request_ids_to_seq_ids", "finished_requests_ids"]) + finished_requests_ids = kwargs["finished_requests_ids"] + request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"] + assert "seqlen_agnostic_capture_inputs" in input_buffers + _, input_state_indices_buffer = input_buffers[ + "seqlen_agnostic_capture_inputs"] + + self._release_finished_requests(finished_requests_ids) + state_indices = self._prepare_current_run_mamba_cache( + request_ids_to_seq_ids, + finished_requests_ids + ) + cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len( + state_indices) + state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len) + + input_state_indices_buffer.copy_( + torch.as_tensor(state_indices, dtype=torch.int32, device="cuda")) + + + def get_seqlen_agnostic_capture_inputs(self, batch_size: int): + """ + Provide the CUDA graph capture runs with a buffer in adjusted size. + The buffer is used to maintain the Mamba Cache during the CUDA graph + replay runs. + """ + state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size, + dtype=torch.int32, + device="cuda") + return (self.mamba_cache, state_indices_tensor) + + def _copy_mamba_cache(self, from_index: int, to_index: int): + assert len(self.mamba_cache) > 0 + for cache_t in self.mamba_cache: + cache_t[:, to_index].copy_(cache_t[:, from_index], + non_blocking=True) + + def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int, + finished_requests_ids) -> int: + """ + Assign (req_id,seq_id) pair to a `destination_index` index, if + already occupied, move the occupying index to a free index. + """ + if cur_rid in finished_requests_ids: + # set as pad, do not allocate destination index + return PAD_SLOT_ID + elif cur_rid not in self.mamba_cache_indices_mapping: + destination_index = self.free_cache_indices.pop() + self.mamba_cache_indices_mapping[cur_rid] = {seq_id: destination_index} + return destination_index + elif seq_id not in (seq_ids2indices := + self.mamba_cache_indices_mapping[cur_rid]): + # parallel sampling , where n > 1, assume prefill have + # already happened, so we copy the + # existing cache into the siblings seq_ids caches + index_exists = next(iter(seq_ids2indices.values())) + # case of decoding n>1, copy prefill cache to decoding indices + destination_index = self.free_cache_indices.pop() + self._copy_mamba_cache(from_index=index_exists, + to_index=destination_index) + self.mamba_cache_indices_mapping[cur_rid][seq_id] = destination_index + return destination_index + else: + # already exists + return self.mamba_cache_indices_mapping[cur_rid][seq_id] + + def _prepare_current_run_mamba_cache( + self, request_ids_to_seq_ids: Dict[str, list[int]], + finished_requests_ids: List[str]) -> List[int]: + return [ + self._assign_seq_id_to_cache_index(req_id, seq_id, + finished_requests_ids) + for req_id, seq_ids in request_ids_to_seq_ids.items() + for seq_id in seq_ids + ] + + def _get_all_occupied_indices(self): + return [ + cache_idx + for seq_ids2indices in self.mamba_cache_indices_mapping.values() + for cache_idx in seq_ids2indices.values() + ] + + + def _release_finished_requests(self, finished_seq_groups_req_ids: List[str]): + for req_id in finished_seq_groups_req_ids: + if req_id in self.mamba_cache_indices_mapping: + for seq_id in self.mamba_cache_indices_mapping[req_id]: + self.free_cache_indices.append( + self.mamba_cache_indices_mapping[req_id][seq_id]) + self.mamba_cache_indices_mapping.pop(req_id) diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 6bba1594c270f..41c2877194bb2 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -474,17 +474,18 @@ def __init__( unpadded_vocab_size = config.vocab_size if lora_config: unpadded_vocab_size += lora_config.lora_extra_vocab_size - if not self.config.tie_word_embeddings: - self.lm_head = ParallelLMHead( - unpadded_vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config else lora_config.lora_vocab_padding_size, - quant_config=quant_config, - ) + self.lm_head = ParallelLMHead( + unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + quant_config=quant_config, + ) + if config.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) self.scale_width = self.config.hidden_size / self.config.dim_model_base self.logits_processor = LogitsProcessor(unpadded_vocab_size, @@ -517,11 +518,7 @@ def compute_logits( sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: hidden_states = hidden_states / self.scale_width - if self.config.tie_word_embeddings: - lm_head = self.model.embed_tokens - else: - lm_head = self.lm_head - logits = self.logits_processor(lm_head, hidden_states, + logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) return logits diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index c37bc5ad7c38f..3b5fd95328d74 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -216,6 +216,28 @@ def _init_layers( class MiniCPM3ForCausalLM(MiniCPMForCausalLM): + packed_modules_mapping = { + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "kv_a_proj_with_mqa", + "q_a_proj", + "q_b_proj", + "kv_b_proj", + "o_proj", + "gate_up_proj", + "down_proj", + "embed_tokens", + "lm_head", + ] + + # `embedding_modules` and `embedding_padding_modules` + # are inherited from MiniCPMForCausalLM def _init_model(self): self.model = MiniCPM3Model(config=self.config, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f1d484521acb9..8caaab9974666 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -3,8 +3,10 @@ import subprocess import sys import tempfile -from functools import lru_cache, partial -from typing import Callable, Dict, List, Optional, Tuple, Type, Union +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from functools import lru_cache +from typing import Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union import cloudpickle import torch.nn as nn @@ -12,7 +14,8 @@ from vllm.logger import init_logger from vllm.utils import is_hip -from .interfaces import supports_multimodal, supports_pp +from .interfaces import (has_inner_state, is_attention_free, + supports_multimodal, supports_pp) from .interfaces_base import is_embedding_model, is_text_generation_model logger = init_logger(__name__) @@ -26,8 +29,7 @@ "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b "BloomForCausalLM": ("bloom", "BloomForCausalLM"), - "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), - "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), + # ChatGLMModel supports multimodal "CohereForCausalLM": ("commandr", "CohereForCausalLM"), "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), @@ -50,6 +52,7 @@ "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), + "MambaForCausalLM": ("mamba", "MambaForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"), @@ -68,6 +71,7 @@ "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"), "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"), + # QWenLMHeadModel supports multimodal "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), "RWForCausalLM": ("falcon", "FalconForCausalLM"), @@ -91,6 +95,8 @@ # [Decoder-only] "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"), "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 + "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), + "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "InternVLChatModel": ("internvl", "InternVLChatModel"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), @@ -116,18 +122,13 @@ } # yapf: enable -_MODELS = { +_VLLM_MODELS = { **_TEXT_GENERATION_MODELS, **_EMBEDDING_MODELS, **_MULTIMODAL_MODELS, **_SPECULATIVE_DECODING_MODELS, } -# Architecture -> type or (module, class). -# out of tree models -_OOT_MODELS: Dict[str, Type[nn.Module]] = {} -_OOT_MODELS_LAZY: Dict[str, Tuple[str, str]] = {} - # Models not supported by ROCm. _ROCM_UNSUPPORTED_MODELS: List[str] = [] @@ -154,79 +155,129 @@ } -class ModelRegistry: +@dataclass(frozen=True) +class _ModelInfo: + is_text_generation_model: bool + is_embedding_model: bool + supports_multimodal: bool + supports_pp: bool + has_inner_state: bool + is_attention_free: bool @staticmethod - def _get_module_cls_name(model_arch: str) -> Tuple[str, str]: - if model_arch in _MODELS: - module_relname, cls_name = _MODELS[model_arch] - return f"vllm.model_executor.models.{module_relname}", cls_name + def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": + return _ModelInfo( + is_text_generation_model=is_text_generation_model(model), + is_embedding_model=is_embedding_model(model), + supports_multimodal=supports_multimodal(model), + supports_pp=supports_pp(model), + has_inner_state=has_inner_state(model), + is_attention_free=is_attention_free(model), + ) - if model_arch in _OOT_MODELS_LAZY: - return _OOT_MODELS_LAZY[model_arch] - raise KeyError(model_arch) +class _BaseRegisteredModel(ABC): - @staticmethod - @lru_cache(maxsize=128) - def _try_get_model_stateful(model_arch: str) -> Optional[Type[nn.Module]]: - try: - mod_name, cls_name = ModelRegistry._get_module_cls_name(model_arch) - except KeyError: - return None + @abstractmethod + def inspect_model_cls(self) -> _ModelInfo: + raise NotImplementedError - module = importlib.import_module(mod_name) - return getattr(module, cls_name, None) + @abstractmethod + def load_model_cls(self) -> Type[nn.Module]: + raise NotImplementedError - @staticmethod - def _try_get_model_stateless(model_arch: str) -> Optional[Type[nn.Module]]: - if model_arch in _OOT_MODELS: - return _OOT_MODELS[model_arch] - - if is_hip(): - if model_arch in _ROCM_UNSUPPORTED_MODELS: - raise ValueError( - f"Model architecture {model_arch} is not supported by " - "ROCm for now.") - if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: - logger.warning( - "Model architecture %s is partially supported by ROCm: %s", - model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]) - return None +@dataclass(frozen=True) +class _RegisteredModel(_BaseRegisteredModel): + """ + Represents a model that has already been imported in the main process. + """ + + interfaces: _ModelInfo + model_cls: Type[nn.Module] @staticmethod - def _try_load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]: - model = ModelRegistry._try_get_model_stateless(model_arch) - if model is not None: - return model + def from_model_cls(model_cls: Type[nn.Module]): + return _RegisteredModel( + interfaces=_ModelInfo.from_model_cls(model_cls), + model_cls=model_cls, + ) + + def inspect_model_cls(self) -> _ModelInfo: + return self.interfaces + + def load_model_cls(self) -> Type[nn.Module]: + return self.model_cls + + +@dataclass(frozen=True) +class _LazyRegisteredModel(_BaseRegisteredModel): + """ + Represents a model that has not been imported in the main process. + """ + module_name: str + class_name: str + + # Performed in another process to avoid initializing CUDA + def inspect_model_cls(self) -> _ModelInfo: + return _run_in_subprocess( + lambda: _ModelInfo.from_model_cls(self.load_model_cls())) + + def load_model_cls(self) -> Type[nn.Module]: + mod = importlib.import_module(self.module_name) + return getattr(mod, self.class_name) + + +@lru_cache(maxsize=128) +def _try_load_model_cls( + model_arch: str, + model: _BaseRegisteredModel, +) -> Optional[Type[nn.Module]]: + if is_hip(): + if model_arch in _ROCM_UNSUPPORTED_MODELS: + raise ValueError(f"Model architecture '{model_arch}' is not " + "supported by ROCm for now.") + + if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: + msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch] + logger.warning( + "Model architecture '%s' is partially " + "supported by ROCm: %s", model_arch, msg) + + try: + return model.load_model_cls() + except Exception: + logger.exception("Error in loading model architecture '%s'", + model_arch) + return None - return ModelRegistry._try_get_model_stateful(model_arch) - @staticmethod - def resolve_model_cls( - architectures: Union[str, List[str]], ) -> Tuple[Type[nn.Module], str]: - if isinstance(architectures, str): - architectures = [architectures] - if not architectures: - logger.warning("No model architectures are specified") +@lru_cache(maxsize=128) +def _try_inspect_model_cls( + model_arch: str, + model: _BaseRegisteredModel, +) -> Optional[_ModelInfo]: + try: + return model.inspect_model_cls() + except Exception: + logger.exception("Error in inspecting model architecture '%s'", + model_arch) + return None - for arch in architectures: - model_cls = ModelRegistry._try_load_model_cls(arch) - if model_cls is not None: - return (model_cls, arch) - raise ValueError( - f"Model architectures {architectures} are not supported for now. " - f"Supported architectures: {ModelRegistry.get_supported_archs()}") +@dataclass +class _ModelRegistry: + # Keyed by model_arch + models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict) - @staticmethod - def get_supported_archs() -> List[str]: - return list(_MODELS.keys()) + list(_OOT_MODELS.keys()) + def get_supported_archs(self) -> List[str]: + return list(self.models.keys()) - @staticmethod - def register_model(model_arch: str, model_cls: Union[Type[nn.Module], - str]): + def register_model( + self, + model_arch: str, + model_cls: Union[Type[nn.Module], str], + ) -> None: """ Register an external model to be used in vLLM. @@ -238,7 +289,7 @@ def register_model(model_arch: str, model_cls: Union[Type[nn.Module], when importing the model and thus the related error :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`. """ - if model_arch in _MODELS: + if model_arch in self.models: logger.warning( "Model architecture %s is already registered, and will be " "overwritten by the new model class %s.", model_arch, @@ -250,120 +301,149 @@ def register_model(model_arch: str, model_cls: Union[Type[nn.Module], msg = "Expected a string in the format `:`" raise ValueError(msg) - module_name, cls_name = split_str - _OOT_MODELS_LAZY[model_arch] = module_name, cls_name + model = _LazyRegisteredModel(*split_str) else: - _OOT_MODELS[model_arch] = model_cls + model = _RegisteredModel.from_model_cls(model_cls) - @staticmethod - @lru_cache(maxsize=128) - def _check_stateless( - func: Callable[[Type[nn.Module]], bool], - model_arch: str, - *, - default: Optional[bool] = None, - ) -> bool: - """ - Run a boolean function against a model and return the result. + self.models[model_arch] = model - If the model is not found, returns the provided default value. + def _raise_for_unsupported(self, architectures: List[str]): + all_supported_archs = self.get_supported_archs() - If the model is not already imported, the function is run inside a - subprocess to avoid initializing CUDA for the main program. - """ - model = ModelRegistry._try_get_model_stateless(model_arch) - if model is not None: - return func(model) + raise ValueError( + f"Model architectures {architectures} are not supported for now. " + f"Supported architectures: {all_supported_archs}") - try: - mod_name, cls_name = ModelRegistry._get_module_cls_name(model_arch) - except KeyError: - if default is not None: - return default - - raise - - with tempfile.NamedTemporaryFile() as output_file: - # `cloudpickle` allows pickling lambda functions directly - input_bytes = cloudpickle.dumps( - (mod_name, cls_name, func, output_file.name)) - # cannot use `sys.executable __file__` here because the script - # contains relative imports - returned = subprocess.run( - [sys.executable, "-m", "vllm.model_executor.models.registry"], - input=input_bytes, - capture_output=True) - - # check if the subprocess is successful - try: - returned.check_returncode() - except Exception as e: - # wrap raised exception to provide more information - raise RuntimeError(f"Error happened when testing " - f"model support for{mod_name}.{cls_name}:\n" - f"{returned.stderr.decode()}") from e - with open(output_file.name, "rb") as f: - result = pickle.load(f) - return result + def _try_load_model_cls(self, + model_arch: str) -> Optional[Type[nn.Module]]: + if model_arch not in self.models: + return None - @staticmethod - def is_text_generation_model(architectures: Union[str, List[str]]) -> bool: - if isinstance(architectures, str): - architectures = [architectures] - if not architectures: - logger.warning("No model architectures are specified") + return _try_load_model_cls(model_arch, self.models[model_arch]) - is_txt_gen = partial(ModelRegistry._check_stateless, - is_text_generation_model, - default=False) + def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]: + if model_arch not in self.models: + return None - return any(is_txt_gen(arch) for arch in architectures) + return _try_inspect_model_cls(model_arch, self.models[model_arch]) - @staticmethod - def is_embedding_model(architectures: Union[str, List[str]]) -> bool: + def _normalize_archs( + self, + architectures: Union[str, List[str]], + ) -> List[str]: if isinstance(architectures, str): architectures = [architectures] if not architectures: logger.warning("No model architectures are specified") - is_emb = partial(ModelRegistry._check_stateless, - is_embedding_model, - default=False) + return architectures - return any(is_emb(arch) for arch in architectures) + def inspect_model_cls( + self, + architectures: Union[str, List[str]], + ) -> _ModelInfo: + architectures = self._normalize_archs(architectures) - @staticmethod - def is_multimodal_model(architectures: Union[str, List[str]]) -> bool: - if isinstance(architectures, str): - architectures = [architectures] - if not architectures: - logger.warning("No model architectures are specified") + for arch in architectures: + model_info = self._try_inspect_model_cls(arch) + if model_info is not None: + return model_info - is_mm = partial(ModelRegistry._check_stateless, - supports_multimodal, - default=False) + return self._raise_for_unsupported(architectures) - return any(is_mm(arch) for arch in architectures) + def resolve_model_cls( + self, + architectures: Union[str, List[str]], + ) -> Tuple[Type[nn.Module], str]: + architectures = self._normalize_archs(architectures) - @staticmethod - def is_pp_supported_model(architectures: Union[str, List[str]]) -> bool: - if isinstance(architectures, str): - architectures = [architectures] - if not architectures: - logger.warning("No model architectures are specified") + for arch in architectures: + model_cls = self._try_load_model_cls(arch) + if model_cls is not None: + return (model_cls, arch) - is_pp = partial(ModelRegistry._check_stateless, - supports_pp, - default=False) + return self._raise_for_unsupported(architectures) - return any(is_pp(arch) for arch in architectures) + def is_text_generation_model( + self, + architectures: Union[str, List[str]], + ) -> bool: + return self.inspect_model_cls(architectures).is_text_generation_model + def is_embedding_model( + self, + architectures: Union[str, List[str]], + ) -> bool: + return self.inspect_model_cls(architectures).is_embedding_model + + def is_multimodal_model( + self, + architectures: Union[str, List[str]], + ) -> bool: + return self.inspect_model_cls(architectures).supports_multimodal + + def is_pp_supported_model( + self, + architectures: Union[str, List[str]], + ) -> bool: + return self.inspect_model_cls(architectures).supports_pp + + def model_has_inner_state(self, architectures: Union[str, + List[str]]) -> bool: + return self.inspect_model_cls(architectures).has_inner_state + + def is_attention_free_model(self, architectures: Union[str, + List[str]]) -> bool: + return self.inspect_model_cls(architectures).is_attention_free + + +ModelRegistry = _ModelRegistry({ + model_arch: _LazyRegisteredModel( + module_name=f"vllm.model_executor.models.{mod_relname}", + class_name=cls_name, + ) + for model_arch, (mod_relname, cls_name) in _VLLM_MODELS.items() +}) + +_T = TypeVar("_T") + + +def _run_in_subprocess(fn: Callable[[], _T]) -> _T: + with tempfile.NamedTemporaryFile() as output_file: + # `cloudpickle` allows pickling lambda functions directly + input_bytes = cloudpickle.dumps((fn, output_file.name)) + + # cannot use `sys.executable __file__` here because the script + # contains relative imports + returned = subprocess.run( + [sys.executable, "-m", "vllm.model_executor.models.registry"], + input=input_bytes, + capture_output=True) + + # check if the subprocess is successful + try: + returned.check_returncode() + except Exception as e: + # wrap raised exception to provide more information + raise RuntimeError(f"Error raised in subprocess:\n" + f"{returned.stderr.decode()}") from e + + with open(output_file.name, "rb") as f: + return pickle.load(f) + + +def _run() -> None: + # Setup plugins + from vllm.plugins import load_general_plugins + load_general_plugins() + + fn, output_file = pickle.loads(sys.stdin.buffer.read()) + + result = fn() -if __name__ == "__main__": - (mod_name, cls_name, func, - output_file) = pickle.loads(sys.stdin.buffer.read()) - mod = importlib.import_module(mod_name) - klass = getattr(mod, cls_name) - result = func(klass) with open(output_file, "wb") as f: f.write(pickle.dumps(result)) + + +if __name__ == "__main__": + _run() diff --git a/vllm/outputs.py b/vllm/outputs.py index 4f29226aa5128..07650241cb638 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -141,7 +141,7 @@ def from_seq_group(cls, seq_group: SequenceGroup, top_n_seqs = seqs else: # Get the top-n sequences. - n = sampling_params.n + n = sampling_params._real_n or sampling_params.n sorting_key = lambda seq: seq.get_cumulative_logprob() sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) top_n_seqs = sorted_seqs[:n] diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index a35777f91cac9..8ba973b28263f 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -1,7 +1,21 @@ +import os + import torch +import vllm.envs as envs +from vllm.compilation.levels import CompilationLevel +from vllm.plugins import set_torch_compile_backend + from .interface import Platform, PlatformEnum +if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ: + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE) + +assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.INDUCTOR,\ + "TPU does not support Inductor." + +set_torch_compile_backend("openxla") + class TpuPlatform(Platform): _enum = PlatformEnum.TPU diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 7939688ef0da3..211fedbc6e2ec 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,5 +1,5 @@ import logging -from typing import Callable, Optional, Union +from typing import Callable, Dict, Optional, Union import vllm.envs as envs @@ -42,3 +42,15 @@ def set_torch_compile_backend(backend: Union[Callable, str]): def get_torch_compile_backend() -> Optional[Union[Callable, str]]: return _torch_compile_backend + + +_inductor_additional_configs: Dict = {} + + +def set_inductor_additional_configs(configs: Dict): + global _inductor_additional_configs + _inductor_additional_configs = configs + + +def get_inductor_additional_configs() -> Dict: + return _inductor_additional_configs diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 95345df43b57d..4f2ae75e65f3a 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -106,9 +106,8 @@ class SamplingParams( n: Number of output sequences to return for the given prompt. best_of: Number of output sequences that are generated from the prompt. From these `best_of` sequences, the top `n` sequences are returned. - `best_of` must be greater than or equal to `n`. This is treated as - the beam width when `use_beam_search` is True. By default, `best_of` - is set to `n`. + `best_of` must be greater than or equal to `n`. By default, + `best_of` is set to `n`. presence_penalty: Float that penalizes new tokens based on whether they appear in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat @@ -173,6 +172,7 @@ class SamplingParams( n: int = 1 best_of: Optional[int] = None + _real_n: Optional[int] = None presence_penalty: float = 0.0 frequency_penalty: float = 0.0 repetition_penalty: float = 1.0 @@ -282,7 +282,19 @@ def from_optional( ) def __post_init__(self) -> None: - self.best_of = self.best_of or self.n + # how we deal with `best_of``: + # if `best_of`` is not set, we default to `n`; + # if `best_of`` is set, we set `n`` to `best_of`, + # and set `_real_n`` to the original `n`. + # when we return the result, we will check + # if we need to return `n` or `_real_n` results + if self.best_of: + if self.best_of < self.n: + raise ValueError( + f"best_of must be greater than or equal to n, " + f"got n={self.n} and best_of={self.best_of}.") + self._real_n = self.n + self.n = self.best_of if 0 < self.temperature < _MAX_TEMP: logger.warning( "temperature %s is less than %s, which may cause numerical " @@ -329,12 +341,6 @@ def _verify_args(self) -> None: f"type {type(self.n)}") if self.n < 1: raise ValueError(f"n must be at least 1, got {self.n}.") - if not isinstance(self.best_of, int): - raise ValueError(f"best_of must be an int, but is of " - f"type {type(self.best_of)}") - if self.best_of < self.n: - raise ValueError(f"best_of must be greater than or equal to n, " - f"got n={self.n} and best_of={self.best_of}.") if not -2.0 <= self.presence_penalty <= 2.0: raise ValueError("presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}.") @@ -385,7 +391,7 @@ def _verify_args(self) -> None: raise ValueError( "stop strings are only supported when detokenize is True. " "Set detokenize=True to use stop.") - if self.best_of != self.n and self.output_kind == ( + if self.best_of != self._real_n and self.output_kind == ( RequestOutputKind.DELTA): raise ValueError("best_of must equal n to use output_kind=DELTA") @@ -393,10 +399,6 @@ def _verify_greedy_sampling(self) -> None: if self.n > 1: raise ValueError("n must be 1 when using greedy sampling, " f"got {self.n}.") - assert isinstance(self.best_of, int) - if self.best_of > 1: - raise ValueError("best_of must be 1 when using greedy sampling, " - f"got {self.best_of}.") def update_from_generation_config( self, @@ -453,7 +455,6 @@ def clone(self) -> "SamplingParams": def __repr__(self) -> str: return ( f"SamplingParams(n={self.n}, " - f"best_of={self.best_of}, " f"presence_penalty={self.presence_penalty}, " f"frequency_penalty={self.frequency_penalty}, " f"repetition_penalty={self.repetition_penalty}, " diff --git a/vllm/sequence.py b/vllm/sequence.py index 0c27ffca36cfd..3bb35ea955c8c 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -803,14 +803,14 @@ def get_max_num_running_seqs(self) -> int: """The maximum number of sequences running in parallel in the remaining lifetime of the request.""" if self.sampling_params: - best_of = self.sampling_params.best_of - assert isinstance(best_of, int) - if best_of > self.num_seqs(): + n = self.sampling_params.n + assert isinstance(n, int) + if n > self.num_seqs(): # At prompt stage, the sequence group is not yet filled up # and only have one sequence running. However, in the - # generation stage, we will have `best_of` sequences + # generation stage, we will have `n` sequences # running. - return best_of + return n # At sampling stages, return the number of actual sequences # that are not finished yet. return self.num_unfinished_seqs() @@ -1137,10 +1137,9 @@ def __eq__(self, other: object) -> bool: return self.embeddings == other.embeddings -class IntermediateTensors( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - array_like=True): # type: ignore[call-arg] +# cannot use msgspec.Struct here because Dynamo does not support it +@dataclass +class IntermediateTensors: """For all pipeline stages except the last, we need to return the hidden states and residuals to be sent to the next stage. This data structure contains the hidden states and residuals for a request. diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py index 59f2a4191a8b2..f35a8a0ab8be3 100644 --- a/vllm/spec_decode/mqa_scorer.py +++ b/vllm/spec_decode/mqa_scorer.py @@ -18,6 +18,7 @@ def score_proposals( target_seq_id_start = max( get_all_seq_ids(execute_model_req.seq_group_metadata_list)) + 1 all_proposal_tokens = proposals.proposal_token_ids.tolist() + all_proposal_lengths = proposals.proposal_lens.tolist() for i, seq_group_metadata in enumerate( execute_model_req.seq_group_metadata_list): seq_data_dict = seq_group_metadata.seq_data @@ -27,7 +28,8 @@ def score_proposals( seq_data: SequenceData = seq_data_dict[seq_id] prompt_token_ids = seq_data.get_prompt_token_ids() output_token_ids = seq_data.get_output_token_ids() - proposal_token_ids = all_proposal_tokens[i] + proposal_token_ids = all_proposal_tokens[ + i][:all_proposal_lengths[i]] new_output_token_ids = [*output_token_ids, *proposal_token_ids] target_seq_id = target_seq_id_start + i @@ -62,18 +64,42 @@ def score_proposals( target_sampler_output = target_sampler_output[0] - bs, k = proposals.proposal_token_ids.shape - all_tokens = target_sampler_output.sampled_token_ids.reshape(bs, k + 1) - - all_probs = target_sampler_output.sampled_token_probs.reshape( - bs, k + 1, self._vocab_size) - all_logprobs = target_sampler_output.logprobs.reshape( - bs, k + 1, self._vocab_size) + k = execute_model_req.num_lookahead_slots + bs = len(execute_model_req.seq_group_metadata_list) + target_token_ids = target_sampler_output.sampled_token_ids + target_probs = target_sampler_output.sampled_token_probs + target_logprobs = target_sampler_output.logprobs + # If all requests have the same number of query tokens, we can avoid + # the for loop to build output for better performance. + if min(all_proposal_lengths) == k: + bs, _ = proposals.proposal_token_ids.shape + all_tokens = target_token_ids.reshape(bs, k + 1) + all_probs = target_probs.reshape(bs, k + 1, self._vocab_size) + all_logprobs = target_logprobs.reshape(bs, k + 1, self._vocab_size) + else: + all_tokens = target_token_ids.new_full(size=(bs, k + 1), + fill_value=-1) + all_probs = target_probs.new_zeros(*all_tokens.shape, + self._vocab_size) + all_logprobs = target_logprobs.new_full(size=all_probs.shape, + fill_value=-float("inf")) + target_token_ids = target_token_ids.flatten() + start_loc = 0 + for i, proposed_len in enumerate(all_proposal_lengths): + output_len = proposed_len + 1 + end_loc = start_loc + output_len + all_tokens[ + i, :output_len] = target_token_ids[start_loc:end_loc] + all_probs[i, :output_len] = target_probs[start_loc:end_loc] + all_logprobs[ + i, :output_len] = target_logprobs[start_loc:end_loc] + start_loc = end_loc hidden_states = None if target_sampler_output.hidden_states is not None: hidden_states = target_sampler_output.hidden_states.reshape( bs, (k + 1), -1) + return SpeculativeScores(probs=all_probs, token_ids=all_tokens, logprobs=all_logprobs, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index a67715290a515..50d2767a03752 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -87,6 +87,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker +# Reminder: Please update docs/source/serving/compatibility_matrix.rst +# If the feature combo become valid class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. @@ -188,12 +190,6 @@ def create_worker( "[Speculative Decoding] Disabling MQA scorer as the " "MQA is only available with flash attn backend.") - if ngram_prompt_lookup_max > 0: - disable_mqa_scorer = True - logger.info( - "[Speculative Decoding] Disabling MQA scorer as the " - "NGramWorker does not support MQA scorer.") - if "model_config" in draft_worker_kwargs and \ draft_worker_kwargs["model_config"].max_model_len < \ scorer_worker.model_config.max_model_len: diff --git a/vllm/tracing.py b/vllm/tracing.py index 31849e2b635aa..50068d8cf9c25 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -96,7 +96,6 @@ class SpanAttributes(BaseSpanAttributes): # The following span attribute names are added here because they are missing # from the Semantic Conventions for LLM. LLM_REQUEST_ID = "gen_ai.request.id" - LLM_REQUEST_BEST_OF = "gen_ai.request.best_of" LLM_REQUEST_N = "gen_ai.request.n" LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences" LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue" diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 85c339df4a76c..94af2388d79db 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -59,6 +59,26 @@ def __len__(self): return tokenizer +def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None: + """Patch _pad method to accept `padding_side` for older tokenizers.""" + orig_pad = tokenizer._pad + + def _pad( + self: PreTrainedTokenizer, + *args, + padding_side: Optional[str] = None, + **kwargs, + ): + if padding_side is not None and padding_side != self.padding_side: + msg = ("`padding_side` argument is not supported by " + f"{type(tokenizer).__name__} and will be ignored.") + warnings.warn(msg, stacklevel=2) + + return orig_pad(*args, **kwargs) + + tokenizer._pad = MethodType(_pad, tokenizer) + + def get_tokenizer( tokenizer_name: Union[str, Path], *args, @@ -143,24 +163,7 @@ def get_tokenizer( if type(tokenizer).__name__ in ("ChatGLMTokenizer", "ChatGLM4Tokenizer"): assert isinstance(tokenizer, PreTrainedTokenizer) - orig_pad = tokenizer._pad - - # Patch _pad method to accept `padding_side` - def _pad( - self: PreTrainedTokenizer, - *args, - padding_side: Optional[str] = None, - **kwargs, - ): - if (padding_side is not None - and padding_side != self.padding_side): - msg = ("`padding_side` argument is not supported by " - "ChatGLMTokenizer and will be ignored.") - warnings.warn(msg, stacklevel=2) - - return orig_pad(*args, **kwargs) - - tokenizer._pad = MethodType(_pad, tokenizer) + patch_padding_side(tokenizer) if not isinstance(tokenizer, PreTrainedTokenizerFast): logger.warning( diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 788133059f12d..aae10d3ee25fd 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -30,12 +30,12 @@ def find_tokenizer_file(files: List[str]): matched_files = [file for file in files if file_pattern.match(file)] if len(matched_files) > 1: raise OSError(f"Found {len(matched_files)} files matching the " - "pattern: {matched_files}. Make sure only one Mistral " - "tokenizer is present in {tokenizer_name}.") + f"pattern: {file_pattern}. Make sure only one Mistral " + f"tokenizer is present in {files}.") elif len(matched_files) == 0: raise OSError(f"Found {len(matched_files)} files matching the " - "pattern: {matched_files}. Make sure that a Mistral " - "tokenizer is present in {tokenizer_name}.") + f"pattern: {file_pattern}. Make sure that a Mistral " + f"tokenizer is present in {files}.") return matched_files[0] diff --git a/vllm/utils.py b/vllm/utils.py index 314fec0a65c7b..8debae52b288c 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -41,6 +41,9 @@ # Exception strings for non-implemented encoder/decoder scenarios +# Reminder: Please update docs/source/serving/compatibility_matrix.rst +# If the feature combo become valid + STR_NOT_IMPL_ENC_DEC_SWA = \ "Sliding window attention for encoder/decoder models " + \ "is not currently supported." diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 252440c7b7e08..090f95e6e892c 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -52,15 +52,12 @@ def __init__( self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] # Get attention backend. - self.attn_backend = get_attn_backend( - model_config.get_num_attention_heads(parallel_config), - self.head_size, - self.num_kv_heads, - model_config.get_sliding_window(), - model_config.dtype, - cache_config.cache_dtype, - self.block_size, - ) + self.attn_backend = get_attn_backend(self.head_size, + model_config.get_sliding_window(), + model_config.dtype, + cache_config.cache_dtype, + self.block_size, + model_config.is_attention_free) # Initialize the cache. self.gpu_cache = self._allocate_kv_cache( diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index f67b086796411..795511aea6754 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -418,13 +418,12 @@ def __init__( self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size self.attn_backend = get_attn_backend( - self.model_config.get_num_attention_heads(self.parallel_config), self.model_config.get_head_size(), - self.model_config.get_num_kv_heads(self.parallel_config), self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, + self.model_config.is_attention_free, ) # Multi-modal data support diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index d6e3670e304d5..b84562851f0f8 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -56,13 +56,12 @@ def __init__(self, cache_config: CacheConfig, model_config: ModelConfig, # Get attention backend. self.attn_backend = get_attn_backend( - self.model_config.get_num_attention_heads(self.parallel_config), self.model_config.get_head_size(), - self.model_config.get_num_kv_heads(self.parallel_config), self.model_config.get_sliding_window(), self.model_config.dtype, cache_config.cache_dtype, self.block_size, + self.model_config.is_attention_free, ) # Initialize the cache. diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 59b4b8c4ddf38..6a00444f5098b 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -196,7 +196,7 @@ def execute_model( seqlen_agnostic_kwargs = { "finished_requests_ids": model_input.finished_requests_ids, "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, - } if self.has_seqlen_agnostic else {} + } if self.has_inner_state else {} multi_modal_kwargs = model_input.multi_modal_kwargs or {} with set_forward_context(model_input.attn_metadata): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 0bd2958816718..9db3261b8ac36 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -17,7 +17,8 @@ import vllm.envs as envs from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention.backends.abstract import AttentionState -from vllm.attention.backends.utils import CommonAttentionState +from vllm.compilation.compile_context import set_compile_context +from vllm.compilation.levels import CompilationLevel from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig) @@ -989,8 +990,7 @@ def __init__( self.graph_memory_pool: Optional[Tuple[ int, int]] = None # Set during graph capture. - self.has_seqlen_agnostic = model_config.contains_seqlen_agnostic_layers( - parallel_config) + self.has_inner_state = model_config.has_inner_state # When using CUDA graph, the input block tables must be padded to # max_seq_len_to_capture. However, creating the block table in @@ -1001,22 +1001,16 @@ def __init__( self.graph_block_tables = np.zeros( (self.max_batchsize_to_capture, self.get_max_block_per_batch()), dtype=np.int32) - num_attn_heads = self.model_config.get_num_attention_heads( - self.parallel_config) self.attn_backend = get_attn_backend( - num_attn_heads, self.model_config.get_head_size(), - self.model_config.get_num_kv_heads(self.parallel_config), self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, - ) if num_attn_heads else None - if self.attn_backend: - self.attn_state = self.attn_backend.get_state_cls()( - weakref.proxy(self)) - else: - self.attn_state = CommonAttentionState(weakref.proxy(self)) + self.model_config.is_attention_free, + ) + self.attn_state = self.attn_backend.get_state_cls()( + weakref.proxy(self)) # Multi-modal data support self.input_registry = input_registry @@ -1126,10 +1120,10 @@ def load_model(self) -> None: "provided. Defaulting to scaling factors of 1.0. " "This may lead to less accurate results!") - if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE and supports_dynamo(): - from vllm.compilation.backends import vllm_backend + if envs.VLLM_TORCH_COMPILE_LEVEL == CompilationLevel.DYNAMO_AS_IS \ + and supports_dynamo(): from vllm.plugins import get_torch_compile_backend - backend = get_torch_compile_backend() or vllm_backend + backend = get_torch_compile_backend() or "eager" self.model = torch.compile( self.model, fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, @@ -1289,7 +1283,15 @@ def profile_run(self) -> None: batch_size=batch_size, dtype=self.model_config.dtype, device=self.device) - self.execute_model(model_input, kv_caches, intermediate_tensors) + + graph_batch_size = self.max_batchsize_to_capture + batch_size_capture_list = [ + bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size + ] + if self.model_config.enforce_eager: + batch_size_capture_list = [] + with set_compile_context(batch_size_capture_list): + self.execute_model(model_input, kv_caches, intermediate_tensors) torch.cuda.synchronize() return @@ -1488,7 +1490,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: "previous_hidden_states"] = previous_hidden_states[: batch_size] - if self.has_seqlen_agnostic: + if self.has_inner_state: # Only used by Mamba-based models CUDA graph atm (Jamba) capture_inputs.update({ "seqlen_agnostic_capture_inputs": @@ -1637,7 +1639,7 @@ def execute_model( seqlen_agnostic_kwargs = { "finished_requests_ids": model_input.finished_requests_ids, "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids, - } if self.has_seqlen_agnostic else {} + } if self.has_inner_state else {} if (self.observability_config is not None and self.observability_config.collect_model_forward_time): model_forward_start = torch.cuda.Event(enable_timing=True) @@ -1842,10 +1844,14 @@ def forward( # Copy the input tensors to the input buffers. self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) self.input_buffers["positions"].copy_(positions, non_blocking=True) - self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping, - non_blocking=True) + + if self.backend_name != "placeholder-attn": + self.input_buffers["slot_mapping"].copy_( + attn_metadata.slot_mapping, non_blocking=True) + self.attn_state.prepare_graph_input_buffers( self.input_buffers, attn_metadata, self._is_encoder_decoder_model) + if "seqlen_agnostic_capture_inputs" in self.input_buffers: self.model.copy_inputs_before_cuda_graphs(self.input_buffers, **kwargs) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 12aa473525c13..0cd0047bebf2d 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -816,6 +816,9 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid + # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: assert len(seq_group.sampling_params.logits_processors) == 0, ( "Logits Processors are not supported in multi-step decoding") diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index de3088695dfef..760b18427e22b 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -74,13 +74,12 @@ def __init__( self.block_size = cache_config.block_size self.attn_backend = get_attn_backend( - self.model_config.get_num_attention_heads(self.parallel_config), self.model_config.get_head_size(), - self.model_config.get_num_kv_heads(self.parallel_config), self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, + self.model_config.is_attention_free, ) # Multi-modal data support diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 6b818186779b6..24425fece850f 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -70,13 +70,12 @@ def __init__( # Get attention backend. self.attn_backend = get_attn_backend( - self.model_config.get_num_attention_heads(self.parallel_config), self.head_size, - self.model_config.get_num_kv_heads(self.parallel_config), self.model_config.get_sliding_window(), self.model_config.dtype, self.cache_config.cache_dtype, self.block_size, + self.model_config.is_attention_free, ) # Initialize the cache. diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 12e4215038d74..c13e95f60af58 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -49,7 +49,7 @@ class ModelInputForTPU(ModelRunnerInputBase): t: torch.Tensor p: torch.Tensor num_samples: int - best_of: List[int] + n: List[int] seq_groups: List[List[int]] is_first_multi_step: bool = True is_last_step: bool = True @@ -65,7 +65,7 @@ def as_broadcastable_tensor_dict( "t": self.t, "p": self.p, "num_samples": self.num_samples, - "best_of": self.best_of, + "n": self.n, "seq_groups": self.seq_groups, "is_first_multi_step": self.is_first_multi_step, "is_last_step": self.is_last_step, @@ -113,13 +113,12 @@ def __init__( (self.scheduler_config.max_num_seqs, self.max_num_blocks_per_seq), dtype=np.int32) self.attn_backend = get_attn_backend( - self.model_config.get_num_attention_heads(self.parallel_config), self.model_config.get_head_size(), - self.model_config.get_num_kv_heads(self.parallel_config), self.model_config.get_sliding_window(), self.model_config.dtype, self.cache_config.cache_dtype, self.block_size, + self.model_config.is_attention_free, False, ) self.cached_step_outputs: List[torch.Tensor] = [] @@ -435,7 +434,7 @@ def _prepare_sample( assert len(seq_group_metadata_list) > 0 t = [] p = [] - best_of = [] + n = [] for seq_group_metadata in seq_group_metadata_list: sampling_params = seq_group_metadata.sampling_params t.append(sampling_params.temperature) @@ -448,11 +447,11 @@ def _prepare_sample( raise NotImplementedError( "Top-k sampling is currently disabled for the TPU backend " "due to performance issues.") - if sampling_params.best_of > _MAX_NUM_SAMPLES: + if sampling_params.n > _MAX_NUM_SAMPLES: raise NotImplementedError( f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU " "backend.") - best_of.append(sampling_params.best_of) + n.append(sampling_params.n) if sampling_params.logprobs is not None: raise NotImplementedError( "logprobs is not currently supported by the TPU backend.") @@ -465,7 +464,7 @@ def _prepare_sample( num_seqs = len(seq_group_metadata.seq_data) t += [t[-1]] * (num_seqs - 1) p += [p[-1]] * (num_seqs - 1) - best_of += [best_of[-1]] * (num_seqs - 1) + n += [n[-1]] * (num_seqs - 1) num_paddings = padded_batch_size - len(t) t += [1.0] * num_paddings @@ -473,7 +472,7 @@ def _prepare_sample( t = torch.tensor(t, dtype=torch.float32, device="cpu") p = torch.tensor(p, dtype=torch.float32, device="cpu") - return t, p, best_of + return t, p, n def prepare_model_input( self, @@ -493,8 +492,8 @@ def prepare_model_input( inputs = self._prepare_decode(seq_group_metadata_list) input_tokens, input_positions, attn_metadata, input_lens = inputs padded_batch_size = input_tokens.shape[0] - t, p, best_of = self._prepare_sample(seq_group_metadata_list, - padded_batch_size) + t, p, n = self._prepare_sample(seq_group_metadata_list, + padded_batch_size) num_samples = _MAX_NUM_SAMPLES if is_prompt else 1 seq_groups = [ @@ -502,8 +501,7 @@ def prepare_model_input( for metadata in seq_group_metadata_list ] return ModelInputForTPU(input_tokens, input_positions, attn_metadata, - input_lens, t, p, num_samples, best_of, - seq_groups) + input_lens, t, p, num_samples, n, seq_groups) def make_model_input_from_broadcasted_tensor_dict( self, tensor_dict: Dict[str, Any]) -> ModelInputForTPU: @@ -543,7 +541,8 @@ def execute_model( seq_group_metadata_list=ctx.seq_group_metadata_list, scheduler_outputs=ctx.scheduler_outputs, is_async=False, - is_last_step=False) + is_last_step=False, + is_first_step_output=i == 0) model_input.async_callback() if use_async_out_proc: return [sampler_outputs[-1]] @@ -609,7 +608,7 @@ def execute_model( assert len(seq_ids) == 1 seq_id = seq_ids[0] seq_outputs = [] - for j in range(model_input.best_of[i]): + for j in range(model_input.n[i]): next_token_id = next_token_ids[i][j] seq_outputs.append( SequenceOutput(seq_id, next_token_id, diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index a07395dfc61d8..f43635464ef00 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -13,6 +13,9 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid + if enc_dec_mr.cache_config.enable_prefix_caching: raise NotImplementedError( STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE']) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 3851843afc960..ab61e4377f900 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -236,11 +236,15 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: "not properly cleaned up before initializing the vLLM instance.") cache_block_size = self.get_cache_block_size_bytes() - num_gpu_blocks = int( - (total_gpu_memory * self.cache_config.gpu_memory_utilization - - peak_memory) // cache_block_size) - num_cpu_blocks = int(self.cache_config.swap_space_bytes // - cache_block_size) + if cache_block_size == 0: + num_gpu_blocks = 0 + num_cpu_blocks = 0 + else: + num_gpu_blocks = int( + (total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) // cache_block_size) + num_cpu_blocks = int(self.cache_config.swap_space_bytes // + cache_block_size) num_gpu_blocks = max(num_gpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) if self.model_runner.lora_manager: @@ -257,6 +261,7 @@ def initialize_cache(self, num_gpu_blocks: int, """ raise_if_cache_size_invalid(num_gpu_blocks, self.cache_config.block_size, + self.cache_config.is_attention_free, self.model_config.max_model_len) self.cache_config.num_gpu_blocks = num_gpu_blocks @@ -472,14 +477,18 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): "`dtype` flag in CLI, for example: --dtype=half.") -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free, max_model_len) -> None: - if num_gpu_blocks <= 0: + if is_attention_free and num_gpu_blocks != 0: + raise ValueError("No memory should be allocated for the cache blocks " + f"for an attention-free model, but {num_gpu_blocks}" + "blocks are allocated.") + if not is_attention_free and num_gpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " "initializing the engine.") max_seq_len = block_size * num_gpu_blocks - if max_model_len > max_seq_len: + if not is_attention_free and max_model_len > max_seq_len: raise ValueError( f"The model's max seq len ({max_model_len}) " "is larger than the maximum number of tokens that can be " diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 612428180226a..20dceee849ae5 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -372,13 +372,12 @@ def __init__( self.block_size = cache_config.block_size self.attn_backend = get_attn_backend( - self.model_config.get_num_attention_heads(self.parallel_config), self.model_config.get_head_size(), - self.model_config.get_num_kv_heads(self.parallel_config), self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, + self.model_config.is_attention_free, ) # Multi-modal data support