From 666ad0aa16f0c656e48e58b4f31ffe956b484d3b Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Thu, 22 Aug 2024 13:10:55 -0700 Subject: [PATCH 01/24] [ci] Cleanup & refactor Dockerfile to pass different Python versions and sccache bucket via build args (#7705) Signed-off-by: kevin --- Dockerfile | 61 +++++++++++++++++++++++------------------------------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/Dockerfile b/Dockerfile index c13cb5c7e7a95..36fcc2f83e9fb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,28 +9,23 @@ ARG CUDA_VERSION=12.4.1 #################### BASE BUILD IMAGE #################### # prepare basic build environment FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base - ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.10 - ENV DEBIAN_FRONTEND=noninteractive +# Install Python and other dependencies RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ - && apt-get install -y ccache software-properties-common \ + && apt-get install -y ccache software-properties-common git curl sudo \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update -y \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ - && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \ - && python3 --version - -RUN apt-get update -y \ - && apt-get install -y git curl sudo - -# Install pip s.t. it will be compatible with our PYTHON_VERSION -RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} -RUN python3 -m pip --version + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully @@ -62,17 +57,12 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} #################### WHEEL BUILD IMAGE #################### FROM base AS build -ARG PYTHON_VERSION=3.10 - # install build dependencies COPY requirements-build.txt requirements-build.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-build.txt -# install compiler cache to speed up compilation leveraging local or remote caching -RUN apt-get update -y && apt-get install -y ccache - # files and directories related to build wheels COPY csrc csrc COPY setup.py setup.py @@ -95,6 +85,8 @@ ARG buildkite_commit ENV BUILDKITE_COMMIT=${buildkite_commit} ARG USE_SCCACHE +ARG SCCACHE_BUCKET_NAME=vllm-build-sccache +ARG SCCACHE_REGION_NAME=us-west-2 # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/pip \ if [ "$USE_SCCACHE" = "1" ]; then \ @@ -103,12 +95,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \ && tar -xzf sccache.tar.gz \ && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ - && if [ "$CUDA_VERSION" = "11.8.0" ]; then \ - export SCCACHE_BUCKET=vllm-build-sccache-2; \ - else \ - export SCCACHE_BUCKET=vllm-build-sccache; \ - fi \ - && export SCCACHE_REGION=us-west-2 \ + && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ + && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ + && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ @@ -160,23 +149,24 @@ FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.10 WORKDIR /vllm-workspace +ENV DEBIAN_FRONTEND=noninteractive + +RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ + echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment +# Install Python and other dependencies RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ - && apt-get install -y ccache software-properties-common \ + && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update -y \ - && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ - && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \ - && python3 --version - -RUN apt-get update -y \ - && apt-get install -y python3-pip git vim curl libibverbs-dev - -# Install pip s.t. it will be compatible with our PYTHON_VERSION -RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} -RUN python3 -m pip --version + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully @@ -194,7 +184,8 @@ RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamb python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp310-cp310-linux_x86_64.whl + . /etc/environment && \ + python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl #################### vLLM installation IMAGE #################### From a15224642832acdddd757ddc95ed40a0ad1be33d Mon Sep 17 00:00:00 2001 From: Siyuan Liu Date: Thu, 22 Aug 2024 13:51:23 -0700 Subject: [PATCH 02/24] [Misc] fix typo in triton import warning (#7794) --- vllm/triton_utils/importing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index 3455036586a93..ce46082247639 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -8,4 +8,4 @@ if not HAS_TRITON: logger.info("Triton not installed; certain GPU-related functions" - " will be not be available.") + " will not be available.") From b903e1ba7fca15f0dc49ab49a9ec8107f625c048 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Thu, 22 Aug 2024 15:50:21 -0600 Subject: [PATCH 03/24] [Frontend] error suppression cleanup (#7786) Signed-off-by: Joe Runde --- tests/entrypoints/openai/rpc/test_zmq_client.py | 7 ++++--- vllm/entrypoints/openai/api_server.py | 5 ++--- vllm/entrypoints/openai/rpc/client.py | 13 ++++++++++++- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/tests/entrypoints/openai/rpc/test_zmq_client.py b/tests/entrypoints/openai/rpc/test_zmq_client.py index 631d15cd03ed7..cafd125c5a598 100644 --- a/tests/entrypoints/openai/rpc/test_zmq_client.py +++ b/tests/entrypoints/openai/rpc/test_zmq_client.py @@ -75,11 +75,12 @@ async def test_client_aborts_use_timeouts(monkeypatch, dummy_server, m.setattr(dummy_server, "abort", lambda x: None) m.setattr(client, "_data_timeout", 10) - # Ensure the client doesn't hang + # The client should suppress timeouts on `abort`s + # and return normally, assuming the server will eventually + # abort the request. client_task = asyncio.get_running_loop().create_task( client.abort("test request id")) - with pytest.raises(TimeoutError, match="Server didn't reply within"): - await asyncio.wait_for(client_task, timeout=0.05) + await asyncio.wait_for(client_task, timeout=0.05) @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 603ac19d8c04b..8e8371ef1559a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -6,7 +6,7 @@ import re import tempfile from argparse import Namespace -from contextlib import asynccontextmanager, suppress +from contextlib import asynccontextmanager from http import HTTPStatus from typing import AsyncIterator, Optional, Set @@ -83,8 +83,7 @@ async def lifespan(app: FastAPI): async def _force_log(): while True: await asyncio.sleep(10) - with suppress(Exception): - await async_engine_client.do_log_stats() + await async_engine_client.do_log_stats() if not engine_args.disable_log_stats: task = asyncio.create_task(_force_log()) diff --git a/vllm/entrypoints/openai/rpc/client.py b/vllm/entrypoints/openai/rpc/client.py index 55b92d41975ea..dc316ca1160c6 100644 --- a/vllm/entrypoints/openai/rpc/client.py +++ b/vllm/entrypoints/openai/rpc/client.py @@ -335,7 +335,18 @@ async def _is_tracing_enabled_rpc(self) -> bool: async def abort(self, request_id: str): """Send an ABORT_REQUEST signal to the RPC Server""" - with suppress(RPCClientClosedError): + + # Suppress timeouts as well. + # In cases where the server is busy processing requests and a very + # large volume of abort requests arrive, it is likely that the server + # will not be able to ack all of them in time. We have seen this when + # we abort 20k requests at once while another 2k are processing- many + # of them time out, but we see the server successfully abort all of the + # requests. + # In this case we assume that the server has received or will receive + # these abort requests, and ignore the timeout. This prevents a massive + # wall of `TimeoutError` stack traces. + with suppress(RPCClientClosedError, TimeoutError): await self._send_one_way_rpc_request( request=RPCAbortRequest(request_id), error_message=f"RPCAbortRequest {request_id} failed") From c01a6cb23144b67b473e569a25b6f9725bc3f85b Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Thu, 22 Aug 2024 17:44:25 -0700 Subject: [PATCH 04/24] [Ray backend] Better error when pg topology is bad. (#7584) Co-authored-by: youkaichao --- .buildkite/test-pipeline.yaml | 1 + .../distributed/test_multi_node_assignment.py | 64 ++++++++ vllm/executor/ray_utils.py | 141 ++++++++++++++++-- 3 files changed, 197 insertions(+), 9 deletions(-) create mode 100644 tests/distributed/test_multi_node_assignment.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 460095429680a..d70a9ce240825 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -293,6 +293,7 @@ steps: commands: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py new file mode 100644 index 0000000000000..9f9c0ff07ee37 --- /dev/null +++ b/tests/distributed/test_multi_node_assignment.py @@ -0,0 +1,64 @@ +"""Make sure ray assigns GPU workers to the correct node. + +Run: +```sh +cd $VLLM_PATH/tests + +pytest distributed/test_multi_node_assignment.py +``` +""" + +import os + +import pytest +import ray +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +from vllm import initialize_ray_cluster +from vllm.config import ParallelConfig +from vllm.executor.ray_utils import _wait_until_pg_removed +from vllm.utils import get_ip + +VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" + + +@pytest.mark.skipif(not VLLM_MULTI_NODE, + reason="Need at least 2 nodes to run the test.") +def test_multi_node_assignment() -> None: + + # NOTE: important to keep this class definition here + # to let ray use cloudpickle to serialize it. + class Actor: + + def get_ip(self): + return get_ip() + + for _ in range(10): + config = ParallelConfig(1, 2) + initialize_ray_cluster(config) + + current_ip = get_ip() + workers = [] + for bundle_id, bundle in enumerate( + config.placement_group.bundle_specs): + if not bundle.get("GPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=config.placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + + worker = ray.remote( + num_cpus=0, + num_gpus=1, + scheduling_strategy=scheduling_strategy, + )(Actor).remote() + worker_ip = ray.get(worker.get_ip.remote()) + assert worker_ip == current_ip + workers.append(worker) + + for worker in workers: + ray.kill(worker) + + _wait_until_pg_removed(config.placement_group) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index ffc94d07ed399..bfdd0f5cf97b3 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,4 +1,6 @@ -from typing import List, Optional, Tuple, Union +import time +from collections import defaultdict +from typing import Dict, List, Optional, Tuple, Union import msgspec @@ -11,9 +13,13 @@ from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) +PG_WAIT_TIMEOUT = 1800 try: import ray + from ray._private.state import available_resources_per_node + from ray.util import placement_group_table + from ray.util.placement_group import PlacementGroup class RayWorkerWrapper(WorkerWrapperBase): """Ray wrapper for vllm.worker.Worker, allowing Worker to be @@ -98,6 +104,106 @@ def assert_ray_available(): "`pip install ray`.") from ray_import_err +def _verify_bundles(placement_group: "PlacementGroup", + parallel_config: ParallelConfig, device_str: str): + """Verify a given placement group has bundles located in the right place. + + There are 2 rules. + - Warn if all tensor parallel workers cannot fit in a single node. + - Fail if driver node is not included in a placement group. + """ + assert ray.is_initialized(), ( + "Ray is not initialized although distributed-executor-backend is ray.") + pg_data = placement_group_table(placement_group) + # bundle_idx -> node_id + bundle_to_node_ids = pg_data["bundles_to_node_id"] + # bundle_idx -> bundle (e.g., {"GPU": 1}) + bundles = pg_data["bundles"] + # node_id -> List of bundle (e.g., {"GPU": 1}) + node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list) + + for bundle_idx, node_id in bundle_to_node_ids.items(): + node_id_to_bundle[node_id].append(bundles[bundle_idx]) + driver_node_id = ray.get_runtime_context().get_node_id() + + if driver_node_id not in node_id_to_bundle: + raise RuntimeError( + f"driver node id {driver_node_id} is not included in a placement " + f"group {placement_group.id}. Node id -> bundles " + f"{node_id_to_bundle}. " + "You don't have enough GPUs available in a current node. Check " + "`ray status` to see if you have available GPUs in a node " + f"{driver_node_id} before starting an vLLM engine.") + + for node_id, bundles in node_id_to_bundle.items(): + if len(bundles) < parallel_config.tensor_parallel_size: + logger.warning( + "tensor_parallel_size=%d " + "is bigger than a reserved number of %ss (%d " + "%ss) in a node %s. Tensor parallel workers can be " + "spread out to 2+ nodes which can degrade the performance " + "unless you have fast interconnect across nodes, like " + "Infiniband. To resolve this issue, make sure you have more " + "than %d GPUs available at each node.", + parallel_config.tensor_parallel_size, device_str, len(bundles), + device_str, node_id, parallel_config.tensor_parallel_size) + + +def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): + """Wait until a placement group is ready. + + It prints the informative log messages if the placement group is + not created within time. + + """ + # Wait until PG is ready - this will block until all + # requested resources are available, and will timeout + # if they cannot be provisioned. + placement_group_specs = current_placement_group.bundle_specs + + s = time.time() + pg_ready_ref = current_placement_group.ready() + wait_interval = 10 + while time.time() - s < PG_WAIT_TIMEOUT: + ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval) + if len(ready) > 0: + break + + # Exponential backoff for warning print. + wait_interval *= 2 + logger.info( + "Waiting for creating a placement group of specs for " + "%d seconds. specs=%s. Check " + "`ray status` to see if you have enough resources.", + int(time.time() - s), placement_group_specs) + + try: + ray.get(pg_ready_ref, timeout=0) + except ray.exceptions.GetTimeoutError: + raise ValueError( + "Cannot provide a placement group of " + f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " + "`ray status` to make sure the cluster has enough resources." + ) from None + + +def _wait_until_pg_removed(current_placement_group: "PlacementGroup"): + ray.util.remove_placement_group(current_placement_group) + s = time.time() + wait_interval = 10 + while time.time() - s < PG_WAIT_TIMEOUT: + pg = ray.util.get_current_placement_group() + if pg is None: + break + + # Exponential backoff for warning print. + wait_interval *= 2 + logger.info( + "Waiting for removing a placement group of specs for " + "%d seconds.", int(time.time() - s)) + time.sleep(wait_interval) + + def initialize_ray_cluster( parallel_config: ParallelConfig, ray_address: Optional[str] = None, @@ -156,15 +262,32 @@ def initialize_ray_cluster( f"The number of required {device_str}s exceeds the total " f"number of available {device_str}s in the placement group.") # Create a new placement group - placement_group_specs = ([{ - device_str: 1 - }] * parallel_config.world_size) + placement_group_specs: List[Dict[str, float]] = ([{ + device_str: 1.0 + } for _ in range(parallel_config.world_size)]) + + # vLLM engine is also a worker to execute model with an accelerator, + # so it requires to have the device in a current node. Check if + # the current node has at least one device. + current_ip = get_ip() + current_node_id = ray.get_runtime_context().get_node_id() + current_node_resource = available_resources_per_node()[current_node_id] + if current_node_resource.get(device_str, 0) < 1: + raise ValueError( + f"Current node has no {device_str} available. " + f"{current_node_resource=}. vLLM engine cannot start without " + f"{device_str}. Make sure you have at least 1 {device_str} " + f"available in a node {current_node_id=} {current_ip=}.") + # This way, at least bundle is required to be created in a current + # node. + placement_group_specs[0][f"node:{current_ip}"] = 0.001 + + # By default, Ray packs resources as much as possible. current_placement_group = ray.util.placement_group( - placement_group_specs) - # Wait until PG is ready - this will block until all - # requested resources are available, and will timeout - # if they cannot be provisioned. - ray.get(current_placement_group.ready(), timeout=1800) + placement_group_specs, strategy="PACK") + _wait_until_pg_ready(current_placement_group) + assert current_placement_group is not None + _verify_bundles(current_placement_group, parallel_config, device_str) # Set the placement group in the parallel config parallel_config.placement_group = current_placement_group From fc5ebbd1d3453461ea6e00a78faf87c41d1aa625 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 23 Aug 2024 11:06:54 +0800 Subject: [PATCH 05/24] [Hardware][Intel GPU] refactor xpu_model_runner for tp (#7712) --- vllm/executor/ray_xpu_executor.py | 383 +----------------- vllm/worker/xpu_model_runner.py | 628 +++++++++++++++++------------- vllm/worker/xpu_worker.py | 9 +- 3 files changed, 370 insertions(+), 650 deletions(-) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index 938f83bc1338b..2b1cdc09b0a9f 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -1,386 +1,37 @@ import asyncio -import os -from collections import defaultdict -from itertools import islice, repeat -from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set, - Tuple, Union) +from typing import List, Optional import vllm.envs as envs -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ParallelConfig, PromptAdapterConfig, - SchedulerConfig, SpeculativeConfig) -from vllm.executor.distributed_gpu_executor import ( # yapf: disable - DistributedGPUExecutor, DistributedGPUExecutorAsync) -from vllm.executor.ray_utils import RayWorkerWrapper, ray +from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync +from vllm.executor.xpu_executor import XPUExecutor from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) - -if ray is not None: - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup +from vllm.utils import get_vllm_instance_id, make_async logger = init_logger(__name__) -# If the env var is set, it uses the Ray's compiled DAG API -# which optimizes the control plane overhead. -# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. -USE_RAY_COMPILED_DAG = envs.VLLM_USE_RAY_COMPILED_DAG - - -class RayXPUExecutor(DistributedGPUExecutor): - - uses_ray: bool = True - - def __init__( - self, - model_config: ModelConfig, - cache_config: CacheConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - prompt_adapter_config: Optional[PromptAdapterConfig], - speculative_config: Optional[SpeculativeConfig], - ) -> None: - assert device_config.device_type == "xpu" - assert (not speculative_config - ), "Speculative decoding not yet supported for XPU backend" - - self.model_config = model_config - self.cache_config = cache_config - self.load_config = load_config - self.lora_config = lora_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.prompt_adapter_config = prompt_adapter_config - - placement_group = self.parallel_config.placement_group - - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - - # Create the parallel GPU workers. - self._init_workers_ray(placement_group) - - self.forward_dag = None - if USE_RAY_COMPILED_DAG: - self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) - - # This is non-None when the execute model loop is running - # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. - self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None - # Updated by implementations that require additional args to be passed - # to the _run_workers execute_model call - self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {} - - def _init_executor(self) -> None: - pass - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks. - This invokes `determine_num_available_blocks` on each worker and takes - the min of the results, guaranteeing that the selected cache sizes are - compatible with all workers. - - Returns: - - Tuple[num_gpu_blocks, num_cpu_blocks] - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers("determine_num_available_blocks", ) - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - - return num_gpu_blocks, num_cpu_blocks - - def _get_worker_wrapper_args(self) -> Dict[str, Any]: - return dict( - worker_module_name="vllm.worker.xpu_worker", - worker_class_name="XPUWorker", - trust_remote_code=self.model_config.trust_remote_code, - ) - - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - if self.parallel_config.tensor_parallel_size == 1: - # For single GPU case, we use a ray worker with constrained memory. - num_gpus = self.cache_config.gpu_memory_utilization - else: - # Otherwise, the ray workers are allocated with a full GPU. - num_gpus = 1 - - # The driver dummy worker does not actually use any resources. - # It holds the resource for the driver worker. - self.driver_dummy_worker: Optional[RayWorkerWrapper] = None - # The remaining workers are the actual ray actors. - self.workers: List[RayWorkerWrapper] = [] - - # Create the workers. - driver_ip = get_ip() - worker_wrapper_kwargs = self._get_worker_wrapper_args() - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("GPU", 0): - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - worker = ray.remote( - num_cpus=0, - num_gpus=num_gpus, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerWrapper).remote(**worker_wrapper_kwargs) - - worker_ip = ray.get(worker.get_node_ip.remote()) - if worker_ip == driver_ip and self.driver_dummy_worker is None: - # If the worker is on the same node as the driver, we use it - # as the resource holder for the driver process. - self.driver_dummy_worker = worker - self.driver_worker = RayWorkerWrapper(**worker_wrapper_kwargs) - else: - # Else, added to the list of workers. - self.workers.append(worker) - if self.driver_dummy_worker is None: - raise ValueError( - "Ray does not allocate any GPUs on the driver node. Consider " - "adjusting the Ray placement group or running the driver on a " - "GPU node.") +class RayXPUExecutor(RayGPUExecutor, XPUExecutor): + def _get_env_vars_to_be_updated(self): # Get the set of GPU IDs used on each node. worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", use_dummy_driver=True) - node_workers = defaultdict(list) - node_gpus = defaultdict(list) - - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): - node_workers[node_id].append(i) - node_gpus[node_id].extend(gpu_ids) - for node_id, gpu_ids in node_gpus.items(): - node_gpus[node_id] = sorted(gpu_ids) - - # TODO: add env var for xpu - - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - def collect_arg_helper_func(**kwargs): - # avoid writing `{"name": value}` manually - return kwargs - - init_worker_all_kwargs = [] - - # Initialize the actual workers inside worker wrapper. - for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids, ): - local_rank = node_workers[node_id].index(rank) - init_worker_all_kwargs.append( - collect_arg_helper_func( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - load_config=self.load_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - is_driver_worker=rank == 0, - )) - self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) + VLLM_INSTANCE_ID = get_vllm_instance_id() - self._run_workers("init_device") - self._run_workers( - "load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers, - ) + # Set environment variables for the driver and workers. + all_args_to_update_environment_variables = [({ + "VLLM_INSTANCE_ID": + VLLM_INSTANCE_ID, + "VLLM_TRACE_FUNCTION": + str(envs.VLLM_TRACE_FUNCTION), + }, ) for (_, _) in worker_node_and_gpu_ids] + return all_args_to_update_environment_variables - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the KV cache in all workers. - """ - # NOTE: We log here to avoid multiple logs when number of workers is - # greater than one. We could log in the engine, but not all executors - # have GPUs. - logger.info("# GPU blocks: %d, " - "# CPU blocks: %d", num_gpu_blocks, num_cpu_blocks) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - self._run_workers("initialize_cache", - num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks) - - def _driver_execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - """Run execute_model in the driver worker. - - Passing None will cause the driver to stop the model execution - loop running in each of the remote workers. - """ - return self.driver_worker.execute_method("execute_model", - execute_model_req) - - def add_lora(self, lora_request: LoRARequest) -> bool: - assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "add_lora", - lora_request=lora_request, - ) - - def remove_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "remove_lora", - lora_id=lora_id, - ) - - def list_loras(self) -> Set[int]: - return self._run_workers("list_loras") - - def _run_workers( - self, - method: str, - *args, - async_run_remote_workers_only: bool = False, - all_args: Optional[List[Tuple[Any, ...]]] = None, - all_kwargs: Optional[List[Dict[str, Any]]] = None, - use_dummy_driver: bool = False, - max_concurrent_workers: Optional[int] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers. Can be used in the following - ways: - - - args/kwargs: All workers share the same args/kwargs - - args/kwargs and driver_args/driver_kwargs: Driver worker has - different args - - all_args/all_kwargs: args/kwargs for each worker are specified - individually - """ - - if max_concurrent_workers: - raise NotImplementedError( - "max_concurrent_workers is not supported yet.") - - count = len(self.workers) - all_worker_args = repeat(args, count) if all_args is None \ - else islice(all_args, 1, None) - all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ - else islice(all_kwargs, 1, None) - - # Start the ray workers first. - ray_worker_outputs = [ - worker.execute_method.remote(method, *worker_args, **worker_kwargs) - for (worker, worker_args, worker_kwargs - ) in zip(self.workers, all_worker_args, all_worker_kwargs) - ] - - if async_run_remote_workers_only: - # Just return futures - return ray_worker_outputs - - driver_worker_output = [] - driver_args = args if all_args is None else all_args[0] - driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] - # Start the driver worker after all the ray workers. - if not use_dummy_driver: - driver_worker_output = self.driver_worker.execute_method( - method, *driver_args, **driver_kwargs) - else: - assert self.driver_dummy_worker is not None - driver_worker_output = ray.get( - self.driver_dummy_worker.execute_method.remote( - method, *driver_args, **driver_kwargs)) - # Get the results of the ray workers. - if self.workers: - ray_worker_outputs = ray.get(ray_worker_outputs) - - return driver_worker_output + ray_worker_outputs - - def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: - """Wait for futures returned from _run_workers() with - async_run_remote_workers_only to complete.""" - ray.get(parallel_worker_tasks) - - def _compiled_ray_dag(self, enable_asyncio: bool): - import pkg_resources - from packaging import version - - required_version = version.parse("2.32") - current_version = version.parse( - pkg_resources.get_distribution("ray").version) - if current_version < required_version: - raise ValueError(f"Ray version {required_version} or greater is " - f"required, but found {current_version}") - - from ray.dag import InputNode, MultiOutputNode - assert self.parallel_config.use_ray - - # Right now, compiled DAG requires at least 1 arg. We send - # a dummy value for now. It will be fixed soon. - with InputNode() as input_data: - forward_dag = MultiOutputNode([ - worker.execute_model_compiled_dag_remote. - bind( # type: ignore[attr-defined] - input_data) for worker in self.workers - ]) - return forward_dag.experimental_compile(enable_asyncio=enable_asyncio) - - def check_health(self) -> None: - """Raises an error if engine is unhealthy.""" - self._check_if_any_actor_is_dead() - - def _check_if_any_actor_is_dead(self): - if not self.workers: - return - - dead_actors = [] - for actor in self.workers: - actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access - if actor_state["State"] == "DEAD": - dead_actors.append(actor) - if dead_actors: - raise RuntimeError("At least one Worker is dead. " - f"Dead Workers: {dead_actors}. ") - - -class RayXPUExecutorAsync(RayXPUExecutor, DistributedGPUExecutorAsync): +class RayXPUExecutorAsync(RayXPUExecutor, RayGPUExecutorAsync): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.driver_exec_method = make_async(self.driver_worker.execute_method) - - async def _driver_execute_model_async( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - return await self.driver_exec_method("execute_model", - execute_model_req) - - async def _start_worker_execution_loop(self): - coros = [ - worker.execute_method.remote("start_worker_execution_loop") - for worker in self.workers - ] - return await asyncio.gather(*coros) + self.pp_locks: Optional[List[asyncio.Lock]] = None diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 0bfc57a1c57de..0335bbcd091e8 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,14 +1,17 @@ +import dataclasses +import time +import weakref from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, + TypeVar) import torch import torch.nn as nn from vllm.attention import get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ParallelConfig, + ModelConfig, ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig) -from vllm.distributed import broadcast_tensor_dict from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model @@ -20,7 +23,7 @@ from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata from vllm.worker.model_runner_base import ( - ModelRunnerBase, ModelRunnerInputBase, + ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, _add_sampling_metadata_broadcastable_dict, _init_attn_metadata_from_tensor_dict, @@ -37,6 +40,8 @@ _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33) ] +TModelInputForXPU = TypeVar('TModelInputForXPU', bound="ModelInputForXPU") + @dataclass(frozen=True) class ModelInputForXPU(ModelRunnerInputBase): @@ -46,11 +51,40 @@ class ModelInputForXPU(ModelRunnerInputBase): input_tokens: Optional[torch.Tensor] = None input_positions: Optional[torch.Tensor] = None attn_metadata: Optional["AttentionMetadata"] = None - sampling_metadata: Optional["SamplingMetadata"] = None multi_modal_kwargs: Optional[BatchedTensorInputs] = None + virtual_engine: Optional[int] = None + seq_lens: Optional[List[int]] = None + query_lens: Optional[List[int]] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls: Type[TModelInputForXPU], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> TModelInputForXPU: + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +@dataclass(frozen=True) +class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU): + """ + Used by the ModelRunner. + """ + sampling_metadata: Optional["SamplingMetadata"] = None - def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, @@ -62,10 +96,10 @@ def as_broadcastable_tensor_dict( @classmethod def from_broadcasted_tensor_dict( - cls: Type["ModelInputForXPU"], + cls, tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForXPU": + ) -> "ModelInputForXPUWithSamplingMetadata": tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) if attn_backend is not None: tensor_dict = _init_attn_metadata_from_tensor_dict( @@ -73,7 +107,230 @@ def from_broadcasted_tensor_dict( return cls(**tensor_dict) -class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]): +class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): + + def __init__(self, + runner: "XPUModelRunner", + finished_requests_ids: Optional[List[str]] = None) -> None: + super().__init__() + self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] + self.runner = runner + self.model_input_cls = self.runner._model_input_cls + self.attn_backend = self.runner.attn_backend + self.sliding_window = self.runner.sliding_window + self.block_size = self.runner.block_size + self.device = self.runner.device + + def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): + self.seq_group_metadata_list.append(seq_group_metadata) + + def build(self) -> ModelInputForXPU: + is_prompt = self.seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: + (input_tokens, input_positions, attn_metadata, seq_lens, + multi_modal_kwargs) = self._prepare_prompt( + self.seq_group_metadata_list) + else: + (input_tokens, input_positions, + attn_metadata) = self._prepare_decode( + self.seq_group_metadata_list) + seq_lens = [] + multi_modal_kwargs = None + + return self.model_input_cls( + input_tokens=input_tokens, + input_positions=input_positions, + attn_metadata=attn_metadata, + multi_modal_kwargs=multi_modal_kwargs, + seq_lens=seq_lens, + query_lens=seq_lens, + ) + + def _prepare_prompt( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], + BatchedTensorInputs]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + seq_lens: List[int] = [] + multi_modal_inputs_list: List[MultiModalInputs] = [] + + for seq_group_metadata in seq_group_metadata_list: + assert seq_group_metadata.is_prompt + seq_ids = list(seq_group_metadata.seq_data.keys()) + assert len(seq_ids) == 1 + seq_id = seq_ids[0] + + seq_data = seq_group_metadata.seq_data[seq_id] + prompt_tokens = seq_data.get_token_ids() + computed_len = seq_data.get_num_computed_tokens() + seq_len = len(prompt_tokens) + + seq_lens.append(seq_len) # Prompt token num + input_tokens.extend(prompt_tokens) # Token ids + + # Token position ids + # NOTE(woosuk): Here we assume that the first token in the prompt + # is always the first token in the sequence. + input_positions.extend(list(range(computed_len, seq_len))) + + if seq_group_metadata.block_tables is None: + # During memory profiling, the block tables are not initialized + # yet. In this case, we just use a dummy slot mapping. + slot_mapping.extend([_PAD_SLOT_ID] * seq_len) + continue + + # Compute the slot mapping. + block_table = seq_group_metadata.block_tables[seq_id] + # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, + # where start_idx is max(0, seq_len - sliding_window). + # For example, if the prompt len is 10, sliding window is 8, and + # block size is 4, the first two tokens are masked and the slot + # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. + start_idx = 0 + if self.sliding_window is not None: + start_idx = max(0, seq_len - self.sliding_window) + + for i in range(computed_len, seq_len): + if i < start_idx: + slot_mapping.append(_PAD_SLOT_ID) + continue + + block_number = block_table[i // + self.block_size] # type: ignore + block_offset = i % self.block_size # type: ignore + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) + + num_prompt_tokens = len(input_tokens) + + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) # type: ignore + input_positions = torch.tensor(input_positions, + dtype=torch.long, + device=self.device) # type: ignore + slot_mapping = torch.tensor(slot_mapping, + dtype=torch.long, + device=self.device) # type: ignore + + max_seqlen = max(seq_lens) + tmp = [0] + tmp.extend(seq_lens) + seqlen = torch.tensor(tmp) + seqlen_q = torch.cumsum(seqlen, dim=0).to(device=self.device) + + attn_metadata = self.attn_backend.make_metadata( + is_prompt=True, + slot_mapping=slot_mapping, + seq_lens=seq_lens, + seqlen_q=seqlen_q, + max_seqlen=max_seqlen, + seq_lens_tensor=torch.tensor([]), + max_decode_seq_len=0, + num_prefills=len(seq_lens), + num_prefill_tokens=num_prompt_tokens, + num_decode_tokens=0, + block_tables=torch.tensor([], device=self.device, dtype=torch.int), + ) + + multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + + return (input_tokens, input_positions, attn_metadata, seq_lens, + multi_modal_kwargs) + + def _prepare_decode( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[int] = [] + input_positions: List[int] = [] + slot_mapping: List[int] = [] + seq_lens: List[int] = [] + block_tables: List[List[int]] = [] + + for seq_group_metadata in seq_group_metadata_list: + assert not seq_group_metadata.is_prompt + assert seq_group_metadata.token_chunk_size == 1 + + seq_ids = list(seq_group_metadata.seq_data.keys()) + + for seq_id in seq_ids: + seq_data = seq_group_metadata.seq_data[seq_id] + generation_token = seq_data.get_last_token_id() + input_tokens.append(generation_token) + + seq_len = seq_data.get_len() + position = seq_len - 1 + input_positions.append(position) + + seq_len = seq_len if self.sliding_window is None else min( + seq_len, self.sliding_window) + seq_lens.append(seq_len) + + block_table = seq_group_metadata.block_tables[seq_id] + block_number = block_table[position // self.block_size] + block_offset = position % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) + + if self.sliding_window is not None: + sliding_window_blocks = (self.sliding_window // + self.block_size) + block_table = block_table[-sliding_window_blocks:] + block_tables.append(block_table) + + max_decode_seq_len = max(seq_lens) + + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) + input_positions = torch.tensor(input_positions, + dtype=torch.long, + device=self.device) + slot_mapping = torch.tensor(slot_mapping, + dtype=torch.long, + device=self.device) + seq_lens_tensor = torch.tensor(seq_lens, + dtype=torch.int, + device=self.device) + + block_tables = make_tensor_with_pad( + block_tables, + pad=0, + dtype=torch.int, + device=self.device, + ) + + attn_metadata = self.attn_backend.make_metadata( + is_prompt=False, + slot_mapping=slot_mapping, + seq_lens=seq_lens, + seqlen_q=torch.tensor([]), + max_seqlen=0, + seq_lens_tensor=seq_lens_tensor, + max_decode_seq_len=max_decode_seq_len, + num_prefill_tokens=0, + num_decode_tokens=len(input_tokens), + num_prefills=0, + block_tables=block_tables, + ) + return ( + input_tokens, + input_positions, + attn_metadata, + ) + + +class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): + _model_input_cls: Type[ModelInputForXPUWithSamplingMetadata] = ( + ModelInputForXPUWithSamplingMetadata) + _builder_cls: Type[ModelInputForXPUBuilder] = ModelInputForXPUBuilder def __init__( self, @@ -84,30 +341,32 @@ def __init__( cache_config: CacheConfig, load_config: LoadConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], kv_cache_dtype: Optional[str] = "auto", - prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, + prompt_adapter_config: Optional[PromptAdapterConfig] = None, + return_hidden_states: bool = False, + observability_config: Optional[ObservabilityConfig] = None, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - *args, - **kwargs, ): self.model_config = model_config self.parallel_config = parallel_config self.scheduler_config = scheduler_config + self.device_config = device_config + self.cache_config = cache_config self.lora_config = lora_config self.load_config = load_config - self.cache_config = cache_config - self.prompt_adapter_config = prompt_adapter_config - self.multimodal_config = multimodal_config self.is_driver_worker = is_driver_worker + self.prompt_adapter_config = prompt_adapter_config + self.observability_config = observability_config + if self.observability_config is not None: + print(f"observability_config is {self.observability_config}") + self.return_hidden_states = return_hidden_states - self.sliding_window = model_config.get_sliding_window() - self.device_config = device_config self.device = self.device_config.device self.kv_cache_dtype = kv_cache_dtype + self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size self.attn_backend = get_attn_backend( @@ -203,166 +462,68 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers - model_input = self.prepare_model_input(seqs) + finished_requests_ids = [seq.request_id for seq in seqs] + model_input = self.prepare_model_input( + seqs, finished_requests_ids=finished_requests_ids) self.execute_model(model_input, kv_caches) torch.xpu.synchronize() return def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> ModelInputForXPU: - return (ModelInputForXPU.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - )) - - def prepare_model_input( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForXPU: - multi_modal_kwargs = None - if self.is_driver_worker: - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, attn_metadata, seq_lens, - multi_modal_kwargs - ) = self._prepare_prompt(seq_group_metadata_list) - else: - (input_tokens, input_positions, - attn_metadata) = self._prepare_decode(seq_group_metadata_list) - seq_lens = [] - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - # subquery_lens is not needed if chunked prefill is not - # supported. Since CPU worker doesn't support chunked prefill - # just use seq_lens instead. - seq_lens, - self.device, - pin_memory=False, - generators=self.get_generators(finished_requests_ids)) - # Broadcast the metadata. - metadata_dict = { - "input_tokens": input_tokens, - "input_positions": input_positions, - "selected_token_indices": - sampling_metadata.selected_token_indices, - "multi_modal_kwargs": multi_modal_kwargs, - } - metadata_dict.update(attn_metadata.asdict_zerocopy()) - broadcast_tensor_dict(metadata_dict, src=0) - else: - metadata_dict = broadcast_tensor_dict(src=0) - input_tokens = metadata_dict.pop("input_tokens") - input_positions = metadata_dict.pop("input_positions") - selected_token_indices = metadata_dict.pop( - "selected_token_indices") - multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs") - attn_metadata = self.attn_backend.make_metadata(**metadata_dict) - sampling_metadata = SamplingMetadata( - seq_groups=None, - selected_token_indices=selected_token_indices, - categorized_sample_indices=None, - num_prompts=0, - ) - - return ModelInputForXPU(input_tokens=input_tokens, - input_positions=input_positions, - attn_metadata=attn_metadata, - sampling_metadata=sampling_metadata, - multi_modal_kwargs=multi_modal_kwargs) + tensor_dict: Dict[str, + Any]) -> ModelInputForXPUWithSamplingMetadata: + return ( + ModelInputForXPUWithSamplingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + )) - def _prepare_decode( + def _prepare_model_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - slot_mapping: List[int] = [] - seq_lens: List[int] = [] - block_tables: List[List[int]] = [] - + finished_requests_ids: Optional[List[str]] = None + ) -> ModelInputForXPUWithSamplingMetadata: + """Helper method to prepare the model input based on a given sequence + group. Prepares metadata needed for the base model forward pass but not + metadata for possible additional steps, e.g., sampling. + + """ + builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) for seq_group_metadata in seq_group_metadata_list: - assert not seq_group_metadata.is_prompt - assert seq_group_metadata.token_chunk_size == 1 + builder.add_seq_group(seq_group_metadata) - seq_ids = list(seq_group_metadata.seq_data.keys()) + return builder.build() # type: ignore - for seq_id in seq_ids: - seq_data = seq_group_metadata.seq_data[seq_id] - generation_token = seq_data.get_last_token_id() - input_tokens.append(generation_token) - - seq_len = seq_data.get_len() - position = seq_len - 1 - input_positions.append(position) - - seq_len = seq_len if self.sliding_window is None else min( - seq_len, self.sliding_window) - seq_lens.append(seq_len) - - block_table = seq_group_metadata.block_tables[seq_id] - block_number = block_table[position // self.block_size] - block_offset = position % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) - - if self.sliding_window is not None: - sliding_window_blocks = (self.sliding_window // - self.block_size) - block_table = block_table[-sliding_window_blocks:] - block_tables.append(block_table) - - max_decode_seq_len = max(seq_lens) - - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) - input_positions = torch.tensor(input_positions, - dtype=torch.long, - device=self.device) - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.long, - device=self.device) - seq_lens_tensor = torch.tensor(seq_lens, - dtype=torch.int, - device=self.device) - - block_tables = make_tensor_with_pad( - block_tables, - pad=0, - dtype=torch.int, - device=self.device, - ) - - attn_metadata = self.attn_backend.make_metadata( - is_prompt=False, - slot_mapping=slot_mapping, - seq_lens=seq_lens, - seqlen_q=None, - max_seqlen=None, - seq_lens_tensor=seq_lens_tensor, - max_decode_seq_len=max_decode_seq_len, - num_prefill_tokens=0, - num_decode_tokens=len(input_tokens), - num_prefills=0, - block_tables=block_tables, - ) - return ( - input_tokens, - input_positions, - attn_metadata, - ) + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None + ) -> ModelInputForXPUWithSamplingMetadata: + """Prepare the model input based on a given sequence group, including + metadata for the sampling step. + + """ + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list, finished_requests_ids) + # Sampling metadata is only required for the final pp group + generators = self.get_generators(finished_requests_ids) + sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, + model_input.seq_lens, + model_input.query_lens, + self.device, + pin_memory=False, + generators=generators) + + return dataclasses.replace(model_input, + sampling_metadata=sampling_metadata, + virtual_engine=virtual_engine) @torch.inference_mode() def execute_model( self, - model_input: ModelInputForXPU, + model_input: ModelInputForXPUWithSamplingMetadata, kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, @@ -372,20 +533,21 @@ def execute_model( "XPUModelRunner does not support multi-step execution.") model_executable = self.model - execute_model_kwargs = { - "input_ids": - model_input.input_tokens, - "positions": - model_input.input_positions, - "kv_caches": - kv_caches, - "attn_metadata": - model_input.attn_metadata, + if (self.observability_config is not None + and self.observability_config.collect_model_forward_time): + model_forward_start_time = time.time() + + hidden_states = model_executable( + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + kv_caches=kv_caches, + attn_metadata=model_input.attn_metadata, + intermediate_tensors=intermediate_tensors, **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device), - } - - hidden_states = model_executable(**execute_model_kwargs) + device=self.device)) + if (self.observability_config is not None + and self.observability_config.collect_model_forward_time): + model_forward_end_time = time.time() # Compute the logits. logits = self.model.compute_logits(hidden_states, @@ -396,109 +558,19 @@ def execute_model( return [] # Sample the next token. - output = self.model.sample( + output: SamplerOutput = self.model.sample( logits=logits, sampling_metadata=model_input.sampling_metadata, ) - return [output] - - def _prepare_prompt( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], - BatchedTensorInputs]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - slot_mapping: List[int] = [] - seq_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] - - for seq_group_metadata in seq_group_metadata_list: - assert seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - - seq_data = seq_group_metadata.seq_data[seq_id] - prompt_tokens = seq_data.get_token_ids() - computed_len = seq_data.get_num_computed_tokens() - seq_len = len(prompt_tokens) + if (self.observability_config is not None + and self.observability_config.collect_model_forward_time + and output is not None): + model_forward_time = (model_forward_end_time - + model_forward_start_time) + # If there are multiple workers, we are still tracking the latency + # from the start time of the driver worker to the end time of the + # driver worker. The model forward time will then end up covering + # the communication time as well. + output.model_forward_time = model_forward_time - seq_lens.append(seq_len) # Prompt token num - input_tokens.extend(prompt_tokens) # Token ids - - # Token position ids - # NOTE(woosuk): Here we assume that the first token in the prompt - # is always the first token in the sequence. - input_positions.extend(list(range(computed_len, seq_len))) - - mm_data = seq_group_metadata.multi_modal_data - if mm_data: - mm_kwargs = self.multi_modal_input_mapper(mm_data) - multi_modal_inputs_list.append(mm_kwargs) - - if seq_group_metadata.block_tables is None: - # During memory profiling, the block tables are not initialized - # yet. In this case, we just use a dummy slot mapping. - slot_mapping.extend([_PAD_SLOT_ID] * seq_len) - continue - - # Compute the slot mapping. - block_table = seq_group_metadata.block_tables[seq_id] - # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, - # where start_idx is max(0, seq_len - sliding_window). - # For example, if the prompt len is 10, sliding window is 8, and - # block size is 4, the first two tokens are masked and the slot - # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. - start_idx = 0 - if self.sliding_window is not None: - start_idx = max(0, seq_len - self.sliding_window) - - for i in range(computed_len, seq_len): - if i < start_idx: - slot_mapping.append(_PAD_SLOT_ID) - continue - - block_number = block_table[i // - self.block_size] # type: ignore - block_offset = i % self.block_size # type: ignore - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) - - num_prompt_tokens = len(input_tokens) - - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) # type: ignore - input_positions = torch.tensor(input_positions, - dtype=torch.long, - device=self.device) # type: ignore - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.long, - device=self.device) # type: ignore - - max_seqlen = max(seq_lens) - tmp = [0] - tmp.extend(seq_lens) - seqlen = torch.tensor(tmp) - seqlen_q = torch.cumsum(seqlen, dim=0).to(device=self.device) - - attn_metadata = self.attn_backend.make_metadata( - is_prompt=True, - slot_mapping=slot_mapping, - seq_lens=seq_lens, - seqlen_q=seqlen_q, - max_seqlen=max_seqlen, - seq_lens_tensor=None, - max_decode_seq_len=None, - num_prefills=len(seq_lens), - num_prefill_tokens=num_prompt_tokens, - num_decode_tokens=0, - block_tables=torch.tensor([], device=self.device, dtype=torch.int), - ) - - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) - - return (input_tokens, input_positions, attn_metadata, seq_lens, - multi_modal_kwargs) + return [output] diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 7c8f5e0cf65ec..b00d1889f8d4b 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -9,8 +9,8 @@ import torch.distributed from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PromptAdapterConfig, SchedulerConfig, + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig, SpeculativeConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) @@ -46,7 +46,6 @@ def __init__( rank: int, distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, - multimodal_config: Optional[MultiModalConfig] = None, speculative_config: Optional[SpeculativeConfig] = None, prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, @@ -73,8 +72,6 @@ def __init__( assert rank % parallel_config.tensor_parallel_size == 0, \ "Driver worker should be rank 0 of tensor parallel group." - self.multimodal_config = multimodal_config - self.model_runner = XPUModelRunner( # type: ignore model_config, parallel_config, @@ -85,7 +82,7 @@ def __init__( lora_config=self.lora_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, - multimodal_config=multimodal_config, + observability_config=self.observability_config, ) # Uninitialized cache engine. Will be initialized by # initialize_cache. From faeddb565d6a528d1cbd169e90bc538178fc2828 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Fri, 23 Aug 2024 13:46:25 +0800 Subject: [PATCH 06/24] [misc] Add Torch profiler support for CPU-only devices (#7806) --- vllm/worker/cpu_worker.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index d9b1d18da156c..52d1806018f51 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -179,6 +179,32 @@ def __init__( self.cache_engine: List[CPUCacheEngine] self.cpu_cache: List[List[torch.Tensor]] + # Torch profiler. Enabled and configured through env vars: + # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + if envs.VLLM_TORCH_PROFILER_DIR: + torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + logger.info("Profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir) + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + ], + with_stack=True, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + torch_profiler_trace_dir, use_gzip=True)) + else: + self.profiler = None + + def start_profile(self): + if self.profiler is None: + raise RuntimeError("Profiler is not enabled.") + self.profiler.start() + + def stop_profile(self): + if self.profiler is None: + raise RuntimeError("Profiler is not enabled.") + self.profiler.stop() + def init_device(self) -> None: if self.local_omp_cpuid != "all": torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) From e25fee57c2e69161bd261f5986dc5aeb198bbd42 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Fri, 23 Aug 2024 10:12:44 -0300 Subject: [PATCH 07/24] [BugFix] Fix server crash on empty prompt (#7746) Signed-off-by: Max de Bayser --- .../entrypoints/llm/test_prompt_validation.py | 9 ++++++++ .../openai/test_prompt_validation.py | 22 +++++++++++++++++++ vllm/engine/llm_engine.py | 8 +++++++ 3 files changed, 39 insertions(+) create mode 100644 tests/entrypoints/llm/test_prompt_validation.py create mode 100644 tests/entrypoints/openai/test_prompt_validation.py diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py new file mode 100644 index 0000000000000..565dfa01346cc --- /dev/null +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -0,0 +1,9 @@ +import pytest + +from vllm import LLM + + +def test_empty_prompt(): + llm = LLM(model="gpt2") + with pytest.raises(ValueError, match='Prompt cannot be empty'): + llm.generate([""]) diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py new file mode 100644 index 0000000000000..0a573a0066d32 --- /dev/null +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -0,0 +1,22 @@ +# imports for guided decoding tests +import re + +import openai +import pytest + +from ...utils import RemoteOpenAIServer + + +@pytest.mark.asyncio +async def test_empty_prompt(): + model_name = "gpt2" + server_args = ["--enforce-eager"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + + with pytest.raises(openai.BadRequestError, + match=re.compile('.+Prompt cannot be empty.+')): + await client.completions.create(model=model_name, + prompt="", + max_tokens=5, + temperature=0.0) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f72902c372181..8c98b64181d06 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -591,6 +591,7 @@ def _add_processed_request( prompt_adapter_request: Optional[PromptAdapterRequest], trace_headers: Optional[Mapping[str, str]] = None, ) -> None: + self._validate_model_inputs(processed_inputs) # Create the sequences. block_size = self.cache_config.block_size seq_id = next(self.seq_counter) @@ -1647,3 +1648,10 @@ def is_encoder_decoder_model(self): def is_embedding_model(self): return self.model_config.is_embedding_model + + def _validate_model_inputs(self, inputs: Union[LLMInputs, + EncoderDecoderLLMInputs]): + prompt_key = "encoder_prompt_token_ids" \ + if self.is_encoder_decoder_model() else "prompt_token_ids" + if not inputs.get(prompt_key): + raise ValueError("Prompt cannot be empty") From 35ee2ad6b9a850a25d94cab582de19de5bca6fbd Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 23 Aug 2024 09:38:50 -0700 Subject: [PATCH 08/24] [github][misc] promote asking llm first (#7809) --- .github/ISSUE_TEMPLATE/100-documentation.yml | 7 +++++++ .github/ISSUE_TEMPLATE/200-installation.yml | 7 +++++++ .github/ISSUE_TEMPLATE/300-usage.yml | 7 +++++++ .github/ISSUE_TEMPLATE/400-bug report.yml | 7 +++++++ .github/ISSUE_TEMPLATE/500-feature request.yml | 7 +++++++ .github/ISSUE_TEMPLATE/600-new model.yml | 7 +++++++ .github/ISSUE_TEMPLATE/700-performance discussion.yml | 7 +++++++ .github/ISSUE_TEMPLATE/750-RFC.yml | 7 +++++++ .github/ISSUE_TEMPLATE/800-misc discussion.yml | 7 +++++++ 9 files changed, 63 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/100-documentation.yml b/.github/ISSUE_TEMPLATE/100-documentation.yml index 501c0aa48b887..74d397b231acd 100644 --- a/.github/ISSUE_TEMPLATE/100-documentation.yml +++ b/.github/ISSUE_TEMPLATE/100-documentation.yml @@ -20,3 +20,10 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml index df41ade8c3c01..590e56c137813 100644 --- a/.github/ISSUE_TEMPLATE/200-installation.yml +++ b/.github/ISSUE_TEMPLATE/200-installation.yml @@ -38,3 +38,10 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml index 54763af1058f6..004798a388a63 100644 --- a/.github/ISSUE_TEMPLATE/300-usage.yml +++ b/.github/ISSUE_TEMPLATE/300-usage.yml @@ -36,3 +36,10 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml index 3a091379eb844..d4113da8b5b81 100644 --- a/.github/ISSUE_TEMPLATE/400-bug report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug report.yml @@ -89,3 +89,10 @@ body: - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature request.yml index 47a90628c76ce..097d88f50930d 100644 --- a/.github/ISSUE_TEMPLATE/500-feature request.yml +++ b/.github/ISSUE_TEMPLATE/500-feature request.yml @@ -29,3 +29,10 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new model.yml index bbddbfd67138a..794617a0cfdf6 100644 --- a/.github/ISSUE_TEMPLATE/600-new model.yml +++ b/.github/ISSUE_TEMPLATE/600-new model.yml @@ -31,3 +31,10 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance discussion.yml index 4f8843420a94e..273f50d59cf76 100644 --- a/.github/ISSUE_TEMPLATE/700-performance discussion.yml +++ b/.github/ISSUE_TEMPLATE/700-performance discussion.yml @@ -50,3 +50,10 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml index 5382b124dcd79..e447c077473f0 100644 --- a/.github/ISSUE_TEMPLATE/750-RFC.yml +++ b/.github/ISSUE_TEMPLATE/750-RFC.yml @@ -47,3 +47,10 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc discussion.yml index ddb10f72db293..79e6e9080d51c 100644 --- a/.github/ISSUE_TEMPLATE/800-misc discussion.yml +++ b/.github/ISSUE_TEMPLATE/800-misc discussion.yml @@ -19,3 +19,10 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: askllm + attributes: + label: Before submitting a new issue... + options: + - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. + required: true From f1df5dbfd6782408228f39bdc0722fa465629f0f Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 23 Aug 2024 14:30:52 -0400 Subject: [PATCH 09/24] [Misc] Update `marlin` to use vLLMParameters (#7803) --- tests/weight_loading/models.txt | 4 +- vllm/model_executor/layers/linear.py | 3 +- .../layers/quantization/marlin.py | 68 ++++++++++--------- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index bfbceb24aef96..98a66b6701ea9 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -15,4 +15,6 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main awq, casperhansen/mixtral-instruct-awq, main awq_marlin, casperhansen/mixtral-instruct-awq, main -fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main \ No newline at end of file +fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main +marlin, nm-testing/zephyr-beta-7b-marlin-g128, main +marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main \ No newline at end of file diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 4af954b74e8b5..e5b40a64abc41 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -22,7 +22,8 @@ WEIGHT_LOADER_V2_SUPPORTED = [ "CompressedTensorsLinearMethod", "AWQMarlinLinearMethod", - "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod" + "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod", + "MarlinLinearMethod" ] diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index cdc5129a93b15..8f1b5370b4538 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -9,7 +9,10 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.parameter import (BasevLLMParameter, + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedvLLMParameter) logger = init_logger(__name__) @@ -132,6 +135,7 @@ def create_weights( **extra_weight_attrs, ): del output_size # Unused. + weight_loader = extra_weight_attrs["weight_loader"] if params_dtype != torch.float16: raise ValueError( @@ -170,64 +174,64 @@ def create_weights( "Each permutation group must reside on the same gpu") # Quantized 4Bit weights packed into Int32. - qweight = Parameter( - torch.empty( + qweight = PackedvLLMParameter( + data=torch.empty( input_size_per_partition // self.quant_config.tile_size, output_size_per_partition * self.quant_config.tile_size // self.quant_config.pack_factor, device="cuda", dtype=torch.int32, ), - requires_grad=False, - ) - set_weight_attrs( - qweight, - { - "input_dim": 0, - "output_dim": 1, - "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, - "marlin_tile_size": self.quant_config.tile_size, - }, - ) + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + marlin_tile_size=self.quant_config.tile_size, + weight_loader=weight_loader) # Determine if channelwise or not input_groups = (1 if self.quant_config.group_size == -1 else input_size_per_partition // self.quant_config.group_size) - scales = Parameter( + weight_scale_args = { + "data": torch.empty( input_groups, output_size_per_partition, device="cuda", dtype=params_dtype, ), - requires_grad=False, - ) - set_weight_attrs( - scales, - { - "input_dim": None if input_groups == 1 else 0, - "output_dim": 1, - }, - ) + "weight_loader": + weight_loader + } + if input_groups == 1: + scales = ChannelQuantScaleParameter(output_dim=1, + **weight_scale_args) + else: + scales = GroupQuantScaleParameter(output_dim=1, + input_dim=0, + **weight_scale_args) # Allocate workspace (Used for internal locking mechanism) max_workspace_size = ( output_size_per_partition // self.quant_config.min_n_threads) * self.quant_config.max_parallel - workspace = Parameter(torch.zeros(max_workspace_size, - device="cuda", - dtype=torch.int), - requires_grad=False) + + workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size, + device="cuda", + dtype=torch.int), + weight_loader=weight_loader) layer.register_parameter("B", qweight) - set_weight_attrs(qweight, extra_weight_attrs) layer.register_parameter("s", scales) - set_weight_attrs(scales, extra_weight_attrs) layer.register_parameter("workspace", workspace) - set_weight_attrs(workspace, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # required by torch.compile + layer.B = Parameter(layer.B.data, requires_grad=False) + layer.s = Parameter(layer.s.data, requires_grad=False) + layer.workspace = Parameter(layer.workspace.data, requires_grad=False) def apply( self, From 09c7792610ada9f88bbf87d32b472dd44bf23cc2 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 23 Aug 2024 11:35:33 -0700 Subject: [PATCH 10/24] Bump version to v0.5.5 (#7823) --- vllm/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/version.py b/vllm/version.py index 247036f1d6211..052eb76b5873c 100644 --- a/vllm/version.py +++ b/vllm/version.py @@ -9,4 +9,4 @@ stacklevel=2) __commit__ = "COMMIT_HASH_PLACEHOLDER" -__version__ = "0.5.4" +__version__ = "0.5.5" From 9db93de20ca282feb4dfaabbc56032c9312bde7b Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Fri, 23 Aug 2024 15:45:53 -0400 Subject: [PATCH 11/24] [Core] Add multi-step support to LLMEngine (#7789) --- .buildkite/test-pipeline.yaml | 3 +- benchmarks/benchmark_throughput.py | 17 ++- tests/lora/test_gemma.py | 2 +- ...tness.py => test_correctness_async_llm.py} | 0 tests/multi_step/test_correctness_llm.py | 49 +++++++ vllm/engine/async_llm_engine.py | 74 +--------- vllm/engine/llm_engine.py | 137 ++++++++++++++++-- 7 files changed, 195 insertions(+), 87 deletions(-) rename tests/multi_step/{test_correctness.py => test_correctness_async_llm.py} (100%) create mode 100644 tests/multi_step/test_correctness_llm.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d70a9ce240825..283776c06ed45 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -335,7 +335,8 @@ steps: - vllm/engine - tests/multi_step commands: - - pytest -v -s multi_step/test_correctness.py + - pytest -v -s multi_step/test_correctness_async_llm.py + - pytest -v -s multi_step/test_correctness_llm.py - label: Pipeline Parallelism Test # 23min working_dir: "/vllm-workspace/tests" diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index a52e67bbbe7e3..1ccab2c65e697 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -82,6 +82,8 @@ def run_vllm( max_num_batched_tokens: int, distributed_executor_backend: Optional[str], gpu_memory_utilization: float = 0.9, + num_scheduler_steps: int = 1, + use_v2_block_manager: bool = False, download_dir: Optional[str] = None, load_format: str = EngineArgs.load_format, ) -> float: @@ -106,6 +108,8 @@ def run_vllm( max_num_batched_tokens=max_num_batched_tokens, distributed_executor_backend=distributed_executor_backend, load_format=load_format, + num_scheduler_steps=num_scheduler_steps, + use_v2_block_manager=use_v2_block_manager, ) # Add the requests to the engine. @@ -232,7 +236,8 @@ def main(args: argparse.Namespace): args.quantization_param_path, args.device, args.enable_prefix_caching, args.enable_chunked_prefill, args.max_num_batched_tokens, args.distributed_executor_backend, - args.gpu_memory_utilization, args.download_dir, args.load_format) + args.gpu_memory_utilization, args.num_scheduler_steps, + args.use_v2_block_manager, args.download_dir, args.load_format) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -353,10 +358,18 @@ def main(args: argparse.Namespace): choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], help='device type for vLLM execution, supporting CUDA, OpenVINO and ' 'CPU.') + parser.add_argument( + "--num-scheduler-steps", + type=int, + default=1, + help="Maximum number of forward steps per scheduler call.") + parser.add_argument("--use-v2-block-manager", + action='store_true', + help="Enable block manager v2.") parser.add_argument( "--enable-prefix-caching", action='store_true', - help="enable automatic prefix caching for vLLM backend.") + help="Enable automatic prefix caching for vLLM backend.") parser.add_argument("--enable-chunked-prefill", action='store_true', help="enable chunked prefill for vLLM backend.") diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 478bb86b78610..709246179bfe4 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -37,7 +37,7 @@ def test_gemma_lora(gemma_lora_files): expected_lora_output = [ "more important than knowledge.\nAuthor: Albert Einstein\n", "everyone else is already taken.\nAuthor: Oscar Wilde\n", - "so little time.\nAuthor: Frank Zappa\n", + "so little time\nAuthor: Frank Zappa\n", ] output1 = do_sample(llm, gemma_lora_files, lora_id=1) diff --git a/tests/multi_step/test_correctness.py b/tests/multi_step/test_correctness_async_llm.py similarity index 100% rename from tests/multi_step/test_correctness.py rename to tests/multi_step/test_correctness_async_llm.py diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py new file mode 100644 index 0000000000000..36f610ba74f05 --- /dev/null +++ b/tests/multi_step/test_correctness_llm.py @@ -0,0 +1,49 @@ +# Test the LLMEngine with multi-step-decoding + +import pytest + +from ..models.utils import check_outputs_equal + +MODELS = [ + "JackFram/llama-160m", +] +NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps +NUM_PROMPTS = [10] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("tp_size", [1]) +@pytest.mark.parametrize("max_tokens", [5]) +@pytest.mark.parametrize("enforce_eager", [True]) +@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) +@pytest.mark.parametrize("num_prompts", NUM_PROMPTS) +def test_multi_step_llm(hf_runner, vllm_runner, example_prompts, model: str, + dtype: str, tp_size: int, max_tokens: int, + enforce_eager: int, num_scheduler_steps: int, + num_prompts: int) -> None: + + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts + + with vllm_runner(model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + use_v2_block_manager=True, + num_scheduler_steps=num_scheduler_steps) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) + + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 8812b853c0665..a2a80b1412132 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,11 +1,9 @@ import asyncio import time -from dataclasses import dataclass from functools import partial from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Type, Union) -import torch from typing_extensions import assert_never import vllm.envs as envs @@ -15,7 +13,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout from vllm.engine.llm_engine import (DecoderPromptComponents, LLMEngine, - PromptComponents) + PromptComponents, SchedulerOutputState) from vllm.engine.metrics_types import StatLoggerBase from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.ray_utils import initialize_ray_cluster, ray @@ -28,8 +26,7 @@ from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata) +from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils import print_warning_once @@ -257,24 +254,11 @@ def has_new_requests(self): return not self._new_requests.empty() -@dataclass -class SchedulerOutputState: - """Caches the scheduler outputs for a virtual engine. Used for Multi-Step""" - last_output: Optional[SamplerOutput] = None - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None - scheduler_outputs: Optional[SchedulerOutputs] = None - - class _AsyncLLMEngine(LLMEngine): """Extension of LLMEngine to add async methods.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - pipeline_parallel_size = \ - self.parallel_config.pipeline_parallel_size - self.cached_scheduler_outputs = [ - SchedulerOutputState() for _ in range(pipeline_parallel_size) - ] async def step_async( self, virtual_engine: int @@ -367,60 +351,6 @@ async def step_async( return request_outputs - def _has_remaining_steps( - self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] - ) -> bool: - if (not self.scheduler_config.is_multi_step - or not seq_group_metadata_list): - return False - - # TODO(will) this is a sanity check for nowto make sure that all the - # seqs are on the same steps. Eventually we will want to do some sort of - # dynamic scheduling when doing multi-step decoding. - ref_remaining_steps = seq_group_metadata_list[0].state.remaining_steps - if any([ - seq_group.state.remaining_steps != ref_remaining_steps - for seq_group in seq_group_metadata_list[1:] - ]): - raise AssertionError(("All running sequence groups should " - "have the same remaining steps.")) - - return ref_remaining_steps > 0 - - def _cache_scheduler_outputs_for_multi_step( - self, virtual_engine: int, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - scheduler_outputs: SchedulerOutputs) -> None: - self.cached_scheduler_outputs[ - virtual_engine].seq_group_metadata_list = seq_group_metadata_list - self.cached_scheduler_outputs[virtual_engine].scheduler_outputs = \ - scheduler_outputs - self.cached_scheduler_outputs[virtual_engine].last_output = None - - def _get_last_sampled_token_ids( - self, virtual_engine: int) -> Optional[torch.Tensor]: - cached_last_output = self.cached_scheduler_outputs[ - virtual_engine].last_output - if (self.scheduler_config.is_multi_step - and self.parallel_config.pipeline_parallel_size > 1 - and cached_last_output is not None - and cached_last_output.sampled_token_ids_cpu is not None): - return cached_last_output.sampled_token_ids_cpu - return None - - def _update_cached_scheduler_output( - self, virtual_engine: int, - output: List[Optional[SamplerOutput]]) -> None: - if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0 - and output[0] is not None): - last_output = output[-1] - assert last_output is not None - assert last_output.sampled_token_ids_cpu is not None - assert last_output.sampled_token_ids is None - assert last_output.sampled_token_probs is None - self.cached_scheduler_outputs[ - virtual_engine].last_output = last_output - async def stop_remote_worker_execution_loop_async(self) -> None: """Stop the remote worker execution loop.""" await self.model_executor.stop_remote_worker_execution_loop_async() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8c98b64181d06..79072e403dc1b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,10 +1,12 @@ import time from contextlib import contextmanager +from dataclasses import dataclass from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List, Mapping, Optional) from typing import Sequence as GenericSequence from typing import Set, Tuple, Type, Union +import torch from typing_extensions import TypeVar, assert_never import vllm.envs as envs @@ -77,6 +79,14 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: Optional[MultiModalDataDict]] +@dataclass +class SchedulerOutputState: + """Caches the scheduler outputs for a virtual engine. Used for Multi-Step""" + last_output: Optional[SamplerOutput] = None + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None + scheduler_outputs: Optional[SchedulerOutputs] = None + + class LLMEngine: """An LLM engine that receives requests and generates texts. @@ -194,7 +204,7 @@ def __init__( "quantization_param_path=%s, device_config=%s, " "decoding_config=%r, observability_config=%r, " "seed=%d, served_model_name=%s, use_v2_block_manager=%s, " - "enable_prefix_caching=%s)", + "num_scheduler_steps=%d, enable_prefix_caching=%s)", VLLM_VERSION, model_config.model, speculative_config, @@ -223,6 +233,7 @@ def __init__( model_config.seed, model_config.served_model_name, scheduler_config.use_v2_block_manager, + scheduler_config.num_scheduler_steps, cache_config.enable_prefix_caching, ) # TODO(woosuk): Print more configs in debug mode. @@ -380,6 +391,11 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: ), )) + self.cached_scheduler_outputs = [ + SchedulerOutputState() + for _ in range(self.parallel_config.pipeline_parallel_size) + ] + def _initialize_kv_caches(self) -> None: """Initialize the KV cache in the worker(s). @@ -1304,16 +1320,40 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: "Pipeline parallelism is only supported through AsyncLLMEngine " "as performance will be severely degraded otherwise.") - if self.scheduler_config.num_scheduler_steps > 1: - raise NotImplementedError( - "Multiple scheduler steps (multi-step) are only supported " - "through AsyncLLMEngine. ") - seq_group_metadata_list, scheduler_outputs = self.scheduler[ - 0].schedule() + # These are cached outputs from previous iterations. None if on first + # iteration + cached_outputs = self.cached_scheduler_outputs[0] + seq_group_metadata_list = cached_outputs.seq_group_metadata_list + scheduler_outputs = cached_outputs.scheduler_outputs + + # Skip the scheduler if there are any remaining steps in the seq groups. + # This ensures that the scheduler is only called again when the current + # batch has completed. + if not self._has_remaining_steps(seq_group_metadata_list): + seq_group_metadata_list, scheduler_outputs = self.scheduler[ + 0].schedule() + + if (self.scheduler_config.is_multi_step + and scheduler_outputs.num_lookahead_slots > 0): + # cache the scheduler outputs for the next iteration if we have + # lookahead slots + self._cache_scheduler_outputs_for_multi_step( + 0, seq_group_metadata_list, scheduler_outputs) + + assert seq_group_metadata_list is not None + assert scheduler_outputs is not None if not scheduler_outputs.is_empty(): finished_requests_ids = self.scheduler[ 0].get_and_reset_finished_requests_ids() + + # Check if we have a cached last_output from the previous iteration. + # For supporting PP this is probably the best way to pass the + # sampled_token_ids, as a separate broadcast over all the PP stages + # will cause one virtual engine's microbatch to block the pipeline. + last_sampled_token_ids = \ + self._get_last_sampled_token_ids(0) + execute_model_req = ExecuteModelRequest( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in, @@ -1321,15 +1361,36 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: blocks_to_copy=scheduler_outputs.blocks_to_copy, num_lookahead_slots=scheduler_outputs.num_lookahead_slots, running_queue_size=scheduler_outputs.running_queue_size, - finished_requests_ids=finished_requests_ids) + finished_requests_ids=finished_requests_ids, + # We use ExecuteModelRequest to pass the last sampled_token_ids + # to each of the non-last PP stages for in-place prepare_input. + last_sampled_token_ids=last_sampled_token_ids) + output = self.model_executor.execute_model( execute_model_req=execute_model_req) + + # we need to do this here so that last step's sampled_token_ids can + # be passed to the next iteration for PP. + if self.scheduler_config.is_multi_step: + self._update_cached_scheduler_output(0, output) else: output = [] - request_outputs = self._process_model_outputs( - output, scheduler_outputs.scheduled_seq_groups, - scheduler_outputs.ignored_seq_groups, seq_group_metadata_list) + # Finish the current step for all the sequence groups. + if self.scheduler_config.is_multi_step: + for seq_group in seq_group_metadata_list: + seq_group.finish_step() + + if not self._has_remaining_steps(seq_group_metadata_list): + # clear the cache if we have finished all the steps + if self.scheduler_config.is_multi_step: + self.cached_scheduler_outputs[0] = SchedulerOutputState() + request_outputs = self._process_model_outputs( + output, scheduler_outputs.scheduled_seq_groups, + scheduler_outputs.ignored_seq_groups, seq_group_metadata_list) + + else: + request_outputs = [] # Log stats. self.do_log_stats(scheduler_outputs, output) @@ -1347,6 +1408,60 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: return request_outputs + def _has_remaining_steps( + self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] + ) -> bool: + if (not self.scheduler_config.is_multi_step + or not seq_group_metadata_list): + return False + + # TODO(will) this is a sanity check for nowto make sure that all the + # seqs are on the same steps. Eventually we will want to do some sort of + # dynamic scheduling when doing multi-step decoding. + ref_remaining_steps = seq_group_metadata_list[0].state.remaining_steps + if any([ + seq_group.state.remaining_steps != ref_remaining_steps + for seq_group in seq_group_metadata_list[1:] + ]): + raise AssertionError(("All running sequence groups should " + "have the same remaining steps.")) + + return ref_remaining_steps > 0 + + def _cache_scheduler_outputs_for_multi_step( + self, virtual_engine: int, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + scheduler_outputs: SchedulerOutputs) -> None: + self.cached_scheduler_outputs[ + virtual_engine].seq_group_metadata_list = seq_group_metadata_list + self.cached_scheduler_outputs[virtual_engine].scheduler_outputs = \ + scheduler_outputs + self.cached_scheduler_outputs[virtual_engine].last_output = None + + def _update_cached_scheduler_output( + self, virtual_engine: int, + output: List[Optional[SamplerOutput]]) -> None: + if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0 + and output[0] is not None): + last_output = output[-1] + assert last_output is not None + assert last_output.sampled_token_ids_cpu is not None + assert last_output.sampled_token_ids is None + assert last_output.sampled_token_probs is None + self.cached_scheduler_outputs[ + virtual_engine].last_output = last_output + + def _get_last_sampled_token_ids( + self, virtual_engine: int) -> Optional[torch.Tensor]: + cached_last_output = self.cached_scheduler_outputs[ + virtual_engine].last_output + if (self.scheduler_config.is_multi_step + and self.parallel_config.pipeline_parallel_size > 1 + and cached_last_output is not None + and cached_last_output.sampled_token_ids_cpu is not None): + return cached_last_output.sampled_token_ids_cpu + return None + def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None: if logger_name in self.stat_loggers: raise KeyError(f"Logger with name {logger_name} already exists.") From 6885fde317433eec52e00c14329270d742f0630d Mon Sep 17 00:00:00 2001 From: Pooya Davoodi Date: Fri, 23 Aug 2024 13:58:26 -0700 Subject: [PATCH 12/24] [Bugfix] Fix run_batch logger (#7640) --- vllm/entrypoints/openai/run_batch.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index af8d95ea66cd3..764712fd5648b 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -6,7 +6,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.logger import RequestLogger, logger # yapf: disable from vllm.entrypoints.openai.protocol import (BatchRequestInput, BatchRequestOutput, @@ -16,13 +16,10 @@ # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding -from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION -logger = init_logger(__name__) - def parse_args(): parser = FlexibleArgumentParser( @@ -184,7 +181,7 @@ async def main(args): if __name__ == "__main__": args = parse_args() - logger.info("vLLM API server version %s", VLLM_VERSION) + logger.info("vLLM batch processing API version %s", VLLM_VERSION) logger.info("args: %s", args) asyncio.run(main(args)) From 8da48e4d95421cbd96fbdecdffed89a3d1aab218 Mon Sep 17 00:00:00 2001 From: Pooya Davoodi Date: Fri, 23 Aug 2024 23:04:22 -0700 Subject: [PATCH 13/24] [Frontend] Publish Prometheus metrics in run_batch API (#7641) --- tests/entrypoints/openai/test_metrics.py | 49 ++++++++++++++++++++++++ vllm/entrypoints/openai/run_batch.py | 27 +++++++++++++ 2 files changed, 76 insertions(+) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index cbe601e623056..042c3730e09f5 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -1,3 +1,7 @@ +import subprocess +import sys +import tempfile +import time from http import HTTPStatus import openai @@ -177,3 +181,48 @@ async def test_metrics_exist(client: openai.AsyncOpenAI): for metric in EXPECTED_METRICS: assert metric in response.text + + +def test_metrics_exist_run_batch(): + input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501 + + base_url = "0.0.0.0" + port = "8001" + server_url = f"http://{base_url}:{port}" + + with tempfile.NamedTemporaryFile( + "w") as input_file, tempfile.NamedTemporaryFile( + "r") as output_file: + input_file.write(input_batch) + input_file.flush() + proc = subprocess.Popen([ + sys.executable, + "-m", + "vllm.entrypoints.openai.run_batch", + "-i", + input_file.name, + "-o", + output_file.name, + "--model", + "intfloat/e5-mistral-7b-instruct", + "--enable-metrics", + "--url", + base_url, + "--port", + port, + ], ) + + def is_server_up(url): + try: + response = requests.get(url) + return response.status_code == 200 + except requests.ConnectionError: + return False + + while not is_server_up(server_url): + time.sleep(1) + + response = requests.get(server_url + "/metrics") + assert response.status_code == HTTPStatus.OK + + proc.wait() diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 764712fd5648b..32bbade256973 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -3,6 +3,7 @@ from typing import Awaitable, Callable, List import aiohttp +from prometheus_client import start_http_server from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -56,6 +57,24 @@ def parse_args(): 'ID numbers being printed in log.' '\n\nDefault: Unlimited') + parser.add_argument("--enable-metrics", + action="store_true", + help="Enable Prometheus metrics") + parser.add_argument( + "--url", + type=str, + default="0.0.0.0", + help="URL to the Prometheus metrics server " + "(only needed if enable-metrics is set).", + ) + parser.add_argument( + "--port", + type=int, + default=8000, + help="Port number for the Prometheus metrics server " + "(only needed if enable-metrics is set).", + ) + return parser.parse_args() @@ -184,4 +203,12 @@ async def main(args): logger.info("vLLM batch processing API version %s", VLLM_VERSION) logger.info("args: %s", args) + # Start the Prometheus metrics server. LLMEngine uses the Prometheus client + # to publish metrics at the /metrics endpoint. + if args.enable_metrics: + logger.info("Prometheus metrics enabled") + start_http_server(port=args.port, addr=args.url) + else: + logger.info("Prometheus metrics disabled") + asyncio.run(main(args)) From d81abefd2ee8e1f4b46b3660ebdaf7b8e19c573a Mon Sep 17 00:00:00 2001 From: Tyler Rockwood Date: Sat, 24 Aug 2024 01:07:24 -0500 Subject: [PATCH 14/24] [Frontend] add json_schema support from OpenAI protocol (#7654) --- tests/entrypoints/openai/test_chat.py | 33 +++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 14 ++++++-- .../lm_format_enforcer_decoding.py | 7 ++++ .../guided_decoding/outlines_decoding.py | 7 ++++ 4 files changed, 59 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index afcb0f44befc5..ce5bf3d5d7ba0 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -837,6 +837,39 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI): assert loaded == {"result": 2}, loaded +@pytest.mark.asyncio +async def test_response_format_json_schema(client: openai.AsyncOpenAI): + for _ in range(2): + resp = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": + "user", + "content": ('what is 1+1? please respond with a JSON object, ' + 'the format is {"result": 2}') + }], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "foo_test", + "schema": { + "type": "object", + "properties": { + "result": { + "type": "integer" + }, + }, + }, + } + }) + + content = resp.choices[0].message.content + assert content is not None + + loaded = json.loads(content) + assert loaded == {"result": 2}, loaded + + @pytest.mark.asyncio async def test_extra_fields(client: openai.AsyncOpenAI): with pytest.raises(BadRequestError) as exc_info: diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index c46f5cf8ce663..0954b81595ef5 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -85,9 +85,19 @@ class UsageInfo(OpenAIBaseModel): completion_tokens: Optional[int] = 0 +class JsonSchemaResponseFormat(OpenAIBaseModel): + name: str + description: Optional[str] = None + # schema is the field in openai but that causes conflicts with pydantic so + # instead use json_schema with an alias + json_schema: Optional[Dict[str, Any]] = Field(default=None, alias='schema') + strict: Optional[bool] = None + + class ResponseFormat(OpenAIBaseModel): - # type must be "json_object" or "text" - type: Literal["text", "json_object"] + # type must be "json_schema", "json_object" or "text" + type: Literal["text", "json_object", "json_schema"] + json_schema: Optional[JsonSchemaResponseFormat] = None class StreamOptions(OpenAIBaseModel): diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py index b2188c9cbc2bb..8de811a6fbc41 100644 --- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py @@ -49,6 +49,13 @@ async def get_lm_format_enforcer_guided_decoding_logits_processor( and request.response_format.type == "json_object"): character_level_parser = JsonSchemaParser( None) # None means any json object + elif (request.response_format is not None + and request.response_format.type == "json_schema" + and request.response_format.json_schema is not None + and request.response_format.json_schema.json_schema is not None): + schema = _normalize_json_schema_object( + request.response_format.json_schema.json_schema) + character_level_parser = JsonSchemaParser(schema) else: return None diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index bc62224dabecf..bfc658ef7d26b 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -127,6 +127,13 @@ def _get_guide_and_mode( and request.response_format is not None and request.response_format.type == "json_object"): return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR + elif (not isinstance(request, GuidedDecodingRequest) + and request.response_format is not None + and request.response_format.type == "json_schema" + and request.response_format.json_schema is not None + and request.response_format.json_schema.json_schema is not None): + json = json_dumps(request.response_format.json_schema.json_schema) + return json, GuidedDecodingMode.JSON else: return None, None From 7d9ffa2ae102cbfae65035c511f8d3c8e5fab986 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 24 Aug 2024 00:51:38 -0700 Subject: [PATCH 15/24] [misc][core] lazy import outlines (#7831) --- .buildkite/test-pipeline.yaml | 3 +- tests/entrypoints/llm/test_lazy_outlines.py | 48 +++++++++++++++++++ .../guided_decoding/__init__.py | 9 ++-- .../lm_format_enforcer_decoding.py | 11 +++-- 4 files changed, 64 insertions(+), 7 deletions(-) create mode 100644 tests/entrypoints/llm/test_lazy_outlines.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 283776c06ed45..e406938647479 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -87,7 +87,8 @@ steps: commands: - pip install -e ./plugins/vllm_add_dummy_model - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api] - - pytest -v -s entrypoints/llm + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py + - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/openai - label: Distributed Tests (4 GPUs) # 10min diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py new file mode 100644 index 0000000000000..39480531f5866 --- /dev/null +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -0,0 +1,48 @@ +import sys + +from vllm import LLM, SamplingParams + + +def test_lazy_outlines(sample_regex): + """If users don't use guided decoding, outlines should not be imported. + """ + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM(model="facebook/opt-125m", + enforce_eager=True, + gpu_memory_utilization=0.3) + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + # make sure outlines is not imported + assert 'outlines' not in sys.modules + + llm = LLM(model="facebook/opt-125m", + enforce_eager=True, + guided_decoding_backend="lm-format-enforcer", + gpu_memory_utilization=0.3) + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + outputs = llm.generate( + prompts=[ + f"Give an example IPv4 address with this regex: {sample_regex}" + ] * 2, + sampling_params=sampling_params, + use_tqdm=True, + guided_options_request=dict(guided_regex=sample_regex)) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + # make sure outlines is not imported + assert 'outlines' not in sys.modules diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 4a2476dd6314d..f9fcdead980a2 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -5,9 +5,6 @@ CompletionRequest) from vllm.model_executor.guided_decoding.guided_fields import ( GuidedDecodingRequest) -from vllm.model_executor.guided_decoding.outlines_decoding import ( - get_local_outlines_guided_decoding_logits_processor, - get_outlines_guided_decoding_logits_processor) from vllm.sampling_params import LogitsProcessor @@ -18,6 +15,9 @@ async def get_guided_decoding_logits_processor( request = _adapt_request_for_tool_use(request) if guided_decoding_backend == 'outlines': + # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 + from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa + get_outlines_guided_decoding_logits_processor) return await get_outlines_guided_decoding_logits_processor( request, tokenizer) if guided_decoding_backend == 'lm-format-enforcer': @@ -37,6 +37,9 @@ def get_local_guided_decoding_logits_processor( # request = _adapt_request_for_tool_use(request) if guided_decoding_backend == 'outlines': + # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 + from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa + get_local_outlines_guided_decoding_logits_processor) return get_local_outlines_guided_decoding_logits_processor( guided_options, tokenizer) if guided_decoding_backend == 'lm-format-enforcer': diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py index 8de811a6fbc41..51f947981cac8 100644 --- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py @@ -14,9 +14,6 @@ CompletionRequest) from vllm.model_executor.guided_decoding.guided_fields import ( GuidedDecodingRequest) -from vllm.model_executor.guided_decoding.outlines_decoding import ( - get_local_outlines_guided_decoding_logits_processor, - get_outlines_guided_decoding_logits_processor) from vllm.sampling_params import LogitsProcessor @@ -43,6 +40,10 @@ async def get_lm_format_enforcer_guided_decoding_logits_processor( character_level_parser = RegexParser(request.guided_regex) elif request.guided_grammar: # CFG grammar not supported by LMFE, revert to outlines + + # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 + from vllm.model_executor.guided_decoding.outlines_decoding import ( + get_outlines_guided_decoding_logits_processor) return await get_outlines_guided_decoding_logits_processor( request, tokenizer) elif (request.response_format is not None @@ -87,6 +88,10 @@ def get_local_lm_format_enforcer_guided_decoding_logits_processor( character_level_parser = RegexParser(guided_options.guided_regex) elif guided_options.guided_grammar: # CFG grammar not supported by LMFE, revert to outlines + + # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 + from vllm.model_executor.guided_decoding.outlines_decoding import ( + get_local_outlines_guided_decoding_logits_processor) return get_local_outlines_guided_decoding_logits_processor( guided_options, tokenizer) elif guided_options.guided_json_object: From ea9fa160e3b47e0b8aa273f3eb2be410bd1ccab5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 24 Aug 2024 01:03:27 -0700 Subject: [PATCH 16/24] [ci][test] exclude model download time in server start time (#7834) --- tests/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/utils.py b/tests/utils.py index 3e0124fa11352..a37b7ee341f78 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -11,6 +11,7 @@ import openai import requests +from huggingface_hub import snapshot_download from transformers import AutoTokenizer from typing_extensions import ParamSpec @@ -64,6 +65,10 @@ def __init__(self, env_dict: Optional[Dict[str, str]] = None, auto_port: bool = True, max_wait_seconds: Optional[float] = None) -> None: + if not model.startswith("/"): + # download the model if it's not a local path + # to exclude the model download time from the server start time + model = snapshot_download(model) if auto_port: if "-p" in cli_args or "--port" in cli_args: raise ValueError("You have manually specified the port" From aab0fcdb63e322f717704e9d77199f63e036d59b Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 24 Aug 2024 10:31:28 -0700 Subject: [PATCH 17/24] [ci][test] fix RemoteOpenAIServer (#7838) --- tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index a37b7ee341f78..955431bbd3014 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -68,7 +68,7 @@ def __init__(self, if not model.startswith("/"): # download the model if it's not a local path # to exclude the model download time from the server start time - model = snapshot_download(model) + snapshot_download(model) if auto_port: if "-p" in cli_args or "--port" in cli_args: raise ValueError("You have manually specified the port" From 80162c44b1d1e59a2c10f65b6adb9b0407439b1f Mon Sep 17 00:00:00 2001 From: zifeitong Date: Sat, 24 Aug 2024 18:16:24 -0700 Subject: [PATCH 18/24] [Bugfix] Fix Phi-3v crash when input images are of certain sizes (#7840) --- tests/models/test_phi3v.py | 27 ++++++++++++++++++++++----- vllm/model_executor/models/phi3v.py | 2 -- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 197e63b1b1e52..40829785d3214 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -3,13 +3,14 @@ from typing import List, Optional, Tuple, Type import pytest +from PIL import Image from transformers import AutoTokenizer from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs from vllm.utils import is_cpu, is_hip -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -58,7 +59,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def run_test( hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, + images: List[Image.Image], model: str, *, size_factors: List[float], @@ -77,8 +78,6 @@ def run_test( Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. """ - images = [asset.pil_image for asset in image_assets] - inputs_per_image = [( [prompt for _ in size_factors], [ @@ -159,7 +158,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, run_test( hf_runner, vllm_runner, - image_assets, + [asset.pil_image for asset in image_assets], model, size_factors=size_factors, dtype=dtype, @@ -167,3 +166,21 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, num_logprobs=num_logprobs, tensor_parallel_size=1, ) + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("dtype", [target_dtype]) +def test_regression_7840(hf_runner, vllm_runner, image_assets, model, + dtype) -> None: + # Regression test for #7840. + run_test( + hf_runner, + vllm_runner, + [image_assets[0].pil_image.resize((465, 226))], + model, + size_factors=[1.0], + dtype=dtype, + max_tokens=128, + num_logprobs=10, + tensor_parallel_size=1, + ) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 4854377215608..2e52531989232 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -400,8 +400,6 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs): image_data = multi_modal_data["image"] if isinstance(image_data, Image.Image): w, h = image_data.size - w, h = _calc_hd_transform_size(width=w, height=h) - image_feature_size = get_phi3v_image_feature_size(hf_config, input_width=w, input_height=h) From 8aaf3d5347ad536de25869caa67b90e43f1ccd5b Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 25 Aug 2024 19:51:20 +0800 Subject: [PATCH 19/24] [Model][VLM] Support multi-images inputs for Phi-3-vision models (#7783) --- tests/models/test_phi3v.py | 111 ++++++++++++++++++++++++++++ vllm/model_executor/models/phi3v.py | 86 +++++++++++++-------- 2 files changed, 168 insertions(+), 29 deletions(-) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 40829785d3214..259cbe515066d 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -21,6 +21,7 @@ "cherry_blossom": "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n", }) +HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501 models = ["microsoft/Phi-3.5-vision-instruct"] @@ -184,3 +185,113 @@ def test_regression_7840(hf_runner, vllm_runner, image_assets, model, num_logprobs=10, tensor_parallel_size=1, ) + + +def run_multi_image_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + images: List[Image.Image], + model: str, + *, + size_factors: List[float], + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalDataDict objects + and corresponding MultiModalConfig as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + + inputs_per_case = [ + ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], + [[rescale_image_size(image, factor) for image in images] + for factor in size_factors]) + ] + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + + # max_model_len should be greater than image_feature_size + with vllm_runner(model, + max_model_len=4096, + max_num_seqs=1, + limit_mm_per_prompt={"image": len(images)}, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + vllm_outputs_per_case = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_case + ] + + hf_model_kwargs = {"_attn_implementation": "eager"} + with hf_runner(model, dtype=dtype, + model_kwargs=hf_model_kwargs) as hf_model: + eos_token_id = hf_model.processor.tokenizer.eos_token_id + hf_outputs_per_case = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images, + eos_token_id=eos_token_id) + for prompts, images in inputs_per_case + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, + vllm_outputs_per_case): + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, model) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, + size_factors, dtype: str, max_tokens: int, + num_logprobs: int) -> None: + run_multi_image_test( + hf_runner, + vllm_runner, + [asset.pil_image for asset in image_assets], + model, + size_factors=size_factors, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 2e52531989232..4872929ec36cc 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -13,6 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import itertools import re from functools import lru_cache from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, @@ -37,11 +38,11 @@ from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.utils import is_list_of -from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, - input_processor_for_clip) +from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsMultiModal from .utils import merge_multimodal_embeddings @@ -400,9 +401,20 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs): image_data = multi_modal_data["image"] if isinstance(image_data, Image.Image): w, h = image_data.size - image_feature_size = get_phi3v_image_feature_size(hf_config, - input_width=w, - input_height=h) + image_feature_size = [ + get_phi3v_image_feature_size(hf_config, + input_width=w, + input_height=h) + ] + image_data = [image_data] + elif is_list_of(image_data, Image.Image): + image_feature_size = [] + for image in image_data: + w, h = image.size + image_feature_size.append( + get_phi3v_image_feature_size(hf_config, + input_width=w, + input_height=h)) elif isinstance(image_data, torch.Tensor): image_feature_size = image_data.shape[0] else: @@ -410,45 +422,61 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs): prompt = llm_inputs.get("prompt") if prompt is None: + image_idx = [] new_prompt = None else: + image_idx = sorted(map(int, re.findall(r"<\|image_(\d+)\|>+", prompt))) if prompt.count("<|image|>") > 0: logger.warning("Please follow the prompt format that is " "documented on HuggingFace which does not involve " "repeating <|image|> tokens.") - elif len(re.findall(r"(<\|image_\d+\|>)+", prompt)) > 1: - logger.warning("Multiple image input is not supported yet, " - "so any extra image tokens will be treated " - "as plain text.") - + elif (num_image_tags := len(image_idx)) > 1: + assert num_image_tags == len( + image_data), "The count of image_placeholder not match image's" new_prompt = prompt - prompt_token_ids = llm_inputs["prompt_token_ids"] - image_1_token_ids = _get_image_placeholder_token_ids(model_config, idx=1) + prompt_token_ids = llm_inputs["prompt_token_ids"].copy() + + # masked place_holder with image token id + for idx in image_idx: + image_token_ids = _get_image_placeholder_token_ids(model_config, + idx=idx) + for i in range(len(prompt_token_ids) - len(image_token_ids) + 1): + if prompt_token_ids[i:i + len(image_token_ids)] == image_token_ids: + prompt_token_ids[i:i + len(image_token_ids)] = [ + _IMAGE_TOKEN_ID + ] * len(image_token_ids) + break + + # merge consecutive tag ids + merged_token_ids: List[int] = [] + for is_placeholder, token_ids in itertools.groupby( + prompt_token_ids, lambda x: x == _IMAGE_TOKEN_ID): + if is_placeholder: + merged_token_ids.append(_IMAGE_TOKEN_ID) + else: + merged_token_ids.extend(list(token_ids)) + # TODO: Move this to utils or integrate with clip. new_token_ids: List[int] = [] - for i in range(len(prompt_token_ids) - len(image_1_token_ids) + 1): - if prompt_token_ids[i:i + len(image_1_token_ids)] == image_1_token_ids: - new_token_ids.append(_IMAGE_TOKEN_ID) - - # No need to further scan the list since we only replace once - new_token_ids.extend(prompt_token_ids[i + len(image_1_token_ids):]) - break + placeholder_idx = 0 + while merged_token_ids: + token_id = merged_token_ids.pop(0) + if token_id == _IMAGE_TOKEN_ID: + new_token_ids.extend( + repeat_and_pad_token( + _IMAGE_TOKEN_ID, + repeat_count=image_feature_size[placeholder_idx], + )) + placeholder_idx += 1 else: - new_token_ids.append(prompt_token_ids[i]) + new_token_ids.append(token_id) # NOTE: Create a defensive copy of the original inputs llm_inputs = LLMInputs(prompt_token_ids=new_token_ids, prompt=new_prompt, multi_modal_data=multi_modal_data) - - return input_processor_for_clip( - model_config, - CLIP_VIT_LARGE_PATCH14_336_CONFIG, - llm_inputs, - image_token_id=_IMAGE_TOKEN_ID, - image_feature_size_override=image_feature_size, - ) + return llm_inputs @MULTIMODAL_REGISTRY.register_image_input_mapper() From 2059b8d9caf12072710a7d610dd80954ad7c047e Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 25 Aug 2024 23:53:09 +0800 Subject: [PATCH 20/24] [Misc] Remove snapshot_download usage in InternVL2 test (#7835) --- tests/models/test_internvl.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py index d032f3be84b58..243bc857c88de 100644 --- a/tests/models/test_internvl.py +++ b/tests/models/test_internvl.py @@ -3,7 +3,6 @@ import pytest import torch -from huggingface_hub import snapshot_download from PIL.Image import Image from transformers import AutoConfig @@ -25,17 +24,12 @@ "<|im_start|>User\n\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 }) -# we use snapshot_download to prevent conflicts between -# dynamic_module and trust_remote_code for hf_runner -DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"] models = [ - snapshot_download("OpenGVLab/InternVL2-1B", - allow_patterns=DOWNLOAD_PATTERN), - snapshot_download("OpenGVLab/InternVL2-2B", - allow_patterns=DOWNLOAD_PATTERN), + "OpenGVLab/InternVL2-1B", + "OpenGVLab/InternVL2-2B", # Broken due to outdated implementation of Phi-3 # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3 - # snapshot_download("OpenGVLab/InternVL2-4B"), + # "OpenGVLab/InternVL2-4B", ] From 70c094ade6eb77396a309512f24ddbfafaf15b38 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 25 Aug 2024 14:30:09 -0700 Subject: [PATCH 21/24] [misc][cuda] improve pynvml warning (#7852) --- vllm/platforms/cuda.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 84301afabe9d8..bda82d3712f09 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -21,7 +21,9 @@ if pynvml.__file__.endswith("__init__.py"): logger.warning( "You are using a deprecated `pynvml` package. Please install" - " `nvidia-ml-py` instead. See https://pypi.org/project/pynvml " + " `nvidia-ml-py` instead, and make sure to uninstall `pynvml`." + " When both of them are installed, `pynvml` will take precedence" + " and cause errors. See https://pypi.org/project/pynvml " "for more information.") # NVML utils From 1856aff4d66833b258ce64132413ab8a18cc18a6 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sun, 25 Aug 2024 15:45:14 -0700 Subject: [PATCH 22/24] [Spec Decoding] Streamline batch expansion tensor manipulation (#7851) --- tests/spec_decode/test_utils.py | 31 +++--- vllm/spec_decode/batch_expansion.py | 143 ++++++++++++++----------- vllm/spec_decode/spec_decode_worker.py | 25 ++--- vllm/spec_decode/top1_proposer.py | 2 +- vllm/spec_decode/util.py | 42 +++----- 5 files changed, 118 insertions(+), 125 deletions(-) diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py index 18dbdd5bc952f..06780d4b8cd01 100644 --- a/tests/spec_decode/test_utils.py +++ b/tests/spec_decode/test_utils.py @@ -55,10 +55,9 @@ def fake_sequence_group_metadata(): def test_filter_zero_length_proposals(fake_sequence_group_metadata): proposal_lens = [0, 1, 0] - filtered_groups, indices = split_batch_by_proposal_len( - fake_sequence_group_metadata, - proposal_lens, - select_proposal_len_zero=True) + _, (filtered_groups, + indices) = split_batch_by_proposal_len(fake_sequence_group_metadata, + proposal_lens) expected_groups = [ fake_sequence_group_metadata[0], fake_sequence_group_metadata[2] @@ -71,10 +70,9 @@ def test_filter_zero_length_proposals(fake_sequence_group_metadata): def test_filter_non_zero_length_proposals(fake_sequence_group_metadata): proposal_lens = [0, 1, 2] - filtered_groups, indices = split_batch_by_proposal_len( - fake_sequence_group_metadata, - proposal_lens, - select_proposal_len_zero=False) + (filtered_groups, + indices), _ = split_batch_by_proposal_len(fake_sequence_group_metadata, + proposal_lens) expected_groups = [ fake_sequence_group_metadata[1], fake_sequence_group_metadata[2] @@ -86,8 +84,7 @@ def test_filter_non_zero_length_proposals(fake_sequence_group_metadata): def test_empty_inputs(): - filtered_groups, indices = split_batch_by_proposal_len( - [], [], select_proposal_len_zero=True) + _, (filtered_groups, indices) = split_batch_by_proposal_len([], []) assert filtered_groups == [] assert indices == [] @@ -95,10 +92,9 @@ def test_empty_inputs(): def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata): proposal_lens = [0, 0, 0] - filtered_groups, indices = split_batch_by_proposal_len( - fake_sequence_group_metadata, - proposal_lens, - select_proposal_len_zero=False) + (filtered_groups, + indices), _ = split_batch_by_proposal_len(fake_sequence_group_metadata, + proposal_lens) assert filtered_groups == [] assert indices == [] @@ -106,10 +102,9 @@ def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata): def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata): proposal_lens = [1, 1, 1] - filtered_groups, indices = split_batch_by_proposal_len( - fake_sequence_group_metadata, - proposal_lens, - select_proposal_len_zero=True) + _, (filtered_groups, + indices) = split_batch_by_proposal_len(fake_sequence_group_metadata, + proposal_lens) assert filtered_groups == [] assert indices == [] diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index ad6f3f313841d..8a691d65aaa06 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -10,8 +10,7 @@ get_all_seq_ids) from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) -from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch, - split_batch_by_proposal_len) +from vllm.spec_decode.util import nvtx_range, split_batch_by_proposal_len from vllm.worker.worker_base import WorkerBase SeqId = int @@ -88,17 +87,25 @@ def score_proposals( assert len(target_sampler_output) == 1, "expected single-step output" target_sampler_output = target_sampler_output[0] - (all_tokens, all_probs, spec_logprobs, - all_hidden_states) = self._contract_batch( - contracted_bs=len(execute_model_req.seq_group_metadata_list), - target_sampler_output=target_sampler_output, - proposals=proposals, - num_scoring_tokens=num_scoring_tokens, - non_spec_indices=non_spec_indices, - spec_indices=spec_indices, - k=execute_model_req.num_lookahead_slots, - ) - + if not non_spec_indices: + # All sequence groups in batch have spec decoding enabled + contracted = self._contract_batch_all_spec( + target_sampler_output=target_sampler_output, + proposals=proposals, + ) + else: + # Batch has a mix of spec decode enabled and disabled seq groups + contracted = self._contract_batch( + contracted_bs=len(execute_model_req.seq_group_metadata_list), + target_sampler_output=target_sampler_output, + proposals=proposals, + num_scoring_tokens=num_scoring_tokens, + non_spec_indices=non_spec_indices, + spec_indices=spec_indices, + k=execute_model_req.num_lookahead_slots, + ) + + all_tokens, all_probs, spec_logprobs, all_hidden_states = contracted return SpeculativeScores( probs=all_probs, token_ids=all_tokens, @@ -121,14 +128,9 @@ def _expand_batch( # proposal len. This adds some complexity (splitting the batch into spec # and non spec sequences) and should be removed in the future. It can be # done by supporting per-sequence proposal lens. - spec_seqs, spec_indices = split_batch_by_proposal_len( - seq_group_metadata_list, - proposal_lens_list, - select_proposal_len_zero=False) - non_spec_seqs, non_spec_indices = split_batch_by_proposal_len( - seq_group_metadata_list, - proposal_lens_list, - select_proposal_len_zero=True) + (spec_seqs, spec_indices), (non_spec_seqs, non_spec_indices) = \ + split_batch_by_proposal_len( + seq_group_metadata_list, proposal_lens_list) target_seq_group_metadata_list = self._create_scoring_model_input( seq_group_metadata_list=spec_seqs, @@ -171,7 +173,7 @@ def _contract_batch( # The number of tokens in the expanded batch used for speculation is # equal to the total expanded batch size minus the number of samples for # non-speculative sequences. - non_spec_expanded_bs, _ = non_spec_target_token_ids.shape + non_spec_expanded_bs = len(non_spec_target_token_ids) spec_expanded_bs = expanded_batch_size - non_spec_expanded_bs target_token_ids = target_token_ids.reshape(spec_expanded_bs, k + 1) @@ -181,7 +183,7 @@ def _contract_batch( if target_hidden_states is not None: target_hidden_states = target_hidden_states.reshape( - spec_expanded_bs, k + 1, target_hidden_states.shape[-1]) + *target_token_ids.shape, target_hidden_states.shape[-1]) all_tokens = target_token_ids.new_full(size=(contracted_bs, k + 1), fill_value=-1) @@ -196,24 +198,58 @@ def _contract_batch( all_hidden_states = None if non_spec_indices: - all_tokens[non_spec_indices, :1] = non_spec_target_token_ids - all_probs[non_spec_indices, :1, :] = non_spec_target_probs - all_logprobs[non_spec_indices, :1, :] = non_spec_target_logprobs - + all_tokens[non_spec_indices, :1] = \ + non_spec_target_token_ids.unsqueeze(1) + all_probs[non_spec_indices, :1, :] = \ + non_spec_target_probs.unsqueeze(1) + all_logprobs[non_spec_indices, :1, :] = \ + non_spec_target_logprobs.unsqueeze(1) if all_hidden_states is not None: - all_hidden_states[ - non_spec_indices, :1, :] = non_spec_target_hidden_states + assert non_spec_target_hidden_states is not None + all_hidden_states[non_spec_indices, :1, :] = \ + non_spec_target_hidden_states.unsqueeze(1) if spec_indices: all_tokens[spec_indices] = target_token_ids all_probs[spec_indices] = target_probs all_logprobs[spec_indices] = target_logprobs - if all_hidden_states is not None: all_hidden_states[spec_indices] = target_hidden_states return all_tokens, all_probs, all_logprobs, all_hidden_states + def _contract_batch_all_spec( + self, + target_sampler_output: SamplerOutput, + proposals: SpeculativeProposals, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, + Optional[torch.Tensor]]: + """Contract the expanded batch back into its original size. + This maps the scores of speculative tokens back to their original + sequences. + + It assumes all sequences in the batch were previously expanded. + """ + + # Map distinct sequences used to score each token + # of shape [batch_size * k + 1] back to [batch_size, k + 1]. + contracted_bs, k = proposals.proposal_token_ids.shape + + # Reshape tensors to original batch size + target_token_ids = target_sampler_output.sampled_token_ids.reshape( + contracted_bs, k + 1) + target_probs = target_sampler_output.sampled_token_probs.reshape( + *target_token_ids.shape, self._vocab_size) + target_logprobs = target_sampler_output.logprobs.reshape( + target_probs.shape) + target_hidden_states = target_sampler_output.hidden_states + if target_hidden_states is not None: + target_hidden_states = target_hidden_states.reshape( + *target_token_ids.shape, target_hidden_states.shape[-1]) + + return (target_token_ids, target_probs, target_logprobs, + target_hidden_states) + def _create_scoring_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -345,8 +381,9 @@ def _create_single_target_seq_group_metadata( token_chunk_size=1, ) + @staticmethod def _split_scoring_output( - self, sampler_output: SamplerOutput, num_scoring_tokens: int + sampler_output: SamplerOutput, num_scoring_tokens: int ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: @@ -361,10 +398,9 @@ def _split_scoring_output( # # First samples are from speculative scoring, latter samples are non- # speculative samples. - split_sizes = [ - num_scoring_tokens, - sampler_output.sampled_token_ids.numel() - num_scoring_tokens - ] + split_sizes = (num_scoring_tokens, + sampler_output.sampled_token_ids.numel() - + num_scoring_tokens) (spec_probs, non_spec_probs ) = sampler_output.sampled_token_probs.split(split_sizes) (spec_sampled_tokens, non_spec_sampled_tokens @@ -382,32 +418,13 @@ def _split_scoring_output( else: spec_hidden_states, non_spec_hidden_states = None, None - # Convert scores to tensors. - sampler_output.sampled_token_probs = spec_probs - sampler_output.sampled_token_ids = spec_sampled_tokens - sampler_output.logprobs = spec_logprobs - sampler_output.hidden_states = spec_hidden_states - (target_token_ids, target_probs, target_logprobs, - target_hidden_states) = sampler_output_to_torch([sampler_output], - True) - - # Convert non-speculative output tokens to tensors. - sampler_output.sampled_token_probs = non_spec_probs - sampler_output.sampled_token_ids = non_spec_sampled_tokens - sampler_output.logprobs = non_spec_logprobs - sampler_output.hidden_states = non_spec_hidden_states - (non_spec_target_token_ids, non_spec_target_probs, - non_spec_target_logprobs, - non_spec_target_hidden_states) = sampler_output_to_torch( - [sampler_output], True) - - return (target_token_ids, target_probs, target_logprobs, - target_hidden_states, non_spec_target_token_ids, - non_spec_target_probs, non_spec_target_logprobs, - non_spec_target_hidden_states) + return (spec_sampled_tokens, spec_probs, spec_logprobs, + spec_hidden_states, non_spec_sampled_tokens, non_spec_probs, + non_spec_logprobs, non_spec_hidden_states) + @staticmethod def _create_target_seq_id_iterator( - self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]: + seq_ids: List[SeqId]) -> Iterator[TargetSeqId]: """Create an iterator for creating target sequence ids. Target sequence ids are distinct from sequence ids because we create a distinct target sequence id for each proposal token to be scored. @@ -417,8 +434,8 @@ def _create_target_seq_id_iterator( """ return count(start=max(seq_ids) + 1) + @staticmethod def _get_token_ids_to_score( - self, full_spec_token_ids: List[TokenId] # shape: [k] ) -> List[List[TokenId]]: """Given an int tensor of proposal token ids, return a list of @@ -439,8 +456,6 @@ def _get_token_ids_to_score( empty_token_ids: List[TokenId] = [] token_ids_to_score = [empty_token_ids] - token_ids_to_score.extend([ - full_spec_token_ids[:i + 1] - for i in range(len(full_spec_token_ids)) - ]) + token_ids_to_score.extend(full_spec_token_ids[:i + 1] + for i in range(len(full_spec_token_ids))) return token_ids_to_score diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 2762b8388029f..9b1f21fcb4920 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -365,12 +365,13 @@ def execute_model( # used during the prefill phase. # 2. Auto-disable enabled: The running queue size exceeds # the specified threshold. - # 3. No request: There are no requests in the batch. + # 3. No request: There are no requests in the batch, or + # none of the requests in the batch have spec decoding enabled. # In any of these cases, the proposer and scorer workers # are called normally. - no_spec = num_lookahead_slots == 0 or len( - execute_model_req.seq_group_metadata_list - ) == 0 or disable_all_speculation + no_spec = num_lookahead_slots == 0 or disable_all_speculation or all( + sgm.num_speculative_tokens == 0 + for sgm in execute_model_req.seq_group_metadata_list) # Broadcast how many lookahead slots are scheduled for this step, and # whether all speculation is disabled, to all non-driver workers. @@ -415,10 +416,8 @@ def _should_disable_all_speculation( self, execute_model_req: ExecuteModelRequest) -> bool: # When the batch size is too large, disable speculative decoding # to stop trading off throughput for latency. - disable_all_speculation = (execute_model_req.running_queue_size >= - self.disable_by_batch_size) - - return disable_all_speculation + return (execute_model_req.running_queue_size >= + self.disable_by_batch_size) def _maybe_disable_speculative_tokens( self, disable_all_speculation: bool, @@ -621,14 +620,8 @@ def _verify_tokens( # proposal len. This adds some complexity (splitting the batch into spec # and non spec sequences) and should be removed in the future. It can be # done by supporting per-sequence proposal lens. - _, spec_indices = split_batch_by_proposal_len( - seq_group_metadata_list, - proposal_lens_list, - select_proposal_len_zero=False) - _, non_spec_indices = split_batch_by_proposal_len( - seq_group_metadata_list, - proposal_lens_list, - select_proposal_len_zero=True) + (_, spec_indices), (_, non_spec_indices) = split_batch_by_proposal_len( + seq_group_metadata_list, proposal_lens_list) original_indices = spec_indices + non_spec_indices # Get probabilities of target model, excluding bonus token. diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 28f7f7eb069ab..aa993e539b6d3 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -138,7 +138,7 @@ def _split_by_proposal_len( # Currently only proposal lens of 0 or the global batch proposal len # are supported. - # If max_proposal_len is defined, then we shall no exceed this + # If max_proposal_len is defined, then we shall not exceed this # quota for nonzero_proposal new_k = 0 if (self.max_proposal_len is None diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 9315cd0f753fe..d18ee47e23a5c 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -1,6 +1,6 @@ import time from contextlib import contextmanager -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Sequence, Tuple import torch @@ -98,33 +98,26 @@ def create_sequence_group_output( def split_batch_by_proposal_len( seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_lens: List[int], select_proposal_len_zero: bool -) -> Tuple[List[SequenceGroupMetadata], List[int]]: + proposal_lens: List[int], +) -> Tuple[Tuple[List[SequenceGroupMetadata], List[int]], Tuple[ + List[SequenceGroupMetadata], List[int]]]: """Utility function that splits a batch based on whether the proposal len is zero or not. We should remove this once vLLM supports per-sequence proposal lens in a batch. """ - if select_proposal_len_zero: - predicate = lambda proposal_len: proposal_len == 0 - else: - predicate = lambda proposal_len: proposal_len != 0 - - indices = [ - i for i, (_, proposal_len - ) in enumerate(zip(seq_group_metadata_list, proposal_lens)) - if predicate(proposal_len) - ] - seq_groups = [ - seq_group for seq_group, proposal_len in zip( - seq_group_metadata_list, proposal_lens) if predicate(proposal_len) - ] - - return seq_groups, indices + nonzero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], []) + zero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], []) + for i, (seq_group, proposal_len) in enumerate( + zip(seq_group_metadata_list, proposal_lens)): + seq_groups, indices = nonzero_lists if proposal_len else zero_lists + seq_groups.append(seq_group) + indices.append(i) + return nonzero_lists, zero_lists def sampler_output_to_torch( - sampler_output_list: List[SamplerOutput], sampler_transposed: bool + sampler_output_list: Sequence[SamplerOutput], sampler_transposed: bool ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """Utility function which converts a list of SamplerOutput to tensors. @@ -148,18 +141,12 @@ def sampler_output_to_torch( dim=0, ) - if sampler_transposed: - sampled_token_probs = sampled_token_probs.transpose(0, 1) - # shape: [batch_size, num_sampler_output, vocab_size] sampled_token_logprobs = torch.stack( [sampler_output.logprobs for sampler_output in sampler_output_list], dim=0, ) - if sampler_transposed: - sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1) - # shape: [batch_size, num_sampler_output] sampled_token_ids = torch.stack( [ @@ -168,7 +155,10 @@ def sampler_output_to_torch( ], dim=0, ) + if sampler_transposed: + sampled_token_probs = sampled_token_probs.transpose(0, 1) + sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1) sampled_token_ids = sampled_token_ids.transpose(0, 1) if sampler_output_list[0].hidden_states is not None: From 0b769992ec1d780b3229c46152c6e647da113aa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?= =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?= Date: Mon, 26 Aug 2024 06:16:38 +0300 Subject: [PATCH 23/24] [Bugfix]: Use float32 for base64 embedding (#7855) Signed-off-by: Hollow Man --- examples/openai_embedding_client.py | 1 - tests/entrypoints/openai/test_embedding.py | 11 ++++++++++- vllm/entrypoints/openai/serving_embedding.py | 4 +++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/examples/openai_embedding_client.py b/examples/openai_embedding_client.py index b4f4c7ad6beb2..4bd7ca01d750d 100644 --- a/examples/openai_embedding_client.py +++ b/examples/openai_embedding_client.py @@ -19,7 +19,6 @@ "The best thing about vLLM is that it supports many different models" ], model=model, - encoding_format="float", ) for data in responses.data: diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index c9747339bbf15..6bf170b94c0d7 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -128,9 +128,18 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, for data in responses_base64.data: decoded_responses_base64_data.append( np.frombuffer(base64.b64decode(data.embedding), - dtype="float").tolist()) + dtype="float32").tolist()) assert responses_float.data[0].embedding == decoded_responses_base64_data[ 0] assert responses_float.data[1].embedding == decoded_responses_base64_data[ 1] + + # Default response is float32 decoded from base64 by OpenAI Client + responses_default = await embedding_client.embeddings.create( + input=input_texts, model=model_name) + + assert responses_float.data[0].embedding == responses_default.data[ + 0].embedding + assert responses_float.data[1].embedding == responses_default.data[ + 1].embedding diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index b0f70ff43e228..12ec6be03cd62 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -31,7 +31,9 @@ def _get_embedding( if encoding_format == "float": return output.embedding elif encoding_format == "base64": - embedding_bytes = np.array(output.embedding).tobytes() + # Force to use float32 for base64 encoding + # to match the OpenAI python client behavior + embedding_bytes = np.array(output.embedding, dtype="float32").tobytes() return base64.b64encode(embedding_bytes).decode("utf-8") assert_never(encoding_format) From 029c71de11bc3bcf84a1b3cf9d91e79ab6949799 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 26 Aug 2024 13:31:10 +0800 Subject: [PATCH 24/24] [CI/Build] Avoid downloading all HF files in `RemoteOpenAIServer` (#7836) --- tests/utils.py | 40 ++++++++++++++++++++++++++-------------- vllm/engine/arg_utils.py | 2 +- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 955431bbd3014..b73a05b5fe67f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -11,13 +11,14 @@ import openai import requests -from huggingface_hub import snapshot_download from transformers import AutoTokenizer from typing_extensions import ParamSpec from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) +from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.openai.cli_args import make_arg_parser +from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip @@ -60,39 +61,50 @@ class RemoteOpenAIServer: def __init__(self, model: str, - cli_args: List[str], + vllm_serve_args: List[str], *, env_dict: Optional[Dict[str, str]] = None, auto_port: bool = True, max_wait_seconds: Optional[float] = None) -> None: - if not model.startswith("/"): - # download the model if it's not a local path - # to exclude the model download time from the server start time - snapshot_download(model) if auto_port: - if "-p" in cli_args or "--port" in cli_args: - raise ValueError("You have manually specified the port" + if "-p" in vllm_serve_args or "--port" in vllm_serve_args: + raise ValueError("You have manually specified the port " "when `auto_port=True`.") - cli_args = cli_args + ["--port", str(get_open_port())] + # Don't mutate the input args + vllm_serve_args = vllm_serve_args + [ + "--port", str(get_open_port()) + ] parser = FlexibleArgumentParser( description="vLLM's remote OpenAI server.") parser = make_arg_parser(parser) - args = parser.parse_args(cli_args) + args = parser.parse_args(["--model", model, *vllm_serve_args]) self.host = str(args.host or 'localhost') self.port = int(args.port) + # download the model before starting the server to avoid timeout + is_local = os.path.isdir(model) + if not is_local: + engine_args = AsyncEngineArgs.from_cli_args(args) + engine_config = engine_args.create_engine_config() + dummy_loader = DefaultModelLoader(engine_config.load_config) + dummy_loader._prepare_weights(engine_config.model_config.model, + engine_config.model_config.revision, + fall_back_to_pt=True) + env = os.environ.copy() # the current process might initialize cuda, # to be safe, we should use spawn method env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' if env_dict is not None: env.update(env_dict) - self.proc = subprocess.Popen(["vllm", "serve"] + [model] + cli_args, - env=env, - stdout=sys.stdout, - stderr=sys.stderr) + self.proc = subprocess.Popen( + ["vllm", "serve", model, *vllm_serve_args], + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) max_wait_seconds = max_wait_seconds or 240 self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4cbd728714bc0..987c1be3d5ad9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -742,7 +742,7 @@ def from_cli_args(cls, args: argparse.Namespace): engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) return engine_args - def create_engine_config(self, ) -> EngineConfig: + def create_engine_config(self) -> EngineConfig: # gguf file needs a specific model loader and doesn't use hf_repo if self.model.endswith(".gguf"): self.quantization = self.load_format = "gguf"