vllm-project · Isotr0py · Nov 8, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 15, 2024
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
@@ -19,17 +19,22 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 
 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" \
-    --ignore=tests/models/test_embedding.py \
-    --ignore=tests/models/test_oot_registration.py \
-    --ignore=tests/models/test_registry.py \
-    --ignore=tests/models/test_jamba.py \
-    --ignore=tests/models/test_mamba.py \
-    --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
+  set -e
+  pip install pytest pytest-asyncio \
+    decord einops librosa peft Pillow sentence-transformers soundfile \
+    transformers_stream_generator matplotlib datamodel_code_generator
+  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+  # Embedding models are not supported for CPU yet
+  # pytest -v -s tests/models/embedding/language
+  pytest -v -s tests/models/encoder_decoder/language
+  pytest -v -s tests/models/decoder_only/language/test_models.py
+  # Chunked prefill not supported for CPU yet
+  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
 # online inference
 docker exec cpu-test bash -c "
+  set -e
   python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
   timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
   python3 benchmarks/benchmark_serving.py \

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -20,32 +20,41 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
  --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 
 # offline inference
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
+docker exec cpu-test-avx2 bash -c "
+  set -e
+  python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
+  set -e
+  pip install pytest pytest-asyncio \
+    decord einops librosa peft Pillow sentence-transformers soundfile \
+    transformers_stream_generator matplotlib datamodel_code_generator
+  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+  # Embedding models are not supported for CPU yet
+  # pytest -v -s tests/models/embedding/language
   pytest -v -s tests/models/encoder_decoder/language
-  pytest -v -s tests/models/decoder_only/language \
-    --ignore=tests/models/test_fp8.py \
-    --ignore=tests/models/decoder_only/language/test_jamba.py \
-    --ignore=tests/models/decoder_only/language/test_mamba.py \
-    --ignore=tests/models/decoder_only/language/test_granitemoe.py \
-    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  pytest -v -s tests/models/decoder_only/language/test_models.py
+  # Chunked prefill not supported for CPU yet
+  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
 # Run compressed-tensor test
 docker exec cpu-test bash -c "
+  set -e
   pytest -s -v \
   tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
   tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
 # Run AWQ test
 docker exec cpu-test bash -c "
+  set -e
   pytest -s -v \
   tests/quantization/test_ipex_quant.py"
 
 # online inference
 docker exec cpu-test bash -c "
+  set -e
   export VLLM_CPU_KVCACHE_SPACE=10 
   export VLLM_CPU_OMP_THREADS_BIND=48-92 
   python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -269,7 +269,6 @@ steps:
   source_file_dependencies:
   - benchmarks/
   commands:
-  - pip install aiohttp
   - bash run-benchmarks.sh
 
 - label: Quantization Test # 33min
@@ -331,7 +330,7 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
 
-- label: Decoder-only Multi-Modal Models Test (Standard)
+- label: Decoder-only Multi-Modal Models Test (Standard) # 26min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/

diff --git a/pyproject.toml b/pyproject.toml
@@ -93,7 +93,8 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
     "skip_global_cleanup",
-    "core_model: run this model test in each PR instead of just daily",
+    "core_model: enable this model test in each PR instead of only nightly",
+    "cpu_model: enable this model test in CPU tests",
     "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
     "skip_v1: do not run this test with v1",
 ]
diff --git a/requirements-test.in b/requirements-test.in
@@ -12,9 +12,7 @@ decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
-opencv-python # required for video tests
 peft
-requests
 ray[adag]==2.35
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
@@ -29,9 +27,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test
 # TODO: Add this after fully implementing llava(mantis)
 # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
 
-# Benchmarking
-aiohttp
-
 # quantization
 bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.9

@@ -5,11 +5,11 @@
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer, BatchEncoding
 
-from tests.utils import RemoteOpenAIServer
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
 from ....conftest import HfRunner, VllmRunner
+from ....utils import RemoteOpenAIServer
 from ...utils import check_logprobs_close
 
 MODEL_NAME = "fixie-ai/ultravox-v0_3"
@@ -39,7 +39,10 @@ def audio(request):
     return AudioAsset(request.param)
 
 
-@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
+@pytest.fixture(params=[
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def server(request, audio_assets):
     args = [
         "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
@@ -185,7 +188,10 @@ def run_multi_audio_test(
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
+@pytest.mark.parametrize("vllm_kwargs", [
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
                 num_logprobs: int, vllm_kwargs: dict) -> None:
 
@@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
+@pytest.mark.parametrize("vllm_kwargs", [
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
                                      max_tokens: int, num_logprobs: int,
                                      vllm_kwargs: dict) -> None:

@@ -14,7 +14,6 @@
     "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
     "h2oai/h2ovl-mississippi-2b",
 ]
-target_dtype = "bfloat16"
 
 
 def run_preprocessing_test(

@@ -94,7 +94,7 @@
             ),
             limit_mm_per_prompt={"image": 4},
         )],
-        marks=[pytest.mark.core_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     "paligemma": VLMTestInfo(
         models=["google/paligemma-3b-mix-224"],
@@ -111,7 +111,8 @@
             "pixel_values"
         ),
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
-        dtype="half" if current_platform.is_rocm() else ("half", "float"),
+        dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
+               else ("half", "float")),
         marks=[pytest.mark.core_model],
     ),
     "qwen2_vl": VLMTestInfo(
@@ -128,7 +129,7 @@
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        marks=[pytest.mark.core_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),
     #### Extended model tests
@@ -172,7 +173,6 @@
         use_tokenizer_eos=True,
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
-        dtype="bfloat16" if current_platform.is_cpu() else "half",
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),
     "glm4": VLMTestInfo(
@@ -245,7 +245,6 @@
         models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
         test_type=VLMTestType.CUSTOM_INPUTS,
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
-        dtype="half",
         num_video_frames=16,
         max_model_len=16384,
         postprocess_inputs=model_utils.get_key_type_post_processor(
@@ -404,7 +403,6 @@
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=4096,
-        dtype="bfloat16" if current_platform.is_cpu() else "half",
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
         custom_test_opts=[
@@ -419,7 +417,6 @@
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=16384,
         max_num_seqs=2,
-        dtype="half",
         postprocess_inputs=model_utils.get_key_type_post_processor(
             "pixel_values"
         ),

@@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 
 
 target_dtype = "half"
-if current_platform.is_cpu():
-    target_dtype = "bfloat16"
 
 # ROCm Triton FA can run into shared memory issues with these models,
 # use other backends in the meantime

@@ -5,7 +5,6 @@
 
 from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
-from vllm.platforms import current_platform
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
 TokensText = Tuple[List[int], str]
@@ -270,7 +269,7 @@ def build_model_context(model_name: str,
     if tokenizer_name is None:
         tokenizer_name = model_name
     if dtype is None:
-        dtype = "bfloat16" if current_platform.is_cpu() else "half"
+        dtype = "half"
 
     model_config = ModelConfig(
         model_name,

diff --git a/vllm/assets/image.py b/vllm/assets/image.py
@@ -27,4 +27,4 @@ def image_embeds(self) -> torch.Tensor:
         """
         image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
                                             s3_prefix=VLM_IMAGES_DIR)
-        return torch.load(image_path)
+        return torch.load(image_path, map_location="cpu")
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
@@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
         if sr != feature_extractor.sampling_rate:
             try:
                 import librosa
-            except ImportError:
+            except ImportError as exc:
                 raise ImportError(
-                    "Please install vllm[audio] for audio support.") from None
+                    "Please install vllm[audio] for audio support.") from exc
             audio = librosa.resample(audio,
                                      orig_sr=sr,
                                      target_sr=feature_extractor.sampling_rate)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
@@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]:
     try:
         import librosa
         import soundfile
-    except ImportError:
+    except ImportError as exc:
         raise ImportError(
-            "Please install vllm[audio] for audio support.") from None
+            "Please install vllm[audio] for audio support.") from exc
     return librosa, soundfile
 
 
@@ -344,9 +344,9 @@ def try_import_video_packages() -> Any:
     try:
         import cv2
         import decord
-    except ImportError:
+    except ImportError as exc:
         raise ImportError(
-            "Please install vllm[video] for video support.") from None
+            "Please install vllm[video] for video support.") from exc
     return cv2, decord
 
 

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
@@ -151,7 +151,11 @@ def __init__(
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
         ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
-        if self.model_config.is_encoder_decoder:
+        if self.model_config.task == "embedding":
+            raise NotImplementedError(
+                "Embedding models are not supported for CPU backend")
+            # ModelRunnerClass = CPUEmbeddingModelRunner
+        elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = CPUEncoderDecoderModelRunner
         self.model_runner: CPUModelRunner = ModelRunnerClass(
             vllm_config=vllm_config,