Skip to content

Commit

Permalink
[CI/Build] Update CPU tests to include all "standard" tests (vllm-pro…
Browse files Browse the repository at this point in the history
…ject#5481)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Loc Huynh <jc1da.3011@gmail.com>
  • Loading branch information
DarkLight1337 authored and JC1DA committed Nov 11, 2024
1 parent 81aaaf9 commit 32e4940
Show file tree
Hide file tree
Showing 14 changed files with 63 additions and 48 deletions.
21 changes: 13 additions & 8 deletions .buildkite/run-cpu-test-ppc64le.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,22 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg

# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" \
--ignore=tests/models/test_embedding.py \
--ignore=tests/models/test_oot_registration.py \
--ignore=tests/models/test_registry.py \
--ignore=tests/models/test_jamba.py \
--ignore=tests/models/test_mamba.py \
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
set -e
pip install pytest pytest-asyncio \
decord einops librosa peft Pillow sentence-transformers soundfile \
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
# Embedding models are not supported for CPU yet
# pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language/test_models.py
# Chunked prefill not supported for CPU yet
# pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

# online inference
docker exec cpu-test bash -c "
set -e
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
Expand Down
25 changes: 17 additions & 8 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,32 +20,41 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2

# offline inference
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
docker exec cpu-test-avx2 bash -c "
set -e
python3 examples/offline_inference.py"

# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
set -e
pip install pytest pytest-asyncio \
decord einops librosa peft Pillow sentence-transformers soundfile \
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
# Embedding models are not supported for CPU yet
# pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language \
--ignore=tests/models/test_fp8.py \
--ignore=tests/models/decoder_only/language/test_jamba.py \
--ignore=tests/models/decoder_only/language/test_mamba.py \
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
pytest -v -s tests/models/decoder_only/language/test_models.py
# Chunked prefill not supported for CPU yet
# pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

# Run compressed-tensor test
docker exec cpu-test bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"

# Run AWQ test
docker exec cpu-test bash -c "
set -e
pytest -s -v \
tests/quantization/test_ipex_quant.py"

# online inference
docker exec cpu-test bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=48-92
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
Expand Down
3 changes: 1 addition & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,6 @@ steps:
source_file_dependencies:
- benchmarks/
commands:
- pip install aiohttp
- bash run-benchmarks.sh

- label: Quantization Test # 33min
Expand Down Expand Up @@ -331,7 +330,7 @@ steps:
commands:
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py

- label: Decoder-only Multi-Modal Models Test (Standard)
- label: Decoder-only Multi-Modal Models Test (Standard) # 26min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ skip_gitignore = true
[tool.pytest.ini_options]
markers = [
"skip_global_cleanup",
"core_model: run this model test in each PR instead of just daily",
"core_model: enable this model test in each PR instead of only nightly",
"cpu_model: enable this model test in CPU tests",
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
"skip_v1: do not run this test with v1",
]
5 changes: 0 additions & 5 deletions requirements-test.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ decord # required for video tests
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio tests
opencv-python # required for video tests
peft
requests
ray[adag]==2.35
sentence-transformers # required for embedding tests
soundfile # required for audio tests
Expand All @@ -29,9 +27,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test
# TODO: Add this after fully implementing llava(mantis)
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test

# Benchmarking
aiohttp

# quantization
bitsandbytes>=0.44.0
buildkite-test-collector==0.1.9
Expand Down
17 changes: 13 additions & 4 deletions tests/models/decoder_only/audio_language/test_ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import pytest_asyncio
from transformers import AutoModel, AutoTokenizer, BatchEncoding

from tests.utils import RemoteOpenAIServer
from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

from ....conftest import HfRunner, VllmRunner
from ....utils import RemoteOpenAIServer
from ...utils import check_logprobs_close

MODEL_NAME = "fixie-ai/ultravox-v0_3"
Expand Down Expand Up @@ -39,7 +39,10 @@ def audio(request):
return AudioAsset(request.param)


@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
@pytest.fixture(params=[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
])
def server(request, audio_assets):
args = [
"--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
Expand Down Expand Up @@ -185,7 +188,10 @@ def run_multi_audio_test(
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
@pytest.mark.parametrize("vllm_kwargs", [
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
])
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
num_logprobs: int, vllm_kwargs: dict) -> None:

Expand All @@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
@pytest.mark.parametrize("vllm_kwargs", [
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
])
def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
max_tokens: int, num_logprobs: int,
vllm_kwargs: dict) -> None:
Expand Down
1 change: 0 additions & 1 deletion tests/models/decoder_only/vision_language/test_h2ovl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
"h2oai/h2ovl-mississippi-800m", # Replace with your actual model names
"h2oai/h2ovl-mississippi-2b",
]
target_dtype = "bfloat16"


def run_preprocessing_test(
Expand Down
11 changes: 4 additions & 7 deletions tests/models/decoder_only/vision_language/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
),
limit_mm_per_prompt={"image": 4},
)],
marks=[pytest.mark.core_model],
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
"paligemma": VLMTestInfo(
models=["google/paligemma-3b-mix-224"],
Expand All @@ -111,7 +111,8 @@
"pixel_values"
),
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="half" if current_platform.is_rocm() else ("half", "float"),
dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
else ("half", "float")),
marks=[pytest.mark.core_model],
),
"qwen2_vl": VLMTestInfo(
Expand All @@ -128,7 +129,7 @@
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
marks=[pytest.mark.core_model],
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
),
#### Extended model tests
Expand Down Expand Up @@ -172,7 +173,6 @@
use_tokenizer_eos=True,
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
num_logprobs=10,
dtype="bfloat16" if current_platform.is_cpu() else "half",
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
),
"glm4": VLMTestInfo(
Expand Down Expand Up @@ -245,7 +245,6 @@
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
test_type=VLMTestType.CUSTOM_INPUTS,
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
dtype="half",
num_video_frames=16,
max_model_len=16384,
postprocess_inputs=model_utils.get_key_type_post_processor(
Expand Down Expand Up @@ -404,7 +403,6 @@
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=4096,
dtype="bfloat16" if current_platform.is_cpu() else "half",
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
custom_test_opts=[
Expand All @@ -419,7 +417,6 @@
test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=16384,
max_num_seqs=2,
dtype="half",
postprocess_inputs=model_utils.get_key_type_post_processor(
"pixel_values"
),
Expand Down
2 changes: 0 additions & 2 deletions tests/models/decoder_only/vision_language/test_phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,


target_dtype = "half"
if current_platform.is_cpu():
target_dtype = "bfloat16"

# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
Expand Down
3 changes: 1 addition & 2 deletions tests/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from vllm.config import ModelConfig, TaskOption
from vllm.inputs import InputContext
from vllm.platforms import current_platform
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs

TokensText = Tuple[List[int], str]
Expand Down Expand Up @@ -270,7 +269,7 @@ def build_model_context(model_name: str,
if tokenizer_name is None:
tokenizer_name = model_name
if dtype is None:
dtype = "bfloat16" if current_platform.is_cpu() else "half"
dtype = "half"

model_config = ModelConfig(
model_name,
Expand Down
2 changes: 1 addition & 1 deletion vllm/assets/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ def image_embeds(self) -> torch.Tensor:
"""
image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
s3_prefix=VLM_IMAGES_DIR)
return torch.load(image_path)
return torch.load(image_path, map_location="cpu")
4 changes: 2 additions & 2 deletions vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
if sr != feature_extractor.sampling_rate:
try:
import librosa
except ImportError:
except ImportError as exc:
raise ImportError(
"Please install vllm[audio] for audio support.") from None
"Please install vllm[audio] for audio support.") from exc
audio = librosa.resample(audio,
orig_sr=sr,
target_sr=feature_extractor.sampling_rate)
Expand Down
8 changes: 4 additions & 4 deletions vllm/multimodal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]:
try:
import librosa
import soundfile
except ImportError:
except ImportError as exc:
raise ImportError(
"Please install vllm[audio] for audio support.") from None
"Please install vllm[audio] for audio support.") from exc
return librosa, soundfile


Expand Down Expand Up @@ -344,9 +344,9 @@ def try_import_video_packages() -> Any:
try:
import cv2
import decord
except ImportError:
except ImportError as exc:
raise ImportError(
"Please install vllm[video] for video support.") from None
"Please install vllm[video] for video support.") from exc
return cv2, decord


Expand Down
6 changes: 5 additions & 1 deletion vllm/worker/cpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,11 @@ def __init__(
self.local_omp_cpuid = omp_cpuids.split("|")[rank]

ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
if self.model_config.is_encoder_decoder:
if self.model_config.task == "embedding":
raise NotImplementedError(
"Embedding models are not supported for CPU backend")
# ModelRunnerClass = CPUEmbeddingModelRunner
elif self.model_config.is_encoder_decoder:
ModelRunnerClass = CPUEncoderDecoderModelRunner
self.model_runner: CPUModelRunner = ModelRunnerClass(
vllm_config=vllm_config,
Expand Down

0 comments on commit 32e4940

Please sign in to comment.