Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI/Build] Update CPU tests to include all "standard" tests #5481

Merged
merged 29 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
597cb35
Enable LLaVA test in CPU
DarkLight1337 Jun 13, 2024
845b465
Fix failing test on CPU due to unsupported dtype
DarkLight1337 Jun 13, 2024
5f92d96
Merge branch 'upstream' into test-llava-cpu
DarkLight1337 Jun 15, 2024
789b493
Merge branch 'upstream' into test-llava-cpu
DarkLight1337 Jun 19, 2024
e50b808
Merge branch 'upstream' into test-llava-cpu
DarkLight1337 Jun 20, 2024
8ba6e77
Merge branch 'upstream' into test-llava-cpu
DarkLight1337 Jun 21, 2024
783cb76
Install torchvision
DarkLight1337 Jun 21, 2024
e177bf8
Use CPU pypi index for torchvision
DarkLight1337 Jun 21, 2024
7273b45
Merge branch 'upstream' into test-llava-cpu
DarkLight1337 Oct 31, 2024
d926082
format
DarkLight1337 Oct 31, 2024
fe0ef62
Use bfloat16
DarkLight1337 Oct 31, 2024
f12d39f
Update
DarkLight1337 Oct 31, 2024
656a499
Update test dependencies
DarkLight1337 Oct 31, 2024
649525f
Merge branch 'upstream' into test-llava-cpu
DarkLight1337 Nov 2, 2024
8e33605
Remove unnecessary `is_cpu()` checks
DarkLight1337 Nov 7, 2024
08e242e
Merge branch 'upstream' into test-llava-cpu
DarkLight1337 Nov 7, 2024
1e63f85
Update
DarkLight1337 Nov 7, 2024
c09b140
Remove unnecessary args
DarkLight1337 Nov 7, 2024
6e6b838
Update
DarkLight1337 Nov 7, 2024
7bc3ad1
Merge branch 'upstream' into test-llava-cpu
DarkLight1337 Nov 7, 2024
e41db03
Fix missing library
DarkLight1337 Nov 7, 2024
8e3cf44
Fix loading image embeds on CPU
DarkLight1337 Nov 7, 2024
cd1cd15
Fix errors not being propagated to CI
DarkLight1337 Nov 7, 2024
b401cb9
Fix missing libraries
DarkLight1337 Nov 7, 2024
431a5c8
Embedding models are not supported for CPU backend
DarkLight1337 Nov 8, 2024
0df552f
Merge branch 'upstream' into test-llava-cpu
DarkLight1337 Nov 8, 2024
8c817e4
Chunked prefill not supported for CPU
DarkLight1337 Nov 8, 2024
4c39939
Fix installation
DarkLight1337 Nov 8, 2024
9ef98fa
Add `cpu_model` mark
DarkLight1337 Nov 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions .buildkite/run-cpu-test-ppc64le.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,22 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg

# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" \
--ignore=tests/models/test_embedding.py \
--ignore=tests/models/test_oot_registration.py \
--ignore=tests/models/test_registry.py \
--ignore=tests/models/test_jamba.py \
--ignore=tests/models/test_mamba.py \
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
set -e
pip install pytest pytest-asyncio \
decord einops librosa peft Pillow sentence-transformers soundfile \
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
# Embedding models are not supported for CPU yet
# pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language/test_models.py
# Chunked prefill not supported for CPU yet
# pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

# online inference
docker exec cpu-test bash -c "
set -e
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
Expand Down
25 changes: 17 additions & 8 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,32 +20,41 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2

# offline inference
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
docker exec cpu-test-avx2 bash -c "
set -e
python3 examples/offline_inference.py"

# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
set -e
pip install pytest pytest-asyncio \
decord einops librosa peft Pillow sentence-transformers soundfile \
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
# Embedding models are not supported for CPU yet
# pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language \
--ignore=tests/models/test_fp8.py \
--ignore=tests/models/decoder_only/language/test_jamba.py \
--ignore=tests/models/decoder_only/language/test_mamba.py \
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
pytest -v -s tests/models/decoder_only/language/test_models.py
# Chunked prefill not supported for CPU yet
# pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

# Run compressed-tensor test
docker exec cpu-test bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"

# Run AWQ test
docker exec cpu-test bash -c "
set -e
pytest -s -v \
tests/quantization/test_ipex_quant.py"

# online inference
docker exec cpu-test bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=48-92
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
Expand Down
3 changes: 1 addition & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,6 @@ steps:
source_file_dependencies:
- benchmarks/
commands:
- pip install aiohttp
- bash run-benchmarks.sh

- label: Quantization Test # 33min
Expand Down Expand Up @@ -331,7 +330,7 @@ steps:
commands:
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py

- label: Decoder-only Multi-Modal Models Test (Standard)
- label: Decoder-only Multi-Modal Models Test (Standard) # 26min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ skip_gitignore = true
[tool.pytest.ini_options]
markers = [
"skip_global_cleanup",
"core_model: run this model test in each PR instead of just daily",
"core_model: enable this model test in each PR instead of only nightly",
"cpu_model: enable this model test in CPU tests",
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
"skip_v1: do not run this test with v1",
]
5 changes: 0 additions & 5 deletions requirements-test.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ decord # required for video tests
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio tests
opencv-python # required for video tests
peft
requests
ray[adag]==2.35
sentence-transformers # required for embedding tests
soundfile # required for audio tests
Expand All @@ -29,9 +27,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test
# TODO: Add this after fully implementing llava(mantis)
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test

# Benchmarking
aiohttp

# quantization
bitsandbytes>=0.44.0
buildkite-test-collector==0.1.9
Expand Down
17 changes: 13 additions & 4 deletions tests/models/decoder_only/audio_language/test_ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import pytest_asyncio
from transformers import AutoModel, AutoTokenizer, BatchEncoding

from tests.utils import RemoteOpenAIServer
from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

from ....conftest import HfRunner, VllmRunner
from ....utils import RemoteOpenAIServer
from ...utils import check_logprobs_close

MODEL_NAME = "fixie-ai/ultravox-v0_3"
Expand Down Expand Up @@ -39,7 +39,10 @@ def audio(request):
return AudioAsset(request.param)


@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
@pytest.fixture(params=[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
])
def server(request, audio_assets):
args = [
"--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
Expand Down Expand Up @@ -185,7 +188,10 @@ def run_multi_audio_test(
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
@pytest.mark.parametrize("vllm_kwargs", [
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
])
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
num_logprobs: int, vllm_kwargs: dict) -> None:

Expand All @@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
@pytest.mark.parametrize("vllm_kwargs", [
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
])
def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
max_tokens: int, num_logprobs: int,
vllm_kwargs: dict) -> None:
Expand Down
1 change: 0 additions & 1 deletion tests/models/decoder_only/vision_language/test_h2ovl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
"h2oai/h2ovl-mississippi-800m", # Replace with your actual model names
"h2oai/h2ovl-mississippi-2b",
]
target_dtype = "bfloat16"


def run_preprocessing_test(
Expand Down
11 changes: 4 additions & 7 deletions tests/models/decoder_only/vision_language/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
),
limit_mm_per_prompt={"image": 4},
)],
marks=[pytest.mark.core_model],
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
"paligemma": VLMTestInfo(
models=["google/paligemma-3b-mix-224"],
Expand All @@ -111,7 +111,8 @@
"pixel_values"
),
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="half" if current_platform.is_rocm() else ("half", "float"),
dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
else ("half", "float")),
marks=[pytest.mark.core_model],
),
"qwen2_vl": VLMTestInfo(
Expand All @@ -128,7 +129,7 @@
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
marks=[pytest.mark.core_model],
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
),
#### Extended model tests
Expand Down Expand Up @@ -172,7 +173,6 @@
use_tokenizer_eos=True,
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
num_logprobs=10,
dtype="bfloat16" if current_platform.is_cpu() else "half",
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
),
"glm4": VLMTestInfo(
Expand Down Expand Up @@ -245,7 +245,6 @@
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
test_type=VLMTestType.CUSTOM_INPUTS,
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
dtype="half",
num_video_frames=16,
max_model_len=16384,
postprocess_inputs=model_utils.get_key_type_post_processor(
Expand Down Expand Up @@ -404,7 +403,6 @@
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=4096,
dtype="bfloat16" if current_platform.is_cpu() else "half",
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
custom_test_opts=[
Expand All @@ -419,7 +417,6 @@
test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=16384,
max_num_seqs=2,
dtype="half",
postprocess_inputs=model_utils.get_key_type_post_processor(
"pixel_values"
),
Expand Down
2 changes: 0 additions & 2 deletions tests/models/decoder_only/vision_language/test_phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,


target_dtype = "half"
if current_platform.is_cpu():
target_dtype = "bfloat16"

# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
Expand Down
3 changes: 1 addition & 2 deletions tests/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from vllm.config import ModelConfig, TaskOption
from vllm.inputs import InputContext
from vllm.platforms import current_platform
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs

TokensText = Tuple[List[int], str]
Expand Down Expand Up @@ -270,7 +269,7 @@ def build_model_context(model_name: str,
if tokenizer_name is None:
tokenizer_name = model_name
if dtype is None:
dtype = "bfloat16" if current_platform.is_cpu() else "half"
dtype = "half"

model_config = ModelConfig(
model_name,
Expand Down
2 changes: 1 addition & 1 deletion vllm/assets/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ def image_embeds(self) -> torch.Tensor:
"""
image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
s3_prefix=VLM_IMAGES_DIR)
return torch.load(image_path)
return torch.load(image_path, map_location="cpu")
4 changes: 2 additions & 2 deletions vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
if sr != feature_extractor.sampling_rate:
try:
import librosa
except ImportError:
except ImportError as exc:
raise ImportError(
"Please install vllm[audio] for audio support.") from None
"Please install vllm[audio] for audio support.") from exc
audio = librosa.resample(audio,
orig_sr=sr,
target_sr=feature_extractor.sampling_rate)
Expand Down
8 changes: 4 additions & 4 deletions vllm/multimodal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]:
try:
import librosa
import soundfile
except ImportError:
except ImportError as exc:
raise ImportError(
"Please install vllm[audio] for audio support.") from None
"Please install vllm[audio] for audio support.") from exc
return librosa, soundfile


Expand Down Expand Up @@ -344,9 +344,9 @@ def try_import_video_packages() -> Any:
try:
import cv2
import decord
except ImportError:
except ImportError as exc:
raise ImportError(
"Please install vllm[video] for video support.") from None
"Please install vllm[video] for video support.") from exc
return cv2, decord


Expand Down
6 changes: 5 additions & 1 deletion vllm/worker/cpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,11 @@ def __init__(
self.local_omp_cpuid = omp_cpuids.split("|")[rank]

ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
if self.model_config.is_encoder_decoder:
if self.model_config.task == "embedding":
raise NotImplementedError(
"Embedding models are not supported for CPU backend")
# ModelRunnerClass = CPUEmbeddingModelRunner
elif self.model_config.is_encoder_decoder:
ModelRunnerClass = CPUEncoderDecoderModelRunner
self.model_runner: CPUModelRunner = ModelRunnerClass(
vllm_config=vllm_config,
Expand Down