diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b4226a3ca5749..d9dcacf5d991e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -83,7 +83,6 @@ steps: - label: Entrypoints Test # 20min working_dir: "/vllm-workspace/tests" - soft_fail: true fast_check: true mirror_hardwares: [amd] source_file_dependencies: @@ -96,7 +95,8 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests @@ -178,7 +178,6 @@ steps: - pytest -v -s prefix_caching - label: Samplers Test # 18min - soft_fail: true source_file_dependencies: - vllm/model_executor/layers - vllm/sampling_metadata.py @@ -206,7 +205,6 @@ steps: - label: LoRA Test %N # 30min each mirror_hardwares: [amd] - soft_fail: true source_file_dependencies: - vllm/lora - tests/lora @@ -311,7 +309,6 @@ steps: - pytest -v -s models/decoder_only/language - label: Decoder-only Multi-Modal Models Test # 56min - soft_fail: true #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -463,7 +460,7 @@ steps: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus - pytest -v -s -x lora/test_mixtral.py - label: LM Eval Large Models # optional diff --git a/tests/conftest.py b/tests/conftest.py index 354862e3579ac..db71d8bc3af1e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -699,7 +699,6 @@ def generate_w_logprobs( if videos is not None: for i, video in enumerate(videos): inputs[i]["multi_modal_data"] = {"video": video} - print(f"[INPUTS!!!!]: {inputs}, {sampling_params}") req_outputs = self.model.generate(inputs, sampling_params=sampling_params) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 280a8abdd13a7..9fd1368cc2b59 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -8,8 +8,6 @@ import os import pytest -from packaging import version -from transformers import __version__ as transformers_version from vllm.logger import init_logger @@ -49,11 +47,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, pytest.skip("Skipping multi-node pipeline parallel test for " "multiprocessing distributed backend") - # Skip tests that require transformers>=4.45.0 - if "Qwen2-VL" in MODEL_NAME and version.parse( - transformers_version) < version.parse("4.45.0.dev0"): - pytest.skip("This test requires transformers>=4.45.0") - pp_args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py index bff0fc99ed022..bbabb936e92ba 100644 --- a/tests/engine/test_custom_executor.py +++ b/tests/engine/test_custom_executor.py @@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model): @pytest.mark.parametrize("model", ["facebook/opt-125m"]) -def test_custom_executor(model, tmpdir): +def test_custom_executor(model, tmp_path): cwd = os.path.abspath(".") - os.chdir(tmpdir) + os.chdir(tmp_path) try: assert not os.path.exists(".marker") @@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir): @pytest.mark.parametrize("model", ["facebook/opt-125m"]) -def test_custom_executor_async(model, tmpdir): +def test_custom_executor_async(model, tmp_path): cwd = os.path.abspath(".") - os.chdir(tmpdir) + os.chdir(tmp_path) try: assert not os.path.exists(".marker") diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index db31745cc102e..ec550fe82c70f 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -15,6 +15,11 @@ BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] +@dataclass +class MockHFConfig: + model_type: str = "any" + + @dataclass class MockModelConfig: tokenizer = MODEL_NAME @@ -24,6 +29,7 @@ class MockModelConfig: tokenizer_revision = None embedding_mode = False multimodal_config = MultiModalConfig() + hf_config = MockHFConfig() @dataclass diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 2dcad23c2b547..daa39b2a3dba1 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -41,7 +41,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): lora_request) -def test_get_lora_tokenizer(sql_lora_files, tmpdir): +def test_get_lora_tokenizer(sql_lora_files, tmp_path): lora_request = None tokenizer = get_lora_tokenizer(lora_request) assert not tokenizer @@ -50,6 +50,6 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir): tokenizer = get_lora_tokenizer(lora_request) assert tokenizer.get_added_vocab() - lora_request = LoRARequest("1", 1, str(tmpdir)) + lora_request = LoRARequest("1", 1, str(tmp_path)) tokenizer = get_lora_tokenizer(lora_request) assert not tokenizer diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py index e5c5ce4a8f745..0b71f0d49c70a 100644 --- a/tests/models/decoder_only/language/test_granite.py +++ b/tests/models/decoder_only/language/test_granite.py @@ -3,7 +3,6 @@ Run `pytest tests/models/test_granite.py`. """ import pytest -import transformers from ...utils import check_logprobs_close @@ -12,9 +11,6 @@ ] -# GraniteForCausalLM will be in transformers >= 4.45 -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="granite model test requires transformers >= 4.45") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) diff --git a/tests/models/decoder_only/vision_language/test_llava_next_video.py b/tests/models/decoder_only/vision_language/test_llava_next_video.py index d477bcc713611..7b7b23c783e2a 100644 --- a/tests/models/decoder_only/vision_language/test_llava_next_video.py +++ b/tests/models/decoder_only/vision_language/test_llava_next_video.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple, Type, overload import pytest -import transformers from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer from vllm.multimodal.utils import (rescale_video_size, resize_video, @@ -158,8 +157,6 @@ def run_test( ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", @@ -203,8 +200,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors, ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "sizes", diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index d1bffddde59ab..978631feacb8c 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple, Type, overload import pytest -import transformers from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, BatchEncoding) @@ -166,8 +165,6 @@ def process(hf_inputs: BatchEncoding): ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", @@ -211,8 +208,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors, ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "sizes", @@ -259,7 +254,9 @@ def run_image_test( # max_model_len should be greater than image_feature_size with vllm_runner(model, dtype=dtype, - max_model_len=32768, + max_num_seqs=1, + max_model_len=16384, + gpu_memory_utilization=0.98, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True, @@ -305,8 +302,8 @@ def process(hf_inputs: BatchEncoding): ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") +# FIXME: Swap to a smaller model for this architecture +@pytest.mark.skip(reason="Model OOMing on CI") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 4b9a1ca44c0d0..b058e2755c245 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -1,15 +1,9 @@ import pytest -import transformers from vllm.model_executor.models import _MODELS, ModelRegistry @pytest.mark.parametrize("model_cls", _MODELS) def test_registry_imports(model_cls): - if (model_cls in ("LlavaOnevisionForConditionalGeneration", - "Qwen2VLForConditionalGeneration") - and transformers.__version__ < "4.45"): - pytest.skip("Waiting for next transformers release") - # Ensure all model classes can be imported successfully ModelRegistry.resolve_model_cls([model_cls]) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 308b708feab71..3342a336a4efa 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,5 +1,6 @@ import itertools import random +from dataclasses import dataclass from typing import Dict, List, Optional, Tuple from unittest.mock import Mock, patch @@ -596,8 +597,19 @@ def test_sampler_top_k_top_p(seed: int, device: str): generation_config = GenerationConfig(top_k=top_k, top_p=top_p, do_sample=True) - warpers = generation_model._get_logits_warper(generation_config, device) - assert len(warpers) == 2 # top_p and top_k + + @dataclass + class MockConfig: + is_encoder_decoder: bool = False + + generation_model.config = MockConfig() # needed by the following method + generation_model._prepare_special_tokens(generation_config, device=device) + processors = generation_model._get_logits_processor(generation_config, + None, + None, + None, [], + device=device) + assert len(processors) == 2 # top_p and top_k seq_group_metadata_list: List[SequenceGroupMetadata] = [] seq_lens: List[int] = [] @@ -639,7 +651,7 @@ def mock_sample(probs, *args, **kwargs): assert sample_probs is not None - hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone()) + hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone()) hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5) assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 254671ef4486a..8b51fc804ad92 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -152,13 +152,13 @@ async def create_chat_completion( **(request.chat_template_kwargs or {}), ) except Exception as e: - logger.error("Error in applying chat template from request: %s", e) + logger.exception("Error in applying chat template from request") return self.create_error_response(str(e)) try: mm_data = await mm_data_future except Exception as e: - logger.error("Error in loading multi-modal data: %s", e) + logger.exception("Error in loading multi-modal data") return self.create_error_response(str(e)) # validation for OpenAI tools diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 2a2d74382e37a..e3b244d06660d 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -1,6 +1,7 @@ import os import warnings from pathlib import Path +from types import MethodType from typing import Optional, Union import huggingface_hub @@ -152,6 +153,29 @@ def get_tokenizer( else: raise e + # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324 + if type(tokenizer).__name__ in ("ChatGLMTokenizer", + "ChatGLM4Tokenizer"): + assert isinstance(tokenizer, PreTrainedTokenizer) + orig_pad = tokenizer._pad + + # Patch _pad method to accept `padding_side` + def _pad( + self: PreTrainedTokenizer, + *args, + padding_side: Optional[str] = None, + **kwargs, + ): + if (padding_side is not None + and padding_side != self.padding_side): + msg = ("`padding_side` argument is not supported by " + "ChatGLMTokenizer and will be ignored.") + warnings.warn(msg, stacklevel=2) + + return orig_pad(*args, **kwargs) + + tokenizer._pad = MethodType(_pad, tokenizer) + if not isinstance(tokenizer, PreTrainedTokenizerFast): logger.warning( "Using a slow tokenizer. This might cause a significant " @@ -167,7 +191,7 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args, return None try: tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs) - except OSError as e: + except Exception as e: # No tokenizer was found in the LoRA folder, # use base model tokenizer logger.warning(