From 4f5339734f2b846f067a2e3ad26bf47cada37589 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 25 Sep 2024 17:10:48 -0700 Subject: [PATCH 01/22] [BugFix] Fix test breakages from transformers 4.45 upgrade --- tests/samplers/test_sampler.py | 10 +++++++--- vllm/config.py | 2 +- vllm/transformers_utils/tokenizer.py | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 308b708feab71..97512381203e7 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -596,8 +596,12 @@ def test_sampler_top_k_top_p(seed: int, device: str): generation_config = GenerationConfig(top_k=top_k, top_p=top_p, do_sample=True) - warpers = generation_model._get_logits_warper(generation_config, device) - assert len(warpers) == 2 # top_p and top_k + processors = generation_model._get_logits_processor(generation_config, + None, + None, + None, [], + device=device) + assert len(processors) == 2 # top_p and top_k seq_group_metadata_list: List[SequenceGroupMetadata] = [] seq_lens: List[int] = [] @@ -639,7 +643,7 @@ def mock_sample(probs, *args, **kwargs): assert sample_probs is not None - hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone()) + hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone()) hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5) assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) diff --git a/vllm/config.py b/vllm/config.py index 108badf150c86..80f676d7e1cbf 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1740,7 +1740,7 @@ def _get_and_verify_max_len( "with rope_scaling. Please raise an issue so we can " "investigate.") - if rope_type == "mrope": + if rope_type in ("mrope", "default"): scaling_factor = 1 else: assert "factor" in rope_scaling diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 2a2d74382e37a..0f8e6beec7e3a 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -167,7 +167,7 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args, return None try: tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs) - except OSError as e: + except (OSError, ValueError) as e: # No tokenizer was found in the LoRA folder, # use base model tokenizer logger.warning( From e2ae1bb2bd1a70975e053b0ce4a5c6ce0247958a Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 25 Sep 2024 17:22:23 -0700 Subject: [PATCH 02/22] Also fix llava OOM from @ywang96 Co-authored-by: Roger Wang --- .../models/decoder_only/vision_language/test_llava_onevision.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index d1bffddde59ab..93038ebc033e7 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -259,6 +259,7 @@ def run_image_test( # max_model_len should be greater than image_feature_size with vllm_runner(model, dtype=dtype, + max_num_seqs=1, max_model_len=32768, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, From 66c0c191453c9f3cf6cff3fe3a5b1be8b9881810 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 25 Sep 2024 19:10:58 -0700 Subject: [PATCH 03/22] Fix next failures --- tests/entrypoints/openai/test_serving_chat.py | 6 ++++++ tests/samplers/test_sampler.py | 8 ++++++++ vllm/entrypoints/openai/serving_chat.py | 4 ++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index db31745cc102e..ec550fe82c70f 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -15,6 +15,11 @@ BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] +@dataclass +class MockHFConfig: + model_type: str = "any" + + @dataclass class MockModelConfig: tokenizer = MODEL_NAME @@ -24,6 +29,7 @@ class MockModelConfig: tokenizer_revision = None embedding_mode = False multimodal_config = MultiModalConfig() + hf_config = MockHFConfig() @dataclass diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 97512381203e7..3342a336a4efa 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,5 +1,6 @@ import itertools import random +from dataclasses import dataclass from typing import Dict, List, Optional, Tuple from unittest.mock import Mock, patch @@ -596,6 +597,13 @@ def test_sampler_top_k_top_p(seed: int, device: str): generation_config = GenerationConfig(top_k=top_k, top_p=top_p, do_sample=True) + + @dataclass + class MockConfig: + is_encoder_decoder: bool = False + + generation_model.config = MockConfig() # needed by the following method + generation_model._prepare_special_tokens(generation_config, device=device) processors = generation_model._get_logits_processor(generation_config, None, None, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 94076ea3a51db..f60a60ed1a843 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -152,13 +152,13 @@ async def create_chat_completion( **(request.chat_template_kwargs or {}), ) except Exception as e: - logger.error("Error in applying chat template from request: %s", e) + logger.exception("Error in applying chat template from request") return self.create_error_response(str(e)) try: mm_data = await mm_data_future except Exception as e: - logger.error("Error in loading multi-modal data: %s", e) + logger.exception("Error in loading multi-modal data") return self.create_error_response(str(e)) # validation for OpenAI tools From a5b289c14b465233c77a91a98a126e285e952439 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 25 Sep 2024 21:17:14 -0700 Subject: [PATCH 04/22] Catch any Exception when attempting to load lora-specific tokenizer --- vllm/transformers_utils/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 0f8e6beec7e3a..fc3d19a28f2b1 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -167,7 +167,7 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args, return None try: tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs) - except (OSError, ValueError) as e: + except Exception as e: # No tokenizer was found in the LoRA folder, # use base model tokenizer logger.warning( From ce1d477041a6808a2082e8d311ea1e5f1d4e29ac Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 25 Sep 2024 22:13:58 -0700 Subject: [PATCH 05/22] Change "default" rope scaling type back to "mrope" in HF config --- vllm/config.py | 2 +- vllm/transformers_utils/config.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 80f676d7e1cbf..108badf150c86 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1740,7 +1740,7 @@ def _get_and_verify_max_len( "with rope_scaling. Please raise an issue so we can " "investigate.") - if rope_type in ("mrope", "default"): + if rope_type == "mrope": scaling_factor = 1 else: assert "factor" in rope_scaling diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 3871c0cb8b819..35788155fbcea 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -174,6 +174,14 @@ def get_config( else: raise e + # Replace unrecognized "default" rope scaling type with "mrope" + # See https://github.com/huggingface/transformers/issues/33401 + hf_rope_scaling = getattr(config, "rope_scaling", None) + if hf_rope_scaling is not None: + for type_key in ("type", "rope_type"): + if hf_rope_scaling.get(type_key) == "default": + hf_rope_scaling[type_key] = "mrope" + elif config_format == ConfigFormat.MISTRAL: config = load_params_config(model, revision) else: From 4eaa8e13ff36dca9dcae3718b34fb68946447e66 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 26 Sep 2024 01:22:35 -0700 Subject: [PATCH 06/22] raise gpu mem --- .../models/decoder_only/vision_language/test_llava_onevision.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index 93038ebc033e7..6774937a03869 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -261,6 +261,7 @@ def run_image_test( dtype=dtype, max_num_seqs=1, max_model_len=32768, + gpu_memory_utilization=0.95, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True, From 562f81641a33c626ec51190d7e2da04d1f8fac7d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 26 Sep 2024 14:51:50 +0000 Subject: [PATCH 07/22] Remove unnecessary overwrite --- vllm/transformers_utils/config.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 9ae2c25335d4f..0f20e8d0c8213 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -172,14 +172,6 @@ def get_config( else: raise e - # Replace unrecognized "default" rope scaling type with "mrope" - # See https://github.com/huggingface/transformers/issues/33401 - hf_rope_scaling = getattr(config, "rope_scaling", None) - if hf_rope_scaling is not None: - for type_key in ("type", "rope_type"): - if hf_rope_scaling.get(type_key) == "default": - hf_rope_scaling[type_key] = "mrope" - elif config_format == ConfigFormat.MISTRAL: config = load_params_config(model, revision) else: From 51b9abc9d69667fdd3339b7cdaa8e1573f72ac6d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 26 Sep 2024 14:54:34 +0000 Subject: [PATCH 08/22] Remove unnecessary version guards --- tests/distributed/test_pipeline_parallel.py | 7 ------- tests/models/decoder_only/language/test_granite.py | 4 ---- .../decoder_only/vision_language/test_llava_next_video.py | 5 ----- .../decoder_only/vision_language/test_llava_onevision.py | 7 ------- tests/models/test_registry.py | 6 ------ 5 files changed, 29 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 280a8abdd13a7..9fd1368cc2b59 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -8,8 +8,6 @@ import os import pytest -from packaging import version -from transformers import __version__ as transformers_version from vllm.logger import init_logger @@ -49,11 +47,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, pytest.skip("Skipping multi-node pipeline parallel test for " "multiprocessing distributed backend") - # Skip tests that require transformers>=4.45.0 - if "Qwen2-VL" in MODEL_NAME and version.parse( - transformers_version) < version.parse("4.45.0.dev0"): - pytest.skip("This test requires transformers>=4.45.0") - pp_args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py index e5c5ce4a8f745..0b71f0d49c70a 100644 --- a/tests/models/decoder_only/language/test_granite.py +++ b/tests/models/decoder_only/language/test_granite.py @@ -3,7 +3,6 @@ Run `pytest tests/models/test_granite.py`. """ import pytest -import transformers from ...utils import check_logprobs_close @@ -12,9 +11,6 @@ ] -# GraniteForCausalLM will be in transformers >= 4.45 -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="granite model test requires transformers >= 4.45") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) diff --git a/tests/models/decoder_only/vision_language/test_llava_next_video.py b/tests/models/decoder_only/vision_language/test_llava_next_video.py index d477bcc713611..7b7b23c783e2a 100644 --- a/tests/models/decoder_only/vision_language/test_llava_next_video.py +++ b/tests/models/decoder_only/vision_language/test_llava_next_video.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple, Type, overload import pytest -import transformers from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer from vllm.multimodal.utils import (rescale_video_size, resize_video, @@ -158,8 +157,6 @@ def run_test( ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", @@ -203,8 +200,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors, ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "sizes", diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index 6774937a03869..00adc8f284c16 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple, Type, overload import pytest -import transformers from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, BatchEncoding) @@ -166,8 +165,6 @@ def process(hf_inputs: BatchEncoding): ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", @@ -211,8 +208,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors, ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "sizes", @@ -307,8 +302,6 @@ def process(hf_inputs: BatchEncoding): ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 4b9a1ca44c0d0..b058e2755c245 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -1,15 +1,9 @@ import pytest -import transformers from vllm.model_executor.models import _MODELS, ModelRegistry @pytest.mark.parametrize("model_cls", _MODELS) def test_registry_imports(model_cls): - if (model_cls in ("LlavaOnevisionForConditionalGeneration", - "Qwen2VLForConditionalGeneration") - and transformers.__version__ < "4.45"): - pytest.skip("Waiting for next transformers release") - # Ensure all model classes can be imported successfully ModelRegistry.resolve_model_cls([model_cls]) From 8e7f2b632d6546e96252ff6c37f41e9210b73223 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 26 Sep 2024 16:54:10 +0000 Subject: [PATCH 09/22] Update A100 distributed test with new file location (missed in #7820) --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ea8b3d46f1b3f..dd62d9c186629 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -459,7 +459,7 @@ steps: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus - pytest -v -s -x lora/test_mixtral.py - label: LM Eval Large Models # optional From 57b73286658050ef8b9106ba3775214acfac08fc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 26 Sep 2024 17:01:23 +0000 Subject: [PATCH 10/22] Replace legacy `tmpdir` with modern `tmp_path` fixture --- tests/engine/test_custom_executor.py | 8 ++++---- tests/lora/test_tokenizer_group.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py index bff0fc99ed022..bbabb936e92ba 100644 --- a/tests/engine/test_custom_executor.py +++ b/tests/engine/test_custom_executor.py @@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model): @pytest.mark.parametrize("model", ["facebook/opt-125m"]) -def test_custom_executor(model, tmpdir): +def test_custom_executor(model, tmp_path): cwd = os.path.abspath(".") - os.chdir(tmpdir) + os.chdir(tmp_path) try: assert not os.path.exists(".marker") @@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir): @pytest.mark.parametrize("model", ["facebook/opt-125m"]) -def test_custom_executor_async(model, tmpdir): +def test_custom_executor_async(model, tmp_path): cwd = os.path.abspath(".") - os.chdir(tmpdir) + os.chdir(tmp_path) try: assert not os.path.exists(".marker") diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 2dcad23c2b547..daa39b2a3dba1 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -41,7 +41,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): lora_request) -def test_get_lora_tokenizer(sql_lora_files, tmpdir): +def test_get_lora_tokenizer(sql_lora_files, tmp_path): lora_request = None tokenizer = get_lora_tokenizer(lora_request) assert not tokenizer @@ -50,6 +50,6 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir): tokenizer = get_lora_tokenizer(lora_request) assert tokenizer.get_added_vocab() - lora_request = LoRARequest("1", 1, str(tmpdir)) + lora_request = LoRARequest("1", 1, str(tmp_path)) tokenizer = get_lora_tokenizer(lora_request) assert not tokenizer From 0ebd4fb690a804853c743f0454069c43cf568af0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 26 Sep 2024 17:33:33 +0000 Subject: [PATCH 11/22] Reduce max_model_len in LLaVA-OneVision test to avoid OOM --- .../decoder_only/vision_language/test_llava_onevision.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index 00adc8f284c16..e1a2fc93d2483 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -255,8 +255,7 @@ def run_image_test( with vllm_runner(model, dtype=dtype, max_num_seqs=1, - max_model_len=32768, - gpu_memory_utilization=0.95, + max_model_len=10240, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True, From 4a924c88ef432698fa06419faeb37baf9e93b5bb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 26 Sep 2024 17:41:44 +0000 Subject: [PATCH 12/22] Patch `ChatGLMTokenizer._pad` --- vllm/transformers_utils/tokenizer.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index fc3d19a28f2b1..4001535476c13 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -1,6 +1,7 @@ import os import warnings from pathlib import Path +from types import MethodType from typing import Optional, Union import huggingface_hub @@ -152,6 +153,23 @@ def get_tokenizer( else: raise e + if type(tokenizer).__name__ == "ChatGLMTokenizer": + assert isinstance(tokenizer, PreTrainedTokenizer) + orig_pad = tokenizer._pad + + # Patch _pad method to accept `padding_side` + def _pad(self: PreTrainedTokenizer, *args, **kwargs): + padding_side: Optional[str] = kwargs.pop("padding_side") + if (padding_side is not None + and padding_side != self.padding_side): + msg = ("`padding_side` argument is not supported by " + "ChatGLMTokenizer and will be ignored.") + warnings.warn(msg, stacklevel=2) + + return orig_pad(*args, **kwargs) + + tokenizer._pad = MethodType(_pad, tokenizer) + if not isinstance(tokenizer, PreTrainedTokenizerFast): logger.warning( "Using a slow tokenizer. This might cause a significant " From 0c30e87e03b06dc17ff46d8e34de561e431b8d04 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 26 Sep 2024 17:47:00 +0000 Subject: [PATCH 13/22] Run OOT test in a clean process to solve OOM in AMD --- .buildkite/test-pipeline.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index dd62d9c186629..d9dcacf5d991e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -95,7 +95,8 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests From 9f2fac8864fd15ce92eb6c94deb84fe68050bb7c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 26 Sep 2024 17:51:56 +0000 Subject: [PATCH 14/22] Fix insufficient `max_model_len` --- tests/models/decoder_only/vision_language/test_llava_next.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/decoder_only/vision_language/test_llava_next.py b/tests/models/decoder_only/vision_language/test_llava_next.py index f833fe0c8bbb4..d08bf7de0f620 100644 --- a/tests/models/decoder_only/vision_language/test_llava_next.py +++ b/tests/models/decoder_only/vision_language/test_llava_next.py @@ -135,7 +135,7 @@ def _run_test( # max_model_len should be greater than image_feature_size with vllm_runner(model, dtype=dtype, - max_model_len=10240, + max_model_len=16384, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True, From 2b6948c6c21a045983df073406eb3a384fcb8e74 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 26 Sep 2024 17:54:26 +0000 Subject: [PATCH 15/22] Fix wrong test being updated --- tests/models/decoder_only/vision_language/test_llava_next.py | 2 +- .../models/decoder_only/vision_language/test_llava_onevision.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_llava_next.py b/tests/models/decoder_only/vision_language/test_llava_next.py index d08bf7de0f620..f833fe0c8bbb4 100644 --- a/tests/models/decoder_only/vision_language/test_llava_next.py +++ b/tests/models/decoder_only/vision_language/test_llava_next.py @@ -135,7 +135,7 @@ def _run_test( # max_model_len should be greater than image_feature_size with vllm_runner(model, dtype=dtype, - max_model_len=16384, + max_model_len=10240, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True, diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index e1a2fc93d2483..2d85f0b8c1d18 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -255,7 +255,7 @@ def run_image_test( with vllm_runner(model, dtype=dtype, max_num_seqs=1, - max_model_len=10240, + max_model_len=16384, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True, From 45e2b547027581d68d8dc67d65f49af653033299 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 26 Sep 2024 17:59:22 +0000 Subject: [PATCH 16/22] Cleanup --- vllm/transformers_utils/tokenizer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 4001535476c13..ec90bdbc630e6 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -158,8 +158,12 @@ def get_tokenizer( orig_pad = tokenizer._pad # Patch _pad method to accept `padding_side` - def _pad(self: PreTrainedTokenizer, *args, **kwargs): - padding_side: Optional[str] = kwargs.pop("padding_side") + def _pad( + self: PreTrainedTokenizer, + *args, + padding_side: Optional[str] = None, + **kwargs, + ): if (padding_side is not None and padding_side != self.padding_side): msg = ("`padding_side` argument is not supported by " From f0584fa16ac377351f13533030d42f10e1991aa5 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 26 Sep 2024 14:42:28 -0700 Subject: [PATCH 17/22] raise mem --- .../models/decoder_only/vision_language/test_llava_onevision.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index 2d85f0b8c1d18..d7ea790b70cee 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -256,6 +256,7 @@ def run_image_test( dtype=dtype, max_num_seqs=1, max_model_len=16384, + gpu_memory_utilization = 0.98, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True, From 27b96c1f3e5eb0a0886f80a65f6c04111f22e087 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 26 Sep 2024 14:48:35 -0700 Subject: [PATCH 18/22] format --- .../models/decoder_only/vision_language/test_llava_onevision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index d7ea790b70cee..897001ec2c79c 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -256,7 +256,7 @@ def run_image_test( dtype=dtype, max_num_seqs=1, max_model_len=16384, - gpu_memory_utilization = 0.98, + gpu_memory_utilization=0.98, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True, From 315ff90bc988f08908f5329ee251298a6054e346 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 26 Sep 2024 16:35:06 -0700 Subject: [PATCH 19/22] remove comment --- tests/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 354862e3579ac..db71d8bc3af1e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -699,7 +699,6 @@ def generate_w_logprobs( if videos is not None: for i, video in enumerate(videos): inputs[i]["multi_modal_data"] = {"video": video} - print(f"[INPUTS!!!!]: {inputs}, {sampling_params}") req_outputs = self.model.generate(inputs, sampling_params=sampling_params) From 8fdad1c5375435948b898c2fa5e992e44d846095 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 26 Sep 2024 16:36:52 -0700 Subject: [PATCH 20/22] skip test --- .../models/decoder_only/vision_language/test_llava_onevision.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index 897001ec2c79c..978631feacb8c 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -302,6 +302,8 @@ def process(hf_inputs: BatchEncoding): ) +# FIXME: Swap to a smaller model for this architecture +@pytest.mark.skip(reason="Model OOMing on CI") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) From 6decd70e91d71376fb170c022e33c9e19233b859 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 26 Sep 2024 16:38:44 -0700 Subject: [PATCH 21/22] revert soft fail --- .buildkite/test-pipeline.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f831297b9d949..d9dcacf5d991e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -83,7 +83,6 @@ steps: - label: Entrypoints Test # 20min working_dir: "/vllm-workspace/tests" - soft_fail: true fast_check: true mirror_hardwares: [amd] source_file_dependencies: @@ -179,7 +178,6 @@ steps: - pytest -v -s prefix_caching - label: Samplers Test # 18min - soft_fail: true source_file_dependencies: - vllm/model_executor/layers - vllm/sampling_metadata.py @@ -207,7 +205,6 @@ steps: - label: LoRA Test %N # 30min each mirror_hardwares: [amd] - soft_fail: true source_file_dependencies: - vllm/lora - tests/lora @@ -312,7 +309,6 @@ steps: - pytest -v -s models/decoder_only/language - label: Decoder-only Multi-Modal Models Test # 56min - soft_fail: true #mirror_hardwares: [amd] source_file_dependencies: - vllm/ From 59bc78d9fd2863471b16b62b4ba6507ec5a92d49 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 26 Sep 2024 23:42:34 +0000 Subject: [PATCH 22/22] Update tokenizer patch --- vllm/transformers_utils/tokenizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index ec90bdbc630e6..e3b244d06660d 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -153,7 +153,9 @@ def get_tokenizer( else: raise e - if type(tokenizer).__name__ == "ChatGLMTokenizer": + # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324 + if type(tokenizer).__name__ in ("ChatGLMTokenizer", + "ChatGLM4Tokenizer"): assert isinstance(tokenizer, PreTrainedTokenizer) orig_pad = tokenizer._pad