From 22ea4374098c45054b36124ee53058b75e105177 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Thu, 12 Sep 2024 14:06:51 -0700 Subject: [PATCH] [Hotfix][Core][VLM] Disable chunked prefill by default and prefix caching for multimodal models (#8425) --- vllm/engine/arg_utils.py | 12 +++++++++++- vllm/model_executor/models/__init__.py | 4 ++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6f58c39162087..b5eba9ca3727a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -843,6 +843,13 @@ def create_engine_config(self) -> EngineConfig: device_config = DeviceConfig(device=self.device) model_config = self.create_model_config() + if model_config.is_multimodal_model: + if self.enable_prefix_caching: + logger.warning( + "--enable-prefix-caching is currently not " + "supported for multimodal models and has been disabled.") + self.enable_prefix_caching = False + cache_config = CacheConfig( block_size=self.block_size if self.device != "neuron" else self.max_model_len, # neuron needs block_size = max_model_len @@ -874,7 +881,10 @@ def create_engine_config(self) -> EngineConfig: # If not explicitly set, enable chunked prefill by default for # long context (> 32K) models. This is to avoid OOM errors in the # initial memory profiling phase. - if use_long_context: + + # Chunked prefill is currently disabled for multimodal models by + # default. + if use_long_context and not model_config.is_multimodal_model: is_gpu = device_config.device_type == "cuda" use_sliding_window = (model_config.get_sliding_window() is not None) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 2c01eb380c375..250f75b639a5b 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -90,12 +90,12 @@ "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), - "UltravoxModel": ("ultravox", "UltravoxModel"), - "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), + "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), + "UltravoxModel": ("ultravox", "UltravoxModel"), } _CONDITIONAL_GENERATION_MODELS = { "BartModel": ("bart", "BartForConditionalGeneration"),