From 2fbbcb79a19788b74b8185add3d640ef868e4625 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 19:31:40 +0800 Subject: [PATCH 01/16] add phi3v resize for dynamic shape and fix torchvision requirement --- requirements-cpu.txt | 2 ++ requirements-cuda.txt | 1 + requirements-test.txt | 1 - vllm/model_executor/models/phi3v.py | 44 ++++++++++++++++++++--------- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 8b7d86e686217..773c5cd4f4ea5 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -1,6 +1,8 @@ # Common dependencies -r requirements-common.txt +--index-url https://download.pytorch.org/whl/cpu # Dependencies for x86_64 CPUs torch == 2.3.1+cpu +torchvision == 0.18.1+cpu # required for the image processor of phi3v triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. \ No newline at end of file diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 3536179835967..54fac006d6f5f 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -5,5 +5,6 @@ ray >= 2.9 nvidia-ml-py # for pynvml package torch == 2.3.0 +torchvision == 0.18.0 # required for the image processor of phi3v xformers == 0.0.26.post1 # Requires PyTorch 2.3.0 vllm-flash-attn == 2.5.9 # Requires PyTorch 2.3.0 diff --git a/requirements-test.txt b/requirements-test.txt index fef0ede7be0ff..8b68e0e939669 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -14,7 +14,6 @@ peft requests ray sentence-transformers # required for embedding -torchvision # required for the image processor of phi3v # Benchmarking aiohttp diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index fa20a7c5903d6..7162ded074b1e 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -13,28 +13,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict +from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict import torch import torch.nn as nn -from transformers import CLIPVisionConfig, PretrainedConfig +from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, VisionLanguageConfig +from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig +from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.models.vlm_base import VisionLanguageModelBase from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import get_dummy_image_data +from vllm.multimodal.image import ImagePixelData, get_dummy_image_data from vllm.sequence import SamplerOutput +logger = init_logger(__name__) + _KEYS_TO_MODIFY_MAPPING = { "model.vision_embed_tokens": "vision_embed_tokens", } @@ -71,10 +73,9 @@ def get_img_features(self, LAYER_IDX = self.layer_idx TYPE_FEATURE = self.type_feature - # NOTE: we skip the step to select the vision feature layer since - # this is already done inside the img_processor - img_feature = self.img_processor(img_embeds, - vision_feature_layer=LAYER_IDX) + img_processor_output = self.img_processor(img_embeds, + output_hidden_states=True) + img_feature = img_processor_output.hidden_states[LAYER_IDX] if TYPE_FEATURE == "patch": patch_feature = img_feature[:, 1:] @@ -268,7 +269,27 @@ class Phi3VImagePixelInputs(TypedDict): """Shape: (batch_size, 2)""" -@MULTIMODAL_REGISTRY.register_image_pixel_input() +def _image_processor( + data: ImagePixelData, + model_config: ModelConfig, + vlm_config: VisionLanguageConfig, +) -> Dict[str, torch.Tensor]: + image = data.image + + # Temporary patch before dynamic number of image tokens is supported + _, _, h, w = vlm_config.image_input_shape + if (w, h) != (image.width, image.height): + logger.warning( + "Dynamic image shape is currently not supported. " + "Resizing input image to (%d, %d).", w, h) + + data.image = image.resize((w, h)) + + return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \ + ._default_input_processor(data, model_config, vlm_config) + + +@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor) @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) class Phi3VForCausalLM(VisionLanguageModelBase): @@ -354,9 +375,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue - # post_layernorm is not needed in CLIPVisionModel - if "vision_model.post_layernorm" in name: - continue for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): if key_to_modify in name: name = name.replace(key_to_modify, new_key) From 1579519d271098ab02f42da7eeac1a63afb4280b Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 19:42:25 +0800 Subject: [PATCH 02/16] remove index-url --- requirements-cpu.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 773c5cd4f4ea5..5046c26a4acbd 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -1,6 +1,5 @@ # Common dependencies -r requirements-common.txt ---index-url https://download.pytorch.org/whl/cpu # Dependencies for x86_64 CPUs torch == 2.3.1+cpu From 39f3fef453763e8bff90f8756bc00ac74ae3e7cf Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 19:45:05 +0800 Subject: [PATCH 03/16] revert change by mistake --- vllm/model_executor/models/phi3v.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 7162ded074b1e..fa944b11b0058 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -375,6 +375,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + # post_layernorm is not needed in CLIPVisionModel + if "vision_model.post_layernorm" in name: + continue for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): if key_to_modify in name: name = name.replace(key_to_modify, new_key) From f048b3e98886281f621b1c4255c460ccb1f45e2d Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 19:46:55 +0800 Subject: [PATCH 04/16] revert change by mistake --- vllm/model_executor/models/phi3v.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index fa944b11b0058..52e72db4c630e 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -73,9 +73,10 @@ def get_img_features(self, LAYER_IDX = self.layer_idx TYPE_FEATURE = self.type_feature - img_processor_output = self.img_processor(img_embeds, - output_hidden_states=True) - img_feature = img_processor_output.hidden_states[LAYER_IDX] + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the img_processor + img_feature = self.img_processor(img_embeds, + vision_feature_layer=LAYER_IDX) if TYPE_FEATURE == "patch": patch_feature = img_feature[:, 1:] From 2131fa2c6ce4367582519aa9a54be7f7f8f28184 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 19:47:54 +0800 Subject: [PATCH 05/16] revert change by mistake --- vllm/model_executor/models/phi3v.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 52e72db4c630e..5186f8d6197ff 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -17,7 +17,7 @@ import torch import torch.nn as nn -from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig +from transformers import CLIPVisionConfig, PretrainedConfig from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig @@ -28,6 +28,7 @@ from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.models.vlm_base import VisionLanguageModelBase from vllm.model_executor.sampling_metadata import SamplingMetadata From e11e39571d35853488e702f118c48e2aa5c6e2e9 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 21:02:58 +0800 Subject: [PATCH 06/16] fix phi3v --- vllm/model_executor/models/phi3v.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 5186f8d6197ff..204c021c2d559 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -278,6 +278,13 @@ def _image_processor( ) -> Dict[str, torch.Tensor]: image = data.image + if isinstance(image, torch.Tensor): + pixel_values = image.to(model_config.dtype) + batch_size, _, _, h, w = pixel_values.shape + image_sizes = torch.tensor([(w, h) for _ in range(batch_size)]) + + return {"pixel_values": pixel_values, "image_sizes": image_sizes} + # Temporary patch before dynamic number of image tokens is supported _, _, h, w = vlm_config.image_input_shape if (w, h) != (image.width, image.height): From ea71ee874d802be413df57c724e9204086f8a09e Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 22:13:30 +0800 Subject: [PATCH 07/16] fix phi3v --- vllm/model_executor/models/phi3v.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 204c021c2d559..b89c25e09e073 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -17,6 +17,7 @@ import torch import torch.nn as nn +from PIL import Image from transformers import CLIPVisionConfig, PretrainedConfig from vllm.attention import AttentionMetadata @@ -278,24 +279,18 @@ def _image_processor( ) -> Dict[str, torch.Tensor]: image = data.image - if isinstance(image, torch.Tensor): - pixel_values = image.to(model_config.dtype) - batch_size, _, _, h, w = pixel_values.shape - image_sizes = torch.tensor([(w, h) for _ in range(batch_size)]) + if isinstance(image, Image.Image): + # Temporary patch before dynamic number of image tokens is supported + _, _, h, w = vlm_config.image_input_shape + if (w//336, h) != (image.width, image.height): + logger.warning( + "Dynamic image shape is currently not supported. " + "Resizing input image to (%d, %d).", w, h) - return {"pixel_values": pixel_values, "image_sizes": image_sizes} - - # Temporary patch before dynamic number of image tokens is supported - _, _, h, w = vlm_config.image_input_shape - if (w, h) != (image.width, image.height): - logger.warning( - "Dynamic image shape is currently not supported. " - "Resizing input image to (%d, %d).", w, h) - - data.image = image.resize((w, h)) + data.image = image.resize((w, h)) return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \ - ._default_input_processor(data, model_config, vlm_config) + ._default_input_processor(data, model_config, vlm_config) @MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor) From 56666d7808c26640feb31a6d44cafc4463e69a1c Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 22:15:24 +0800 Subject: [PATCH 08/16] fix a typo --- vllm/model_executor/models/phi3v.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index b89c25e09e073..c935ba6878164 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -282,7 +282,7 @@ def _image_processor( if isinstance(image, Image.Image): # Temporary patch before dynamic number of image tokens is supported _, _, h, w = vlm_config.image_input_shape - if (w//336, h) != (image.width, image.height): + if (w, h) != (image.width, image.height): logger.warning( "Dynamic image shape is currently not supported. " "Resizing input image to (%d, %d).", w, h) From 1fe6ede3dc210d186fc8ccc9e18afa8fbc3b022b Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 22:37:22 +0800 Subject: [PATCH 09/16] only resize image when hd_transform not matched --- vllm/model_executor/models/phi3v.py | 38 ++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index c935ba6878164..3b77180f47f6b 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -15,6 +15,7 @@ # limitations under the License. from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict +import numpy as np import torch import torch.nn as nn from PIL import Image @@ -272,6 +273,41 @@ class Phi3VImagePixelInputs(TypedDict): """Shape: (batch_size, 2)""" +# FIXME(Isotr0py): Remove these function after dynamic number of image tokens is supported +# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py +def calc_padded_size(width, height, padding_unit=336): + target_height = int(np.ceil(height / padding_unit) * padding_unit) + top_padding = int((target_height - height) / 2) + bottom_padding = target_height - height - top_padding + padded_width = width + padded_height = height + top_padding + bottom_padding + return padded_width, padded_height + + +# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py +def calc_hd_transform_size(width, height, hd_num=16): + transposed = False + if width < height: + width, height = height, width + transposed = True + + ratio = width / height + scale = 1 + while scale * np.ceil(scale / ratio) <= hd_num: + scale += 1 + scale -= 1 + + new_width = int(scale * 336) + new_height = int(new_width / ratio) + + padded_width, padded_height = calc_padded_size(new_width, new_height) + + if transposed: + padded_width, padded_height = padded_height, padded_width + + return padded_width, padded_height + + def _image_processor( data: ImagePixelData, model_config: ModelConfig, @@ -282,7 +318,7 @@ def _image_processor( if isinstance(image, Image.Image): # Temporary patch before dynamic number of image tokens is supported _, _, h, w = vlm_config.image_input_shape - if (w, h) != (image.width, image.height): + if (w, h) != calc_hd_transform_size(image.width, image.height): logger.warning( "Dynamic image shape is currently not supported. " "Resizing input image to (%d, %d).", w, h) From e4ad67c3face6d2e7948ca94ee0f99867100d76c Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 22:40:24 +0800 Subject: [PATCH 10/16] format code --- vllm/model_executor/models/phi3v.py | 44 ++++++++++++++--------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 3b77180f47f6b..dac832a686c2c 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -273,7 +273,7 @@ class Phi3VImagePixelInputs(TypedDict): """Shape: (batch_size, 2)""" -# FIXME(Isotr0py): Remove these function after dynamic number of image tokens is supported +# FIXME(Isotr0py): Remove these after dynamic num_img_tokens is supported # copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py def calc_padded_size(width, height, padding_unit=336): target_height = int(np.ceil(height / padding_unit) * padding_unit) @@ -285,27 +285,27 @@ def calc_padded_size(width, height, padding_unit=336): # copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py -def calc_hd_transform_size(width, height, hd_num=16): - transposed = False - if width < height: - width, height = height, width - transposed = True - - ratio = width / height - scale = 1 - while scale * np.ceil(scale / ratio) <= hd_num: - scale += 1 - scale -= 1 - - new_width = int(scale * 336) - new_height = int(new_width / ratio) - - padded_width, padded_height = calc_padded_size(new_width, new_height) - - if transposed: - padded_width, padded_height = padded_height, padded_width - - return padded_width, padded_height +def calc_hd_transform_size(width, height, hd_num=16): + transposed = False + if width < height: + width, height = height, width + transposed = True + + ratio = width / height + scale = 1 + while scale * np.ceil(scale / ratio) <= hd_num: + scale += 1 + scale -= 1 + + new_width = int(scale * 336) + new_height = int(new_width / ratio) + + padded_width, padded_height = calc_padded_size(new_width, new_height) + + if transposed: + padded_width, padded_height = padded_height, padded_width + + return padded_width, padded_height def _image_processor( From c987ab501fc246904f5b4476aefbba2be58302db Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 23:47:37 +0800 Subject: [PATCH 11/16] add multi-resolution test --- tests/conftest.py | 13 +++++++++++++ tests/models/test_phi3v.py | 23 +++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 67885b93285c5..fa79adbc34d9e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -101,6 +101,19 @@ def vllm_images(request) -> List[MultiModalData]: ] +@pytest.fixture() +def vllm_multi_res_images(request) -> List[MultiModalData]: + vision_language_config = request.getfixturevalue("model_and_config")[1] + assert vision_language_config.image_input_type == ( + VisionLanguageConfig.ImageInputType.PIXEL_VALUES) + + dynamic_size = [(512, 512), (1024, 1024)] + return [ + ImagePixelData(Image.open(filename).resize(size)) + for filename, size in zip(IMAGE_FILES, dynamic_size) + ] + + @pytest.fixture() def vllm_image_tensors(request) -> List[torch.Tensor]: return [torch.load(filename) for filename in PIXEL_VALUES_FILES] diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 23454759827d5..c95f27a39f4f1 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -124,3 +124,26 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") assert hf_output_ids == vllm_output_ids, ( f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") + + +@pytest.mark.parametrize("model_and_config", model_and_vl_config) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +def test_multi_res_inputs(vllm_runner, vllm_multi_res_images, + model_and_config, dtype: str, max_tokens: int) -> None: + model_id, vlm_config = model_and_config + + vllm_image_prompts = [ + p.replace("<|image_1|>", + "<|image|>" * vlm_config.image_feature_size + "") + for p in HF_IMAGE_PROMPTS + ] + + with vllm_runner(model_id, + max_model_len=2048, + dtype=dtype, + enforce_eager=True, + **vlm_config.as_cli_args_dict()) as vllm_model: + vllm_model.generate_greedy(vllm_image_prompts, + max_tokens, + images=vllm_multi_res_images) From 73e3950519d9f12ccf450b24c759f6e98e2ecef5 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 23 Jun 2024 23:48:06 +0800 Subject: [PATCH 12/16] format code --- tests/models/test_phi3v.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index c95f27a39f4f1..1132cf05e2af7 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -129,8 +129,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128]) -def test_multi_res_inputs(vllm_runner, vllm_multi_res_images, - model_and_config, dtype: str, max_tokens: int) -> None: +def test_multi_res_inputs(vllm_runner, vllm_multi_res_images, model_and_config, + dtype: str, max_tokens: int) -> None: model_id, vlm_config = model_and_config vllm_image_prompts = [ @@ -145,5 +145,5 @@ def test_multi_res_inputs(vllm_runner, vllm_multi_res_images, enforce_eager=True, **vlm_config.as_cli_args_dict()) as vllm_model: vllm_model.generate_greedy(vllm_image_prompts, - max_tokens, - images=vllm_multi_res_images) + max_tokens, + images=vllm_multi_res_images) From 358513519c736863964830d7b97b2a66d7027bf1 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 24 Jun 2024 00:22:48 +0800 Subject: [PATCH 13/16] refactor phi3v test --- tests/conftest.py | 13 ------------- tests/models/test_phi3v.py | 29 +++++------------------------ 2 files changed, 5 insertions(+), 37 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index fa79adbc34d9e..67885b93285c5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -101,19 +101,6 @@ def vllm_images(request) -> List[MultiModalData]: ] -@pytest.fixture() -def vllm_multi_res_images(request) -> List[MultiModalData]: - vision_language_config = request.getfixturevalue("model_and_config")[1] - assert vision_language_config.image_input_type == ( - VisionLanguageConfig.ImageInputType.PIXEL_VALUES) - - dynamic_size = [(512, 512), (1024, 1024)] - return [ - ImagePixelData(Image.open(filename).resize(size)) - for filename, size in zip(IMAGE_FILES, dynamic_size) - ] - - @pytest.fixture() def vllm_image_tensors(request) -> List[torch.Tensor]: return [torch.load(filename) for filename in PIXEL_VALUES_FILES] diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 1132cf05e2af7..43d304d94a3c6 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -22,6 +22,10 @@ def iter_phi3v_configs(model_name: str): image_hw_to_feature_size = { (1008, 1344): 1921, + (336, 336): 2509, + (672, 672): 2509, + (1344, 336): 2485, + (336, 1344): 2557, } for (h, w), f in image_hw_to_feature_size.items(): @@ -108,7 +112,7 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, ] with vllm_runner(model_id, - max_model_len=2048, + max_model_len=4096, dtype=dtype, enforce_eager=True, **vlm_config.as_cli_args_dict()) as vllm_model: @@ -124,26 +128,3 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") assert hf_output_ids == vllm_output_ids, ( f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") - - -@pytest.mark.parametrize("model_and_config", model_and_vl_config) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -def test_multi_res_inputs(vllm_runner, vllm_multi_res_images, model_and_config, - dtype: str, max_tokens: int) -> None: - model_id, vlm_config = model_and_config - - vllm_image_prompts = [ - p.replace("<|image_1|>", - "<|image|>" * vlm_config.image_feature_size + "") - for p in HF_IMAGE_PROMPTS - ] - - with vllm_runner(model_id, - max_model_len=2048, - dtype=dtype, - enforce_eager=True, - **vlm_config.as_cli_args_dict()) as vllm_model: - vllm_model.generate_greedy(vllm_image_prompts, - max_tokens, - images=vllm_multi_res_images) From bc3a942a4b85f159aa132dbb4a60c83563382989 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 24 Jun 2024 00:35:45 +0800 Subject: [PATCH 14/16] add comment to requirements --- requirements-cpu.txt | 2 +- requirements-cuda.txt | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 5046c26a4acbd..21acee91d7b57 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -3,5 +3,5 @@ # Dependencies for x86_64 CPUs torch == 2.3.1+cpu -torchvision == 0.18.1+cpu # required for the image processor of phi3v +torchvision == 0.18.1+cpu # required for the image processor of phi3v, this must be updated alongside torch triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. \ No newline at end of file diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 54fac006d6f5f..10596ed85d600 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -5,6 +5,7 @@ ray >= 2.9 nvidia-ml-py # for pynvml package torch == 2.3.0 -torchvision == 0.18.0 # required for the image processor of phi3v +# These must be updated alongside torch +torchvision == 0.18.0 # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version xformers == 0.0.26.post1 # Requires PyTorch 2.3.0 vllm-flash-attn == 2.5.9 # Requires PyTorch 2.3.0 From 512c341763725352a012f807dad38fe31e46e734 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 24 Jun 2024 01:29:57 +0800 Subject: [PATCH 15/16] fix broken test --- tests/models/test_phi3v.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 43d304d94a3c6..54d28b697da95 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -22,10 +22,7 @@ def iter_phi3v_configs(model_name: str): image_hw_to_feature_size = { (1008, 1344): 1921, - (336, 336): 2509, - (672, 672): 2509, - (1344, 336): 2485, - (336, 1344): 2557, + (2016, 2688): 1933, } for (h, w), f in image_hw_to_feature_size.items(): @@ -112,7 +109,7 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images, ] with vllm_runner(model_id, - max_model_len=4096, + max_model_len=2048, dtype=dtype, enforce_eager=True, **vlm_config.as_cli_args_dict()) as vllm_model: From a7cf5d0fc98170191fe40de764ffd878e5352890 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 24 Jun 2024 10:18:53 +0800 Subject: [PATCH 16/16] add xfail to phi3v test --- tests/models/test_phi3v.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 54d28b697da95..a29d50df4c4e5 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -76,6 +76,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], # TODO: Add test for `tensor_parallel_size` [ref: PR #3883] # Since we use _attn_implementation="eager" for hf_runner, here is # numeric difference for longer context and test can't pass +@pytest.mark.xfail( + reason="Inconsistent image processor being used due to lack " + "of support for dynamic image token replacement") @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [128])