From 2fbbcb79a19788b74b8185add3d640ef868e4625 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 19:31:40 +0800
Subject: [PATCH 01/16] add phi3v resize for dynamic shape and fix torchvision
 requirement

---
 requirements-cpu.txt                |  2 ++
 requirements-cuda.txt               |  1 +
 requirements-test.txt               |  1 -
 vllm/model_executor/models/phi3v.py | 44 ++++++++++++++++++++---------
 4 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 8b7d86e686217..773c5cd4f4ea5 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -1,6 +1,8 @@
 # Common dependencies
 -r requirements-common.txt
+--index-url https://download.pytorch.org/whl/cpu
 
 # Dependencies for x86_64 CPUs
 torch == 2.3.1+cpu
+torchvision == 0.18.1+cpu   # required for the image processor of phi3v
 triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 3536179835967..54fac006d6f5f 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -5,5 +5,6 @@
 ray >= 2.9
 nvidia-ml-py # for pynvml package
 torch == 2.3.0
+torchvision == 0.18.0   # required for the image processor of phi3v
 xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
 vllm-flash-attn == 2.5.9  # Requires PyTorch 2.3.0
diff --git a/requirements-test.txt b/requirements-test.txt
index fef0ede7be0ff..8b68e0e939669 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -14,7 +14,6 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
-torchvision # required for the image processor of phi3v
 
 # Benchmarking
 aiohttp
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index fa20a7c5903d6..7162ded074b1e 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -13,28 +13,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
-from transformers import CLIPVisionConfig, PretrainedConfig
+from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import get_dummy_image_data
+from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
 from vllm.sequence import SamplerOutput
 
+logger = init_logger(__name__)
+
 _KEYS_TO_MODIFY_MAPPING = {
     "model.vision_embed_tokens": "vision_embed_tokens",
 }
@@ -71,10 +73,9 @@ def get_img_features(self,
         LAYER_IDX = self.layer_idx
         TYPE_FEATURE = self.type_feature
 
-        # NOTE: we skip the step to select the vision feature layer since
-        # this is already done inside the img_processor
-        img_feature = self.img_processor(img_embeds,
-                                         vision_feature_layer=LAYER_IDX)
+        img_processor_output = self.img_processor(img_embeds,
+                                                  output_hidden_states=True)
+        img_feature = img_processor_output.hidden_states[LAYER_IDX]
 
         if TYPE_FEATURE == "patch":
             patch_feature = img_feature[:, 1:]
@@ -268,7 +269,27 @@ class Phi3VImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
+def _image_processor(
+    data: ImagePixelData,
+    model_config: ModelConfig,
+    vlm_config: VisionLanguageConfig,
+) -> Dict[str, torch.Tensor]:
+    image = data.image
+
+    # Temporary patch before dynamic number of image tokens is supported
+    _, _, h, w = vlm_config.image_input_shape
+    if (w, h) != (image.width, image.height):
+        logger.warning(
+            "Dynamic image shape is currently not supported. "
+            "Resizing input image to (%d, %d).", w, h)
+
+        data.image = image.resize((w, h))
+
+    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
+        ._default_input_processor(data, model_config, vlm_config)
+
+
+@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor)
 @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
 class Phi3VForCausalLM(VisionLanguageModelBase):
 
@@ -354,9 +375,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            # post_layernorm is not needed in CLIPVisionModel
-            if "vision_model.post_layernorm" in name:
-                continue
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)

From 1579519d271098ab02f42da7eeac1a63afb4280b Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 19:42:25 +0800
Subject: [PATCH 02/16] remove index-url

---
 requirements-cpu.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 773c5cd4f4ea5..5046c26a4acbd 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -1,6 +1,5 @@
 # Common dependencies
 -r requirements-common.txt
---index-url https://download.pytorch.org/whl/cpu
 
 # Dependencies for x86_64 CPUs
 torch == 2.3.1+cpu

From 39f3fef453763e8bff90f8756bc00ac74ae3e7cf Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 19:45:05 +0800
Subject: [PATCH 03/16] revert change by mistake

---
 vllm/model_executor/models/phi3v.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7162ded074b1e..fa944b11b0058 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -375,6 +375,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            # post_layernorm is not needed in CLIPVisionModel
+            if "vision_model.post_layernorm" in name:
+                continue
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)

From f048b3e98886281f621b1c4255c460ccb1f45e2d Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 19:46:55 +0800
Subject: [PATCH 04/16] revert change by mistake

---
 vllm/model_executor/models/phi3v.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index fa944b11b0058..52e72db4c630e 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -73,9 +73,10 @@ def get_img_features(self,
         LAYER_IDX = self.layer_idx
         TYPE_FEATURE = self.type_feature
 
-        img_processor_output = self.img_processor(img_embeds,
-                                                  output_hidden_states=True)
-        img_feature = img_processor_output.hidden_states[LAYER_IDX]
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the img_processor
+        img_feature = self.img_processor(img_embeds,
+                                         vision_feature_layer=LAYER_IDX)
 
         if TYPE_FEATURE == "patch":
             patch_feature = img_feature[:, 1:]

From 2131fa2c6ce4367582519aa9a54be7f7f8f28184 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 19:47:54 +0800
Subject: [PATCH 05/16] revert change by mistake

---
 vllm/model_executor/models/phi3v.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 52e72db4c630e..5186f8d6197ff 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -17,7 +17,7 @@
 
 import torch
 import torch.nn as nn
-from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
+from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
@@ -28,6 +28,7 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata

From e11e39571d35853488e702f118c48e2aa5c6e2e9 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 21:02:58 +0800
Subject: [PATCH 06/16] fix phi3v

---
 vllm/model_executor/models/phi3v.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 5186f8d6197ff..204c021c2d559 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -278,6 +278,13 @@ def _image_processor(
 ) -> Dict[str, torch.Tensor]:
     image = data.image
 
+    if isinstance(image, torch.Tensor):
+        pixel_values = image.to(model_config.dtype)
+        batch_size, _, _, h, w = pixel_values.shape
+        image_sizes = torch.tensor([(w, h) for _ in range(batch_size)])
+
+        return {"pixel_values": pixel_values, "image_sizes": image_sizes}
+
     # Temporary patch before dynamic number of image tokens is supported
     _, _, h, w = vlm_config.image_input_shape
     if (w, h) != (image.width, image.height):

From ea71ee874d802be413df57c724e9204086f8a09e Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 22:13:30 +0800
Subject: [PATCH 07/16] fix phi3v

---
 vllm/model_executor/models/phi3v.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 204c021c2d559..b89c25e09e073 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -17,6 +17,7 @@
 
 import torch
 import torch.nn as nn
+from PIL import Image
 from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
@@ -278,24 +279,18 @@ def _image_processor(
 ) -> Dict[str, torch.Tensor]:
     image = data.image
 
-    if isinstance(image, torch.Tensor):
-        pixel_values = image.to(model_config.dtype)
-        batch_size, _, _, h, w = pixel_values.shape
-        image_sizes = torch.tensor([(w, h) for _ in range(batch_size)])
+    if isinstance(image, Image.Image):
+        # Temporary patch before dynamic number of image tokens is supported
+        _, _, h, w = vlm_config.image_input_shape
+        if (w//336, h) != (image.width, image.height):
+            logger.warning(
+                "Dynamic image shape is currently not supported. "
+                "Resizing input image to (%d, %d).", w, h)
 
-        return {"pixel_values": pixel_values, "image_sizes": image_sizes}
-
-    # Temporary patch before dynamic number of image tokens is supported
-    _, _, h, w = vlm_config.image_input_shape
-    if (w, h) != (image.width, image.height):
-        logger.warning(
-            "Dynamic image shape is currently not supported. "
-            "Resizing input image to (%d, %d).", w, h)
-
-        data.image = image.resize((w, h))
+            data.image = image.resize((w, h))
 
     return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
-        ._default_input_processor(data, model_config, vlm_config)
+            ._default_input_processor(data, model_config, vlm_config)
 
 
 @MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor)

From 56666d7808c26640feb31a6d44cafc4463e69a1c Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 22:15:24 +0800
Subject: [PATCH 08/16] fix a typo

---
 vllm/model_executor/models/phi3v.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index b89c25e09e073..c935ba6878164 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -282,7 +282,7 @@ def _image_processor(
     if isinstance(image, Image.Image):
         # Temporary patch before dynamic number of image tokens is supported
         _, _, h, w = vlm_config.image_input_shape
-        if (w//336, h) != (image.width, image.height):
+        if (w, h) != (image.width, image.height):
             logger.warning(
                 "Dynamic image shape is currently not supported. "
                 "Resizing input image to (%d, %d).", w, h)

From 1fe6ede3dc210d186fc8ccc9e18afa8fbc3b022b Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 22:37:22 +0800
Subject: [PATCH 09/16] only resize image when hd_transform not matched

---
 vllm/model_executor/models/phi3v.py | 38 ++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index c935ba6878164..3b77180f47f6b 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict
 
+import numpy as np
 import torch
 import torch.nn as nn
 from PIL import Image
@@ -272,6 +273,41 @@ class Phi3VImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
+# FIXME(Isotr0py): Remove these function after dynamic number of image tokens is supported
+# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def calc_padded_size(width, height, padding_unit=336):
+    target_height = int(np.ceil(height / padding_unit) * padding_unit)
+    top_padding = int((target_height - height) / 2)
+    bottom_padding = target_height - height - top_padding
+    padded_width = width
+    padded_height = height + top_padding + bottom_padding
+    return padded_width, padded_height
+
+
+# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def calc_hd_transform_size(width, height, hd_num=16):  
+    transposed = False  
+    if width < height:  
+        width, height = height, width  
+        transposed = True  
+  
+    ratio = width / height  
+    scale = 1  
+    while scale * np.ceil(scale / ratio) <= hd_num:  
+        scale += 1  
+    scale -= 1  
+  
+    new_width = int(scale * 336)  
+    new_height = int(new_width / ratio)  
+  
+    padded_width, padded_height = calc_padded_size(new_width, new_height)  
+      
+    if transposed:  
+        padded_width, padded_height = padded_height, padded_width  
+  
+    return padded_width, padded_height  
+
+
 def _image_processor(
     data: ImagePixelData,
     model_config: ModelConfig,
@@ -282,7 +318,7 @@ def _image_processor(
     if isinstance(image, Image.Image):
         # Temporary patch before dynamic number of image tokens is supported
         _, _, h, w = vlm_config.image_input_shape
-        if (w, h) != (image.width, image.height):
+        if (w, h) != calc_hd_transform_size(image.width, image.height):
             logger.warning(
                 "Dynamic image shape is currently not supported. "
                 "Resizing input image to (%d, %d).", w, h)

From e4ad67c3face6d2e7948ca94ee0f99867100d76c Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 22:40:24 +0800
Subject: [PATCH 10/16] format code

---
 vllm/model_executor/models/phi3v.py | 44 ++++++++++++++---------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 3b77180f47f6b..dac832a686c2c 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -273,7 +273,7 @@ class Phi3VImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
-# FIXME(Isotr0py): Remove these function after dynamic number of image tokens is supported
+# FIXME(Isotr0py): Remove these after dynamic num_img_tokens is supported
 # copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
 def calc_padded_size(width, height, padding_unit=336):
     target_height = int(np.ceil(height / padding_unit) * padding_unit)
@@ -285,27 +285,27 @@ def calc_padded_size(width, height, padding_unit=336):
 
 
 # copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
-def calc_hd_transform_size(width, height, hd_num=16):  
-    transposed = False  
-    if width < height:  
-        width, height = height, width  
-        transposed = True  
-  
-    ratio = width / height  
-    scale = 1  
-    while scale * np.ceil(scale / ratio) <= hd_num:  
-        scale += 1  
-    scale -= 1  
-  
-    new_width = int(scale * 336)  
-    new_height = int(new_width / ratio)  
-  
-    padded_width, padded_height = calc_padded_size(new_width, new_height)  
-      
-    if transposed:  
-        padded_width, padded_height = padded_height, padded_width  
-  
-    return padded_width, padded_height  
+def calc_hd_transform_size(width, height, hd_num=16):
+    transposed = False
+    if width < height:
+        width, height = height, width
+        transposed = True
+
+    ratio = width / height
+    scale = 1
+    while scale * np.ceil(scale / ratio) <= hd_num:
+        scale += 1
+    scale -= 1
+
+    new_width = int(scale * 336)
+    new_height = int(new_width / ratio)
+
+    padded_width, padded_height = calc_padded_size(new_width, new_height)
+
+    if transposed:
+        padded_width, padded_height = padded_height, padded_width
+
+    return padded_width, padded_height
 
 
 def _image_processor(

From c987ab501fc246904f5b4476aefbba2be58302db Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 23:47:37 +0800
Subject: [PATCH 11/16] add multi-resolution test

---
 tests/conftest.py          | 13 +++++++++++++
 tests/models/test_phi3v.py | 23 +++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 67885b93285c5..fa79adbc34d9e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -101,6 +101,19 @@ def vllm_images(request) -> List[MultiModalData]:
         ]
 
 
+@pytest.fixture()
+def vllm_multi_res_images(request) -> List[MultiModalData]:
+    vision_language_config = request.getfixturevalue("model_and_config")[1]
+    assert vision_language_config.image_input_type == (
+        VisionLanguageConfig.ImageInputType.PIXEL_VALUES)
+
+    dynamic_size = [(512, 512), (1024, 1024)]
+    return [
+        ImagePixelData(Image.open(filename).resize(size))
+        for filename, size in zip(IMAGE_FILES, dynamic_size)
+    ]
+
+
 @pytest.fixture()
 def vllm_image_tensors(request) -> List[torch.Tensor]:
     return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 23454759827d5..c95f27a39f4f1 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -124,3 +124,26 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
         assert hf_output_ids == vllm_output_ids, (
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_multi_res_inputs(vllm_runner, vllm_multi_res_images,
+                model_and_config, dtype: str, max_tokens: int) -> None:
+    model_id, vlm_config = model_and_config
+
+    vllm_image_prompts = [
+        p.replace("<|image_1|>",
+                  "<|image|>" * vlm_config.image_feature_size + "<s>")
+        for p in HF_IMAGE_PROMPTS
+    ]
+
+    with vllm_runner(model_id,
+                     max_model_len=2048,
+                     dtype=dtype,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_multi_res_images)

From 73e3950519d9f12ccf450b24c759f6e98e2ecef5 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 23 Jun 2024 23:48:06 +0800
Subject: [PATCH 12/16] format code

---
 tests/models/test_phi3v.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index c95f27a39f4f1..1132cf05e2af7 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -129,8 +129,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_multi_res_inputs(vllm_runner, vllm_multi_res_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+def test_multi_res_inputs(vllm_runner, vllm_multi_res_images, model_and_config,
+                          dtype: str, max_tokens: int) -> None:
     model_id, vlm_config = model_and_config
 
     vllm_image_prompts = [
@@ -145,5 +145,5 @@ def test_multi_res_inputs(vllm_runner, vllm_multi_res_images,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
         vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_multi_res_images)
+                                   max_tokens,
+                                   images=vllm_multi_res_images)

From 358513519c736863964830d7b97b2a66d7027bf1 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 24 Jun 2024 00:22:48 +0800
Subject: [PATCH 13/16] refactor phi3v test

---
 tests/conftest.py          | 13 -------------
 tests/models/test_phi3v.py | 29 +++++------------------------
 2 files changed, 5 insertions(+), 37 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index fa79adbc34d9e..67885b93285c5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -101,19 +101,6 @@ def vllm_images(request) -> List[MultiModalData]:
         ]
 
 
-@pytest.fixture()
-def vllm_multi_res_images(request) -> List[MultiModalData]:
-    vision_language_config = request.getfixturevalue("model_and_config")[1]
-    assert vision_language_config.image_input_type == (
-        VisionLanguageConfig.ImageInputType.PIXEL_VALUES)
-
-    dynamic_size = [(512, 512), (1024, 1024)]
-    return [
-        ImagePixelData(Image.open(filename).resize(size))
-        for filename, size in zip(IMAGE_FILES, dynamic_size)
-    ]
-
-
 @pytest.fixture()
 def vllm_image_tensors(request) -> List[torch.Tensor]:
     return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 1132cf05e2af7..43d304d94a3c6 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -22,6 +22,10 @@
 def iter_phi3v_configs(model_name: str):
     image_hw_to_feature_size = {
         (1008, 1344): 1921,
+        (336, 336): 2509,
+        (672, 672): 2509,
+        (1344, 336): 2485,
+        (336, 1344): 2557,
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
@@ -108,7 +112,7 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     ]
 
     with vllm_runner(model_id,
-                     max_model_len=2048,
+                     max_model_len=4096,
                      dtype=dtype,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
@@ -124,26 +128,3 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
         assert hf_output_ids == vllm_output_ids, (
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-
-
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_multi_res_inputs(vllm_runner, vllm_multi_res_images, model_and_config,
-                          dtype: str, max_tokens: int) -> None:
-    model_id, vlm_config = model_and_config
-
-    vllm_image_prompts = [
-        p.replace("<|image_1|>",
-                  "<|image|>" * vlm_config.image_feature_size + "<s>")
-        for p in HF_IMAGE_PROMPTS
-    ]
-
-    with vllm_runner(model_id,
-                     max_model_len=2048,
-                     dtype=dtype,
-                     enforce_eager=True,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_model.generate_greedy(vllm_image_prompts,
-                                   max_tokens,
-                                   images=vllm_multi_res_images)

From bc3a942a4b85f159aa132dbb4a60c83563382989 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 24 Jun 2024 00:35:45 +0800
Subject: [PATCH 14/16] add comment to requirements

---
 requirements-cpu.txt  | 2 +-
 requirements-cuda.txt | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 5046c26a4acbd..21acee91d7b57 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -3,5 +3,5 @@
 
 # Dependencies for x86_64 CPUs
 torch == 2.3.1+cpu
-torchvision == 0.18.1+cpu   # required for the image processor of phi3v
+torchvision == 0.18.1+cpu   # required for the image processor of phi3v, this must be updated alongside torch
 triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 54fac006d6f5f..10596ed85d600 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -5,6 +5,7 @@
 ray >= 2.9
 nvidia-ml-py # for pynvml package
 torch == 2.3.0
-torchvision == 0.18.0   # required for the image processor of phi3v
+# These must be updated alongside torch
+torchvision == 0.18.0   # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
 vllm-flash-attn == 2.5.9  # Requires PyTorch 2.3.0

From 512c341763725352a012f807dad38fe31e46e734 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 24 Jun 2024 01:29:57 +0800
Subject: [PATCH 15/16] fix broken test

---
 tests/models/test_phi3v.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 43d304d94a3c6..54d28b697da95 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -22,10 +22,7 @@
 def iter_phi3v_configs(model_name: str):
     image_hw_to_feature_size = {
         (1008, 1344): 1921,
-        (336, 336): 2509,
-        (672, 672): 2509,
-        (1344, 336): 2485,
-        (336, 1344): 2557,
+        (2016, 2688): 1933,
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
@@ -112,7 +109,7 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     ]
 
     with vllm_runner(model_id,
-                     max_model_len=4096,
+                     max_model_len=2048,
                      dtype=dtype,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:

From a7cf5d0fc98170191fe40de764ffd878e5352890 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 24 Jun 2024 10:18:53 +0800
Subject: [PATCH 16/16] add xfail to phi3v test

---
 tests/models/test_phi3v.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 54d28b697da95..a29d50df4c4e5 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -76,6 +76,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 # TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
 # Since we use _attn_implementation="eager" for hf_runner, here is
 # numeric difference for longer context and test can't pass
+@pytest.mark.xfail(
+    reason="Inconsistent image processor being used due to lack "
+    "of support for dynamic image token replacement")
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])