From b3708d79bba987358456d9366cc7ada84107e1d1 Mon Sep 17 00:00:00 2001
From: Somasundaram <somasundaram.sindhu@gmail.com>
Date: Fri, 5 Jul 2024 19:38:21 -0700
Subject: [PATCH] [unittest] add assertions to unit test cases

---
 .../setup/djl_python/multimodal/utils.py       |  5 +----
 .../properties_manager/vllm_rb_properties.py   |  9 ---------
 .../rolling_batch/rolling_batch_vllm_utils.py  | 12 +++---------
 ..._multimodal.py => test_multimodal_utils.py} | 18 ++++++++++++++++--
 4 files changed, 20 insertions(+), 24 deletions(-)
 rename engines/python/setup/djl_python/tests/multimodal/{test_parse_multimodal.py => test_multimodal_utils.py} (74%)
diff --git a/engines/python/setup/djl_python/multimodal/utils.py b/engines/python/setup/djl_python/multimodal/utils.py
index 82d1097189..38afb6cd32 100644
--- a/engines/python/setup/djl_python/multimodal/utils.py
+++ b/engines/python/setup/djl_python/multimodal/utils.py
@@ -25,11 +25,8 @@ def get_image_text_prompt(prompt_text: str) -> str:
     # TODO: image token str must be decoded from image_token_id in serving.properties. Change it after refactor PR.
     image_token_str = '<image>'
 
-    # TODO: image_feature_size should be referred from serving.properties. Change it after refactor PR.
-    image_feature_size = 1176
-
     # TODO: Remove image_token_str*1176 after vllm next release, as the image placeholder is not needed.
-    return f"{image_token_str*image_feature_size}\n{prompt_text}"
+    return f"{image_token_str}\n{prompt_text}"
 
 
 def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
index 38d84d9351..4f0804ab30 100644
--- a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
+++ b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
@@ -48,15 +48,6 @@ class VllmRbProperties(Properties):
     device: Optional[str] = None
     preloaded_model: Optional[Any] = None
 
-    # Vision language configurations
-    # TODO: remove this after vLLM next release
-    image_token_id: Optional[int] = None
-    image_input_type: Optional[str] = None
-    image_input_shape: Optional[str] = None
-    image_feature_size: Optional[int] = None
-    image_processor: Optional[str] = None
-    image_processor_revision: Optional[str] = None
-
     @field_validator('engine')
     def validate_engine(cls, engine):
         if engine != "Python":
diff --git a/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py b/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py
index 6bac5a458e..d4dd44c5d0 100644
--- a/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py
+++ b/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py
@@ -16,7 +16,6 @@
 from vllm import EngineArgs
 from vllm.outputs import CompletionOutput, RequestOutput as vLLMRequestOutput
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.image import ImagePixelData
 from vllm.inputs import PromptInputs
 
 from djl_python.request_io import Token, Sequence
@@ -227,13 +226,7 @@ def get_engine_args_from_config(config: VllmRbProperties) -> EngineArgs:
             max_lora_rank=config.max_lora_rank,
             lora_extra_vocab_size=config.lora_extra_vocab_size,
             max_cpu_loras=config.max_cpu_loras,
-            revision=config.revision,
-            image_input_type=config.image_input_type,
-            image_token_id=config.image_token_id,
-            image_input_shape=config.image_input_shape,
-            image_feature_size=config.image_feature_size,
-            image_processor=config.image_processor,
-            image_processor_revision=config.image_processor_revision)
+            revision=config.revision)
 
 
 def get_multi_modal_data(request: Request) -> dict:
@@ -241,7 +234,8 @@ def get_multi_modal_data(request: Request) -> dict:
     images = parameters.pop("images", None)
     multi_modal_data = None
     if images:
-        multi_modal_data = ImagePixelData(images[0])
+        # vLLM only supports one image per request.
+        multi_modal_data = {"image": images[0]}
     return multi_modal_data
 
 
diff --git a/engines/python/setup/djl_python/tests/multimodal/test_parse_multimodal.py b/engines/python/setup/djl_python/tests/multimodal/test_multimodal_utils.py
similarity index 74%
rename from engines/python/setup/djl_python/tests/multimodal/test_parse_multimodal.py
rename to engines/python/setup/djl_python/tests/multimodal/test_multimodal_utils.py
index cbeac1c1f4..6c74768c40 100644
--- a/engines/python/setup/djl_python/tests/multimodal/test_parse_multimodal.py
+++ b/engines/python/setup/djl_python/tests/multimodal/test_multimodal_utils.py
@@ -17,7 +17,7 @@
 )
 
 
-class TestLmiDist(unittest.TestCase):
+class TestMultiModalUtils(unittest.TestCase):
 
     def test_open_ai_format_parse(self):
         image_url = "https://resources.djl.ai/images/dog_bike_car.jpg"
@@ -45,7 +45,21 @@ def test_open_ai_format_parse(self):
                                                         is_rolling_batch=True,
                                                         tokenizer=tokenizer)
         print(inputs)
+        image_token = "<image>"
+        self.assertEqual(
+            f"<|im_start|>user\n{image_token*1176}\nWhat’s in this image?<|im_end|>\n",
+            inputs)
         images = params.pop("images", None)
         for image in images:
             print(image)
-        print(params)
+        self.assertEqual(
+            {
+                'frequency_penalty': 0.0,
+                'presence_penalty': 0.0,
+                'stream': False,
+                'temperature': 1.0,
+                'top_p': 1.0,
+                'do_sample': True,
+                'details': True,
+                'output_formatter': 'json_chat'
+            }, params)