diff --git a/engines/python/setup/djl_python/multimodal/utils.py b/engines/python/setup/djl_python/multimodal/utils.py index 82d1097189..38afb6cd32 100644 --- a/engines/python/setup/djl_python/multimodal/utils.py +++ b/engines/python/setup/djl_python/multimodal/utils.py @@ -25,11 +25,8 @@ def get_image_text_prompt(prompt_text: str) -> str: # TODO: image token str must be decoded from image_token_id in serving.properties. Change it after refactor PR. image_token_str = '' - # TODO: image_feature_size should be referred from serving.properties. Change it after refactor PR. - image_feature_size = 1176 - # TODO: Remove image_token_str*1176 after vllm next release, as the image placeholder is not needed. - return f"{image_token_str*image_feature_size}\n{prompt_text}" + return f"{image_token_str}\n{prompt_text}" def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py index 38d84d9351..4f0804ab30 100644 --- a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py +++ b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py @@ -48,15 +48,6 @@ class VllmRbProperties(Properties): device: Optional[str] = None preloaded_model: Optional[Any] = None - # Vision language configurations - # TODO: remove this after vLLM next release - image_token_id: Optional[int] = None - image_input_type: Optional[str] = None - image_input_shape: Optional[str] = None - image_feature_size: Optional[int] = None - image_processor: Optional[str] = None - image_processor_revision: Optional[str] = None - @field_validator('engine') def validate_engine(cls, engine): if engine != "Python": diff --git a/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py b/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py index 6bac5a458e..d4dd44c5d0 100644 --- a/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py +++ b/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py @@ -16,7 +16,6 @@ from vllm import EngineArgs from vllm.outputs import CompletionOutput, RequestOutput as vLLMRequestOutput from vllm.lora.request import LoRARequest -from vllm.multimodal.image import ImagePixelData from vllm.inputs import PromptInputs from djl_python.request_io import Token, Sequence @@ -227,13 +226,7 @@ def get_engine_args_from_config(config: VllmRbProperties) -> EngineArgs: max_lora_rank=config.max_lora_rank, lora_extra_vocab_size=config.lora_extra_vocab_size, max_cpu_loras=config.max_cpu_loras, - revision=config.revision, - image_input_type=config.image_input_type, - image_token_id=config.image_token_id, - image_input_shape=config.image_input_shape, - image_feature_size=config.image_feature_size, - image_processor=config.image_processor, - image_processor_revision=config.image_processor_revision) + revision=config.revision) def get_multi_modal_data(request: Request) -> dict: @@ -241,7 +234,8 @@ def get_multi_modal_data(request: Request) -> dict: images = parameters.pop("images", None) multi_modal_data = None if images: - multi_modal_data = ImagePixelData(images[0]) + # vLLM only supports one image per request. + multi_modal_data = {"image": images[0]} return multi_modal_data diff --git a/engines/python/setup/djl_python/tests/multimodal/test_parse_multimodal.py b/engines/python/setup/djl_python/tests/multimodal/test_multimodal_utils.py similarity index 74% rename from engines/python/setup/djl_python/tests/multimodal/test_parse_multimodal.py rename to engines/python/setup/djl_python/tests/multimodal/test_multimodal_utils.py index cbeac1c1f4..6c74768c40 100644 --- a/engines/python/setup/djl_python/tests/multimodal/test_parse_multimodal.py +++ b/engines/python/setup/djl_python/tests/multimodal/test_multimodal_utils.py @@ -17,7 +17,7 @@ ) -class TestLmiDist(unittest.TestCase): +class TestMultiModalUtils(unittest.TestCase): def test_open_ai_format_parse(self): image_url = "https://resources.djl.ai/images/dog_bike_car.jpg" @@ -45,7 +45,21 @@ def test_open_ai_format_parse(self): is_rolling_batch=True, tokenizer=tokenizer) print(inputs) + image_token = "" + self.assertEqual( + f"<|im_start|>user\n{image_token*1176}\nWhat’s in this image?<|im_end|>\n", + inputs) images = params.pop("images", None) for image in images: print(image) - print(params) + self.assertEqual( + { + 'frequency_penalty': 0.0, + 'presence_penalty': 0.0, + 'stream': False, + 'temperature': 1.0, + 'top_p': 1.0, + 'do_sample': True, + 'details': True, + 'output_formatter': 'json_chat' + }, params)