From d95701b230b67b6fce4de69f7c171bc4ee85dbf9 Mon Sep 17 00:00:00 2001 From: yangshen Date: Thu, 15 Aug 2024 15:10:40 +0000 Subject: [PATCH] Initial support for llava-next-video * Inference with llava-hf/llava-next-video* (with bugs) * Add VideoPlugin. * Add example for llava-next-video --- examples/offline_inference_vision_language.py | 67 ++- vllm/assets/video.py | 81 ++++ vllm/model_executor/models/__init__.py | 2 + .../model_executor/models/llava_next_video.py | 404 ++++++++++++++++++ vllm/multimodal/registry.py | 3 +- vllm/multimodal/video.py | 44 ++ 6 files changed, 590 insertions(+), 11 deletions(-) create mode 100644 vllm/assets/video.py create mode 100644 vllm/model_executor/models/llava_next_video.py create mode 100644 vllm/multimodal/video.py diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 9a0e9d4bc5362..ee8553803c7ad 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -9,19 +9,15 @@ from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset from vllm.utils import FlexibleArgumentParser -# Input image and question -image = ImageAsset("cherry_blossom").pil_image.convert("RGB") -question = "What is the content of this image?" - - # LLaVA-1.5 def run_llava(question): prompt = f"USER: \n{question}\nASSISTANT:" - llm = LLM(model="llava-hf/llava-1.5-7b-hf") + llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf") stop_token_ids = None return llm, prompt, stop_token_ids @@ -34,6 +30,13 @@ def run_llava_next(question): stop_token_ids = None return llm, prompt, stop_token_ids +# LlaVA-NeXT-Video +# Currently only support for video input +def run_llava_next_video(question): + prompt = f"[INST]