From a8d4dc6fe86fa4a403a6833d49c5b0528d3811c8 Mon Sep 17 00:00:00 2001 From: yangshen Date: Thu, 15 Aug 2024 15:10:40 +0000 Subject: [PATCH 01/28] Initial support for llava-next-video * Inference with llava-hf/llava-next-video * Add VideoPlugin. * Add example for llava-next-video * Set the default video input np.ndarray * Support single video per prompt, with variable length --- examples/offline_inference_vision_language.py | 78 ++- vllm/assets/video.py | 82 ++++ vllm/model_executor/models/__init__.py | 6 +- .../model_executor/models/llava_next_video.py | 447 ++++++++++++++++++ vllm/multimodal/registry.py | 3 +- vllm/multimodal/video.py | 70 +++ vllm/transformers_utils/image_processor.py | 27 ++ 7 files changed, 698 insertions(+), 15 deletions(-) create mode 100644 vllm/assets/video.py create mode 100644 vllm/model_executor/models/llava_next_video.py create mode 100644 vllm/multimodal/video.py diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index aa1580343aee7..94fa4a9ae440b 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -9,19 +9,16 @@ from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset from vllm.utils import FlexibleArgumentParser -# Input image and question -image = ImageAsset("cherry_blossom").pil_image.convert("RGB") -question = "What is the content of this image?" - # LLaVA-1.5 def run_llava(question): prompt = f"USER: \n{question}\nASSISTANT:" - llm = LLM(model="llava-hf/llava-1.5-7b-hf") + llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf") stop_token_ids = None return llm, prompt, stop_token_ids @@ -30,7 +27,18 @@ def run_llava(question): def run_llava_next(question): prompt = f"[INST] \n{question} [/INST]" - llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf") + llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# LlaVA-NeXT-Video +# Currently only support for video input +def run_llava_next_video(question): + # prompt = f"[INST]