From a8d4dc6fe86fa4a403a6833d49c5b0528d3811c8 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Thu, 15 Aug 2024 15:10:40 +0000
Subject: [PATCH 01/28] Initial support for llava-next-video * Inference with
 llava-hf/llava-next-video * Add VideoPlugin. * Add example for
 llava-next-video * Set the default video input np.ndarray * Support single
 video per prompt, with variable length

---
 examples/offline_inference_vision_language.py |  78 ++-
 vllm/assets/video.py                          |  82 ++++
 vllm/model_executor/models/__init__.py        |   6 +-
 .../model_executor/models/llava_next_video.py | 447 ++++++++++++++++++
 vllm/multimodal/registry.py                   |   3 +-
 vllm/multimodal/video.py                      |  70 +++
 vllm/transformers_utils/image_processor.py    |  27 ++
 7 files changed, 698 insertions(+), 15 deletions(-)
 create mode 100644 vllm/assets/video.py
 create mode 100644 vllm/model_executor/models/llava_next_video.py
 create mode 100644 vllm/multimodal/video.py
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index aa1580343aee7..94fa4a9ae440b 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -9,19 +9,16 @@
 
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
 from vllm.utils import FlexibleArgumentParser
 
-# Input image and question
-image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-question = "What is the content of this image?"
-
 
 # LLaVA-1.5
 def run_llava(question):
 
     prompt = f"USER: <image>\n{question}\nASSISTANT:"
 
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf")
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -30,7 +27,18 @@ def run_llava(question):
 def run_llava_next(question):
 
     prompt = f"[INST] <image>\n{question} [/INST]"
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# LlaVA-NeXT-Video
+# Currently only support for video input
+def run_llava_next_video(question):
+    # prompt = f"[INST] <video>\n{question} [/INST]"
+    prompt = f"USER: <video>\n{question} ASSISTANT:"
+    print(prompt)
+    llm = LLM(model="llava-hf/llava-next-video-7b-hf", max_model_len=8192)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -176,6 +184,7 @@ def run_qwen_vl(question):
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
+    "llava-next-video": run_llava_next_video,
     "fuyu": run_fuyu,
     "phi3_v": run_phi3v,
     "paligemma": run_paligemma,
@@ -187,17 +196,55 @@ def run_qwen_vl(question):
 }
 
 
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom") \
+            .pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=args.num_frames).np_ndarrays
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
 def main(args):
     model = args.model_type
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
     llm, prompt, stop_token_ids = model_example_map[model](question)
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
     sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=64,
+                                     max_tokens=1024,
                                      stop_token_ids=stop_token_ids)
 
     assert args.num_prompts > 0
@@ -206,7 +253,7 @@ def main(args):
         inputs = {
             "prompt": prompt,
             "multi_modal_data": {
-                "image": image
+                modality: data
             },
         }
 
@@ -215,7 +262,7 @@ def main(args):
         inputs = [{
             "prompt": prompt,
             "multi_modal_data": {
-                "image": image
+                modality: data
             },
         } for _ in range(args.num_prompts)]
 
@@ -233,13 +280,20 @@ def main(args):
     parser.add_argument('--model-type',
                         '-m',
                         type=str,
-                        default="llava",
+                        default="llava-next-video",
                         choices=model_example_map.keys(),
                         help='Huggingface "model_type".')
     parser.add_argument('--num-prompts',
                         type=int,
-                        default=1,
+                        default=4,
                         help='Number of prompts to run.')
-
+    parser.add_argument('--modality',
+                        type=str,
+                        default="video",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
     args = parser.parse_args()
     main(args)
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
new file mode 100644
index 0000000000000..f47aa12493544
--- /dev/null
+++ b/vllm/assets/video.py
@@ -0,0 +1,82 @@
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import List, Literal
+
+import cv2
+import numpy as np
+import numpy.typing as npt
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from .base import get_cache_dir
+
+
+@lru_cache
+def download_video_asset(filename: str) -> str:
+    """
+    Download and open an image from huggingface
+    repo: raushan-testing-hf/videos-test
+    """
+    video_directory = get_cache_dir() / "video-eample-data"
+    video_directory.mkdir(parents=True, exist_ok=True)
+
+    video_path = video_directory / filename
+    video_path_str = str(video_path)
+    if not video_path.exists():
+        video_path_str = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test",
+            filename=filename,
+            repo_type="dataset",
+            cache_dir=video_directory,
+        )
+    return video_path_str
+
+
+def video_to_ndarrays(path: str, num_frames: int) -> npt.NDArray:
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file {path}")
+
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    frames = []
+    for i in range(total_frames):
+        ret, frame = cap.read()
+        if ret:
+            frames.append(frame)
+
+    cap.release()
+
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    frames = [frames[i] for i in frame_indices if i < len(frames)]
+    frames = np.stack(frames)
+
+    if len(frames) < num_frames:
+        raise ValueError(f"Could not read enough frames from video file {path}"
+                         f" (expected {num_frames} frames, got {len(frames)})")
+
+    return frames
+
+
+def video_to_pil_images_list(path: str, num_frames: int) -> List[Image.Image]:
+    frames = video_to_ndarrays(path, num_frames)
+    return [
+        Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        for frame in frames
+    ]
+
+
+@dataclass(frozen=True)
+class VideoAsset:
+    name: Literal["sample_demo_1.mp4"]
+    num_frames: int
+
+    @property
+    def pil_images(self) -> List[Image.Image]:
+        video_path = download_video_asset(self.name)
+        return video_to_pil_images_list(video_path, self.num_frames)
+
+    @property
+    def np_ndarrays(self) -> List[npt.NDArray]:
+        video_path = download_video_asset(self.name)
+        return video_to_ndarrays(video_path, self.num_frames)
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 4db847029566f..da907e8a75063 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -80,8 +80,10 @@
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "LlavaForConditionalGeneration":
     ("llava", "LlavaForConditionalGeneration"),
-    "LlavaNextForConditionalGeneration":
-    ("llava_next", "LlavaNextForConditionalGeneration"),
+    "LlavaNextForConditionalGeneration": ("llava_next",
+                                          "LlavaNextForConditionalGeneration"),
+    "LlavaNextVideoForConditionalGeneration":
+    ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "PaliGemmaForConditionalGeneration": ("paligemma",
                                           "PaliGemmaForConditionalGeneration"),
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
new file mode 100644
index 0000000000000..cb0f0dec1821a
--- /dev/null
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -0,0 +1,447 @@
+import itertools
+import math
+from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+                    TypedDict, Union)
+
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionConfig, LlavaNextVideoConfig
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.clip import CLIPVisionModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import (cached_get_tokenizer,
+                                   repeat_and_pad_image_tokens)
+from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.utils import is_list_of
+
+from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
+from .interfaces import SupportsMultiModal
+from .llava import LlavaMultiModalProjector
+from .utils import (filter_weights, init_vllm_registered_model,
+                    merge_multimodal_embeddings)
+
+logger = init_logger(__name__)
+
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 32
+_MAX_NUM_VIDEOS = 1
+
+
+class LlavaNextVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    Shape: `(batch_size, num_frames, num_channels, height, width)`
+
+    Note that `num_frames` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
+
+    Note that it only supports one video input for one batch.
+    """
+
+
+def get_llava_next_video_frame_feature_size(
+        hf_config: LlavaNextVideoConfig) -> int:
+
+    image_size = hf_config.vision_config.image_size
+    patch_size = hf_config.vision_config.patch_size
+    spatial_pool_stride = hf_config.spatial_pool_stride
+
+    return int((image_size / patch_size / spatial_pool_stride)**2)
+
+
+def _get_max_llm_tokens(ctx: InputContext) -> int:
+    """
+    Calculated from the maximum video frames under the context length
+    constraints of the language model.
+    """
+    hf_text_config = ctx.model_config.hf_text_config
+    model_config = ctx.model_config
+    max_tokens = model_config.max_model_len
+    rope_scaling = model_config.rope_scaling
+
+    if rope_scaling:
+        rope_scaling_factor = hf_text_config.rope_scaling["factor"]
+    else:
+        rope_scaling_factor = 1
+
+    max_tokens *= rope_scaling_factor
+
+    return max_tokens
+
+
+def get_max_llava_next_video_tokens(ctx: InputContext) -> int:
+    # Currently set to 32 frames, need
+    # max_tokens = _get_max_llm_tokens(ctx)
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
+    return _MAX_FRAMES_PER_VIDEO * tokens_per_frame
+
+
+def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int,
+                                    mm_counts: Mapping[str, int]):
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    vision_config = hf_config.vision_config
+
+    # TODO: support multiple videos
+    num_videos = mm_counts["video"]
+    if num_videos != _MAX_NUM_VIDEOS:
+        raise NotImplementedError(
+            f"Only {_MAX_NUM_VIDEOS} videos are supported")
+
+    # TODO: support configuring the number of frames
+    frames_per_video = _MAX_FRAMES_PER_VIDEO
+    # num_images = num_videos * frames_per_video
+
+    # fills the sequence with as longer video data as possible
+    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
+    video_feature_size = frames_per_video * tokens_per_frame
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            num_videos,
+            image_token_id=hf_config.video_token_index,
+            image_feature_size_override=video_feature_size,
+        )
+
+        pil_frame = dummy_image_for_clip(vision_config, num_images=1)
+        np_frame = np.array(pil_frame["image"])
+        mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
+        mm_data = {"video": mm_data_per_video}
+        return seq_data, mm_data
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def input_processor_for_llava_next_video(ctx: InputContext,
+                                         llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "video" not in multi_modal_data:
+        return llm_inputs
+    video_data = multi_modal_data["video"]
+
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    vision_config = hf_config.vision_config
+
+    if isinstance(video_data, np.ndarray):
+        num_frames = video_data.shape[0]
+        frame_feature_size = \
+            get_llava_next_video_frame_feature_size(hf_config)
+        video_feature_size = num_frames * frame_feature_size
+
+        strategy = hf_config.vision_feature_select_strategy
+        if strategy == "default":
+            pass
+        elif strategy == "full":
+            raise NotImplementedError(
+                "Unsupported select feature strategy: full")
+        else:
+            raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+        tokenizer = cached_get_tokenizer(model_config.tokenizer)
+
+        new_prompt, new_token_ids = repeat_and_pad_image_tokens(
+            tokenizer,
+            llm_inputs.get("prompt"),
+            llm_inputs["prompt_token_ids"],
+            image_token_id=hf_config.video_token_index,
+            repeat_count=video_feature_size,
+        )
+
+        return LLMInputs(prompt_token_ids=new_token_ids,
+                         prompt=new_prompt,
+                         multi_modal_data=multi_modal_data)
+
+    elif is_list_of(video_data, np.ndarray):
+        raise NotImplementedError(
+            "Processing multiple videos is not supported")
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def _init_vision_tower(hf_config: LlavaNextVideoConfig):
+    vision_config = hf_config.vision_config
+
+    # Initialize the vision tower only up to the required feature layer
+    vision_feature_layer = hf_config.vision_feature_layer
+    if vision_feature_layer < 0:
+        num_hidden_layers = hf_config.vision_config.num_hidden_layers \
+            + vision_feature_layer + 1
+    else:
+        num_hidden_layers = vision_feature_layer + 1
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return CLIPVisionModel(
+            vision_config,
+            num_hidden_layers_override=num_hidden_layers,
+        )
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+# adopted from transformers modeling_llava_next_video.py
+class LlavaNextVideoPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+
+        mode = config.spatial_pool_mode
+        stride = config.spatial_pool_stride
+        image_size = config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.image_size = image_size // patch_size**2
+
+        if mode == "average":
+            self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
+        elif mode == "max":
+            self.pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+        else:
+            # TODO: Support Conv2d pooling layer, need to load weights
+            raise ValueError(
+                f"Unknown pooling mode: {mode}. Expected [`average`, `max`]")
+
+    def forward(self, image_features):
+        ori_width = int(
+            math.sqrt(image_features.shape[1] * self.image_size //
+                      self.image_size))
+        ori_height = int(ori_width * self.image_size // self.image_size)
+
+        batch_size, _, dim = image_features.shape
+        image_features_spatial = image_features \
+            .view(batch_size, ori_height, ori_height, dim) \
+            .permute(0, 3, 1, 2)
+        image_features_spatial = self.pool(image_features_spatial)
+
+        return image_features_spatial.flatten(2).transpose(1, 2).contiguous()
+
+
+@MULTIMODAL_REGISTRY.register_input_mapper("video")
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+    "video", get_max_llava_next_video_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next_video)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next_video)
+class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal):
+
+    def __init__(self,
+                 config: LlavaNextVideoConfig,
+                 multimodal_config: MultiModalConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # Initialize the vision tower only up to the required feature layer
+        self.vision_tower = _init_vision_tower(config)
+        self.multi_modal_projector = LlavaMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act)
+        self.language_model = init_vllm_registered_model(
+            config.text_config, cache_config, quant_config)
+        self.vision_resampler = LlavaNextVideoPooler(config)
+
+        # # For processing image inputs
+        # embed_std = 1 / math.sqrt(config.text_config.hidden_size)
+        # self.image_newline = nn.Parameter(
+        #     torch.randn(config.text_config.hidden_size,
+        #                 dtype=self.dtype) * embed_std)
+
+    def _validate_video_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[2:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_frames", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values in each video frame "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[LlavaNextVideoPixelInputs]:
+        """
+        A legal video input should have the following dimensions:
+        {
+            "pixel_values_videos" : 
+                List[b, Tensor(nb_frames, nb_channels, height, width)]
+        }
+        """
+        pixel_values = kwargs.pop("pixel_values_videos", None)
+
+        if pixel_values is None:
+            return None
+
+        if not (is_list_of(pixel_values,
+                           (torch.Tensor))  # different shape videos 
+                or isinstance(pixel_values,
+                              torch.Tensor)):  # same shape videos
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return LlavaNextVideoPixelInputs(
+            type="pixel_values_videos",
+            data=pixel_values,
+        )
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _video_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel],
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values)
+        image_features = self._select_image_features(
+            image_features,
+            strategy=self.config.vision_feature_select_strategy,
+        )
+        image_features = self.vision_resampler(image_features)
+        image_features = self.multi_modal_projector(image_features)
+        return image_features
+
+    def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
+        assert self.vision_tower is not None
+
+        video_pixels = inputs["data"]
+
+        if isinstance(video_pixels, torch.Tensor):
+            b, num_frames, c, h, w = video_pixels.shape
+            stacked_pixels = video_pixels.view(b * num_frames, c, h, w)
+            stacked_embeddings = self._video_pixels_to_features(
+                self.vision_tower, stacked_pixels)
+            return stacked_embeddings.view(b, num_frames,
+                                           *stacked_embeddings.shape[1:])
+
+        elif is_list_of(video_pixels, torch.Tensor):
+            frames_per_videos = [v.shape[0] for v in video_pixels]
+            stacked_pixels = torch.cat(video_pixels, dim=0)
+            stacked_embeddings = self._video_pixels_to_features(
+                self.vision_tower, stacked_pixels)
+            return torch.split(stacked_embeddings, frames_per_videos, dim=0)
+
+        else:
+            raise ValueError(
+                f"Unsupported type of video input {type(video_pixels)}")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        """Run forward pass for LlaVA-NeXT-Video.
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values_videos: Pixels in each frames for each input videos.
+        """
+        video_input = self._parse_and_validate_video_input(**kwargs)
+
+        # merge video embeddings into input embeddings
+        if video_input is not None:
+            video_embeddings = self._process_video_pixels(video_input)
+            inputs_embeds = self.language_model \
+                .model.get_input_embeddings(input_ids)
+
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, video_embeddings,
+                self.config.video_token_index)
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  None,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # prepare weight iterators
+        vit_weights, mlp_weights, newline_weights, llm_weights = itertools.tee(
+            weights, 4)
+
+        # load vision encoder
+        vit_weights = filter_weights(vit_weights, "vision_tower")
+        self.vision_tower.load_weights(vit_weights)
+
+        # load mlp projector
+        mlp_weights = filter_weights(mlp_weights, "multi_modal_projector")
+        mlp_params_dict = dict(self.multi_modal_projector.named_parameters())
+        for name, loaded_weight in mlp_weights:
+            param = mlp_params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+        # # load newline
+        # newline_weights = filter_weights(newline_weights, "image_newline")
+        # for name, loaded_weight in newline_weights:
+        #     assert name == ""
+        #     param = self.image_newline
+        #     weight_loader = getattr(param, "weight_loader",
+        #                             default_weight_loader)
+        #     weight_loader(param, loaded_weight)
+
+        # load llm backbone
+        llm_weights = filter_weights(llm_weights, "language_model")
+        self.language_model.load_weights(llm_weights)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index cd16cdcbd890c..745fc715caf45 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -9,6 +9,7 @@
 from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
                    MultiModalPlugin, MultiModalTokensCalc, NestedTensors)
 from .image import ImagePlugin
+from .video import VideoPlugin
 
 logger = init_logger(__name__)
 
@@ -34,7 +35,7 @@ class MultiModalRegistry:
     :class:`~vllm.multimodal.MultiModalPlugin` for each modality.
     """
 
-    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin())
+    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
 
     def __init__(
             self,
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
new file mode 100644
index 0000000000000..57cabe6c024b2
--- /dev/null
+++ b/vllm/multimodal/video.py
@@ -0,0 +1,70 @@
+from functools import lru_cache
+from typing import List, Union
+
+import numpy as np
+
+from vllm.config import ModelConfig
+from vllm.inputs.registry import InputContext
+from vllm.logger import init_logger
+from vllm.transformers_utils.image_processor import get_video_processor
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import is_list_of
+
+from .base import MultiModalData, MultiModalInputs
+from .image import ImagePlugin
+
+logger = init_logger(__name__)
+
+cached_get_video_processor = lru_cache(get_video_processor)
+cached_get_tokenizer = lru_cache(get_tokenizer)
+
+VideoInput = Union[
+    "np.ndarray",  # single video input
+    List["np.ndarray"],
+    # List[Image.Image], List[List[Image.Image]],
+    # "torch.Tensor",
+    # List["torch.Tensor"],
+    # List[List["np.ndarrray"]],
+    # List[List["torch.Tensor"]],
+]
+
+
+class VideoPlugin(ImagePlugin):
+    """Plugin for video data."""
+
+    def get_data_key(self) -> str:
+        return "video"
+
+    def _get_hf_video_processor(self, model_config: ModelConfig):
+        return cached_get_video_processor(
+            model_config.model,
+            trust_remote_code=model_config.trust_remote_code)
+
+    def _default_input_mapper(
+        self,
+        ctx: InputContext,
+        data: MultiModalData[object],
+    ) -> MultiModalInputs:
+        model_config = ctx.model_config
+
+        # single video input as np.ndarray
+        if isinstance(data, np.ndarray):
+            video_processor = self._get_hf_video_processor(model_config)
+            if video_processor is None:
+                raise RuntimeError("No HuggingFace processor is available "
+                                   "to process the image object")
+            try:
+                batch_data = video_processor(data, return_tensors="pt").data
+            except Exception:
+                logger.error("Failed to process image (%s)", data)
+                raise
+
+            return MultiModalInputs(batch_data)
+        elif is_list_of(data, np.ndarray):
+            raise NotImplementedError(
+                "Multi video for a prompt is not supported yet")
+
+        raise TypeError(f"Invalid video type: {type(data)}")
+
+    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
+        return 4096
diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index c7d9eabd06f0e..4cffac3724ba8 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,6 +1,33 @@
 from typing import cast
 
 
+def get_video_processor(
+    processor_name: str,
+    trust_remote_code: bool = False,
+):
+    """
+    Gets a processor for the given model name via HuggingFace.
+    """
+    from transformers import AutoProcessor
+
+    try:
+        processor = AutoProcessor.from_pretrained(processor_name)
+        video_processor = processor.video_processor
+
+    except ValueError as e:
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the processor. If the processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    return video_processor
+
+
 def get_image_processor(
     processor_name: str,
     *args,

From b6a204f1a8e6e804a5bf29ac81b13c93079b6385 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Tue, 20 Aug 2024 12:12:01 +0000
Subject: [PATCH 02/28] [doc] add model support for llava-next-video series to
 document

---
 docs/source/models/supported_models.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 1bb3a448f2c92..f9d9d6e56adf5 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -227,6 +227,10 @@ Multimodal Language Models
     - Image\ :sup:`E+`
     - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
+  * - :code:`LlavaNextVideoForConditionalGeneration`
+    - LLaVA-NeXT-Video
+    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
+    -
   * - :code:`MiniCPMV`
     - MiniCPM-V
     - Image\ :sup:`+`

From b4dc8fc9b8b0849ff3013328eaaa611dea6127a5 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Thu, 22 Aug 2024 07:59:08 +0000
Subject: [PATCH 03/28] Fix comments and vision language examples

---
 examples/offline_inference_vision_language.py |  8 ++--
 .../model_executor/models/llava_next_video.py | 44 +++++++------------
 vllm/multimodal/video.py                      |  1 +
 3 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 94fa4a9ae440b..02ef9dfc270fe 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -18,7 +18,7 @@ def run_llava(question):
 
     prompt = f"USER: <image>\n{question}\nASSISTANT:"
 
-    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf")
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -35,10 +35,8 @@ def run_llava_next(question):
 # LlaVA-NeXT-Video
 # Currently only support for video input
 def run_llava_next_video(question):
-    # prompt = f"[INST] <video>\n{question} [/INST]"
     prompt = f"USER: <video>\n{question} ASSISTANT:"
-    print(prompt)
-    llm = LLM(model="llava-hf/llava-next-video-7b-hf", max_model_len=8192)
+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -244,7 +242,7 @@ def main(args):
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
     sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=1024,
+                                     max_tokens=64,
                                      stop_token_ids=stop_token_ids)
 
     assert args.num_prompts > 0
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index cb0f0dec1821a..2529f900f2ab8 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -257,12 +257,6 @@ def __init__(self,
             config.text_config, cache_config, quant_config)
         self.vision_resampler = LlavaNextVideoPooler(config)
 
-        # # For processing image inputs
-        # embed_std = 1 / math.sqrt(config.text_config.hidden_size)
-        # self.image_newline = nn.Parameter(
-        #     torch.randn(config.text_config.hidden_size,
-        #                 dtype=self.dtype) * embed_std)
-
     def _validate_video_pixel_values(
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
@@ -400,6 +394,21 @@ def forward(
 
         return hidden_states
 
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+    
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         # prepare weight iterators
         vit_weights, mlp_weights, newline_weights, llm_weights = itertools.tee(
@@ -418,30 +427,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
 
-        # # load newline
-        # newline_weights = filter_weights(newline_weights, "image_newline")
-        # for name, loaded_weight in newline_weights:
-        #     assert name == ""
-        #     param = self.image_newline
-        #     weight_loader = getattr(param, "weight_loader",
-        #                             default_weight_loader)
-        #     weight_loader(param, loaded_weight)
-
         # load llm backbone
         llm_weights = filter_weights(llm_weights, "language_model")
         self.language_model.load_weights(llm_weights)
 
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        return self.language_model.compute_logits(hidden_states,
-                                                  sampling_metadata)
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 57cabe6c024b2..4401d13157923 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -21,6 +21,7 @@
 VideoInput = Union[
     "np.ndarray",  # single video input
     List["np.ndarray"],
+    # TODO: support more types
     # List[Image.Image], List[List[Image.Image]],
     # "torch.Tensor",
     # List["torch.Tensor"],

From b036cba364e969e3b8990b4e49ba77aa7828cbc0 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Thu, 22 Aug 2024 08:09:10 +0000
Subject: [PATCH 04/28] Fix calling to repeat_and_pad_placeholder_tokens

---
 vllm/model_executor/models/llava_next_video.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 2529f900f2ab8..fac8ce79f56a6 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -18,8 +18,8 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import (cached_get_tokenizer,
-                                   repeat_and_pad_image_tokens)
+from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   repeat_and_pad_placeholder_tokens)
 from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import is_list_of
 
@@ -153,11 +153,11 @@ def input_processor_for_llava_next_video(ctx: InputContext,
 
         tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
-        new_prompt, new_token_ids = repeat_and_pad_image_tokens(
+        new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
             tokenizer,
             llm_inputs.get("prompt"),
             llm_inputs["prompt_token_ids"],
-            image_token_id=hf_config.video_token_index,
+            placeholder_token_id=hf_config.video_token_index,
             repeat_count=video_feature_size,
         )
 

From f0a2cdd9124ee375196f931b111baaff0661f90a Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Thu, 22 Aug 2024 08:09:56 +0000
Subject: [PATCH 05/28] Formatting

---
 vllm/model_executor/models/llava_next_video.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index fac8ce79f56a6..06d00504c8ed7 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -408,7 +408,7 @@ def sample(
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
-    
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         # prepare weight iterators
         vit_weights, mlp_weights, newline_weights, llm_weights = itertools.tee(
@@ -430,4 +430,3 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         # load llm backbone
         llm_weights = filter_weights(llm_weights, "language_model")
         self.language_model.load_weights(llm_weights)
-

From b15ffe605f89559d1d8f4721f5ed5288dbd6a433 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Thu, 22 Aug 2024 08:24:32 +0000
Subject: [PATCH 06/28] [Doc] Add modality to llava-next-video

---
 docs/source/models/supported_models.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f9d9d6e56adf5..051a3588bf580 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -229,6 +229,7 @@ Multimodal Language Models
     -
   * - :code:`LlavaNextVideoForConditionalGeneration`
     - LLaVA-NeXT-Video
+    - Video
     - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
     -
   * - :code:`MiniCPMV`

From fbe9f6ce1a09885ddff8ecffeb12d729d1fb61c9 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Thu, 22 Aug 2024 19:02:04 +0000
Subject: [PATCH 07/28] * Add Siglip support for llava-next-video * Fix some
 minor issues according to review comments

---
 examples/offline_inference_vision_language.py |  4 +-
 .../model_executor/models/llava_next_video.py | 69 ++++++++++++++-----
 2 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 02ef9dfc270fe..2ec691608df6d 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -278,7 +278,7 @@ def main(args):
     parser.add_argument('--model-type',
                         '-m',
                         type=str,
-                        default="llava-next-video",
+                        default="llava",
                         choices=model_example_map.keys(),
                         help='Huggingface "model_type".')
     parser.add_argument('--num-prompts',
@@ -287,7 +287,7 @@ def main(args):
                         help='Number of prompts to run.')
     parser.add_argument('--modality',
                         type=str,
-                        default="video",
+                        default="image",
                         help='Modality of the input.')
     parser.add_argument('--num-frames',
                         type=int,
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 06d00504c8ed7..b3302bb435246 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -6,12 +6,13 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import CLIPVisionConfig, LlavaNextVideoConfig
+from transformers import CLIPVisionConfig, LlavaNextVideoConfig, SiglipVisionConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -25,9 +26,10 @@
 
 from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsMultiModal
-from .llava import LlavaMultiModalProjector
 from .utils import (filter_weights, init_vllm_registered_model,
                     merge_multimodal_embeddings)
+from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
+                     dummy_seq_data_for_siglip)
 
 logger = init_logger(__name__)
 
@@ -51,14 +53,13 @@ class LlavaNextVideoPixelInputs(TypedDict):
 
 def get_llava_next_video_frame_feature_size(
         hf_config: LlavaNextVideoConfig) -> int:
-
+    # Support both CLIPVisionConfig and SiglipVisionConfig
     image_size = hf_config.vision_config.image_size
     patch_size = hf_config.vision_config.patch_size
     spatial_pool_stride = hf_config.spatial_pool_stride
 
     return int((image_size / patch_size / spatial_pool_stride)**2)
 
-
 def _get_max_llm_tokens(ctx: InputContext) -> int:
     """
     Calculated from the maximum video frames under the context length
@@ -80,8 +81,8 @@ def _get_max_llm_tokens(ctx: InputContext) -> int:
 
 
 def get_max_llava_next_video_tokens(ctx: InputContext) -> int:
-    # Currently set to 32 frames, need
-    # max_tokens = _get_max_llm_tokens(ctx)
+    # Currently set to 32 frames
+    # TODO: max_tokens = _get_max_llm_tokens(ctx)
     hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
     tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
     return _MAX_FRAMES_PER_VIDEO * tokens_per_frame
@@ -120,7 +121,21 @@ def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int,
         mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
         mm_data = {"video": mm_data_per_video}
         return seq_data, mm_data
+    elif isinstance(vision_config, SiglipVisionConfig):
+        seq_data = dummy_seq_data_for_siglip(
+            vision_config,
+            seq_len,
+            num_videos,
+            image_token_id=hf_config.video_token_index,
+            image_feature_size_override=video_feature_size,
+        )
 
+        pil_frame = dummy_image_for_siglip(vision_config, num_images=1)
+        np_frame = np.array(pil_frame["image"])
+        mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
+        mm_data = {"video": mm_data_per_video}
+        return seq_data, mm_data
+    
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
 
@@ -137,20 +152,12 @@ def input_processor_for_llava_next_video(ctx: InputContext,
     vision_config = hf_config.vision_config
 
     if isinstance(video_data, np.ndarray):
+        # Supports both CLIP and Siglip
         num_frames = video_data.shape[0]
         frame_feature_size = \
             get_llava_next_video_frame_feature_size(hf_config)
         video_feature_size = num_frames * frame_feature_size
 
-        strategy = hf_config.vision_feature_select_strategy
-        if strategy == "default":
-            pass
-        elif strategy == "full":
-            raise NotImplementedError(
-                "Unsupported select feature strategy: full")
-        else:
-            raise ValueError(f"Unexpected select feature strategy: {strategy}")
-
         tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
         new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
@@ -189,7 +196,12 @@ def _init_vision_tower(hf_config: LlavaNextVideoConfig):
             vision_config,
             num_hidden_layers_override=num_hidden_layers,
         )
-
+    elif isinstance(vision_config, SiglipVisionConfig):
+        return SiglipVisionModel(
+            vision_config,
+            num_hidden_layers_override=num_hidden_layers,
+        )
+    
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
 
@@ -229,6 +241,27 @@ def forward(self, image_features):
 
         return image_features_spatial.flatten(2).transpose(1, 2).contiguous()
 
+class LlavaNextMultiModalProjector(nn.Module):
+
+    def __init__(self, vision_hidden_size: int, text_hidden_size: int,
+                 projector_hidden_act: str):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(vision_hidden_size,
+                                  text_hidden_size,
+                                  bias=True)
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = nn.Linear(text_hidden_size,
+                                  text_hidden_size,
+                                  bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
 
 @MULTIMODAL_REGISTRY.register_input_mapper("video")
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
@@ -249,7 +282,7 @@ def __init__(self,
 
         # Initialize the vision tower only up to the required feature layer
         self.vision_tower = _init_vision_tower(config)
-        self.multi_modal_projector = LlavaMultiModalProjector(
+        self.multi_modal_projector = LlavaNextMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
             projector_hidden_act=config.projector_hidden_act)
@@ -315,7 +348,7 @@ def _select_image_features(self, image_features: torch.Tensor, *,
 
     def _video_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel],
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
 

From b196b0de3acca0dba74da388775dbd0231917267 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Thu, 22 Aug 2024 19:09:31 +0000
Subject: [PATCH 08/28] Fix formatting issues

---
 vllm/model_executor/models/llava_next_video.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index b3302bb435246..cabb3dbc22137 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -6,7 +6,8 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import CLIPVisionConfig, LlavaNextVideoConfig, SiglipVisionConfig
+from transformers import (CLIPVisionConfig, LlavaNextVideoConfig,
+                          SiglipVisionConfig)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
@@ -26,10 +27,10 @@
 
 from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsMultiModal
-from .utils import (filter_weights, init_vllm_registered_model,
-                    merge_multimodal_embeddings)
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
                      dummy_seq_data_for_siglip)
+from .utils import (filter_weights, init_vllm_registered_model,
+                    merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
 
@@ -60,6 +61,7 @@ def get_llava_next_video_frame_feature_size(
 
     return int((image_size / patch_size / spatial_pool_stride)**2)
 
+
 def _get_max_llm_tokens(ctx: InputContext) -> int:
     """
     Calculated from the maximum video frames under the context length
@@ -135,7 +137,7 @@ def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int,
         mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0)
         mm_data = {"video": mm_data_per_video}
         return seq_data, mm_data
-    
+
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
 
@@ -201,7 +203,7 @@ def _init_vision_tower(hf_config: LlavaNextVideoConfig):
             vision_config,
             num_hidden_layers_override=num_hidden_layers,
         )
-    
+
     msg = f"Unsupported vision config: {type(vision_config)}"
     raise NotImplementedError(msg)
 
@@ -241,6 +243,7 @@ def forward(self, image_features):
 
         return image_features_spatial.flatten(2).transpose(1, 2).contiguous()
 
+
 class LlavaNextMultiModalProjector(nn.Module):
 
     def __init__(self, vision_hidden_size: int, text_hidden_size: int,
@@ -262,7 +265,6 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-
 @MULTIMODAL_REGISTRY.register_input_mapper("video")
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
     "video", get_max_llava_next_video_tokens)

From 3dae4c5753f8871152373d7480f98944f8758230 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Sat, 31 Aug 2024 14:47:27 +0000
Subject: [PATCH 09/28] * Add llava-next-video tests

---
 tests/conftest.py                             |  56 ++++-
 tests/models/test_llava_next_video.py         | 228 ++++++++++++++++++
 vllm/assets/video.py                          |  22 +-
 .../model_executor/models/llava_next_video.py |   7 +-
 vllm/multimodal/utils.py                      |  32 +++
 5 files changed, 331 insertions(+), 14 deletions(-)
 create mode 100644 tests/models/test_llava_next_video.py

diff --git a/tests/conftest.py b/tests/conftest.py
index cd0091b7cba68..c850e60a9ca6c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,6 +21,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
 from vllm.config import TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (destroy_distributed_environment,
@@ -44,6 +45,7 @@
 PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
 PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
                          List[List[Tuple[np.ndarray, int]]]]
+PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
 
 
 def _read_prompts(filename: str) -> List[str]:
@@ -85,8 +87,35 @@ def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
         return [prompts["stop_sign"], prompts["cherry_blossom"]]
 
 
+class _VideoAssetPrompts(TypedDict):
+    sample_demo_1: str
+
+
+if sys.version_info < (3, 9):
+    # UserList cannot be subscripted
+    class _VideoAssetsBase(UserList):
+        pass
+else:
+
+    class _VideoAssetsBase(UserList[VideoAsset]):
+        pass
+
+
+class _VideoAssets(_VideoAssetsBase):
+
+    def __init__(self) -> None:
+        super().__init__([
+            VideoAsset("sample_demo_1.mp4"),
+        ])
+
+    def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
+        return [prompts["sample_demo_1"]]
+
+
 IMAGE_ASSETS = _ImageAssets()
 """Singleton instance of :class:`_ImageAssets`."""
+VIDEO_ASSETS = _VideoAssets()
+"""Singleton instance of :class:`_VideoAssets`."""
 
 
 @pytest.fixture(autouse=True)
@@ -202,6 +231,11 @@ def image_assets() -> _ImageAssets:
     return IMAGE_ASSETS
 
 
+@pytest.fixture(scope="session")
+def video_assets() -> _VideoAssets:
+    return VIDEO_ASSETS
+
+
 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
 
 
@@ -279,6 +313,7 @@ def generate(
         self,
         prompts: List[str],
         images: Optional[PromptImageInput] = None,
+        videos: Optional[List[np.ndarray]] = None,
         **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images:
@@ -292,6 +327,8 @@ def generate(
             }
             if images is not None and images[i] is not None:
                 processor_kwargs["images"] = images[i]
+            if videos is not None and videos[i] is not None:
+                processor_kwargs["videos"] = videos[i]
 
             inputs = self.processor(**processor_kwargs)
             inputs = self.postprocess_inputs(inputs)
@@ -352,6 +389,7 @@ def generate_greedy_logprobs(
         prompts: List[str],
         max_tokens: int,
         images: Optional[PromptImageInput] = None,
+        videos: Optional[List[np.ndarray]] = None,
         **kwargs: Any,
     ) -> List[List[torch.Tensor]]:
         all_logprobs: List[List[torch.Tensor]] = []
@@ -362,6 +400,8 @@ def generate_greedy_logprobs(
             }
             if images is not None and images[i] is not None:
                 processor_kwargs["images"] = images[i]
+            if videos is not None and videos[i] is not None:
+                processor_kwargs["videos"] = videos[i]
 
             inputs = self.processor(**processor_kwargs)
             inputs = self.postprocess_inputs(inputs)
@@ -435,6 +475,7 @@ def generate_greedy_logprobs_limit(
         num_logprobs: int,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        videos: Optional[List[np.ndarray]] = None,
         **kwargs: Any,
     ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
         all_logprobs: List[List[Dict[int, float]]] = []
@@ -454,6 +495,8 @@ def generate_greedy_logprobs_limit(
                 processor_kwargs["audio"] = audio
                 processor_kwargs["sampling_rate"] = sr
 
+            if videos is not None:
+                processor_kwargs["videos"] = videos[i]
             inputs = self.processor(**processor_kwargs)
             inputs = self.postprocess_inputs(inputs)
 
@@ -634,12 +677,16 @@ def generate_w_logprobs(
         sampling_params: SamplingParams,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        videos: Optional[PromptVideoInput] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         assert sampling_params.logprobs is not None
 
         if images is not None:
             assert len(prompts) == len(images)
 
+        if videos is not None:
+            assert len(prompts) == len(videos)
+
         inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
         if images is not None:
             for i, image in enumerate(images):
@@ -649,6 +696,11 @@ def generate_w_logprobs(
             for i, audio in enumerate(audios):
                 inputs[i]["multi_modal_data"] = {"audio": audio}
 
+        if videos is not None:
+            for i, video in enumerate(videos):
+                inputs[i]["multi_modal_data"] = {"video": video}
+        print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")
+
         req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
         return self._final_steps_generate_w_logprobs(req_outputs)
@@ -685,6 +737,7 @@ def generate_greedy_logprobs(
         num_logprobs: int,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        videos: Optional[PromptVideoInput] = None,
         stop_token_ids: Optional[List[int]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         greedy_logprobs_params = SamplingParams(temperature=0.0,
@@ -694,7 +747,8 @@ def generate_greedy_logprobs(
         outputs = self.generate_w_logprobs(prompts,
                                            greedy_logprobs_params,
                                            images=images,
-                                           audios=audios)
+                                           audios=audios,
+                                           videos=videos)
 
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
diff --git a/tests/models/test_llava_next_video.py b/tests/models/test_llava_next_video.py
new file mode 100644
index 0000000000000..3fced4d2c9c4d
--- /dev/null
+++ b/tests/models/test_llava_next_video.py
@@ -0,0 +1,228 @@
+from typing import List, Optional, Tuple, Type, overload
+
+import pytest
+from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
+
+from vllm.multimodal.utils import rescale_video_size, resize_video
+from vllm.multimodal.utils import sample_frames_from_video
+from vllm.sequence import SampleLogprobs
+
+from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+_PREFACE = (
+    "A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's "
+    "questions.")
+
+HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
+    "sample_demo_1": f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
+})
+
+models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    config = AutoConfig.from_pretrained(model)
+    video_token_id = config.video_token_index
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
+    ]
+
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+@overload
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+@overload
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    sizes: List[Tuple[int, int]],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    size_factors: Optional[List[float]] = None,
+    sizes: Optional[List[Tuple[int, int]]] = None,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    videos = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets]
+
+    for video in videos:
+        print(video.shape)
+
+    if size_factors is not None:
+        inputs_per_video = [(
+            [prompt for _ in size_factors],
+            [rescale_video_size(video, factor) for factor in size_factors],
+        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
+    elif sizes is not None:
+        inputs_per_video = [(
+            [prompt for _ in sizes],
+            [resize_video(video, size) for size in sizes],
+        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
+    else:
+        raise ValueError("You must provide either `size_factors` or `sizes`")
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     dtype=dtype,
+                     max_model_len=4096,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_video = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                videos=videos)
+            for prompts, videos in inputs_per_video
+        ]
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
+        hf_outputs_per_video = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    videos=videos)
+            for prompts, videos in inputs_per_video
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_video,
+                                        vllm_outputs_per_video):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No video
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("num_frames", [16])
+def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
+                dtype, max_tokens, num_logprobs, num_frames) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/videos.
+    For huggingface runner, we provide the np.ndarray as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    run_test(
+        hf_runner,
+        vllm_runner,
+        video_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        num_frames=num_frames,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "sizes",
+    [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("num_frames", [16])
+def test_models_fixed_sizes(hf_runner, vllm_runner, video_assets, model, sizes,
+                            dtype, max_tokens, num_logprobs, num_frames) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        video_assets,
+        model,
+        sizes=sizes,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        num_frames=num_frames,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index f47aa12493544..ad8637f4c0654 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -8,6 +8,8 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
+from vllm.multimodal.utils import sample_frames_from_video
+
 from .base import get_cache_dir
 
 
@@ -32,33 +34,29 @@ def download_video_asset(filename: str) -> str:
     return video_path_str
 
 
-def video_to_ndarrays(path: str, num_frames: int) -> npt.NDArray:
+def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
         raise ValueError(f"Could not open video file {path}")
 
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-
     frames = []
     for i in range(total_frames):
         ret, frame = cap.read()
         if ret:
             frames.append(frame)
-
     cap.release()
 
-    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
-    frames = [frames[i] for i in frame_indices if i < len(frames)]
     frames = np.stack(frames)
-
+    frames = sample_frames_from_video(frames, num_frames)
     if len(frames) < num_frames:
         raise ValueError(f"Could not read enough frames from video file {path}"
                          f" (expected {num_frames} frames, got {len(frames)})")
-
     return frames
 
 
-def video_to_pil_images_list(path: str, num_frames: int) -> List[Image.Image]:
+def video_to_pil_images_list(path: str,
+                             num_frames: int = -1) -> List[Image.Image]:
     frames = video_to_ndarrays(path, num_frames)
     return [
         Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
@@ -69,14 +67,16 @@ def video_to_pil_images_list(path: str, num_frames: int) -> List[Image.Image]:
 @dataclass(frozen=True)
 class VideoAsset:
     name: Literal["sample_demo_1.mp4"]
-    num_frames: int
+    num_frames: int = -1
 
     @property
     def pil_images(self) -> List[Image.Image]:
         video_path = download_video_asset(self.name)
-        return video_to_pil_images_list(video_path, self.num_frames)
+        ret = video_to_pil_images_list(video_path, self.num_frames)
+        return ret
 
     @property
     def np_ndarrays(self) -> List[npt.NDArray]:
         video_path = download_video_asset(self.name)
-        return video_to_ndarrays(video_path, self.num_frames)
+        ret = video_to_ndarrays(video_path, self.num_frames)
+        return ret
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index cabb3dbc22137..a9a404e075000 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -371,8 +371,11 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
         video_pixels = inputs["data"]
 
         if isinstance(video_pixels, torch.Tensor):
-            b, num_frames, c, h, w = video_pixels.shape
-            stacked_pixels = video_pixels.view(b * num_frames, c, h, w)
+            # TODO: support multiple videos per input
+            b, num_videos, num_frames, c, h, w = video_pixels.shape
+            assert (num_videos == 1)
+            stacked_pixels = video_pixels.view(b * num_videos * num_frames, c,
+                                               h, w)
             stacked_embeddings = self._video_pixels_to_features(
                 self.vision_tower, stacked_pixels)
             return stacked_embeddings.view(b, num_frames,
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index b76b765bc677a..30b1e463ccdeb 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,7 +3,9 @@
 from io import BytesIO
 from typing import Any, List, Optional, Tuple, TypeVar, Union
 
+import cv2
 import numpy as np
+import numpy.typing as npt
 from PIL import Image
 
 from vllm.connections import global_http_connection
@@ -187,6 +189,36 @@ def rescale_image_size(image: Image.Image,
     return image
 
 
+def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
+    num_frames, _, _, channels = frames.shape
+    new_height, new_width = size
+    resized_frames = np.empty((num_frames, new_height, new_width, channels),
+                              dtype=frames.dtype)
+    for i, frame in enumerate(frames):
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frames[i] = resized_frame
+    return resized_frames
+
+
+def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
+    _, height, width, _ = frames.shape
+    new_height = int(height * size_factor)
+    new_width = int(width * size_factor)
+
+    return resize_video(frames, (new_height, new_width))
+
+
+def sample_frames_from_video(frames: npt.NDArray,
+                             num_frames: int) -> npt.NDArray:
+    total_frames = frames.shape[0]
+    if num_frames == -1:
+        return frames
+    else:
+        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+        sampled_frames = frames[frame_indices, ...]
+        return sampled_frames
+
+
 # Utilities for input processors
 _T = TypeVar("_T", str, int)
 

From 8068de57202a00a3dc5f118c9f4d3e4a98a03e83 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Sat, 31 Aug 2024 16:01:40 +0000
Subject: [PATCH 10/28] format

---
 tests/models/test_llava_next_video.py          | 13 ++++++++-----
 vllm/model_executor/models/llava_next_video.py |  3 ++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_llava_next_video.py b/tests/models/test_llava_next_video.py
index 3fced4d2c9c4d..9212b27b87145 100644
--- a/tests/models/test_llava_next_video.py
+++ b/tests/models/test_llava_next_video.py
@@ -3,8 +3,8 @@
 import pytest
 from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 
-from vllm.multimodal.utils import rescale_video_size, resize_video
-from vllm.multimodal.utils import sample_frames_from_video
+from vllm.multimodal.utils import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
 from vllm.sequence import SampleLogprobs
 
 from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
@@ -18,7 +18,8 @@
     "questions.")
 
 HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
-    "sample_demo_1": f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
+    "sample_demo_1":
+    f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
 })
 
 models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
@@ -102,7 +103,8 @@ def run_test(
 ):
     videos = [
         sample_frames_from_video(asset.np_ndarrays, num_frames)
-        for asset in video_assets]
+        for asset in video_assets
+    ]
 
     for video in videos:
         print(video.shape)
@@ -213,7 +215,8 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("num_frames", [16])
 def test_models_fixed_sizes(hf_runner, vllm_runner, video_assets, model, sizes,
-                            dtype, max_tokens, num_logprobs, num_frames) -> None:
+                            dtype, max_tokens, num_logprobs,
+                            num_frames) -> None:
     run_test(
         hf_runner,
         vllm_runner,
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index a9a404e075000..7fe85e5e4ab3d 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -16,13 +16,14 @@
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    repeat_and_pad_placeholder_tokens)
-from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
 from .clip import dummy_image_for_clip, dummy_seq_data_for_clip

From 2d4a487a147963ac333614ac242f2f503769df34 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sat, 31 Aug 2024 21:14:42 -0700
Subject: [PATCH 11/28] dockefile update for opencv

---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index 0ec6655ed449e..f28a973eca4bd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,6 +18,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
     && apt-get install -y ccache software-properties-common git curl sudo \
+    && apt-get install ffmpeg libsm6 libxext6  -y \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \

From 3b459c99edf333df360a39e082c511d9f6c32d6b Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sat, 31 Aug 2024 21:23:08 -0700
Subject: [PATCH 12/28] mock doc

---
 docs/source/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index b4f5b4ab9d569..8435129e752e1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -99,6 +99,7 @@ def setup(app):
     "aiohttp",
     "compressed_tensors",
     "cpuinfo",
+    "cv2",
     "torch",
     "transformers",
     "psutil",

From 7e5c0822b2e37e18907a661d10504626391a9cea Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Sun, 1 Sep 2024 06:57:58 +0000
Subject: [PATCH 13/28] fix libgl1 missing

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index f28a973eca4bd..5484be5bc5785 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,7 +18,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
     && apt-get install -y ccache software-properties-common git curl sudo \
-    && apt-get install ffmpeg libsm6 libxext6  -y \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@@ -146,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
     && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \

From c69203d9f440108fec178b2d5c923eb965e79b3f Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Sun, 1 Sep 2024 07:29:56 +0000
Subject: [PATCH 14/28] Fix libgl.so missing error

---
 Dockerfile.cpu      | 1 +
 Dockerfile.neuron   | 4 +++-
 Dockerfile.openvino | 3 ++-
 Dockerfile.ppc64le  | 2 +-
 Dockerfile.rocm     | 5 +++++
 Dockerfile.tpu      | 3 +++
 Dockerfile.xpu      | 3 +--
 7 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 9a570f988f3db..2b60835255cb4 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -5,6 +5,7 @@ FROM ubuntu:22.04 AS cpu-test-1
 RUN --mount=type=cache,target=/var/cache/apt \
     apt-get update -y \
     && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index caa1b1d6c4424..f0c3479625a70 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -6,7 +6,9 @@ FROM $BASE_IMAGE
 RUN echo "Base image is $BASE_IMAGE"
 
 # Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
+RUN apt-get update \
+    && apt-get install python3 python3-pip -y \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 
 ### Mount Point ###
 # When launching the container, mount the code directory to /app
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index 06ca4638dfeb9..96b9593a2bfa8 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -4,7 +4,8 @@
 FROM ubuntu:22.04 AS dev
 
 RUN apt-get update -y && \
-    apt-get install -y python3-pip git
+    apt-get install -y python3-pip git && \
+    apt-get install -y ffmpeg libsm6 libxext6 libgl1 
 WORKDIR /workspace
 
 # copy requirements
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index 16780f8ab950c..7947493505169 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -4,7 +4,7 @@ USER root
 
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
-RUN apt-get update -y && apt-get install -y git wget vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential
+RUN apt-get update -y && apt-get install -y git wget vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 33423fde4ff96..96156b940720a 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -40,6 +40,11 @@ RUN apt-get update && apt-get install -y \
     unzip \
     tmux \
     ccache \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    libgl1 
+
  && rm -rf /var/lib/apt/lists/*
 
 # When launching the container, mount the code directory to /vllm-workspace
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 3a11c6721ead9..04cd4d79f4045 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
 FROM $BASE_IMAGE
 WORKDIR /workspace
 
+# Install some basic utilities
+RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+
 # Install the TPU and Pallas dependencies.
 RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
 RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index f91baa11a3753..321da98cf6c89 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
     chmod 644 /usr/share/keyrings/intel-graphics.gpg
 
 RUN apt-get update  -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
-
+&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 
 COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm

From efb37899b1138f98a6415ade35dcbef3d12eb3eb Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Sun, 1 Sep 2024 16:30:24 +0000
Subject: [PATCH 15/28] move opencv-python import

---
 vllm/assets/video.py     | 7 ++++++-
 vllm/multimodal/utils.py | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index ad8637f4c0654..bfa2a266228b3 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -2,7 +2,6 @@
 from functools import lru_cache
 from typing import List, Literal
 
-import cv2
 import numpy as np
 import numpy.typing as npt
 from huggingface_hub import hf_hub_download
@@ -35,6 +34,12 @@ def download_video_asset(filename: str) -> str:
 
 
 def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
+    try:
+        import cv2
+    except ImportError:
+        raise ImportError(
+            "Please install vllm[audio] for audio support.") from None
+    
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
         raise ValueError(f"Could not open video file {path}")
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 30b1e463ccdeb..c62028aa0c615 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,7 +3,6 @@
 from io import BytesIO
 from typing import Any, List, Optional, Tuple, TypeVar, Union
 
-import cv2
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
@@ -190,6 +189,12 @@ def rescale_image_size(image: Image.Image,
 
 
 def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
+    try:
+        import cv2
+    except ImportError:
+        raise ImportError(
+            "Please install vllm[audio] for audio support.") from None
+    
     num_frames, _, _, channels = frames.shape
     new_height, new_width = size
     resized_frames = np.empty((num_frames, new_height, new_width, channels),

From 7c037430ce6118b394a05fdf8335806e172e3892 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Sun, 1 Sep 2024 16:36:27 +0000
Subject: [PATCH 16/28] optional install video packages

---
 setup.py                 | 1 +
 vllm/assets/video.py     | 8 ++------
 vllm/multimodal/utils.py | 9 ++++++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 1e08a5bd70cd3..485c734372940 100644
--- a/setup.py
+++ b/setup.py
@@ -502,6 +502,7 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules=ext_modules,
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
+        "video": ["opencv-python"], # Required for video processing
         "audio": ["librosa", "soundfile"]  # Required for audio processing
     },
     cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index bfa2a266228b3..dc87f72edb881 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -7,7 +7,7 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.multimodal.utils import sample_frames_from_video
+from vllm.multimodal.utils import (sample_frames_from_video, try_import_video_packages)
 
 from .base import get_cache_dir
 
@@ -34,11 +34,7 @@ def download_video_asset(filename: str) -> str:
 
 
 def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
-    try:
-        import cv2
-    except ImportError:
-        raise ImportError(
-            "Please install vllm[audio] for audio support.") from None
+    cv2 = try_import_video_packages()
     
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index c62028aa0c615..18210063d90df 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -187,13 +187,16 @@ def rescale_image_size(image: Image.Image,
         image = image.transpose(Image.Transpose(transpose))
     return image
 
-
-def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
+def try_import_video_packages() -> Any:
     try:
         import cv2
     except ImportError:
         raise ImportError(
-            "Please install vllm[audio] for audio support.") from None
+            "Please install vllm[video] for video support.") from None
+    return cv2
+
+def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
+    cv2 = try_import_video_packages()
     
     num_frames, _, _, channels = frames.shape
     new_height, new_width = size

From f091380dec215c412b426a87b2cb1af83dcfce06 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Sun, 1 Sep 2024 16:45:17 +0000
Subject: [PATCH 17/28] Fix formatting

---
 vllm/assets/video.py     | 6 ++++--
 vllm/multimodal/utils.py | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index dc87f72edb881..e71011f5769e7 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -7,7 +7,8 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.multimodal.utils import (sample_frames_from_video, try_import_video_packages)
+from vllm.multimodal.utils import (sample_frames_from_video,
+                                   try_import_video_packages)
 
 from .base import get_cache_dir
 
@@ -35,7 +36,7 @@ def download_video_asset(filename: str) -> str:
 
 def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
     cv2 = try_import_video_packages()
-    
+
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
         raise ValueError(f"Could not open video file {path}")
@@ -58,6 +59,7 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 
 def video_to_pil_images_list(path: str,
                              num_frames: int = -1) -> List[Image.Image]:
+    cv2 = try_import_video_packages()
     frames = video_to_ndarrays(path, num_frames)
     return [
         Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 18210063d90df..3c801464383ad 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -187,6 +187,7 @@ def rescale_image_size(image: Image.Image,
         image = image.transpose(Image.Transpose(transpose))
     return image
 
+
 def try_import_video_packages() -> Any:
     try:
         import cv2
@@ -195,9 +196,10 @@ def try_import_video_packages() -> Any:
             "Please install vllm[video] for video support.") from None
     return cv2
 
+
 def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
     cv2 = try_import_video_packages()
-    
+
     num_frames, _, _, channels = frames.shape
     new_height, new_width = size
     resized_frames = np.empty((num_frames, new_height, new_width, channels),

From 4888357c498db7c24bb198134c8c6db75b3ab593 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Tue, 3 Sep 2024 04:39:17 +0000
Subject: [PATCH 18/28] formatting

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 485c734372940..c01406d029aee 100644
--- a/setup.py
+++ b/setup.py
@@ -502,7 +502,7 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules=ext_modules,
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
-        "video": ["opencv-python"], # Required for video processing
+        "video": ["opencv-python"],  # Required for video processing
         "audio": ["librosa", "soundfile"]  # Required for audio processing
     },
     cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},

From 37371a416214120b4af481fed649d224bfad1e75 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Mon, 9 Sep 2024 18:20:44 +0000
Subject: [PATCH 19/28] Fix docker file

---
 Dockerfile.rocm | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 96156b940720a..99945531d6af3 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -43,9 +43,8 @@ RUN apt-get update && apt-get install -y \
     ffmpeg \
     libsm6 \
     libxext6 \
-    libgl1 
-
- && rm -rf /var/lib/apt/lists/*
+    libgl1 \
+    && rm -rf /var/lib/apt/lists/*
 
 # When launching the container, mount the code directory to /vllm-workspace
 ARG APP_MOUNT=/vllm-workspace

From fe6e1d38a481938b7e6f2dcc4199addc53114f77 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Tue, 10 Sep 2024 09:35:15 +0000
Subject: [PATCH 20/28] add notes for llava-next-video about upstream
 transformers library

---
 docs/source/models/supported_models.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 051a3588bf580..573ef369f8b32 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -265,6 +265,12 @@ Multimodal Language Models
   For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 
+  For :code:`LLaVA-NeXT-Video`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developed version (21fac7abba2a37fae86106f87fcf9974fd1e3830) of :code:`huggingface/transformers` for now。 
+  This can be installed by running the following command: 
+
+  .. code-block:: bash
+    pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
+    
 ----
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.

From 37f0209eb234af51320dd60f1605c501e9168812 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Tue, 10 Sep 2024 09:37:42 +0000
Subject: [PATCH 21/28] update notes

---
 docs/source/models/supported_models.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 573ef369f8b32..6b5289d97e09b 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -268,9 +268,10 @@ Multimodal Language Models
   For :code:`LLaVA-NeXT-Video`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developed version (21fac7abba2a37fae86106f87fcf9974fd1e3830) of :code:`huggingface/transformers` for now。 
   This can be installed by running the following command: 
 
-  .. code-block:: bash
-    pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
-    
+    .. code-block:: bash
+      
+        pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
+
 ----
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.

From 6d481ddae22ed454a4866724c3486f7fd92d6778 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Tue, 10 Sep 2024 09:39:24 +0000
Subject: [PATCH 22/28] formatting

---
 docs/source/models/supported_models.rst | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 6b5289d97e09b..4ec0ec3e5baa0 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -268,9 +268,11 @@ Multimodal Language Models
   For :code:`LLaVA-NeXT-Video`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developed version (21fac7abba2a37fae86106f87fcf9974fd1e3830) of :code:`huggingface/transformers` for now。 
   This can be installed by running the following command: 
 
-    .. code-block:: bash
-      
-        pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
+
+  .. code-block:: bash
+    
+    pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
+
 
 ----
 

From 8bd43fcc3120175048bccb237e91e91a02221c9e Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Tue, 10 Sep 2024 09:49:55 +0000
Subject: [PATCH 23/28] Fix notes

---
 docs/source/models/supported_models.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 4ec0ec3e5baa0..29fa5d812deb2 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -230,7 +230,7 @@ Multimodal Language Models
   * - :code:`LlavaNextVideoForConditionalGeneration`
     - LLaVA-NeXT-Video
     - Video
-    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
+    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
     -
   * - :code:`MiniCPMV`
     - MiniCPM-V
@@ -265,7 +265,7 @@ Multimodal Language Models
   For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 
-  For :code:`LLaVA-NeXT-Video`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developed version (21fac7abba2a37fae86106f87fcf9974fd1e3830) of :code:`huggingface/transformers` for now。 
+  For :code:`LLaVA-NeXT-Video`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
   This can be installed by running the following command: 
 
 

From 66bf496adde06b76670e65d80849da6f90f8c3e3 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Tue, 10 Sep 2024 12:51:27 +0000
Subject: [PATCH 24/28] Fix dockerfile rocm

---
 Dockerfile.rocm | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 99945531d6af3..33423fde4ff96 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -40,11 +40,7 @@ RUN apt-get update && apt-get install -y \
     unzip \
     tmux \
     ccache \
-    ffmpeg \
-    libsm6 \
-    libxext6 \
-    libgl1 \
-    && rm -rf /var/lib/apt/lists/*
+ && rm -rf /var/lib/apt/lists/*
 
 # When launching the container, mount the code directory to /vllm-workspace
 ARG APP_MOUNT=/vllm-workspace

From 808dcae4fded44d066e8f463aa9139b4dc7d071e Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Tue, 10 Sep 2024 18:17:49 +0000
Subject: [PATCH 25/28] Add open-cv for tests

---
 requirements-test.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-test.txt b/requirements-test.txt
index 44ba99fe84bd4..ca3bfa7aff629 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -11,6 +11,7 @@ awscli
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio test
+opencv-python # required for video test
 peft
 requests
 ray[adag]>=2.35

From ff1652165929e7cb4b78d6ff7965a797890618c4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <cyrus.tl.leung@gmail.com>
Date: Wed, 11 Sep 2024 09:17:54 +0800
Subject: [PATCH 26/28] Skip LLaVA-NeXT-Video tests for now

---
 tests/models/test_llava_next_video.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/models/test_llava_next_video.py b/tests/models/test_llava_next_video.py
index 9212b27b87145..528f447691038 100644
--- a/tests/models/test_llava_next_video.py
+++ b/tests/models/test_llava_next_video.py
@@ -1,6 +1,7 @@
 from typing import List, Optional, Tuple, Type, overload
 
 import pytest
+import transformers
 from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 
 from vllm.multimodal.utils import (rescale_video_size, resize_video,
@@ -162,6 +163,8 @@ def run_test(
         )
 
 
+@pytest.mark.skipif(transformers.__version__ < "4.45",
+                    "LLaVA-NeXT-Video does not work in transformers yet")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "size_factors",
@@ -205,6 +208,8 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
     )
 
 
+@pytest.mark.skipif(transformers.__version__ < "4.45",
+                    "LLaVA-NeXT-Video does not work in transformers yet")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "sizes",

From ec6d34009b4de9839478f2d01478e9a1fd923808 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 11 Sep 2024 01:47:51 +0000
Subject: [PATCH 27/28] Fix missing reason

---
 tests/models/test_llava_next_video.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_llava_next_video.py b/tests/models/test_llava_next_video.py
index 528f447691038..f84663c866dff 100644
--- a/tests/models/test_llava_next_video.py
+++ b/tests/models/test_llava_next_video.py
@@ -164,7 +164,7 @@ def run_test(
 
 
 @pytest.mark.skipif(transformers.__version__ < "4.45",
-                    "LLaVA-NeXT-Video does not work in transformers yet")
+                    reason="LLaVA-NeXT-Video does not work in transformers yet")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "size_factors",
@@ -209,7 +209,7 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
 
 
 @pytest.mark.skipif(transformers.__version__ < "4.45",
-                    "LLaVA-NeXT-Video does not work in transformers yet")
+                    reason="LLaVA-NeXT-Video does not work in transformers yet")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "sizes",

From f36b6ca1a4ce1eaf4dd551a9a80d06313c92a513 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 11 Sep 2024 01:54:49 +0000
Subject: [PATCH 28/28] Reword

---
 tests/models/test_llava_next_video.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_llava_next_video.py b/tests/models/test_llava_next_video.py
index f84663c866dff..6856b15f22ec3 100644
--- a/tests/models/test_llava_next_video.py
+++ b/tests/models/test_llava_next_video.py
@@ -164,7 +164,7 @@ def run_test(
 
 
 @pytest.mark.skipif(transformers.__version__ < "4.45",
-                    reason="LLaVA-NeXT-Video does not work in transformers yet")
+                    reason="Waiting for next transformers release")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "size_factors",
@@ -209,7 +209,7 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
 
 
 @pytest.mark.skipif(transformers.__version__ < "4.45",
-                    reason="LLaVA-NeXT-Video does not work in transformers yet")
+                    reason="Waiting for next transformers release")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "sizes",