From d95701b230b67b6fce4de69f7c171bc4ee85dbf9 Mon Sep 17 00:00:00 2001
From: yangshen <yangshen.d@outlook.com>
Date: Thu, 15 Aug 2024 15:10:40 +0000
Subject: [PATCH] Initial support for llava-next-video * Inference with
 llava-hf/llava-next-video* (with bugs) * Add VideoPlugin. * Add example for
 llava-next-video

---
 examples/offline_inference_vision_language.py |  67 ++-
 vllm/assets/video.py                          |  81 ++++
 vllm/model_executor/models/__init__.py        |   2 +
 .../model_executor/models/llava_next_video.py | 404 ++++++++++++++++++
 vllm/multimodal/registry.py                   |   3 +-
 vllm/multimodal/video.py                      |  44 ++
 6 files changed, 590 insertions(+), 11 deletions(-)
 create mode 100644 vllm/assets/video.py
 create mode 100644 vllm/model_executor/models/llava_next_video.py
 create mode 100644 vllm/multimodal/video.py
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 9a0e9d4bc5362..ee8553803c7ad 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -9,19 +9,15 @@
 
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
 from vllm.utils import FlexibleArgumentParser
 
-# Input image and question
-image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-question = "What is the content of this image?"
-
-
 # LLaVA-1.5
 def run_llava(question):
 
     prompt = f"USER: <image>\n{question}\nASSISTANT:"
 
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf")
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -34,6 +30,13 @@ def run_llava_next(question):
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
+# LlaVA-NeXT-Video
+# Currently only support for video input
+def run_llava_next_video(question):
+    prompt = f"[INST] <video>\n{question} [/INST]"
+    llm = LLM(model="llava-hf/llava-next-video-7b-hf")
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
 
 # Fuyu
 def run_fuyu(question):
@@ -162,6 +165,7 @@ def run_blip2(question):
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
+    "llava-next-video": run_llava_next_video,
     "fuyu": run_fuyu,
     "phi3_v": run_phi3v,
     "paligemma": run_paligemma,
@@ -171,12 +175,48 @@ def run_blip2(question):
     "internvl_chat": run_internvl,
 }
 
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom") \
+            .pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+    
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4", 
+                           num_frames=args.num_frames).pil_images
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
 
 def main(args):
     model = args.model_type
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+    
     llm, prompt, stop_token_ids = model_example_map[model](question)
 
     # We set temperature to 0.2 so that outputs can be different
@@ -191,7 +231,7 @@ def main(args):
         inputs = {
             "prompt": prompt,
             "multi_modal_data": {
-                "image": image
+                modality: data
             },
         }
 
@@ -200,7 +240,7 @@ def main(args):
         inputs = [{
             "prompt": prompt,
             "multi_modal_data": {
-                "image": image
+                modality: data
             },
         } for _ in range(args.num_prompts)]
 
@@ -218,13 +258,20 @@ def main(args):
     parser.add_argument('--model-type',
                         '-m',
                         type=str,
-                        default="llava",
+                        default="llava-next-video",
                         choices=model_example_map.keys(),
                         help='Huggingface "model_type".')
     parser.add_argument('--num-prompts',
                         type=int,
                         default=1,
                         help='Number of prompts to run.')
-
+    parser.add_argument('--modality',
+                        type=str,
+                        default="video",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
     args = parser.parse_args()
     main(args)
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
new file mode 100644
index 0000000000000..f22d63f93aa4a
--- /dev/null
+++ b/vllm/assets/video.py
@@ -0,0 +1,81 @@
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Literal, List
+from huggingface_hub import hf_hub_download
+import cv2
+import numpy as np
+
+from PIL import Image
+
+from .base import get_cache_dir
+
+
+@lru_cache
+def download_video_asset(filename: str) -> str:
+    """
+    Download and open an image from huggingface
+    repo: raushan-testing-hf/videos-test
+    """
+    video_directory = get_cache_dir() / "video-eample-data"
+    video_directory.mkdir(parents=True, exist_ok=True)
+
+    video_path = video_directory / filename
+    if not video_path.exists():
+        video_path = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test",
+            filename=filename,
+            repo_type="dataset",
+            cache_dir=video_directory,
+        )
+    return video_path
+    
+
+def video_to_ndarrays_list(
+        path: str, num_frames: int) -> List[np.ndarray]:
+    cap = cv2.VideoCapture(path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file {path}")
+    
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    frames = []
+    for i in range(total_frames):
+        ret, frame = cap.read()
+        if ret:
+            frames.append(frame)
+    
+    cap.release()
+
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    frames = [frames[i] for i in frame_indices if i < len(frames)]
+
+    if len(frames) < num_frames:
+        raise ValueError(
+            f"Could not read enough frames from video file {path}"
+            f" (expected {num_frames} frames, got {len(frames)})")
+    
+    return frames
+
+def video_to_pil_images_list(
+        path: str, num_frames: int) -> List[Image.Image]:
+    frames = video_to_ndarrays_list(path, num_frames)
+    return [Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 
+            for frame in frames]
+    
+
+@dataclass(frozen=True)
+class VideoAsset:
+    name: Literal["sample_demo_1.mp4"]
+    num_frames: int
+
+    @property
+    def pil_images(self) -> List[Image.Image]:
+        video_path = download_video_asset(self.name)
+        return video_to_pil_images_list(video_path, self.num_frames)
+        
+    @property
+    def np_ndarrays(self) -> List[np.ndarray]:
+        video_path = download_video_asset(self.name)
+        return video_to_ndarrays_list(video_path, self.num_frames)
+
+    
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 46aa62e24e8af..6688fe286411e 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -44,6 +44,8 @@
     ("llava", "LlavaForConditionalGeneration"),
     "LlavaNextForConditionalGeneration":
     ("llava_next", "LlavaNextForConditionalGeneration"),
+    "LlavaNextVideoForConditionalGeneration":
+    ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
new file mode 100644
index 0000000000000..aa84071aaeb40
--- /dev/null
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -0,0 +1,404 @@
+import itertools
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
+
+import torch
+import torch.nn as nn
+from PIL import Image
+from transformers import CLIPVisionConfig, LlavaNextVideoConfig, SiglipVisionConfig
+
+import math
+
+from transformers.models.llava_next_video.modeling_llava_next_video import (
+    get_anyres_image_grid_shape, unpad_image)
+from typing_extensions import NotRequired
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.clip import CLIPVisionModel
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
+                   get_clip_patch_grid_length, input_processor_for_clip)
+from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
+                     dummy_seq_data_for_siglip, get_siglip_image_feature_size,
+                     get_siglip_patch_grid_length, input_processor_for_siglip)
+
+from .interfaces import SupportsVision
+from .llava import LlavaMultiModalProjector
+# from .llava_next import get_llava_next_image_feature_size
+from .utils import (filter_weights, init_vllm_registered_model,
+                    merge_vision_embeddings)
+
+logger = init_logger(__name__)
+
+_KEYS_TO_MODIFY_MAPPING = {
+    "language_model.lm_head": "lm_head",
+    "language_model.model": "language_model",
+}
+
+class LlavaNextVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    Shape: `(batch_size, num_frames, num_channels, height, width)`
+
+    Note that `num_frames` may be different for each batch, in which case
+    the data is passed as a list instead of a batched tensor.
+
+    Note that it only supports one video input for one batch.
+    """
+
+def get_llava_next_video_frame_feature_size(
+        hf_config: LlavaNextVideoConfig) -> int:
+    
+    image_size = hf_config.vision_config.image_size
+    patch_size = hf_config.vision_config.patch_size
+    spatial_pool_stride = hf_config.spatial_pool_stride
+    
+    return int(
+        (image_size / patch_size / spatial_pool_stride) ** 2
+    )
+
+def _get_max_llm_tokens(ctx: InputContext) -> int:
+    """
+    Calculated from the maximum video frames under the context length 
+    constraints of the language model.
+    """
+    hf_text_config = ctx.model_config.hf_text_config
+    model_config = ctx.model_config
+    max_tokens = model_config.max_model_len
+    rope_scaling = model_config.rope_scaling
+
+    if rope_scaling:
+        rope_scaling_factor = hf_text_config.rope_scaling["factor"]
+    else:
+        rope_scaling_factor = 1
+
+    max_tokens *= rope_scaling_factor
+
+def get_max_llava_next_video_tokens(ctx: InputContext) -> int:
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    max_tokens = _get_max_llm_tokens(ctx)
+    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
+    return max_tokens // tokens_per_frame * tokens_per_frame
+
+def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int):
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    vision_config = hf_config.vision_config
+
+    # fills the sequence with as longer video data as possible
+    tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config)
+    num_frames = seq_len // tokens_per_frame
+    video_feature_size = num_frames * tokens_per_frame
+    
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            image_token_id=hf_config.image_token_index,
+            image_feature_size_override=video_feature_size,
+        )
+
+        frame = dummy_image_for_clip(vision_config)
+        mm_data = {"video": [frame] * num_frames}
+        return seq_data, mm_data
+    
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def input_processor_for_llava_next_video(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "video" not in multi_modal_data:
+        return llm_inputs
+    
+    model_config = ctx.model_config
+    hf_config = ctx.get_hf_config(LlavaNextVideoConfig)
+    vision_config = hf_config.vision_config
+
+    video_data = multi_modal_data["video"]
+    if isinstance(video_data, List[Image.Image]):
+        num_frames = len(video_data)
+        frame_feature_size = get_llava_next_video_frame_feature_size(hf_config)
+        video_feature_size = num_frames * frame_feature_size
+
+        strategy = hf_config.vision_feature_select_strategy
+        if strategy == "default":
+            pass
+        elif strategy == "full":
+            raise NotImplementedError(f"Unsupported select feature strategy: full")
+        else:
+            raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return input_processor_for_clip(
+            model_config,
+            vision_config,
+            llm_inputs,
+            image_token_id=hf_config.video_token_index,
+            image_feature_size_override=video_feature_size,
+        )
+    
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def _init_vision_tower(hf_config: LlavaNextVideoConfig):
+    vision_config = hf_config.vision_config
+
+    # Initialize the vision tower only up to the required feature layer
+    vision_feature_layer = hf_config.vision_feature_layer
+    if vision_feature_layer < 0:
+        num_hidden_layers = hf_config.vision_config.num_hidden_layers \
+            + vision_feature_layer + 1
+    else:
+        num_hidden_layers = vision_feature_layer + 1
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return CLIPVisionModel(
+            vision_config,
+            num_hidden_layers_override=num_hidden_layers,
+        )
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+# adopted from transformers modeling_llava_next_video.py
+class LlavaNextVideoPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        mode = config.spatial_pool_mode
+        stride = config.spatial_pool_stride
+        out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size)
+        self.image_size = config.vision_config.image_size // config.vision_config.patch_size**2
+
+        if mode == "average":
+            self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
+        elif mode == "max":
+            self.pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+        else:
+            # Support Conv2d pooling layer, need to load weights
+            raise ValueError(f"Unknown pooling mode: {mode}. Has to be one of [`average`, `max`]")
+        
+
+    def forward(self, image_features):
+        ori_width = int(math.sqrt(image_features.shape[1] * self.image_size // self.image_size))
+        ori_height = int(ori_width * self.image_size // self.image_size)
+
+        batch_size, _, dim = image_features.shape
+        image_features_spatial = image_features.view(batch_size, ori_height, ori_height, dim).permute(0, 3, 1, 2)
+        image_features_spatial_pool = self.pool(image_features_spatial)
+
+        return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()
+    
+    
+@MULTIMODAL_REGISTRY.register_input_mapper("video")
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("video", get_max_llava_next_video_tokens) 
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next_video) 
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next_video)
+class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsVision):    
+    def __init__(
+        self,
+        config: LlavaNextVideoConfig,
+        multimodal_config: MultiModalConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # Initialize the vision tower only up to the required feature layer
+        self.vision_tower = _init_vision_tower(config)
+        self.multi_modal_projector = LlavaMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act)
+        self.language_model = init_vllm_registered_model(
+            config.text_config, cache_config, quant_config)
+        self.vision_resampler = LlavaNextVideoPooler(config)
+
+        # # For processing image inputs
+        # embed_std = 1 / math.sqrt(config.text_config.hidden_size)
+        # self.image_newline = nn.Parameter(
+        #     torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std)
+    
+    def _validate_video_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[2:]) # only validate the last two dimensions
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_frames", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values in each video frame "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+    
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[LlavaNextVideoPixelInputs]:
+        """
+        A legal video input should have the following dimensions:
+        {
+            "pixel_values" : Tensor of shape 
+                (nb_videos, nb_frames, nb_channels, height, width),
+        }
+        """
+        pixel_values = kwargs.pop("pixel_values_videos", None)
+
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, (torch.Tensor)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        return LlavaNextVideoPixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421  # noqa
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _video_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values)
+        image_features = self._select_image_features(
+            image_features,
+            strategy=self.config.vision_feature_select_strategy,
+        )
+        image_features = self.vision_resampler(image_features)
+        image_features = self.multi_modal_projector(image_features)
+        return image_features
+
+
+    def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
+        assert self.vision_tower is not None
+
+        video_pixels = inputs["data"]
+
+        if isinstance(video_pixels, torch.Tensor):
+            b, num_frames, c, h, w = video_pixels.shape
+            stacked_pixels = video_pixels.view(b * num_frames, c, h, w)
+            stacked_embeddings = self._video_pixels_to_features(
+                self.vision_tower, stacked_pixels)
+            return stacked_embeddings.view(
+                b, num_frames, *stacked_embeddings.shape[1:])
+            
+        elif isinstance(video_pixels, list):
+            frames_per_videos = [v.shape[0] for v in video_pixels]
+            stacked_pixels = torch.cat(video_pixels, dim=0)
+            stacked_embeddings = self._video_pixels_to_features(
+                self.vision_tower, stacked_pixels)
+            return torch.split(
+                stacked_embeddings, frames_per_videos, dim=0
+            )
+        
+        else:
+            raise ValueError(
+                f"Unsupported type of video input {type(video_pixels)}")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        """Run forward pass for LlaVA-NeXT-Video.
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values_videos: The pixels in each frames for each input videos.
+        """
+        video_input = self._parse_and_validate_video_input(**kwargs)
+
+        # merge video embeddings into input embeddings
+        if video_input is not None:
+            video_imbeddings = self._process_video_input(video_input)
+            inputs_embeds = self.language_model.model.get_input_embeddings()(input_ids)
+
+            inputs_embeds = merge_vision_embeddings(
+                input_ids, inputs_embeds, video_imbeddings,
+                self.config.video_token_index)
+            
+            input_ids = None
+        else:
+            inputs_embeds = None
+        
+        hidden_states = self.language_model.model(input_ids, 
+                                                  positions, 
+                                                  kv_caches, 
+                                                  attn_metadata,
+                                                  None,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+    
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # prepare weight iterators
+        vit_weights, mlp_weights, newline_weights, llm_weights = itertools.tee(
+            weights, 4)
+
+        # load vision encoder
+        vit_weights = filter_weights(vit_weights, "vision_tower")
+        self.vision_tower.load_weights(vit_weights)
+
+        # load mlp projector
+        mlp_weights = filter_weights(mlp_weights, "multi_modal_projector")
+        mlp_params_dict = dict(self.multi_modal_projector.named_parameters())
+        for name, loaded_weight in mlp_weights:
+            param = mlp_params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+        # # load newline
+        # newline_weights = filter_weights(newline_weights, "image_newline")
+        # for name, loaded_weight in newline_weights:
+        #     assert name == ""
+        #     param = self.image_newline
+        #     weight_loader = getattr(param, "weight_loader",
+        #                             default_weight_loader)
+        #     weight_loader(param, loaded_weight)
+
+        # load llm backbone
+        llm_weights = filter_weights(llm_weights, "language_model")
+        self.language_model.load_weights(llm_weights)
\ No newline at end of file
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index d487d20011b45..09c764806235b 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -9,6 +9,7 @@
 from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
                    MultiModalPlugin, MultiModalTokensCalc, NestedTensors)
 from .image import ImagePlugin
+from .video import VideoPlugin
 
 logger = init_logger(__name__)
 
@@ -34,7 +35,7 @@ class MultiModalRegistry:
     :class:`~vllm.multimodal.MultiModalPlugin` for each modality.
     """
 
-    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin())
+    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
 
     def __init__(
             self,
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
new file mode 100644
index 0000000000000..2f0be1b31dccb
--- /dev/null
+++ b/vllm/multimodal/video.py
@@ -0,0 +1,44 @@
+from functools import lru_cache
+from typing import List, Optional, Tuple, TypeVar, Union
+
+import torch
+from PIL import Image
+import numpy as np
+from transformers import PreTrainedTokenizerBase
+
+from vllm.config import ModelConfig
+from vllm.inputs.registry import InputContext
+from vllm.logger import init_logger
+from vllm.transformers_utils.image_processor import get_image_processor
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import is_list_of
+
+
+from .base import MultiModalInputs, MultiModalPlugin
+from .image import ImagePlugin
+
+logger = init_logger(__name__)
+
+cached_get_image_processor = lru_cache(get_image_processor)
+cached_get_tokenizer = lru_cache(get_tokenizer)
+
+VideoInput = Union[
+    List["Image.Image"], 
+    List[List["Image.Image"]], 
+    "np.ndarray",
+    "torch.Tensor",
+    List["np.ndarray"],
+    List["torch.Tensor"],
+    List[List["np.ndarrray"]],
+    List[List["torch.Tensor"]],
+]  
+
+class VideoPlugin(ImagePlugin):
+    """Plugin for video data."""
+
+    def get_data_key(self) -> str:
+        return "video"
+
+    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
+        return 4096 
+    
\ No newline at end of file