You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
For the first version of the llava-next-video project, the model chosen was LLaVA-NeXT-Video-7B-DPO. If the number of frames is set to 32, the final inputs_embeds dimension sent to the llama2 model is torch.Size([1, 4670, 4096]), with the format being torch.float16 and batch=1. The inference process uses approximately 20GB of GPU memory.
For llava-onevision, the model chosen was llava-onevision-qwen2-7b-ov. If the number of frames is set to 24, the final inputs_embeds dimension sent to the qwen2 model is torch.Size([1, 7356, 896]), also in torch.float16 format with batch=1. However, during inference, the GPU memory usage reaches 72GB!
Why is this happening? It’s evident that the data size of inputs_embeds for llava-next-video is much larger than that of llava-onevision-qwen2-7b-ov, yet it requires significantly less GPU memory. As a result, when testing on an 80GB A100 GPU, the maximum number of frames for the llava-onevision-qwen2-7b-ov model can only be set to 24, rather than 32 as mentioned in the paper.
inference code
import argparse
import torch
import sys
# print(f"before,sys.path============={sys.path}")
sys.path.append("/media/star/8T/PycharmProjects/github/gpt/LLaVA-NeXT")
# print(f"after,sys.path============={sys.path}")
import time
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
from llava.conversation import conv_templates, SeparatorStyle
import torch
import cv2
import numpy as np
from PIL import Image
import requests
import copy
import warnings
warnings.filterwarnings("ignore")
# Load the OneVision model
pretrained = "/media/star/8T/model/gpt/llava/llava-onevision/llava-onevision-qwen2-0.5b-ov"
model_name = "llava_qwen"
device = "cuda"
device_map = "auto"
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map)
model.eval()
# Function to extract frames from video
def extract_frames(video_path, num_frames=8):
cap = cv2.VideoCapture(video_path)
frames = []
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
for i in indices:
cap.set(cv2.CAP_PROP_POS_FRAMES, i)
ret, frame = cap.read()
if ret:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frames.append(Image.fromarray(frame))
cap.release()
return frames
# Load and process video
video_path = "/media/star/8T/tmp/gpt4v/video/zouxiu2_5/clip_135_140.mp4"
video_frames = extract_frames(video_path,num_frames=24)
image_tensors = process_images(video_frames, image_processor, model.config)
image_tensors = [_image.to(dtype=torch.float16, device=device) for _image in image_tensors]
print(f"image_tensors.shape={[image_tensor.shape for image_tensor in image_tensors]}")
# Prepare conversation input
conv_template = "qwen_1_5"
question = f"{DEFAULT_IMAGE_TOKEN}\nIs the model changing clothes in the video? answer the question using a single word or phrase."
conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], question)
conv.append_message(conv.roles[1], None)
prompt_question = conv.get_prompt()
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
image_sizes = [frame.size for frame in video_frames]
print(f"image_sizes={image_sizes[:2]}")
# Generate response
cont = model.generate(
input_ids,
images=image_tensors,
image_sizes=image_sizes,
do_sample=False,
temperature=0.3,
max_new_tokens=768,
)
text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
print(text_outputs[0])
The text was updated successfully, but these errors were encountered:
For the first version of the llava-next-video project, the model chosen was LLaVA-NeXT-Video-7B-DPO. If the number of frames is set to 32, the final inputs_embeds dimension sent to the llama2 model is torch.Size([1, 4670, 4096]), with the format being torch.float16 and batch=1. The inference process uses approximately 20GB of GPU memory.
For llava-onevision, the model chosen was llava-onevision-qwen2-7b-ov. If the number of frames is set to 24, the final inputs_embeds dimension sent to the qwen2 model is torch.Size([1, 7356, 896]), also in torch.float16 format with batch=1. However, during inference, the GPU memory usage reaches 72GB!
Why is this happening? It’s evident that the data size of inputs_embeds for llava-next-video is much larger than that of llava-onevision-qwen2-7b-ov, yet it requires significantly less GPU memory. As a result, when testing on an 80GB A100 GPU, the maximum number of frames for the llava-onevision-qwen2-7b-ov model can only be set to 24, rather than 32 as mentioned in the paper.
inference code
The text was updated successfully, but these errors were encountered: