-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathllava_sensor_event_narration_and_qa.py
143 lines (125 loc) · 4.71 KB
/
llava_sensor_event_narration_and_qa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
It's notable that this pipeline verified the limitations of
current open-source LLMs. They can't understand sensor
data like GPT-4 can. Therefore, we should come back to this
script after finetuning our model.
"""
import os
import torch
from videollava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from videollava.conversation import Conversation, SeparatorStyle
from videollava.model.builder import load_pretrained_model
from videollava.utils import disable_torch_init
from videollava.mm_utils import (
tokenizer_image_token,
get_model_name_from_path,
KeywordsStoppingCriteria
)
import pandas as pd
disable_torch_init()
INSTRUCTION = (
"Please consider yourself to be an expert "
"on gyroscope and accelerometer sensor information "
"given as a metadata in a vision dataset. "
"You are given an egocentric video in a home environment setting. "
"The user also provides a brief summary of the event "
"followed by 'Summary:'. "
"They also give you gyroscopic and accelerometer sensor data followed by "
"'Gyroscope:' and 'Accelerometer:' respectively. "
"They are written in a Python list of lists format and contain "
"x, y, and z axis data respectively. "
"Narrate the video with details that are "
"context-aware based on the sensor data."
)
model_path = 'LanguageBind/Video-LLaVA-7B'
cache_dir = 'cache_dir'
device = 'cuda'
load_4bit, load_8bit = False, False
model_name = get_model_name_from_path(model_path)
tokenizer, model, processor, _ = load_pretrained_model(
model_path, None, model_name, load_8bit, load_4bit,
device=device, cache_dir=cache_dir
)
video_processor = processor['video']
def narrate_sensor_event(video_filename, video_dir, sensor_dir, summary_dir, subsample=25):
video = os.path.join(video_dir, video_filename)
imu_filename = video_filename.replace(".MP4", ".csv")
summary_filename = video_filename.replace(".MP4", ".txt")
imu_file = os.path.join(sensor_dir, imu_filename)
summary_file = os.path.join(summary_dir, summary_filename)
imu_df = pd.read_csv(imu_file).round(3)
accl_str = str(imu_df[['AcclX', 'AcclY', 'AcclZ']].values.tolist()[::subsample])
gyro_str = str(imu_df[['GyroX', 'GyroY', 'GyroZ']].values.tolist()[::subsample])
print(len(imu_df.values.tolist()[::subsample]))
if len(imu_df.values.tolist()[::subsample])>15:
return "Too long"
with open(summary_file, "r") as f:
summary = f.read()
inp = (
f"{INSTRUCTION}\n"
f"Summary: {summary}\n"
f"Gyroscope: {gyro_str}\n"
f"Accelerometer: {accl_str}\n"
)
conv = Conversation(
system=INSTRUCTION,
roles=("USER", "ASSISTANT"),
version="v1",
messages=(),
offset=0,
sep_style=SeparatorStyle.TWO,
sep=" ",
sep2="</s>",
).copy()
roles = conv.roles
video_tensor = video_processor(video, return_tensors='pt')['pixel_values']
if isinstance(video_tensor, list):
tensor = [
video.to(
model.device, dtype=torch.float16
)
for video in video_tensor
]
else:
tensor = video_tensor.to(model.device, dtype=torch.float16)
print(f"{roles[1]}: {inp}")
inp = ' '.join(
[DEFAULT_IMAGE_TOKEN] * model.get_video_tower().config.num_frames
) + '\n' + inp
conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(
prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'
).unsqueeze(0).to(model.device)
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(
keywords, tokenizer, input_ids
)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=tensor,
do_sample=False,
temperature=0,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria]
)
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
return outputs
if __name__ == '__main__':
VIDEO_DIR = "/hdd/LLM/finetune_data/videos"
SENSOR_DIR = "/hdd/LLM/finetune_data/imus"
SUMMARY_DIR = "/hdd/LLM/finetune_data/event_description"
video_filenames = os.listdir(VIDEO_DIR)
with open("results.txt", "w") as f:
for filename in video_filenames[:3]:
narration = narrate_sensor_event(
video_filename=filename,
video_dir=VIDEO_DIR,
sensor_dir=SENSOR_DIR,
summary_dir=SUMMARY_DIR
)
f.write(filename+"\n"+narration+"\n")