Skip to content

Commit

Permalink
Update Vinoground to make evaluation consistent with paper (#354)
Browse files Browse the repository at this point in the history
* add vinoground

* make evaluation consistent to paper

---------

Co-authored-by: jzhang2427 <jzhang2427@wisc.edu>
  • Loading branch information
HanSolo9682 and jzhang2427 authored Oct 26, 2024
1 parent d693e05 commit f255e5b
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 36 deletions.
69 changes: 42 additions & 27 deletions lmms_eval/tasks/vinoground/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
from pathlib import Path

Expand All @@ -16,52 +17,64 @@
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]


textscore_dict, videoscore_dict = {}, {}


def prep_data():
global textscore_dict, videoscore_dict
cache_dir = os.path.join(base_cache_dir, cache_name)
with open(os.path.join(cache_dir, "vinoground_videos", "vinoground_textscore.json")) as f:
textscore_list = json.load(f)
textscore_dict = {}
for item in textscore_list:
textscore_dict[item["idx"]] = item
with open(os.path.join(cache_dir, "vinoground_videos_concated", "vinoground_videoscore.json")) as f:
videoscore_list = json.load(f)
videoscore_dict = {}
for item in videoscore_list:
videoscore_dict[item["idx"]] = item
return textscore_dict, videoscore_dict


def vinoground_doc_to_visual(doc):
if len(textscore_dict) == 0:
prep_data()
cache_dir = os.path.join(base_cache_dir, cache_name)
idx, question_type = "_".join(doc["index"].split("_")[:2]), doc["index"].split("_")[2]
scoredict = textscore_dict if question_type == "text" else videoscore_dict

if doc["index"].split("_")[2] == "text":
video_path = os.path.join(cache_dir, "vinoground_videos", "_".join(doc["index"].split("_")[:2]) + ".mp4")
else:
video_path = os.path.join(cache_dir, "vinoground_videos_concated", doc["index"].split("_")[0] + ".mp4")
video_path = os.path.join(cache_dir, scoredict[idx]["video_name"])
if not os.path.exists(video_path):
raise Exception(f"video path:{video_path} does not exist, please check")
return [video_path]


def vinoground_doc_to_text(doc, lmms_eval_specific_kwargs=None):
if doc["index"].split("_")[2] == "text":
pre_prompt = "Which caption best describes this video?"
option_a = "A. " + doc["pos_cap"]
option_b = "B. " + doc["neg_cap"]
post_prompt = "Answer with the option's letter from the given choices directly. Please only output 1 English character."
full_prompt = pre_prompt + "\n" + option_a + "\n" + option_b + "\n" + post_prompt
else:
pos_neg = doc["index"].split("_")[1]
caption_in_question = doc[f"{pos_neg}_cap"]
pre_prompt = "Which video segment matches this caption? Note: The video contains two segments separated by a 2-second black frame."
caption = f"Caption: {caption_in_question}"
options = "A. The first fragment (before black frame)\nB. The second fragment (after black frame)"
post_prompt = "Answer with the option's letter from the given choices directly. Please only output 1 English character."
full_prompt = pre_prompt + "\n" + caption + "\n" + options + "\n" + post_prompt
return full_prompt
if len(textscore_dict) == 0:
prep_data()
idx, question_type = "_".join(doc["index"].split("_")[:2]), doc["index"].split("_")[2]
scoredict = textscore_dict if question_type == "text" else videoscore_dict

return scoredict[idx]["question"] + "\nPlease only output one English character."


def vinoground_process_results(doc, results):
pred = results[0]

major = doc["major"]
minors = doc["minor"]
categories = [major]
categories = ["all", major]
if minors is not None:
categories.extend(minors.split(";"))
question_type = doc["index"].split("_")[2]
data_dict = {"index": doc["index"], "categories": categories, "question_type": question_type, "pred": pred}
idx, question_type = "_".join(doc["index"].split("_")[:2]), doc["index"].split("_")[2]
data_dict = {"index": idx, "categories": categories, "question_type": question_type, "pred": pred}

return {"vinoground_score": data_dict}


def vinoground_aggregate_results(results):
matrix = np.zeros((500, 7), dtype=np.int8)
textscore_dict, videoscore_dict = prep_data()

category_all = {}
category_text = {}
Expand All @@ -70,15 +83,17 @@ def vinoground_aggregate_results(results):
index_to_categories = {}

for result in results:
index, categories, question_type, pred = result["index"], result["categories"], result["question_type"], result["pred"]
matrix_col = 0 if "pos" in index else 1
idx, categories, question_type, pred = result["index"], result["categories"], result["question_type"], result["pred"]
matrix_col = 0 if "pos" in idx else 1
if question_type == "video":
matrix_col += 3
gt = "A" if "pos" in index else "B"
idx = int(index.split("_")[0])
if question_type == "text":
gt = textscore_dict[idx]["GT"]
else:
gt = videoscore_dict[idx]["GT"]
idx = int(idx.split("_")[0])
matrix[idx, matrix_col] = pred[0].lower() == gt.lower()

categories.append("all")
if idx not in index_to_categories.keys():
index_to_categories[idx] = categories

Expand Down
14 changes: 5 additions & 9 deletions lmms_eval/tasks/vinoground/vinoground.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,16 @@ doc_to_text: !function utils.vinoground_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 16
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
# temperature: 0
# top_p: 1.0
# num_beams: 1
# do_sample: false

process_results: !function utils.vinoground_process_results

metric_list:
- metric: vinoground_score
aggregation: !function utils.vinoground_aggregate_results
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly. Please only output one English character."
metadata:
- version: 0.0
- version: 1.0

0 comments on commit f255e5b

Please sign in to comment.