Skip to content

Commit

Permalink
VizWiz Scenario (stanford-crfm#1983)
Browse files Browse the repository at this point in the history
  • Loading branch information
teetone authored and yotam committed Nov 9, 2023
1 parent 84044e7 commit d893e1f
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/helm/benchmark/presentation/run_specs_vhelm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ entries: [
################################################# Main experiments #################################################

{description: "vqa:model=vlm", priority: 1, groups: ["vqa_base"]}
{description: "viz_wiz:model=vlm", priority: 1, groups: ["vqa_base"]}
]
108 changes: 108 additions & 0 deletions src/helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from typing import Dict, List, Set
import json
import os

from helm.benchmark.scenarios.scenario import (
CORRECT_TAG,
TRAIN_SPLIT,
VALID_SPLIT,
Instance,
Input,
Output,
Reference,
Scenario,
)
from helm.common.media_object import MediaObject, MultimediaObject
from helm.common.general import ensure_directory_exists, ensure_file_downloaded


class VizWizScenario(Scenario):
"""
VizWiz is a real-world visual question answering dataset consisting of questions
asked by people who are blind. It originates from a natural visual question answering
setting where blind people each took an image and recorded a spoken question about it,
together with 10 crowdsourced answers per visual question.
Version as of January 1, 2020:
- 20,523 training image/question pairs
- 205,230 training answer/answer confidence pairs
- 4,319 validation image/question pairs
- 43,190 validation answer/answer confidence pairs
where answer confidences are one of {"yes", "maybe", "no"}.
Answers are publicly shared for the train and validation splits and hidden for the test split.
Paper: https://arxiv.org/abs/1802.08218
Website: https://vizwiz.org/tasks-and-datasets/vqa
"""

# Annotations are not available for the test set
ANNOTATIONS_URL: str = "https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip"
SPLIT_TO_ANNOTATIONS_FILE: Dict[str, str] = {
TRAIN_SPLIT: "train.json",
VALID_SPLIT: "val.json",
}

SPLIT_TO_IMAGES: Dict[str, str] = {
TRAIN_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/train.zip",
VALID_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/val.zip",
}

name = "viz_wiz"
description = (
"Real-world VQA dataset consisting of questions asked by "
"people who are blind ([paper](https://arxiv.org/abs/1802.08218))."
)
tags = ["vision-language", "visual question answering"]

def get_instances(self, output_path: str) -> List[Instance]:
# Download the questions and annotations
annotations_path: str = os.path.join(output_path, "annotations")
ensure_directory_exists(annotations_path)
ensure_file_downloaded(
source_url=self.ANNOTATIONS_URL,
target_path=annotations_path,
unpack=True,
unpack_type="unzip",
)

instances: List[Instance] = []
for split in [TRAIN_SPLIT, VALID_SPLIT]:
# Download the images for the split
images_path: str = os.path.join(output_path, split)
ensure_file_downloaded(
source_url=self.SPLIT_TO_IMAGES[split],
target_path=images_path,
unpack=True,
unpack_type="unzip",
)

annotations_split_path: str = os.path.join(annotations_path, self.SPLIT_TO_ANNOTATIONS_FILE[split])
with open(annotations_split_path) as f:
for image_annotation in json.load(f):
image_path: str = os.path.join(images_path, image_annotation["image"])
assert os.path.exists(image_path), f"Image {image_path} does not exist"

content: List[MediaObject] = [
MediaObject(location=image_path, content_type="image/jpeg"),
MediaObject(text=image_annotation["question"], content_type="text/plain"),
]
deduped_answers: Set[str] = {
answer_json["answer"]
for answer_json in image_annotation["answers"]
if answer_json["answer_confidence"] == "yes"
}

instances.append(
Instance(
Input(multimedia_content=MultimediaObject(content)),
references=[
Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in deduped_answers
],
split=split,
)
)

return instances
32 changes: 29 additions & 3 deletions src/helm/benchmark/vlm_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def get_vlm_generation_adapter_spec(
input_suffix: str = "",
output_prefix: str = "",
output_suffix: str = "",
max_train_instances: int = 0,
max_tokens: int = 100,
stop_sequences: Optional[List[str]] = None,
) -> AdapterSpec:
Expand All @@ -31,7 +30,8 @@ def get_vlm_generation_adapter_spec(
output_prefix=output_prefix,
output_suffix=output_suffix,
instance_prefix="\n",
max_train_instances=max_train_instances,
# We focus on zero-shot evaluation for now as most open VLMs only support a single image input
max_train_instances=0,
num_outputs=1,
max_tokens=max_tokens,
stop_sequences=stop_sequences if stop_sequences is not None else [],
Expand All @@ -43,6 +43,33 @@ def get_vlm_generation_adapter_spec(
# VHELM run specs


@run_spec_function("viz_wiz")
def get_viz_wiz_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
)

# TODO: finalize the adapter spec parameters once we add more models
adapter_spec: AdapterSpec = get_vlm_generation_adapter_spec(
input_prefix="User: ",
input_suffix="<end_of_utterance>",
output_prefix="\nAssistant: ",
output_suffix="<end_of_utterance>",
stop_sequences=["<end_of_utterance>"],
)

metric_specs: List[MetricSpec] = get_exact_match_metric_specs()

run_spec_name: str = "viz_wiz"
return RunSpec(
name=run_spec_name,
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=[run_spec_name],
)


@run_spec_function("vqa")
def get_vqa_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
Expand All @@ -55,7 +82,6 @@ def get_vqa_spec() -> RunSpec:
input_suffix="<end_of_utterance>",
output_prefix="\nAssistant: ",
output_suffix="<end_of_utterance>",
max_train_instances=3,
stop_sequences=["<end_of_utterance>"],
)

Expand Down

0 comments on commit d893e1f

Please sign in to comment.