VizWiz Scenario (stanford-crfm#1983)

perlitz · Nov 9, 2023 · d893e1f · d893e1f
1 parent 84044e7
commit d893e1f
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 3 deletions.
diff --git a/src/helm/benchmark/presentation/run_specs_vhelm.conf b/src/helm/benchmark/presentation/run_specs_vhelm.conf
@@ -4,4 +4,5 @@ entries: [
     ################################################# Main experiments #################################################
 
     {description: "vqa:model=vlm", priority: 1, groups: ["vqa_base"]}
+    {description: "viz_wiz:model=vlm", priority: 1, groups: ["vqa_base"]}
 ]
diff --git a/src/helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py b/src/helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py
@@ -0,0 +1,108 @@
+from typing import Dict, List, Set
+import json
+import os
+
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+
+
+class VizWizScenario(Scenario):
+    """
+    VizWiz is a real-world visual question answering dataset consisting of questions
+    asked by people who are blind. It originates from a natural visual question answering
+    setting where blind people each took an image and recorded a spoken question about it,
+    together with 10 crowdsourced answers per visual question.
+
+    Version as of January 1, 2020:
+
+    - 20,523 training image/question pairs
+    - 205,230 training answer/answer confidence pairs
+    - 4,319 validation image/question pairs
+    - 43,190 validation answer/answer confidence pairs
+
+    where answer confidences are one of {"yes", "maybe", "no"}.
+
+    Answers are publicly shared for the train and validation splits and hidden for the test split.
+
+    Paper: https://arxiv.org/abs/1802.08218
+    Website: https://vizwiz.org/tasks-and-datasets/vqa
+    """
+
+    # Annotations are not available for the test set
+    ANNOTATIONS_URL: str = "https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip"
+    SPLIT_TO_ANNOTATIONS_FILE: Dict[str, str] = {
+        TRAIN_SPLIT: "train.json",
+        VALID_SPLIT: "val.json",
+    }
+
+    SPLIT_TO_IMAGES: Dict[str, str] = {
+        TRAIN_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/train.zip",
+        VALID_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/val.zip",
+    }
+
+    name = "viz_wiz"
+    description = (
+        "Real-world VQA dataset consisting of questions asked by "
+        "people who are blind ([paper](https://arxiv.org/abs/1802.08218))."
+    )
+    tags = ["vision-language", "visual question answering"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the questions and annotations
+        annotations_path: str = os.path.join(output_path, "annotations")
+        ensure_directory_exists(annotations_path)
+        ensure_file_downloaded(
+            source_url=self.ANNOTATIONS_URL,
+            target_path=annotations_path,
+            unpack=True,
+            unpack_type="unzip",
+        )
+
+        instances: List[Instance] = []
+        for split in [TRAIN_SPLIT, VALID_SPLIT]:
+            # Download the images for the split
+            images_path: str = os.path.join(output_path, split)
+            ensure_file_downloaded(
+                source_url=self.SPLIT_TO_IMAGES[split],
+                target_path=images_path,
+                unpack=True,
+                unpack_type="unzip",
+            )
+
+            annotations_split_path: str = os.path.join(annotations_path, self.SPLIT_TO_ANNOTATIONS_FILE[split])
+            with open(annotations_split_path) as f:
+                for image_annotation in json.load(f):
+                    image_path: str = os.path.join(images_path, image_annotation["image"])
+                    assert os.path.exists(image_path), f"Image {image_path} does not exist"
+
+                    content: List[MediaObject] = [
+                        MediaObject(location=image_path, content_type="image/jpeg"),
+                        MediaObject(text=image_annotation["question"], content_type="text/plain"),
+                    ]
+                    deduped_answers: Set[str] = {
+                        answer_json["answer"]
+                        for answer_json in image_annotation["answers"]
+                        if answer_json["answer_confidence"] == "yes"
+                    }
+
+                    instances.append(
+                        Instance(
+                            Input(multimedia_content=MultimediaObject(content)),
+                            references=[
+                                Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in deduped_answers
+                            ],
+                            split=split,
+                        )
+                    )
+
+        return instances
diff --git a/src/helm/benchmark/vlm_run_specs.py b/src/helm/benchmark/vlm_run_specs.py
@@ -18,7 +18,6 @@ def get_vlm_generation_adapter_spec(
     input_suffix: str = "",
     output_prefix: str = "",
     output_suffix: str = "",
-    max_train_instances: int = 0,
     max_tokens: int = 100,
     stop_sequences: Optional[List[str]] = None,
 ) -> AdapterSpec:
@@ -31,7 +30,8 @@ def get_vlm_generation_adapter_spec(
         output_prefix=output_prefix,
         output_suffix=output_suffix,
         instance_prefix="\n",
-        max_train_instances=max_train_instances,
+        # We focus on zero-shot evaluation for now as most open VLMs only support a single image input
+        max_train_instances=0,
         num_outputs=1,
         max_tokens=max_tokens,
         stop_sequences=stop_sequences if stop_sequences is not None else [],
@@ -43,6 +43,33 @@ def get_vlm_generation_adapter_spec(
 # VHELM run specs
 
 
+@run_spec_function("viz_wiz")
+def get_viz_wiz_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
+    )
+
+    # TODO: finalize the adapter spec parameters once we add more models
+    adapter_spec: AdapterSpec = get_vlm_generation_adapter_spec(
+        input_prefix="User: ",
+        input_suffix="<end_of_utterance>",
+        output_prefix="\nAssistant: ",
+        output_suffix="<end_of_utterance>",
+        stop_sequences=["<end_of_utterance>"],
+    )
+
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
+
+    run_spec_name: str = "viz_wiz"
+    return RunSpec(
+        name=run_spec_name,
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
 @run_spec_function("vqa")
 def get_vqa_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -55,7 +82,6 @@ def get_vqa_spec() -> RunSpec:
         input_suffix="<end_of_utterance>",
         output_prefix="\nAssistant: ",
         output_suffix="<end_of_utterance>",
-        max_train_instances=3,
         stop_sequences=["<end_of_utterance>"],
     )