From 832f9b2c858adc3751dc73734db9c8e4cfe2be7a Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Mon, 13 Feb 2023 21:15:59 -0800
Subject: [PATCH 01/17] Initial commit: LM Opinions scenario

---
 src/helm/benchmark/__init__.py                |   2 +
 src/helm/benchmark/adaptation/adapter_spec.py |   3 +
 .../adapters/in_context_learning_adapter.py   |  11 +-
 .../presentation/run_specs_lm_opinions.conf   |   6 +
 src/helm/benchmark/run_specs.py               |  45 ++++-
 .../scenarios/lm_opinions_scenario.py         | 158 ++++++++++++++++++
 6 files changed, 219 insertions(+), 6 deletions(-)
 create mode 100644 src/helm/benchmark/presentation/run_specs_lm_opinions.conf
 create mode 100644 src/helm/benchmark/scenarios/lm_opinions_scenario.py

diff --git a/src/helm/benchmark/__init__.py b/src/helm/benchmark/__init__.py
index 4e058c9d2c..95be40524b 100644
--- a/src/helm/benchmark/__init__.py
+++ b/src/helm/benchmark/__init__.py
@@ -42,6 +42,8 @@
 from .scenarios import entity_matching_scenario  # noqa
 from .scenarios import entity_data_imputation_scenario  # noqa
 from .scenarios import big_bench_scenario  # noqa
+from .scenarios import lm_opinions_scenario  # noqa
+
 
 # Biomedical
 from .scenarios import covid_dialog_scenario  # noqa
diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
index a431089136..41cc11e9e4 100644
--- a/src/helm/benchmark/adaptation/adapter_spec.py
+++ b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -67,6 +67,9 @@ class AdapterSpec:
     # Number of trials, where in each trial we choose an independent, random
     # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
+        
+    # Sample train examples or use deterministic
+    sample_train: bool = True
 
     # Decoding parameters (inherited by `Request`)
 
diff --git a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
index 72e3437ee0..f71439884e 100644
--- a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
@@ -65,7 +65,8 @@ def _adapt_trial_index(
         parallelism: int,
     ) -> List[RequestState]:
         self.train_trial_index: int = train_trial_index
-        self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index)
+        self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index,
+                                                                    sample_train=self.adapter_spec.sample_train)
         hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
 
         # Generate request_states
@@ -93,7 +94,8 @@ def _adapt_trial_index(
 
         return [request_state for result in results for request_state in result]
 
-    def sample_examples(self, all_train_instances: List[Instance], seed: int) -> List[Instance]:
+    def sample_examples(self, all_train_instances: List[Instance], seed: int,
+                        sample_train: bool) -> List[Instance]:
         """
         Sample a random set of train instances to use as examples by following the steps below:
         1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
@@ -123,6 +125,11 @@ class labels.
 
         unlabeled_instances: List[Instance] = []
         label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
+            
+        if not sample_train:
+            # Sample sequentially from the train set
+            examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
+            return examples
 
         for instance in all_train_instances:
             if instance.first_correct_reference:
diff --git a/src/helm/benchmark/presentation/run_specs_lm_opinions.conf b/src/helm/benchmark/presentation/run_specs_lm_opinions.conf
new file mode 100644
index 0000000000..24ed52c662
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_lm_opinions.conf
@@ -0,0 +1,6 @@
+entries: [
+{description: "lm_opinions:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/text-davinci-001", priority: 1}
+{description: "lm_opinions:survey_type=Pew_American_Trends_Panel_W92,context=steer-qa,num_train_trials=22,num_logprobs=100,model=openai/text-davinci-001", priority: 1}
+{description: "lm_opinions:survey_type=Pew_American_Trends_Panel_W92,context=steer-bio,num_logprobs=100,model=openai/text-davinci-001", priority: 1}
+{description: "lm_opinions:survey_type=Pew_American_Trends_Panel_W92,context=steer-portray,num_logprobs=100,model=openai/text-davinci-001", priority: 1}
+]
diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
index da9c59cf72..acbc9e90e9 100644
--- a/src/helm/benchmark/run_specs.py
+++ b/src/helm/benchmark/run_specs.py
@@ -47,8 +47,8 @@ def format_instructions(instructions: str) -> str:
 
 
 def get_multiple_choice_joint_adapter_spec(
-    instructions: str, input_noun: Optional[str], output_noun: str, max_train_instances: int = 5, **kwargs
-) -> AdapterSpec:
+    instructions: str, input_noun: Optional[str], output_noun: str, num_outputs: int = 5,
+    max_train_instances: int = 5, sample_train=True, **kwargs) -> AdapterSpec:
     """
     [instructions]
 
@@ -64,6 +64,7 @@ def get_multiple_choice_joint_adapter_spec(
     [reference_k]
     [output_noun]:
     """
+    
     return AdapterSpec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
         instructions=format_instructions(instructions),
@@ -76,6 +77,7 @@ def get_multiple_choice_joint_adapter_spec(
         max_tokens=5,
         temperature=0.0,
         stop_sequences=["\n"],
+        sample_train=sample_train,
         **kwargs,
     )
 
@@ -109,15 +111,19 @@ def get_multiple_choice_adapter_spec(
     input_noun: Optional[str],
     output_noun: str,
     max_train_instances: int = 5,
+    num_outputs: int = 5,
     empty_input: bool = False,
+    sample_train: bool = True,
     **kwargs,
 ):
+
     """
     Toggle between joint and separate adapters.
     """
     if method == ADAPT_MULTIPLE_CHOICE_JOINT:
         return get_multiple_choice_joint_adapter_spec(
-            instructions, input_noun, output_noun, max_train_instances, **kwargs
+            instructions, input_noun, output_noun, max_train_instances=max_train_instances, 
+            sample_train=sample_train, **kwargs
         )
     elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
         return get_multiple_choice_separate_adapter_spec(method, empty_input)
@@ -1490,7 +1496,6 @@ def get_entity_data_imputation_spec(dataset: str) -> RunSpec:
         groups=["entity_data_imputation"],
     )
 
-
 @htrack("Extracting adaptation parameters from the BIG-bench task definition and building the RunSpec")
 def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
     def get_adaptation_method(big_bench_metrics: List[str]) -> str:
@@ -1792,6 +1797,37 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
         groups=["wmt_14"],
     )
 
+def get_lm_opinions_spec(
+        survey_type: str,
+        num_logprobs: str,
+        context: str = "None",
+        num_train_trials: str = "1",
+        method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
+    ) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.lm_opinions_scenario.LMOpinionsScenario",
+        args={"survey_type": survey_type, "context": context},
+    )
+
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions="",
+        input_noun="Question",
+        output_noun="Answer",
+        max_train_instances=1,
+        num_outputs=int(num_logprobs),
+        num_train_trials=1 if  context != "steer-qa" else int(num_train_trials),
+        sample_train=False
+    )
+
+    return RunSpec(
+        name=f"lm_opinions:survey={survey_type},num_logprobs={num_logprobs},context={context},num_train_trials={num_train_trials}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["lm-opinions"],
+    )
+
 
 ############################################################
 
@@ -1850,6 +1886,7 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
     "med_paragraph_simplification": get_med_paragraph_simplification_spec,
     "med_qa": get_med_qa_spec,
     "pubmed_qa": get_pubmed_qa_spec,
+    "lm_opinions": get_lm_opinions_spec,
 }
 
 
diff --git a/src/helm/benchmark/scenarios/lm_opinions_scenario.py b/src/helm/benchmark/scenarios/lm_opinions_scenario.py
new file mode 100644
index 0000000000..219c247fea
--- /dev/null
+++ b/src/helm/benchmark/scenarios/lm_opinions_scenario.py
@@ -0,0 +1,158 @@
+import json
+import os
+import string
+from typing import Dict, List
+import pandas as pd
+from typing import List, Dict, Optional
+
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from .scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    PassageQuestionInput,
+    Input,
+    Output,
+)
+
+
+class LMOpinionsScenario(Scenario):
+    """
+    The LMOpinions dataset is from the paper:
+    [TODO ARXIV]
+
+
+    LM Opinions is a QA dataset containing 1484 multiple-choice questions. Since the questions are inherently 
+    subjective, there isn't a single ground truth response. Instead, the object of interest is how 
+    the distribution of model responses compares to those obtained from human survey participants as 
+    disccused in [TODO GITHUB].
+    
+    As discussed in Santurkar et al., we consider prompting an LM:
+    1. Without any context (zero-shot) to evaluate the "default" opinions reflected 
+        by it.
+    2. With context containing information pertaining to the group we want to steer the model towards.
+       This context is either formatted as a question-answer pair (QA) or a textual description (BIO/PORTRAY).
+ 
+    
+    We prompt models using the following format
+    
+        <optional context>
+
+        Question: <question>                  
+        A. <reference>
+        B. <reference>
+        C. <reference>
+        D. <reference>
+        Answer: <A/B/C/D>
+
+    For example, we have:
+    
+        Question: In politics today, do you consider yourself a 
+        A. Republican
+        B. Democrat
+        C. Independent
+        D. Something else E. Refused 
+        Answer: B
+
+        Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes 
+        to gun violence in the country today?
+        A. A great deal
+        B. A fair amount
+        C. Not too much
+        D. Not at all
+        E. Refused
+        Answer:
+        
+    
+    """
+
+    name = "lm-opinions"
+    description = "Subjective questions dataset based on Pew American Trends Panel opinion polls."
+    tags = ["multiple_choice"]
+
+    def __init__(self, survey_type: str, context: str):
+        super().__init__()
+        assert context in ['default', "steer-qa", "steer-bio", "steer-portray"]
+        
+        self.survey_type: str = survey_type
+        self.context: str = context
+            
+    def read_survey_questions(self, csv_path):
+        df = pd.read_csv(csv_path, sep="\t")
+        df["options"] = df.apply(lambda x: eval(x["options"]), axis=1)
+        return df
+        
+
+    def get_instances(self) -> List[Instance]:
+        # Download the raw data
+        data_path: str = os.path.join(self.output_path, "data")
+        # ensure_file_downloaded(
+        #    source_url="https://people.eecs.berkeley.edu/~hendrycks/data.tar",
+        #    target_path=data_path,
+        #    unpack=True,
+        #    unpack_type="untar",
+        # )
+
+        # Read all the instances
+        instances: List[Instance] = []
+        splits: Dict[str, str] = {
+            "dev": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+            
+
+        all_splits = ["dev", "test"] if self.context == "steer-qa" else ["test"]
+        csv_dict = {"dev": os.path.join(data_path, f"{self.context}.csv"),
+                    "test": os.path.join(data_path, f"{self.survey_type}.csv")}
+        
+        bios_df = None
+        if self.context in ["steer-bio", "steer-portray"]:
+            bios_path = os.path.join(data_path, f"{self.context}.csv")
+            bios_df = pd.read_csv(bios_path, sep="\t")     
+                    
+        for split in all_splits:
+
+            csv_path: str = csv_dict[split]
+            assert os.path.exists(csv_path)
+            
+            question_df = self.read_survey_questions(csv_path)
+
+            for qidx, (question, answers) in enumerate(zip(question_df["question"], 
+                                                           question_df["options"])):
+
+                prefixes = list(string.ascii_uppercase)
+                assert len(prefixes) >= len(answers)
+                answers_dict = dict(zip(prefixes, answers))
+                
+                # LM Opinions test questions have no correct answer. However, since the HELM codebase requires a
+                # correct answer to be associated with each instance, we set it to be the first reference.
+                # Note that this is never used in the analysis.
+                # In the case where context = steer-qa, we add demographic information in the form of a
+                # question answer pair as shown in the example above.
+                
+                correct_answer = answers[0] if split == "test" else question_df["correct"][qidx]
+
+                def answer_to_reference(answer: str) -> Reference:
+                    return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
+
+                if bios_df is None:
+                    # context = "default"/"steer-qa"
+                    instance = Instance(Input(text=question),
+                        references=list(map(answer_to_reference, answers)),
+                        split=splits[split],
+                    )
+                else:
+                    # context = "steer-bio"/"steer-portray"
+                    for bio in bios_df['question'].values:
+
+                        context = PassageQuestionInput(passage=bio, question=question+'\n')
+                        instance = Instance(context,
+                            references=list(map(answer_to_reference, answers)),
+                            split=splits[split],
+                        )
+                instances.append(instance)
+
+        return instances
\ No newline at end of file

From ad40b31d03df96c33bce990790c7cd08a9db7bd0 Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Mon, 20 Feb 2023 12:20:31 -0800
Subject: [PATCH 02/17] Propagage num-outputs and tokens

---
 src/helm/benchmark/run_specs.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
index acbc9e90e9..016e129c1c 100644
--- a/src/helm/benchmark/run_specs.py
+++ b/src/helm/benchmark/run_specs.py
@@ -48,7 +48,7 @@ def format_instructions(instructions: str) -> str:
 
 def get_multiple_choice_joint_adapter_spec(
     instructions: str, input_noun: Optional[str], output_noun: str, num_outputs: int = 5,
-    max_train_instances: int = 5, sample_train=True, **kwargs) -> AdapterSpec:
+    max_train_instances: int = 5, max_tokens: int=5, sample_train=True, **kwargs) -> AdapterSpec:
     """
     [instructions]
 
@@ -73,8 +73,8 @@ def get_multiple_choice_joint_adapter_spec(
         output_prefix=f"{output_noun}: ",
         output_suffix="\n",
         max_train_instances=max_train_instances,
-        num_outputs=1,
-        max_tokens=5,
+        num_outputs=num_outputs,
+        max_tokens=max_tokens,
         temperature=0.0,
         stop_sequences=["\n"],
         sample_train=sample_train,
@@ -112,6 +112,7 @@ def get_multiple_choice_adapter_spec(
     output_noun: str,
     max_train_instances: int = 5,
     num_outputs: int = 5,
+    max_tokens: int = 5,
     empty_input: bool = False,
     sample_train: bool = True,
     **kwargs,
@@ -123,6 +124,7 @@ def get_multiple_choice_adapter_spec(
     if method == ADAPT_MULTIPLE_CHOICE_JOINT:
         return get_multiple_choice_joint_adapter_spec(
             instructions, input_noun, output_noun, max_train_instances=max_train_instances, 
+            num_outputs=num_outputs, max_tokens=max_tokens,
             sample_train=sample_train, **kwargs
         )
     elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
@@ -1815,6 +1817,7 @@ def get_lm_opinions_spec(
         input_noun="Question",
         output_noun="Answer",
         max_train_instances=1,
+        max_tokens=1,
         num_outputs=int(num_logprobs),
         num_train_trials=1 if  context != "steer-qa" else int(num_train_trials),
         sample_train=False

From 8957d659cfcf78dd623b22ffb2690224e2b7cd5a Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Fri, 10 Mar 2023 15:50:14 -0800
Subject: [PATCH 03/17] OpinionsQA initial commit

---
 src/helm/benchmark/__init__.py                |   2 +-
 .../presentation/run_specs_lm_opinions.conf   |   6 -
 .../run_specs_lm_opinions_openai_default.conf |  92 ++++++++++
 src/helm/benchmark/run_specs.py               |  14 +-
 .../scenarios/opinions_qa_scenario.py         | 172 ++++++++++++++++++
 5 files changed, 272 insertions(+), 14 deletions(-)
 delete mode 100644 src/helm/benchmark/presentation/run_specs_lm_opinions.conf
 create mode 100644 src/helm/benchmark/presentation/run_specs_lm_opinions_openai_default.conf
 create mode 100644 src/helm/benchmark/scenarios/opinions_qa_scenario.py

diff --git a/src/helm/benchmark/__init__.py b/src/helm/benchmark/__init__.py
index 95be40524b..633eaf8c67 100644
--- a/src/helm/benchmark/__init__.py
+++ b/src/helm/benchmark/__init__.py
@@ -42,7 +42,7 @@
 from .scenarios import entity_matching_scenario  # noqa
 from .scenarios import entity_data_imputation_scenario  # noqa
 from .scenarios import big_bench_scenario  # noqa
-from .scenarios import lm_opinions_scenario  # noqa
+from .scenarios import opinions_qa_scenario  # noqa
 
 
 # Biomedical
diff --git a/src/helm/benchmark/presentation/run_specs_lm_opinions.conf b/src/helm/benchmark/presentation/run_specs_lm_opinions.conf
deleted file mode 100644
index 24ed52c662..0000000000
--- a/src/helm/benchmark/presentation/run_specs_lm_opinions.conf
+++ /dev/null
@@ -1,6 +0,0 @@
-entries: [
-{description: "lm_opinions:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/text-davinci-001", priority: 1}
-{description: "lm_opinions:survey_type=Pew_American_Trends_Panel_W92,context=steer-qa,num_train_trials=22,num_logprobs=100,model=openai/text-davinci-001", priority: 1}
-{description: "lm_opinions:survey_type=Pew_American_Trends_Panel_W92,context=steer-bio,num_logprobs=100,model=openai/text-davinci-001", priority: 1}
-{description: "lm_opinions:survey_type=Pew_American_Trends_Panel_W92,context=steer-portray,num_logprobs=100,model=openai/text-davinci-001", priority: 1}
-]
diff --git a/src/helm/benchmark/presentation/run_specs_lm_opinions_openai_default.conf b/src/helm/benchmark/presentation/run_specs_lm_opinions_openai_default.conf
new file mode 100644
index 0000000000..9ee230bffc
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_lm_opinions_openai_default.conf
@@ -0,0 +1,92 @@
+entries: [
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+]
diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
index 016e129c1c..e607eceabf 100644
--- a/src/helm/benchmark/run_specs.py
+++ b/src/helm/benchmark/run_specs.py
@@ -1799,7 +1799,7 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
         groups=["wmt_14"],
     )
 
-def get_lm_opinions_spec(
+def get_opinions_qa_spec(
         survey_type: str,
         num_logprobs: str,
         context: str = "None",
@@ -1807,7 +1807,7 @@ def get_lm_opinions_spec(
         method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
     ) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.lm_opinions_scenario.LMOpinionsScenario",
+        class_name="helm.benchmark.scenarios.opinions_qa_scenario.OpinionsQAScenario",
         args={"survey_type": survey_type, "context": context},
     )
 
@@ -1816,19 +1816,19 @@ def get_lm_opinions_spec(
         instructions="",
         input_noun="Question",
         output_noun="Answer",
-        max_train_instances=1,
+        max_train_instances=1 if 'steer' in context else 0,
         max_tokens=1,
         num_outputs=int(num_logprobs),
-        num_train_trials=1 if  context != "steer-qa" else int(num_train_trials),
+        num_train_trials=1 if context != "steer-qa" else int(num_train_trials),
         sample_train=False
     )
 
     return RunSpec(
-        name=f"lm_opinions:survey={survey_type},num_logprobs={num_logprobs},context={context},num_train_trials={num_train_trials}",
+        name=f"opinions_qa:survey={survey_type},num_logprobs={num_logprobs},context={context},num_train_trials={num_train_trials}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["lm-opinions"],
+        groups=["opinions-qa"],
     )
 
 
@@ -1889,7 +1889,7 @@ def get_lm_opinions_spec(
     "med_paragraph_simplification": get_med_paragraph_simplification_spec,
     "med_qa": get_med_qa_spec,
     "pubmed_qa": get_pubmed_qa_spec,
-    "lm_opinions": get_lm_opinions_spec,
+    "opinions_qa": get_opinions_qa_spec,
 }
 
 
diff --git a/src/helm/benchmark/scenarios/opinions_qa_scenario.py b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
new file mode 100644
index 0000000000..7087baa409
--- /dev/null
+++ b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
@@ -0,0 +1,172 @@
+import json
+import os
+import string
+from typing import Dict, List
+import pandas as pd
+from typing import List, Dict, Optional
+
+from helm.common.general import shell, ensure_directory_exists
+from .scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    PassageQuestionInput,
+    Input,
+    Output,
+)
+
+
+class OpinionsQAScenario(Scenario):
+    """
+    The OpinionsQAScenario dataset is from Santurkar et al. (2023).
+
+    OpinionsQA is a QA dataset containing 1484 multiple-choice questions. Since the questions are inherently 
+    subjective, there isn't a single ground truth response. Instead, the object of interest is how 
+    the distribution of model responses compares to those obtained from human survey participants as 
+    disccused in [TODO GITHUB].
+    
+    As discussed in Santurkar et al., we consider prompting an LM:
+    1. Without any context (zero-shot) to evaluate the "default" opinions reflected 
+        by it.
+    2. With context containing information pertaining to the group we want to steer the model towards.
+       This context is either formatted as a question-answer pair (QA) or a textual description (BIO/PORTRAY).
+ 
+    
+    We prompt models using the following format
+    
+        <optional context>
+
+        Question: <question>                  
+        A. <reference>
+        B. <reference>
+        C. <reference>
+        D. <reference>
+        Answer: <A/B/C/D>
+
+    For example, we have:
+    
+        Question: In politics today, do you consider yourself a 
+        A. Republican
+        B. Democrat
+        C. Independent
+        D. Something else E. Refused 
+        Answer: B
+
+        Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes 
+        to gun violence in the country today?
+        A. A great deal
+        B. A fair amount
+        C. Not too much
+        D. Not at all
+        E. Refused
+        Answer:
+        
+    
+    """
+
+    name = "opinions-qa"
+    description = "Subjective questions dataset based on Pew American Trends Panel opinion polls."
+    tags = ["multiple_choice"]
+    
+    """ Information needed to download the dataset """
+    CODALAB_URI_TEMPLATE: str = "https://worksheets.codalab.org/rest/bundles/{bundle}/contents/blob/model_input/{filename}"
+    CODALAB_BUNDLE: str = "0xd938ca73ca0a49bf97db4fbf8e706faa"
+    FILE_NAME: str = "Pew_American_Trends_Panel_W{wave}.csv"
+    PEW_SURVEY_WAVES: list = [26, 27, 29, 32, 34, 36, 41, 42, 43, 45, 49, 50, 54, 82, 92]
+
+    def __init__(self, survey_type: str, context: str):
+        super().__init__()
+        assert context in ['default', "steer-qa", "steer-bio", "steer-portray"]
+        
+        self.survey_type: str = survey_type
+        self.context: str = context
+            
+    def download_data(self):
+        
+        DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
+        DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
+        
+        for filename in DOWNLOAD_FILENAMES:
+            data_path: str = os.path.join(self.output_path, filename)
+            source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, 
+                                                               filename=filename)
+            if not os.path.exists(data_path):
+                shell(['wget', source_url, "--no-check-certificate", "-O", data_path])
+                
+    def read_survey_questions(self, csv_path):
+        df = pd.read_csv(csv_path, sep="\t")
+        df["options"] = df.apply(lambda x: eval(x["options"]), axis=1)
+        return df
+        
+
+    def get_instances(self) -> List[Instance]:    
+        
+        self.output_path: str = os.path.join(self.output_path, "data")
+        if not os.path.exists(self.output_path): 
+            os.makedirs(self.output_path)
+            
+        self.download_data()
+                
+        # Read all the instances
+        instances: List[Instance] = []
+        splits: Dict[str, str] = {
+            "dev": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+            
+
+        all_splits = ["dev", "test"] if self.context == "steer-qa" else ["test"]
+        csv_dict = {"dev": os.path.join(self.output_path, f"{self.context}.csv"),
+                    "test": os.path.join(self.output_path, f"{self.survey_type}.csv")}
+        
+        bios_df = None
+        if self.context in ["steer-bio", "steer-portray"]:
+            bios_path = os.path.join(data_path, f"{self.context}.csv")
+            bios_df = pd.read_csv(bios_path, sep="\t")     
+                    
+        for split in all_splits:
+
+            csv_path: str = csv_dict[split]
+            assert os.path.exists(csv_path)
+            
+            question_df = self.read_survey_questions(csv_path)
+
+            for qidx, (question, answers) in enumerate(zip(question_df["question"], 
+                                                           question_df["options"])):
+
+                prefixes = list(string.ascii_uppercase)
+                assert len(prefixes) >= len(answers)
+                answers_dict = dict(zip(prefixes, answers))
+                
+                # Opinions QA test questions have no correct answer. However, since the HELM codebase requires a
+                # correct answer to be associated with each instance, we set it to be the first reference.
+                # Note that this is never used in the analysis.
+                # In the case where context = steer-qa, we add demographic information in the form of a
+                # question answer pair as shown in the example above.
+                
+                correct_answer = answers[0] if split == "test" else question_df["correct"][qidx]
+
+                def answer_to_reference(answer: str) -> Reference:
+                    return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
+
+                if bios_df is None:
+                    # context = "default"/"steer-qa"
+                    instance = Instance(Input(text=question),
+                        references=list(map(answer_to_reference, answers)),
+                        split=splits[split],
+                    )
+                else:
+                    # context = "steer-bio"/"steer-portray"
+                    for bio in bios_df['question'].values:
+
+                        context = PassageQuestionInput(passage=bio, question=question+'\n')
+                        instance = Instance(context,
+                            references=list(map(answer_to_reference, answers)),
+                            split=splits[split],
+                        )
+                instances.append(instance)
+
+        return instances
\ No newline at end of file

From 4d8201284bb157d020af16d3e937cd3ebd8f8451 Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sun, 19 Mar 2023 17:13:42 -0700
Subject: [PATCH 04/17] Opinions QA

---
 .../run_specs_lm_opinions_ai21_steer-bio.conf |   5 +
 .../run_specs_opinions_qa_ai21_default.conf   |  50 ++++++
 ..._specs_opinions_qa_ai21_steer-portray.conf |   5 +
 .../run_specs_opinions_qa_ai21_steer-qa.conf  |   5 +
 ...run_specs_opinions_qa_openai_default.conf} |   6 +
 ...un_specs_opinions_qa_openai_steer-bio.conf |   8 +
 ...pecs_opinions_qa_openai_steer-portray.conf |   8 +
 ...run_specs_opinions_qa_openai_steer-qa.conf |   8 +
 .../scenarios/lm_opinions_scenario.py         | 158 ------------------
 .../scenarios/opinions_qa_scenario.py         |   9 +-
 10 files changed, 101 insertions(+), 161 deletions(-)
 create mode 100644 src/helm/benchmark/presentation/run_specs_lm_opinions_ai21_steer-bio.conf
 create mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf
 create mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-portray.conf
 create mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-qa.conf
 rename src/helm/benchmark/presentation/{run_specs_lm_opinions_openai_default.conf => run_specs_opinions_qa_openai_default.conf} (93%)
 create mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-bio.conf
 create mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-portray.conf
 create mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-qa.conf
 delete mode 100644 src/helm/benchmark/scenarios/lm_opinions_scenario.py

diff --git a/src/helm/benchmark/presentation/run_specs_lm_opinions_ai21_steer-bio.conf b/src/helm/benchmark/presentation/run_specs_lm_opinions_ai21_steer-bio.conf
new file mode 100644
index 0000000000..23c79ea25e
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_lm_opinions_ai21_steer-bio.conf
@@ -0,0 +1,5 @@
+entries: [
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf
new file mode 100644
index 0000000000..c4cb950a7d
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf
@@ -0,0 +1,50 @@
+entries: [
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-portray.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-portray.conf
new file mode 100644
index 0000000000..fff482d113
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-portray.conf
@@ -0,0 +1,5 @@
+entries: [
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-qa.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-qa.conf
new file mode 100644
index 0000000000..f124c0be3b
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-qa.conf
@@ -0,0 +1,5 @@
+entries: [
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=22", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=10,model=ai21/j1-grande,num_train_trials=22", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=22", priority: 1}
+]
diff --git a/src/helm/benchmark/presentation/run_specs_lm_opinions_openai_default.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_default.conf
similarity index 93%
rename from src/helm/benchmark/presentation/run_specs_lm_opinions_openai_default.conf
rename to src/helm/benchmark/presentation/run_specs_opinions_qa_openai_default.conf
index 9ee230bffc..56decc9db1 100644
--- a/src/helm/benchmark/presentation/run_specs_lm_opinions_openai_default.conf
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_default.conf
@@ -1,4 +1,10 @@
 entries: [
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
 {description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
 {description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
 {description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-bio.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-bio.conf
new file mode 100644
index 0000000000..a0de7ec55b
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-bio.conf
@@ -0,0 +1,8 @@
+entries: [
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-portray.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-portray.conf
new file mode 100644
index 0000000000..2c78a0bf85
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-portray.conf
@@ -0,0 +1,8 @@
+entries: [
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-qa.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-qa.conf
new file mode 100644
index 0000000000..7ae0fb3fd0
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-qa.conf
@@ -0,0 +1,8 @@
+entries: [
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/ada,num_train_trials=22", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/davinci,num_train_trials=22", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/text-ada-001,num_train_trials=22", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=22", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=22", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=22", priority: 1}
+]
diff --git a/src/helm/benchmark/scenarios/lm_opinions_scenario.py b/src/helm/benchmark/scenarios/lm_opinions_scenario.py
deleted file mode 100644
index 219c247fea..0000000000
--- a/src/helm/benchmark/scenarios/lm_opinions_scenario.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import json
-import os
-import string
-from typing import Dict, List
-import pandas as pd
-from typing import List, Dict, Optional
-
-from helm.common.general import ensure_file_downloaded, ensure_directory_exists
-from .scenario import (
-    Scenario,
-    Instance,
-    Reference,
-    TRAIN_SPLIT,
-    TEST_SPLIT,
-    CORRECT_TAG,
-    PassageQuestionInput,
-    Input,
-    Output,
-)
-
-
-class LMOpinionsScenario(Scenario):
-    """
-    The LMOpinions dataset is from the paper:
-    [TODO ARXIV]
-
-
-    LM Opinions is a QA dataset containing 1484 multiple-choice questions. Since the questions are inherently 
-    subjective, there isn't a single ground truth response. Instead, the object of interest is how 
-    the distribution of model responses compares to those obtained from human survey participants as 
-    disccused in [TODO GITHUB].
-    
-    As discussed in Santurkar et al., we consider prompting an LM:
-    1. Without any context (zero-shot) to evaluate the "default" opinions reflected 
-        by it.
-    2. With context containing information pertaining to the group we want to steer the model towards.
-       This context is either formatted as a question-answer pair (QA) or a textual description (BIO/PORTRAY).
- 
-    
-    We prompt models using the following format
-    
-        <optional context>
-
-        Question: <question>                  
-        A. <reference>
-        B. <reference>
-        C. <reference>
-        D. <reference>
-        Answer: <A/B/C/D>
-
-    For example, we have:
-    
-        Question: In politics today, do you consider yourself a 
-        A. Republican
-        B. Democrat
-        C. Independent
-        D. Something else E. Refused 
-        Answer: B
-
-        Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes 
-        to gun violence in the country today?
-        A. A great deal
-        B. A fair amount
-        C. Not too much
-        D. Not at all
-        E. Refused
-        Answer:
-        
-    
-    """
-
-    name = "lm-opinions"
-    description = "Subjective questions dataset based on Pew American Trends Panel opinion polls."
-    tags = ["multiple_choice"]
-
-    def __init__(self, survey_type: str, context: str):
-        super().__init__()
-        assert context in ['default', "steer-qa", "steer-bio", "steer-portray"]
-        
-        self.survey_type: str = survey_type
-        self.context: str = context
-            
-    def read_survey_questions(self, csv_path):
-        df = pd.read_csv(csv_path, sep="\t")
-        df["options"] = df.apply(lambda x: eval(x["options"]), axis=1)
-        return df
-        
-
-    def get_instances(self) -> List[Instance]:
-        # Download the raw data
-        data_path: str = os.path.join(self.output_path, "data")
-        # ensure_file_downloaded(
-        #    source_url="https://people.eecs.berkeley.edu/~hendrycks/data.tar",
-        #    target_path=data_path,
-        #    unpack=True,
-        #    unpack_type="untar",
-        # )
-
-        # Read all the instances
-        instances: List[Instance] = []
-        splits: Dict[str, str] = {
-            "dev": TRAIN_SPLIT,
-            "test": TEST_SPLIT,
-        }
-            
-
-        all_splits = ["dev", "test"] if self.context == "steer-qa" else ["test"]
-        csv_dict = {"dev": os.path.join(data_path, f"{self.context}.csv"),
-                    "test": os.path.join(data_path, f"{self.survey_type}.csv")}
-        
-        bios_df = None
-        if self.context in ["steer-bio", "steer-portray"]:
-            bios_path = os.path.join(data_path, f"{self.context}.csv")
-            bios_df = pd.read_csv(bios_path, sep="\t")     
-                    
-        for split in all_splits:
-
-            csv_path: str = csv_dict[split]
-            assert os.path.exists(csv_path)
-            
-            question_df = self.read_survey_questions(csv_path)
-
-            for qidx, (question, answers) in enumerate(zip(question_df["question"], 
-                                                           question_df["options"])):
-
-                prefixes = list(string.ascii_uppercase)
-                assert len(prefixes) >= len(answers)
-                answers_dict = dict(zip(prefixes, answers))
-                
-                # LM Opinions test questions have no correct answer. However, since the HELM codebase requires a
-                # correct answer to be associated with each instance, we set it to be the first reference.
-                # Note that this is never used in the analysis.
-                # In the case where context = steer-qa, we add demographic information in the form of a
-                # question answer pair as shown in the example above.
-                
-                correct_answer = answers[0] if split == "test" else question_df["correct"][qidx]
-
-                def answer_to_reference(answer: str) -> Reference:
-                    return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
-
-                if bios_df is None:
-                    # context = "default"/"steer-qa"
-                    instance = Instance(Input(text=question),
-                        references=list(map(answer_to_reference, answers)),
-                        split=splits[split],
-                    )
-                else:
-                    # context = "steer-bio"/"steer-portray"
-                    for bio in bios_df['question'].values:
-
-                        context = PassageQuestionInput(passage=bio, question=question+'\n')
-                        instance = Instance(context,
-                            references=list(map(answer_to_reference, answers)),
-                            split=splits[split],
-                        )
-                instances.append(instance)
-
-        return instances
\ No newline at end of file
diff --git a/src/helm/benchmark/scenarios/opinions_qa_scenario.py b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
index 7087baa409..02fc902b67 100644
--- a/src/helm/benchmark/scenarios/opinions_qa_scenario.py
+++ b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
@@ -73,7 +73,7 @@ class OpinionsQAScenario(Scenario):
     
     """ Information needed to download the dataset """
     CODALAB_URI_TEMPLATE: str = "https://worksheets.codalab.org/rest/bundles/{bundle}/contents/blob/model_input/{filename}"
-    CODALAB_BUNDLE: str = "0xd938ca73ca0a49bf97db4fbf8e706faa"
+    CODALAB_BUNDLE: str = "0xa6f81cc62d7d4ccb93031a72d2043669"
     FILE_NAME: str = "Pew_American_Trends_Panel_W{wave}.csv"
     PEW_SURVEY_WAVES: list = [26, 27, 29, 32, 34, 36, 41, 42, 43, 45, 49, 50, 54, 82, 92]
 
@@ -88,9 +88,11 @@ def download_data(self):
         
         DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
         DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
+        DOWNLOAD_FILENAMES += ['Pew_American_Trends_Panel_disagreement_500.csv']
         
         for filename in DOWNLOAD_FILENAMES:
             data_path: str = os.path.join(self.output_path, filename)
+                
             source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, 
                                                                filename=filename)
             if not os.path.exists(data_path):
@@ -124,7 +126,7 @@ def get_instances(self) -> List[Instance]:
         
         bios_df = None
         if self.context in ["steer-bio", "steer-portray"]:
-            bios_path = os.path.join(data_path, f"{self.context}.csv")
+            bios_path = os.path.join(self.output_path, f"{self.context}.csv")
             bios_df = pd.read_csv(bios_path, sep="\t")     
                     
         for split in all_splits:
@@ -158,6 +160,7 @@ def answer_to_reference(answer: str) -> Reference:
                         references=list(map(answer_to_reference, answers)),
                         split=splits[split],
                     )
+                    instances.append(instance)
                 else:
                     # context = "steer-bio"/"steer-portray"
                     for bio in bios_df['question'].values:
@@ -167,6 +170,6 @@ def answer_to_reference(answer: str) -> Reference:
                             references=list(map(answer_to_reference, answers)),
                             split=splits[split],
                         )
-                instances.append(instance)
+                        instances.append(instance)
 
         return instances
\ No newline at end of file

From af959e4ab4da8fa55a86baabfdd4f70bca9bbee5 Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sun, 19 Mar 2023 17:16:26 -0700
Subject: [PATCH 05/17] Minor

---
 src/helm/benchmark/scenarios/opinions_qa_scenario.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/helm/benchmark/scenarios/opinions_qa_scenario.py b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
index 02fc902b67..fbf380b62f 100644
--- a/src/helm/benchmark/scenarios/opinions_qa_scenario.py
+++ b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
@@ -21,12 +21,12 @@
 
 class OpinionsQAScenario(Scenario):
     """
-    The OpinionsQAScenario dataset is from Santurkar et al. (2023).
+    The OpinionsQAScenario dataset is from the paper "Whose Opinions Do Language Models Reflect?"
+    [Santurkar et al., 2023].
 
     OpinionsQA is a QA dataset containing 1484 multiple-choice questions. Since the questions are inherently 
     subjective, there isn't a single ground truth response. Instead, the object of interest is how 
-    the distribution of model responses compares to those obtained from human survey participants as 
-    disccused in [TODO GITHUB].
+    the distribution of model responses compares to those obtained from human survey participants.
     
     As discussed in Santurkar et al., we consider prompting an LM:
     1. Without any context (zero-shot) to evaluate the "default" opinions reflected 

From 1bd98e0d8e17dc311d5d9f851276ebfe2395e233 Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sun, 19 Mar 2023 17:37:58 -0700
Subject: [PATCH 06/17] Minor reformatting

---
 src/helm/benchmark/adaptation/adapter_spec.py |   2 +-
 .../adapters/in_context_learning_adapter.py   |  10 +-
 src/helm/benchmark/run_specs.py               |  42 +++++---
 .../scenarios/opinions_qa_scenario.py         | 100 +++++++++---------
 4 files changed, 85 insertions(+), 69 deletions(-)

diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
index 41cc11e9e4..d54c7477e2 100644
--- a/src/helm/benchmark/adaptation/adapter_spec.py
+++ b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -67,7 +67,7 @@ class AdapterSpec:
     # Number of trials, where in each trial we choose an independent, random
     # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
-        
+
     # Sample train examples or use deterministic
     sample_train: bool = True
 
diff --git a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
index f71439884e..3c53f3e923 100644
--- a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
@@ -65,8 +65,9 @@ def _adapt_trial_index(
         parallelism: int,
     ) -> List[RequestState]:
         self.train_trial_index: int = train_trial_index
-        self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index,
-                                                                    sample_train=self.adapter_spec.sample_train)
+        self.train_instances: List[Instance] = self.sample_examples(
+            all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
+        )
         hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
 
         # Generate request_states
@@ -94,8 +95,7 @@ def _adapt_trial_index(
 
         return [request_state for result in results for request_state in result]
 
-    def sample_examples(self, all_train_instances: List[Instance], seed: int,
-                        sample_train: bool) -> List[Instance]:
+    def sample_examples(self, all_train_instances: List[Instance], seed: int, sample_train: bool) -> List[Instance]:
         """
         Sample a random set of train instances to use as examples by following the steps below:
         1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
@@ -125,7 +125,7 @@ class labels.
 
         unlabeled_instances: List[Instance] = []
         label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
-            
+
         if not sample_train:
             # Sample sequentially from the train set
             examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
index e607eceabf..986e73a1f9 100644
--- a/src/helm/benchmark/run_specs.py
+++ b/src/helm/benchmark/run_specs.py
@@ -47,8 +47,15 @@ def format_instructions(instructions: str) -> str:
 
 
 def get_multiple_choice_joint_adapter_spec(
-    instructions: str, input_noun: Optional[str], output_noun: str, num_outputs: int = 5,
-    max_train_instances: int = 5, max_tokens: int=5, sample_train=True, **kwargs) -> AdapterSpec:
+    instructions: str,
+    input_noun: Optional[str],
+    output_noun: str,
+    num_outputs: int = 5,
+    max_train_instances: int = 5,
+    max_tokens: int = 5,
+    sample_train=True,
+    **kwargs,
+) -> AdapterSpec:
     """
     [instructions]
 
@@ -64,7 +71,7 @@ def get_multiple_choice_joint_adapter_spec(
     [reference_k]
     [output_noun]:
     """
-    
+
     return AdapterSpec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
         instructions=format_instructions(instructions),
@@ -123,9 +130,14 @@ def get_multiple_choice_adapter_spec(
     """
     if method == ADAPT_MULTIPLE_CHOICE_JOINT:
         return get_multiple_choice_joint_adapter_spec(
-            instructions, input_noun, output_noun, max_train_instances=max_train_instances, 
-            num_outputs=num_outputs, max_tokens=max_tokens,
-            sample_train=sample_train, **kwargs
+            instructions,
+            input_noun,
+            output_noun,
+            max_train_instances=max_train_instances,
+            num_outputs=num_outputs,
+            max_tokens=max_tokens,
+            sample_train=sample_train,
+            **kwargs,
         )
     elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
         return get_multiple_choice_separate_adapter_spec(method, empty_input)
@@ -1498,6 +1510,7 @@ def get_entity_data_imputation_spec(dataset: str) -> RunSpec:
         groups=["entity_data_imputation"],
     )
 
+
 @htrack("Extracting adaptation parameters from the BIG-bench task definition and building the RunSpec")
 def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
     def get_adaptation_method(big_bench_metrics: List[str]) -> str:
@@ -1799,13 +1812,14 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
         groups=["wmt_14"],
     )
 
+
 def get_opinions_qa_spec(
-        survey_type: str,
-        num_logprobs: str,
-        context: str = "None",
-        num_train_trials: str = "1",
-        method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
-    ) -> RunSpec:
+    survey_type: str,
+    num_logprobs: str,
+    context: str = "None",
+    num_train_trials: str = "1",
+    method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
+) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.opinions_qa_scenario.OpinionsQAScenario",
         args={"survey_type": survey_type, "context": context},
@@ -1816,11 +1830,11 @@ def get_opinions_qa_spec(
         instructions="",
         input_noun="Question",
         output_noun="Answer",
-        max_train_instances=1 if 'steer' in context else 0,
+        max_train_instances=1 if "steer" in context else 0,
         max_tokens=1,
         num_outputs=int(num_logprobs),
         num_train_trials=1 if context != "steer-qa" else int(num_train_trials),
-        sample_train=False
+        sample_train=False,
     )
 
     return RunSpec(
diff --git a/src/helm/benchmark/scenarios/opinions_qa_scenario.py b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
index fbf380b62f..4ecc0902cf 100644
--- a/src/helm/benchmark/scenarios/opinions_qa_scenario.py
+++ b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
@@ -24,22 +24,22 @@ class OpinionsQAScenario(Scenario):
     The OpinionsQAScenario dataset is from the paper "Whose Opinions Do Language Models Reflect?"
     [Santurkar et al., 2023].
 
-    OpinionsQA is a QA dataset containing 1484 multiple-choice questions. Since the questions are inherently 
-    subjective, there isn't a single ground truth response. Instead, the object of interest is how 
+    OpinionsQA is a QA dataset containing 1484 multiple-choice questions. Since the questions are inherently
+    subjective, there isn't a single ground truth response. Instead, the object of interest is how
     the distribution of model responses compares to those obtained from human survey participants.
-    
+
     As discussed in Santurkar et al., we consider prompting an LM:
-    1. Without any context (zero-shot) to evaluate the "default" opinions reflected 
+    1. Without any context (zero-shot) to evaluate the "default" opinions reflected
         by it.
     2. With context containing information pertaining to the group we want to steer the model towards.
        This context is either formatted as a question-answer pair (QA) or a textual description (BIO/PORTRAY).
- 
-    
+
+
     We prompt models using the following format
-    
+
         <optional context>
 
-        Question: <question>                  
+        Question: <question>
         A. <reference>
         B. <reference>
         C. <reference>
@@ -47,15 +47,15 @@ class OpinionsQAScenario(Scenario):
         Answer: <A/B/C/D>
 
     For example, we have:
-    
-        Question: In politics today, do you consider yourself a 
+
+        Question: In politics today, do you consider yourself a
         A. Republican
         B. Democrat
         C. Independent
-        D. Something else E. Refused 
+        D. Something else E. Refused
         Answer: B
 
-        Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes 
+        Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes
         to gun violence in the country today?
         A. A great deal
         B. A fair amount
@@ -63,92 +63,92 @@ class OpinionsQAScenario(Scenario):
         D. Not at all
         E. Refused
         Answer:
-        
-    
+
+
     """
 
     name = "opinions-qa"
     description = "Subjective questions dataset based on Pew American Trends Panel opinion polls."
     tags = ["multiple_choice"]
-    
+
     """ Information needed to download the dataset """
-    CODALAB_URI_TEMPLATE: str = "https://worksheets.codalab.org/rest/bundles/{bundle}/contents/blob/model_input/{filename}"
+    CODALAB_URI_TEMPLATE: str = (
+        "https://worksheets.codalab.org/rest/bundles/{bundle}/contents/blob/model_input/{filename}"
+    )
     CODALAB_BUNDLE: str = "0xa6f81cc62d7d4ccb93031a72d2043669"
     FILE_NAME: str = "Pew_American_Trends_Panel_W{wave}.csv"
     PEW_SURVEY_WAVES: list = [26, 27, 29, 32, 34, 36, 41, 42, 43, 45, 49, 50, 54, 82, 92]
 
     def __init__(self, survey_type: str, context: str):
         super().__init__()
-        assert context in ['default', "steer-qa", "steer-bio", "steer-portray"]
-        
+        assert context in ["default", "steer-qa", "steer-bio", "steer-portray"]
+
         self.survey_type: str = survey_type
         self.context: str = context
-            
+
     def download_data(self):
-        
+
         DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
         DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
-        DOWNLOAD_FILENAMES += ['Pew_American_Trends_Panel_disagreement_500.csv']
-        
+        DOWNLOAD_FILENAMES += ["Pew_American_Trends_Panel_disagreement_500.csv"]
+
         for filename in DOWNLOAD_FILENAMES:
             data_path: str = os.path.join(self.output_path, filename)
-                
-            source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, 
-                                                               filename=filename)
+
+            source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, filename=filename)
             if not os.path.exists(data_path):
-                shell(['wget', source_url, "--no-check-certificate", "-O", data_path])
-                
+                shell(["wget", source_url, "--no-check-certificate", "-O", data_path])
+
     def read_survey_questions(self, csv_path):
         df = pd.read_csv(csv_path, sep="\t")
         df["options"] = df.apply(lambda x: eval(x["options"]), axis=1)
         return df
-        
 
-    def get_instances(self) -> List[Instance]:    
-        
+    def get_instances(self) -> List[Instance]:
+
         self.output_path: str = os.path.join(self.output_path, "data")
-        if not os.path.exists(self.output_path): 
+        if not os.path.exists(self.output_path):
             os.makedirs(self.output_path)
-            
+
         self.download_data()
-                
+
         # Read all the instances
         instances: List[Instance] = []
         splits: Dict[str, str] = {
             "dev": TRAIN_SPLIT,
             "test": TEST_SPLIT,
         }
-            
 
         all_splits = ["dev", "test"] if self.context == "steer-qa" else ["test"]
-        csv_dict = {"dev": os.path.join(self.output_path, f"{self.context}.csv"),
-                    "test": os.path.join(self.output_path, f"{self.survey_type}.csv")}
-        
+        csv_dict = {
+            "dev": os.path.join(self.output_path, f"{self.context}.csv"),
+            "test": os.path.join(self.output_path, f"{self.survey_type}.csv"),
+        }
+
         bios_df = None
         if self.context in ["steer-bio", "steer-portray"]:
             bios_path = os.path.join(self.output_path, f"{self.context}.csv")
-            bios_df = pd.read_csv(bios_path, sep="\t")     
-                    
+            bios_df = pd.read_csv(bios_path, sep="\t")
+
         for split in all_splits:
 
             csv_path: str = csv_dict[split]
             assert os.path.exists(csv_path)
-            
+
             question_df = self.read_survey_questions(csv_path)
 
-            for qidx, (question, answers) in enumerate(zip(question_df["question"], 
-                                                           question_df["options"])):
+            for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
 
                 prefixes = list(string.ascii_uppercase)
                 assert len(prefixes) >= len(answers)
                 answers_dict = dict(zip(prefixes, answers))
-                
+
                 # Opinions QA test questions have no correct answer. However, since the HELM codebase requires a
                 # correct answer to be associated with each instance, we set it to be the first reference.
                 # Note that this is never used in the analysis.
                 # In the case where context = steer-qa, we add demographic information in the form of a
                 # question answer pair as shown in the example above.
-                
+
                 correct_answer = answers[0] if split == "test" else question_df["correct"][qidx]
 
                 def answer_to_reference(answer: str) -> Reference:
@@ -156,20 +156,22 @@ def answer_to_reference(answer: str) -> Reference:
 
                 if bios_df is None:
                     # context = "default"/"steer-qa"
-                    instance = Instance(Input(text=question),
+                    instance = Instance(
+                        Input(text=question),
                         references=list(map(answer_to_reference, answers)),
                         split=splits[split],
                     )
                     instances.append(instance)
                 else:
                     # context = "steer-bio"/"steer-portray"
-                    for bio in bios_df['question'].values:
+                    for bio in bios_df["question"].values:
 
-                        context = PassageQuestionInput(passage=bio, question=question+'\n')
-                        instance = Instance(context,
+                        context = PassageQuestionInput(passage=bio, question=question + "\n")
+                        instance = Instance(
+                            context,
                             references=list(map(answer_to_reference, answers)),
                             split=splits[split],
                         )
                         instances.append(instance)
 
-        return instances
\ No newline at end of file
+        return instances

From d88e993c2f42a9fd8780d58a6e6aa3c31983218c Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sun, 19 Mar 2023 17:51:04 -0700
Subject: [PATCH 07/17] Minor formatting

---
 src/helm/benchmark/run_specs.py                      |  3 ++-
 src/helm/benchmark/scenarios/opinions_qa_scenario.py | 11 ++---------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
index 986e73a1f9..29a686947c 100644
--- a/src/helm/benchmark/run_specs.py
+++ b/src/helm/benchmark/run_specs.py
@@ -1838,7 +1838,8 @@ def get_opinions_qa_spec(
     )
 
     return RunSpec(
-        name=f"opinions_qa:survey={survey_type},num_logprobs={num_logprobs},context={context},num_train_trials={num_train_trials}",
+        name=f"opinions_qa:survey={survey_type},num_logprobs={num_logprobs}"
+        + f",context={context},num_train_trials={num_train_trials}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
diff --git a/src/helm/benchmark/scenarios/opinions_qa_scenario.py b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
index 4ecc0902cf..dbaf271366 100644
--- a/src/helm/benchmark/scenarios/opinions_qa_scenario.py
+++ b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
@@ -1,11 +1,8 @@
-import json
 import os
-import string
-from typing import Dict, List
 import pandas as pd
-from typing import List, Dict, Optional
+from typing import List, Dict
 
-from helm.common.general import shell, ensure_directory_exists
+from helm.common.general import shell
 from .scenario import (
     Scenario,
     Instance,
@@ -139,10 +136,6 @@ def get_instances(self) -> List[Instance]:
 
             for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
 
-                prefixes = list(string.ascii_uppercase)
-                assert len(prefixes) >= len(answers)
-                answers_dict = dict(zip(prefixes, answers))
-
                 # Opinions QA test questions have no correct answer. However, since the HELM codebase requires a
                 # correct answer to be associated with each instance, we set it to be the first reference.
                 # Note that this is never used in the analysis.

From d856c4978e11c3f0d58d66b11e4bca4c542ce13f Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sun, 19 Mar 2023 19:25:57 -0700
Subject: [PATCH 08/17] Minor

---
 .../adaptation/adapters/in_context_learning_adapter.py         | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
index 3c53f3e923..0ddda04e4f 100644
--- a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
@@ -125,7 +125,7 @@ class labels.
 
         unlabeled_instances: List[Instance] = []
         label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
-
+        examples: List[Instance] = []
         if not sample_train:
             # Sample sequentially from the train set
             examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
@@ -152,7 +152,6 @@ class labels.
             sorted_labels.extend(labels)
 
         labels_iterable = cycle(sorted_labels)
-        examples: List[Instance] = []
         while num_instances_to_sample > 0:
             next_label: Optional[str] = next(labels_iterable, None)
             if not next_label:

From f29fd78c3706a79285f2622cd76ab66f2c3d7757 Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sat, 25 Mar 2023 12:45:50 -0700
Subject: [PATCH 09/17] Percy's comments

---
 src/helm/benchmark/adaptation/adapter_spec.py |   2 +-
 .../run_specs_lm_opinions_ai21_steer-bio.conf |   5 -
 .../run_specs_opinions_qa_ai21_default.conf   |  64 +++-------
 ..._specs_opinions_qa_ai21_steer-portray.conf |   5 -
 .../run_specs_opinions_qa_ai21_steer-qa.conf  |   5 -
 .../run_specs_opinions_qa_ai21_steer.conf     |   5 +
 .../run_specs_opinions_qa_openai_default.conf | 112 +++---------------
 ...un_specs_opinions_qa_openai_steer-bio.conf |   8 --
 ...pecs_opinions_qa_openai_steer-portray.conf |   8 --
 ...run_specs_opinions_qa_openai_steer-qa.conf |   8 --
 .../run_specs_opinions_qa_openai_steer.conf   |   5 +
 src/helm/benchmark/run_expander.py            |   2 +
 src/helm/benchmark/run_specs.py               |   4 +-
 .../scenarios/opinions_qa_scenario.py         |  33 +++---
 src/helm/common/general.py                    |   5 +-
 15 files changed, 67 insertions(+), 204 deletions(-)
 delete mode 100644 src/helm/benchmark/presentation/run_specs_lm_opinions_ai21_steer-bio.conf
 delete mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-portray.conf
 delete mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-qa.conf
 create mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer.conf
 delete mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-bio.conf
 delete mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-portray.conf
 delete mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-qa.conf
 create mode 100644 src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer.conf

diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
index d54c7477e2..3540db26bb 100644
--- a/src/helm/benchmark/adaptation/adapter_spec.py
+++ b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -68,7 +68,7 @@ class AdapterSpec:
     # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
 
-    # Sample train examples or use deterministic
+    # Randomly sample train examples or use them in order
     sample_train: bool = True
 
     # Decoding parameters (inherited by `Request`)
diff --git a/src/helm/benchmark/presentation/run_specs_lm_opinions_ai21_steer-bio.conf b/src/helm/benchmark/presentation/run_specs_lm_opinions_ai21_steer-bio.conf
deleted file mode 100644
index 23c79ea25e..0000000000
--- a/src/helm/benchmark/presentation/run_specs_lm_opinions_ai21_steer-bio.conf
+++ /dev/null
@@ -1,5 +0,0 @@
-entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf
index c4cb950a7d..47e63a8477 100644
--- a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf
@@ -1,50 +1,18 @@
 entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
 ]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-portray.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-portray.conf
deleted file mode 100644
index fff482d113..0000000000
--- a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-portray.conf
+++ /dev/null
@@ -1,5 +0,0 @@
-entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=10,model=ai21/j1-grande,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=1", priority: 1}
-]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-qa.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-qa.conf
deleted file mode 100644
index f124c0be3b..0000000000
--- a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer-qa.conf
+++ /dev/null
@@ -1,5 +0,0 @@
-entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=10,model=ai21/j1-jumbo,num_train_trials=22", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=10,model=ai21/j1-grande,num_train_trials=22", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=10,model=ai21/j1-grande-v2-beta,num_train_trials=22", priority: 1}
-]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer.conf
new file mode 100644
index 0000000000..de1562b5f7
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer.conf
@@ -0,0 +1,5 @@
+entries: [
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=22", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_default.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_default.conf
index 56decc9db1..0ddd5d1b08 100644
--- a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_default.conf
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_default.conf
@@ -1,98 +1,18 @@
 entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
 ]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-bio.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-bio.conf
deleted file mode 100644
index a0de7ec55b..0000000000
--- a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-bio.conf
+++ /dev/null
@@ -1,8 +0,0 @@
-entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-portray.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-portray.conf
deleted file mode 100644
index 2c78a0bf85..0000000000
--- a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-portray.conf
+++ /dev/null
@@ -1,8 +0,0 @@
-entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/ada,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/davinci,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/text-ada-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=1", priority: 1}
-]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-qa.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-qa.conf
deleted file mode 100644
index 7ae0fb3fd0..0000000000
--- a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer-qa.conf
+++ /dev/null
@@ -1,8 +0,0 @@
-entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/ada,num_train_trials=22", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/davinci,num_train_trials=22", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/text-ada-001,num_train_trials=22", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/text-davinci-001,num_train_trials=22", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/text-davinci-002,num_train_trials=22", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=openai/text-davinci-003,num_train_trials=22", priority: 1}
-]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer.conf
new file mode 100644
index 0000000000..b3f0ed6354
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer.conf
@@ -0,0 +1,5 @@
+entries: [
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=opinions_qa/openai,num_train_trials=22", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+]
diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py
index 21cebbf515..50483db0ad 100644
--- a/src/helm/benchmark/run_expander.py
+++ b/src/helm/benchmark/run_expander.py
@@ -319,6 +319,8 @@ class ModelRunExpander(ReplaceValueRunExpander):
         "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
         "biomedical": ["openai/text-davinci-003"],  # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
         "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
+        "opinions_qa/openai": ["openai/ada", "openai/davinci", "openai/text-ada-001", "openai/text-davinci-001", "openai/text-davinci-002", "openai/text-davinci-003"],
+        "opinions_qa/ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "j1-grande-v2-beta"],
     }
 
     # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
index 29a686947c..8d34cf2188 100644
--- a/src/helm/benchmark/run_specs.py
+++ b/src/helm/benchmark/run_specs.py
@@ -1842,8 +1842,8 @@ def get_opinions_qa_spec(
         + f",context={context},num_train_trials={num_train_trials}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),
-        groups=["opinions-qa"],
+        metric_specs=[],
+        groups=["opinions_qa"],
     )
 
 
diff --git a/src/helm/benchmark/scenarios/opinions_qa_scenario.py b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
index dbaf271366..4f9d005a99 100644
--- a/src/helm/benchmark/scenarios/opinions_qa_scenario.py
+++ b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
@@ -2,7 +2,8 @@
 import pandas as pd
 from typing import List, Dict
 
-from helm.common.general import shell
+from helm.common.general import shell, ensure_file_downloaded
+
 from .scenario import (
     Scenario,
     Instance,
@@ -49,7 +50,8 @@ class OpinionsQAScenario(Scenario):
         A. Republican
         B. Democrat
         C. Independent
-        D. Something else E. Refused
+        D. Something else
+        E. Refused
         Answer: B
 
         Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes
@@ -64,7 +66,7 @@ class OpinionsQAScenario(Scenario):
 
     """
 
-    name = "opinions-qa"
+    name = "opinions_qa"
     description = "Subjective questions dataset based on Pew American Trends Panel opinion polls."
     tags = ["multiple_choice"]
 
@@ -85,6 +87,10 @@ def __init__(self, survey_type: str, context: str):
 
     def download_data(self):
 
+        self.output_path: str = os.path.join(self.output_path, "data")
+        if not os.path.exists(self.output_path):
+            os.makedirs(self.output_path)
+
         DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
         DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
         DOWNLOAD_FILENAMES += ["Pew_American_Trends_Panel_disagreement_500.csv"]
@@ -93,8 +99,7 @@ def download_data(self):
             data_path: str = os.path.join(self.output_path, filename)
 
             source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, filename=filename)
-            if not os.path.exists(data_path):
-                shell(["wget", source_url, "--no-check-certificate", "-O", data_path])
+            ensure_file_downloaded(source_url=source_url, target_path=data_path, downloader_executable="gdown")
 
     def read_survey_questions(self, csv_path):
         df = pd.read_csv(csv_path, sep="\t")
@@ -102,11 +107,6 @@ def read_survey_questions(self, csv_path):
         return df
 
     def get_instances(self) -> List[Instance]:
-
-        self.output_path: str = os.path.join(self.output_path, "data")
-        if not os.path.exists(self.output_path):
-            os.makedirs(self.output_path)
-
         self.download_data()
 
         # Read all the instances
@@ -136,16 +136,17 @@ def get_instances(self) -> List[Instance]:
 
             for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
 
-                # Opinions QA test questions have no correct answer. However, since the HELM codebase requires a
-                # correct answer to be associated with each instance, we set it to be the first reference.
-                # Note that this is never used in the analysis.
+                # Opinions QA test questions have no correct answer and thus we set it to be None by default
+                # for all test instances. 
                 # In the case where context = steer-qa, we add demographic information in the form of a
-                # question answer pair as shown in the example above.
+                # in-context question answer pair as shown in the example above.
 
-                correct_answer = answers[0] if split == "test" else question_df["correct"][qidx]
+                correct_answer = None if split == "test" else question_df["correct"][qidx]
 
                 def answer_to_reference(answer: str) -> Reference:
-                    return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
+                    return Reference(Output(text=answer), 
+                                     tags=[CORRECT_TAG] if (answer == correct_answer and split != 'test') \
+                                     else [])
 
                 if bios_df is None:
                     # context = "default"/"steer-qa"
diff --git a/src/helm/common/general.py b/src/helm/common/general.py
index 099565c5ba..e1801afbab 100644
--- a/src/helm/common/general.py
+++ b/src/helm/common/general.py
@@ -49,7 +49,7 @@ def shell(args: List[str]):
 
 
 @htrack(None)
-def ensure_file_downloaded(source_url: str, target_path: str, unpack: bool = False, unpack_type: Optional[str] = None):
+def ensure_file_downloaded(source_url: str, target_path: str, unpack: bool = False, unpack_type: Optional[str] = None, downloader_executable: Optional[str] = "wget"):
     """Download `source_url` to `target_path` if it doesn't exist."""
     if os.path.exists(target_path):
         # Assume it's all good
@@ -59,7 +59,8 @@ def ensure_file_downloaded(source_url: str, target_path: str, unpack: bool = Fal
     # Download
     # gdown is used to download large files/zip folders from Google Drive.
     # It bypasses security warnings which wget cannot handle.
-    downloader_executable: str = "gdown" if source_url.startswith("https://drive.google.com") else "wget"
+    if source_url.startswith("https://drive.google.com"):
+        downloader_executable = "gdown" 
     tmp_path: str = f"{target_path}.tmp"
     shell([downloader_executable, source_url, "-O", tmp_path])
 

From 1f688b581da999b138dc76368b296fdde21fc5ad Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sat, 25 Mar 2023 12:55:00 -0700
Subject: [PATCH 10/17] Address Percy's comments

---
 src/helm/benchmark/run_expander.py                   |  9 ++++++++-
 src/helm/benchmark/scenarios/opinions_qa_scenario.py | 11 ++++++-----
 src/helm/common/general.py                           | 10 ++++++++--
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py
index 50483db0ad..45294b92fe 100644
--- a/src/helm/benchmark/run_expander.py
+++ b/src/helm/benchmark/run_expander.py
@@ -319,7 +319,14 @@ class ModelRunExpander(ReplaceValueRunExpander):
         "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
         "biomedical": ["openai/text-davinci-003"],  # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
         "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
-        "opinions_qa/openai": ["openai/ada", "openai/davinci", "openai/text-ada-001", "openai/text-davinci-001", "openai/text-davinci-002", "openai/text-davinci-003"],
+        "opinions_qa/openai": [
+            "openai/ada",
+            "openai/davinci",
+            "openai/text-ada-001",
+            "openai/text-davinci-001",
+            "openai/text-davinci-002",
+            "openai/text-davinci-003",
+        ],
         "opinions_qa/ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "j1-grande-v2-beta"],
     }
 
diff --git a/src/helm/benchmark/scenarios/opinions_qa_scenario.py b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
index 4f9d005a99..6c40ebcdc5 100644
--- a/src/helm/benchmark/scenarios/opinions_qa_scenario.py
+++ b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
@@ -2,7 +2,7 @@
 import pandas as pd
 from typing import List, Dict
 
-from helm.common.general import shell, ensure_file_downloaded
+from helm.common.general import ensure_file_downloaded
 
 from .scenario import (
     Scenario,
@@ -137,16 +137,17 @@ def get_instances(self) -> List[Instance]:
             for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
 
                 # Opinions QA test questions have no correct answer and thus we set it to be None by default
-                # for all test instances. 
+                # for all test instances.
                 # In the case where context = steer-qa, we add demographic information in the form of a
                 # in-context question answer pair as shown in the example above.
 
                 correct_answer = None if split == "test" else question_df["correct"][qidx]
 
                 def answer_to_reference(answer: str) -> Reference:
-                    return Reference(Output(text=answer), 
-                                     tags=[CORRECT_TAG] if (answer == correct_answer and split != 'test') \
-                                     else [])
+                    return Reference(
+                        Output(text=answer),
+                        tags=[CORRECT_TAG] if (answer == correct_answer and split != "test") else [],
+                    )
 
                 if bios_df is None:
                     # context = "default"/"steer-qa"
diff --git a/src/helm/common/general.py b/src/helm/common/general.py
index e1801afbab..ce3cf56955 100644
--- a/src/helm/common/general.py
+++ b/src/helm/common/general.py
@@ -49,7 +49,13 @@ def shell(args: List[str]):
 
 
 @htrack(None)
-def ensure_file_downloaded(source_url: str, target_path: str, unpack: bool = False, unpack_type: Optional[str] = None, downloader_executable: Optional[str] = "wget"):
+def ensure_file_downloaded(
+    source_url: str,
+    target_path: str,
+    unpack: bool = False,
+    downloader_executable: str = "wget",
+    unpack_type: Optional[str] = None
+):
     """Download `source_url` to `target_path` if it doesn't exist."""
     if os.path.exists(target_path):
         # Assume it's all good
@@ -60,7 +66,7 @@ def ensure_file_downloaded(source_url: str, target_path: str, unpack: bool = Fal
     # gdown is used to download large files/zip folders from Google Drive.
     # It bypasses security warnings which wget cannot handle.
     if source_url.startswith("https://drive.google.com"):
-        downloader_executable = "gdown" 
+        downloader_executable = "gdown"
     tmp_path: str = f"{target_path}.tmp"
     shell([downloader_executable, source_url, "-O", tmp_path])
 

From 7ad8ad2dd8b76f658bdaf3ceae74e8425af57a57 Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sat, 25 Mar 2023 12:59:34 -0700
Subject: [PATCH 11/17] Address Percy's comments

---
 src/helm/common/general.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/common/general.py b/src/helm/common/general.py
index ce3cf56955..f6a2d4c60d 100644
--- a/src/helm/common/general.py
+++ b/src/helm/common/general.py
@@ -54,7 +54,7 @@ def ensure_file_downloaded(
     target_path: str,
     unpack: bool = False,
     downloader_executable: str = "wget",
-    unpack_type: Optional[str] = None
+    unpack_type: Optional[str] = None,
 ):
     """Download `source_url` to `target_path` if it doesn't exist."""
     if os.path.exists(target_path):

From 348474f5edd3b5c52b103831b1bd03d8720c593c Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sat, 25 Mar 2023 13:35:27 -0700
Subject: [PATCH 12/17] Minor

---
 .../adaptation/adapters/test_multiple_choice_joint_adapter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
index ce2851d6d8..036c819aab 100644
--- a/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
@@ -6,7 +6,9 @@
 
 class TestMultipleChoiceJointAdapter(TestAdapter):
     def test_sample_examples(self):
-        adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=4)
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=4, sample_train=True
+        )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
         all_train_instances = [
             Instance(Input(text="say no"), references=[Reference(Output(text="no"), tags=[CORRECT_TAG])]),

From 2b4cd2e4fc6193dbe75147030793036f5c0b0ab2 Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sat, 25 Mar 2023 13:58:24 -0700
Subject: [PATCH 13/17] Minor

---
 .../adaptation/adapters/in_context_learning_adapter.py        | 4 +++-
 .../adaptation/adapters/test_multiple_choice_joint_adapter.py | 4 +---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
index 0ddda04e4f..4b356849cd 100644
--- a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
@@ -95,7 +95,9 @@ def _adapt_trial_index(
 
         return [request_state for result in results for request_state in result]
 
-    def sample_examples(self, all_train_instances: List[Instance], seed: int, sample_train: bool) -> List[Instance]:
+    def sample_examples(
+        self, all_train_instances: List[Instance], seed: int, sample_train: Optional[bool] = True
+    ) -> List[Instance]:
         """
         Sample a random set of train instances to use as examples by following the steps below:
         1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
diff --git a/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
index 036c819aab..ce2851d6d8 100644
--- a/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
@@ -6,9 +6,7 @@
 
 class TestMultipleChoiceJointAdapter(TestAdapter):
     def test_sample_examples(self):
-        adapter_spec = AdapterSpec(
-            method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=4, sample_train=True
-        )
+        adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=4)
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
         all_train_instances = [
             Instance(Input(text="say no"), references=[Reference(Output(text="no"), tags=[CORRECT_TAG])]),

From d50e512316ea09137a82409a77c09aa1eb75b5bd Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sat, 25 Mar 2023 14:32:31 -0700
Subject: [PATCH 14/17] Minor

---
 .../adaptation/adapters/in_context_learning_adapter.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
index 4b356849cd..eb38a71edd 100644
--- a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
@@ -96,7 +96,7 @@ def _adapt_trial_index(
         return [request_state for result in results for request_state in result]
 
     def sample_examples(
-        self, all_train_instances: List[Instance], seed: int, sample_train: Optional[bool] = True
+        self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
     ) -> List[Instance]:
         """
         Sample a random set of train instances to use as examples by following the steps below:

From e91e50e77c8237be489196d129a9bb973f3ca8cf Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Sat, 25 Mar 2023 15:03:22 -0700
Subject: [PATCH 15/17] Minor

---
 src/helm/benchmark/run_expander.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py
index 45294b92fe..60871dbb69 100644
--- a/src/helm/benchmark/run_expander.py
+++ b/src/helm/benchmark/run_expander.py
@@ -327,7 +327,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
             "openai/text-davinci-002",
             "openai/text-davinci-003",
         ],
-        "opinions_qa/ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "j1-grande-v2-beta"],
+        "opinions_qa/ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
     }
 
     # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")

From 284ec585a2862ac019e5b16e0e576481c924303e Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Mon, 27 Mar 2023 19:25:44 -0700
Subject: [PATCH 16/17] PR comments v2

---
 src/helm/benchmark/adaptation/adapter_spec.py |  2 +-
 .../adapters/in_context_learning_adapter.py   |  6 +--
 .../run_specs_opinions_qa_ai21_default.conf   | 34 +++++++-------
 .../run_specs_opinions_qa_ai21_steer.conf     |  8 ++--
 .../run_specs_opinions_qa_openai_default.conf | 34 +++++++-------
 .../run_specs_opinions_qa_openai_steer.conf   |  8 ++--
 src/helm/benchmark/run_expander.py            |  4 +-
 src/helm/benchmark/run_specs.py               |  2 +-
 .../scenarios/opinions_qa_scenario.py         | 46 ++++++++++++++-----
 9 files changed, 87 insertions(+), 57 deletions(-)

diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
index 3540db26bb..adab7860e3 100644
--- a/src/helm/benchmark/adaptation/adapter_spec.py
+++ b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -68,7 +68,7 @@ class AdapterSpec:
     # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
 
-    # Randomly sample train examples or use them in order
+    # If true, randomly sample N training examples; if false, select the first N training examples
     sample_train: bool = True
 
     # Decoding parameters (inherited by `Request`)
diff --git a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
index eb38a71edd..98cd6470fe 100644
--- a/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/in_context_learning_adapter.py
@@ -125,14 +125,14 @@ class labels.
         random.seed(seed)
         num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
 
-        unlabeled_instances: List[Instance] = []
-        label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
         examples: List[Instance] = []
         if not sample_train:
-            # Sample sequentially from the train set
+            # Select sequentially from the train set
             examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
             return examples
 
+        unlabeled_instances: List[Instance] = []
+        label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
         for instance in all_train_instances:
             if instance.first_correct_reference:
                 label_to_instances[instance.first_correct_reference.output.text].append(instance)
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf
index 47e63a8477..b53d4cde37 100644
--- a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_default.conf
@@ -1,18 +1,20 @@
+# RunSpecs for obtaining default opinion distributions (i.e., zero-shot and without additional context) for AI21 Labs models for the paper "Whose Opinions Do Language Models Reflect?" by Santurkar et al. (2023).
+
 entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
 ]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer.conf
index de1562b5f7..1895069f50 100644
--- a/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer.conf
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_ai21_steer.conf
@@ -1,5 +1,7 @@
+# RunSpecs for obtaining steered opinion distributions (i.e., with additional context containing demographic group information) for AI21 Labs models for the paper "Whose Opinions Do Language Models Reflect?" by Santurkar et al. (2023).
+
 entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=22", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=10,model=opinions_qa/ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=22", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
 ]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_default.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_default.conf
index 0ddd5d1b08..7a19ba76f2 100644
--- a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_default.conf
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_default.conf
@@ -1,18 +1,20 @@
+# RunSpecs for obtaining default opinion distributions (i.e., zero-shot and without additional context) for OpenAI models for the paper "Whose Opinions Do Language Models Reflect?" by Santurkar et al. (2023).
+
 entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
 ]
diff --git a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer.conf b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer.conf
index b3f0ed6354..761dd40f8c 100644
--- a/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer.conf
+++ b/src/helm/benchmark/presentation/run_specs_opinions_qa_openai_steer.conf
@@ -1,5 +1,7 @@
+# RunSpecs for obtaining steered opinion distributions (i.e., with additional context containing demographic group information)  models for the paper "Whose Opinions Do Language Models Reflect?" by Santurkar et al. (2023).
+
 entries: [
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=opinions_qa/openai,num_train_trials=22", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
-{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=opinions_qa/openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=opinions_qa_openai,num_train_trials=22", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
+{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
 ]
diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py
index 60871dbb69..2491e0dd0c 100644
--- a/src/helm/benchmark/run_expander.py
+++ b/src/helm/benchmark/run_expander.py
@@ -319,7 +319,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
         "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
         "biomedical": ["openai/text-davinci-003"],  # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
         "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
-        "opinions_qa/openai": [
+        "opinions_qa_openai": [
             "openai/ada",
             "openai/davinci",
             "openai/text-ada-001",
@@ -327,7 +327,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
             "openai/text-davinci-002",
             "openai/text-davinci-003",
         ],
-        "opinions_qa/ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
+        "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
     }
 
     # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
index 8d34cf2188..e7e6e95669 100644
--- a/src/helm/benchmark/run_specs.py
+++ b/src/helm/benchmark/run_specs.py
@@ -53,7 +53,7 @@ def get_multiple_choice_joint_adapter_spec(
     num_outputs: int = 5,
     max_train_instances: int = 5,
     max_tokens: int = 5,
-    sample_train=True,
+    sample_train: bool = True,
     **kwargs,
 ) -> AdapterSpec:
     """
diff --git a/src/helm/benchmark/scenarios/opinions_qa_scenario.py b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
index 6c40ebcdc5..0f8c6d3398 100644
--- a/src/helm/benchmark/scenarios/opinions_qa_scenario.py
+++ b/src/helm/benchmark/scenarios/opinions_qa_scenario.py
@@ -29,8 +29,9 @@ class OpinionsQAScenario(Scenario):
     As discussed in Santurkar et al., we consider prompting an LM:
     1. Without any context (zero-shot) to evaluate the "default" opinions reflected
         by it.
-    2. With context containing information pertaining to the group we want to steer the model towards.
-       This context is either formatted as a question-answer pair (QA) or a textual description (BIO/PORTRAY).
+    2. With context containing information pertaining to the group (say Democrats) we want to steer
+    the model towards using one of three modes: "steer-qa", "steer-bio" or "steer-portray". See the
+    example below for an illustration fo the difference between the three steering modes.
 
 
     We prompt models using the following format
@@ -44,15 +45,9 @@ class OpinionsQAScenario(Scenario):
         D. <reference>
         Answer: <A/B/C/D>
 
-    For example, we have:
+    For instance:
 
-        Question: In politics today, do you consider yourself a
-        A. Republican
-        B. Democrat
-        C. Independent
-        D. Something else
-        E. Refused
-        Answer: B
+        <optional context>
 
         Question: How much, if at all, do you think the ease with which people can legally obtain guns contributes
         to gun violence in the country today?
@@ -64,6 +59,33 @@ class OpinionsQAScenario(Scenario):
         Answer:
 
 
+    In the example above, the <optional context> varies based on the choice of the "context" variable as follows:
+
+    1. If context="default", <optional context> is not used and the model is prompted with the question directly.
+
+    2. If context="steer-qa", the group information (to steer the model towards) is passed in a QA format, e.g.:
+
+        Question: In politics today, do you consider yourself a
+        A. Republican
+        B. Democrat
+        C. Independent
+        D. Something else
+        E. Refused
+        Answer: B
+
+    3. If context="steer-bio", the group information is passed to the model as a free-text response
+    to a prompt asking for demographic information, e.g.:
+
+    Below you will be asked to provide a short description of your political affiliation and
+    then answer some questions.
+
+    Description: In politics today, I consider myself a Democrat.
+
+    4. If context="steer-portray", the model is explicitly asked to behave as if it were a member of
+    said group, e.g.:
+
+    Answer the following question as if in politics today, you considered yourself a Democrat.
+
     """
 
     name = "opinions_qa"
@@ -150,7 +172,7 @@ def answer_to_reference(answer: str) -> Reference:
                     )
 
                 if bios_df is None:
-                    # context = "default"/"steer-qa"
+                    # context = "default" or "steer-qa"
                     instance = Instance(
                         Input(text=question),
                         references=list(map(answer_to_reference, answers)),
@@ -158,7 +180,7 @@ def answer_to_reference(answer: str) -> Reference:
                     )
                     instances.append(instance)
                 else:
-                    # context = "steer-bio"/"steer-portray"
+                    # context = "steer-bio"or "steer-portray"
                     for bio in bios_df["question"].values:
 
                         context = PassageQuestionInput(passage=bio, question=question + "\n")

From 68330ae3dc89ee34f052cf86bc72c4ddddeeaaa0 Mon Sep 17 00:00:00 2001
From: Shibani Santurkar <shibs92@gmail.com>
Date: Mon, 27 Mar 2023 21:44:22 -0700
Subject: [PATCH 17/17] Minor wording

---
 src/helm/benchmark/adaptation/adapter_spec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
index adab7860e3..fcbb87b22d 100644
--- a/src/helm/benchmark/adaptation/adapter_spec.py
+++ b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -68,7 +68,7 @@ class AdapterSpec:
     # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
 
-    # If true, randomly sample N training examples; if false, select the first N training examples
+    # If true, randomly sample N training examples; if false, select N consecutive training examples
     sample_train: bool = True
 
     # Decoding parameters (inherited by `Request`)