Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OpinionsQA #1424

Merged
merged 17 commits into from
Mar 30, 2023
2 changes: 2 additions & 0 deletions src/helm/benchmark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
from .scenarios import entity_matching_scenario # noqa
from .scenarios import entity_data_imputation_scenario # noqa
from .scenarios import big_bench_scenario # noqa
from .scenarios import opinions_qa_scenario # noqa


# Biomedical
from .scenarios import covid_dialog_scenario # noqa
Expand Down
3 changes: 3 additions & 0 deletions src/helm/benchmark/adaptation/adapter_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ class AdapterSpec:
# set of training instances. Used to compute error bars.
num_train_trials: int = 1

# If true, randomly sample N training examples; if false, select the first N training examples
ShibaniSanturkar marked this conversation as resolved.
Show resolved Hide resolved
sample_train: bool = True

# Decoding parameters (inherited by `Request`)

# Model to make the request to (need to fill in)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def _adapt_trial_index(
parallelism: int,
) -> List[RequestState]:
self.train_trial_index: int = train_trial_index
self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index)
self.train_instances: List[Instance] = self.sample_examples(
all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
)
hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")

# Generate request_states
Expand Down Expand Up @@ -93,7 +95,9 @@ def _adapt_trial_index(

return [request_state for result in results for request_state in result]

def sample_examples(self, all_train_instances: List[Instance], seed: int) -> List[Instance]:
def sample_examples(
self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
) -> List[Instance]:
"""
Sample a random set of train instances to use as examples by following the steps below:
1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
Expand Down Expand Up @@ -121,9 +125,14 @@ class labels.
random.seed(seed)
num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)

examples: List[Instance] = []
if not sample_train:
# Select sequentially from the train set
examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
return examples

unlabeled_instances: List[Instance] = []
label_to_instances: Dict[str, List[Instance]] = defaultdict(list)

for instance in all_train_instances:
if instance.first_correct_reference:
label_to_instances[instance.first_correct_reference.output.text].append(instance)
Expand All @@ -145,7 +154,6 @@ class labels.
sorted_labels.extend(labels)

labels_iterable = cycle(sorted_labels)
examples: List[Instance] = []
while num_instances_to_sample > 0:
next_label: Optional[str] = next(labels_iterable, None)
if not next_label:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# RunSpecs for obtaining default opinion distributions (i.e., zero-shot and without additional context) for AI21 Labs models for the paper "Whose Opinions Do Language Models Reflect?" by Santurkar et al. (2023).

entries: [
ShibaniSanturkar marked this conversation as resolved.
Show resolved Hide resolved
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# RunSpecs for obtaining steered opinion distributions (i.e., with additional context containing demographic group information) for AI21 Labs models for the paper "Whose Opinions Do Language Models Reflect?" by Santurkar et al. (2023).

entries: [
ShibaniSanturkar marked this conversation as resolved.
Show resolved Hide resolved
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=22", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=10,model=opinions_qa_ai21,num_train_trials=1", priority: 1}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# RunSpecs for obtaining default opinion distributions (i.e., zero-shot and without additional context) for OpenAI models for the paper "Whose Opinions Do Language Models Reflect?" by Santurkar et al. (2023).

entries: [
ShibaniSanturkar marked this conversation as resolved.
Show resolved Hide resolved
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W26,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W27,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W29,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W32,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W34,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W36,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W41,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W42,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W43,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W45,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W49,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W50,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W54,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W82,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_W92,context=default,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# RunSpecs for obtaining steered opinion distributions (i.e., with additional context containing demographic group information) models for the paper "Whose Opinions Do Language Models Reflect?" by Santurkar et al. (2023).

entries: [
ShibaniSanturkar marked this conversation as resolved.
Show resolved Hide resolved
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-qa,num_logprobs=100,model=opinions_qa_openai,num_train_trials=22", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-bio,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
{description: "opinions_qa:survey_type=Pew_American_Trends_Panel_disagreement_500,context=steer-portray,num_logprobs=100,model=opinions_qa_openai,num_train_trials=1", priority: 1}
]
9 changes: 9 additions & 0 deletions src/helm/benchmark/run_expander.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,15 @@ class ModelRunExpander(ReplaceValueRunExpander):
"summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
"biomedical": ["openai/text-davinci-003"], # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
"interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
"opinions_qa_openai": [
"openai/ada",
"openai/davinci",
"openai/text-ada-001",
"openai/text-davinci-001",
"openai/text-davinci-002",
"openai/text-davinci-003",
],
"opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
}

# For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
Expand Down
63 changes: 59 additions & 4 deletions src/helm/benchmark/run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,14 @@ def format_instructions(instructions: str) -> str:


def get_multiple_choice_joint_adapter_spec(
instructions: str, input_noun: Optional[str], output_noun: str, max_train_instances: int = 5, **kwargs
instructions: str,
input_noun: Optional[str],
output_noun: str,
num_outputs: int = 5,
max_train_instances: int = 5,
max_tokens: int = 5,
sample_train: bool = True,
**kwargs,
) -> AdapterSpec:
"""
[instructions]
Expand All @@ -64,6 +71,7 @@ def get_multiple_choice_joint_adapter_spec(
[reference_k]
[output_noun]:
"""

return AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
instructions=format_instructions(instructions),
Expand All @@ -72,10 +80,11 @@ def get_multiple_choice_joint_adapter_spec(
output_prefix=f"{output_noun}: ",
output_suffix="\n",
max_train_instances=max_train_instances,
num_outputs=1,
max_tokens=5,
num_outputs=num_outputs,
max_tokens=max_tokens,
temperature=0.0,
stop_sequences=["\n"],
sample_train=sample_train,
**kwargs,
)

Expand Down Expand Up @@ -109,15 +118,26 @@ def get_multiple_choice_adapter_spec(
input_noun: Optional[str],
output_noun: str,
max_train_instances: int = 5,
num_outputs: int = 5,
max_tokens: int = 5,
empty_input: bool = False,
sample_train: bool = True,
**kwargs,
):

"""
Toggle between joint and separate adapters.
"""
if method == ADAPT_MULTIPLE_CHOICE_JOINT:
return get_multiple_choice_joint_adapter_spec(
instructions, input_noun, output_noun, max_train_instances, **kwargs
instructions,
input_noun,
output_noun,
max_train_instances=max_train_instances,
num_outputs=num_outputs,
max_tokens=max_tokens,
sample_train=sample_train,
**kwargs,
)
elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
return get_multiple_choice_separate_adapter_spec(method, empty_input)
Expand Down Expand Up @@ -1793,6 +1813,40 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
)


def get_opinions_qa_spec(
survey_type: str,
num_logprobs: str,
ShibaniSanturkar marked this conversation as resolved.
Show resolved Hide resolved
context: str = "None",
num_train_trials: str = "1",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can get rid of this, because we have a RunExpander that allows you to set num_train_trials

method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.opinions_qa_scenario.OpinionsQAScenario",
args={"survey_type": survey_type, "context": context},
)

adapter_spec = get_multiple_choice_adapter_spec(
method=method,
instructions="",
input_noun="Question",
output_noun="Answer",
max_train_instances=1 if "steer" in context else 0,
max_tokens=1,
num_outputs=int(num_logprobs),
num_train_trials=1 if context != "steer-qa" else int(num_train_trials),
sample_train=False,
)

return RunSpec(
name=f"opinions_qa:survey={survey_type},num_logprobs={num_logprobs}"
+ f",context={context},num_train_trials={num_train_trials}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=[],
groups=["opinions_qa"],
)


############################################################

CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
Expand Down Expand Up @@ -1850,6 +1904,7 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
"med_paragraph_simplification": get_med_paragraph_simplification_spec,
"med_qa": get_med_qa_spec,
"pubmed_qa": get_pubmed_qa_spec,
"opinions_qa": get_opinions_qa_spec,
}


Expand Down
Loading