Skip to content

Commit

Permalink
Add FinanceBench scenario (#2798)
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai authored Jul 19, 2024
1 parent 3c00f11 commit a3d3396
Show file tree
Hide file tree
Showing 6 changed files with 276 additions and 0 deletions.
79 changes: 79 additions & 0 deletions src/helm/benchmark/annotation/financebench_annotator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import json
from json.decoder import JSONDecodeError
from typing import Any

from helm.benchmark.scenarios.scenario import CORRECT_TAG
from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.annotation.annotator import Annotator
from helm.clients.auto_client import AutoClient
from helm.common.request import Request


class FinanceBenchAnnotator(Annotator):
"""Annoator for FinanceBench that uses GPT-4o to determine if the model response is correct."""

name = "financebench"
_PROMPT_TEMPLATE = """Classify the model's response as one of three categories: "correct_answer", "incorrect_answer", or "failure_to_answer". Additionally, provide a short, one-sentence explanation for your classification.
Categories:
correct_answer: Allow minor deviations, such as giving the answer in billions when the unit was given in the question as millions.
incorrect_answer: This includes calculations that are off by small margins to several orders of magnitude, and from making up legal information to giving the wrong direction for an effect (e.g. reporting negative growth when it is actually positive). If a model gives the right answer but with logic or calculations that explicitly contradict the evidence in the gold standard answer, label it as incorrect_answer.
failure_to_answer: If the model explicitly states that it cannot answer because it does not have access to the right information then it is a failure to answer.
Question: {{QUESTION}}
Gold answer: {{GOLD_ANSWER}}
Model's response: {{MODEL_RESPONSE}}
Respond with only a raw JSON object in the following format, without using Markdown formatting:
{"explanation": "<one sentence explanation>", "label": "<category>"}
""" # noqa: E501

def __init__(self, auto_client: AutoClient, file_storage_path: str):
super().__init__()
self._auto_client = auto_client

def annotate(self, request_state: RequestState) -> Any:
assert request_state.result
assert len(request_state.result.completions) == 1
assert len(request_state.instance.references[0].tags) == 1
assert request_state.instance.references[0].tags[0] == CORRECT_TAG
question = request_state.instance.input.text.split("\nQuestion: ")[-1].strip()
gold_answer = request_state.instance.references[0].output.text.strip()
model_response = request_state.result.completions[0].text.strip()
if not model_response.strip():
return {"reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "label": "failure_to_answer"}
annotator_prompt = (
FinanceBenchAnnotator._PROMPT_TEMPLATE.replace("{{QUESTION}}", question)
.replace("{{GOLD_ANSWER}}", gold_answer)
.replace("{{MODEL_RESPONSE}}", model_response)
)
annotator_request = Request(
model="openai/gpt-4o-2024-05-13",
model_deployment="openai/gpt-4o-2024-05-13",
prompt=annotator_prompt,
temperature=0.0,
max_tokens=64,
)
annotator_response = self._auto_client.make_request(annotator_request)
if not annotator_response.success:
raise Exception(f"Annotation request failed: {annotator_response.error}")
assert len(annotator_response.completions) == 1
annotator_response_text = annotator_response.completions[0].text
# OpenAI models like to surround JSON objects with ```json and ``` Markdown formatting.
# This strips everything outside the outermost {} brackets.
json_start_index = annotator_response_text.find("{")
json_end_index = annotator_response_text.rfind("}")
if json_start_index < 0 or json_end_index < 0:
raise Exception(f"Malformed annotator response: {annotator_response_text}")
annotator_response_json = annotator_response_text[json_start_index : json_end_index + 1]
try:
annotator_response_parsed = json.loads(annotator_response_json)
except JSONDecodeError:
raise Exception(f"Malformed annotator response: {annotator_response_text}")
return annotator_response_parsed
49 changes: 49 additions & 0 deletions src/helm/benchmark/metrics/annotation_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import List

from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.metrics.metric import Metric
from helm.benchmark.metrics.metric_name import MetricName
from helm.benchmark.metrics.metric_service import MetricService
from helm.benchmark.metrics.statistic import Stat


class AnnotationLabelMetric(Metric):
"""Binary metric for labels produced by annotators.
Expects the annotation with the given annotator name and key to be a string label.
For each possible label in the list of possible labels, produces a
corresponding stat with a value of 1 or 0 indicating if the actual label
in the annoation."""

def __init__(self, annotator_name: str, key: str, labels: List[str]):
super().__init__()
self.annotator_name = annotator_name
self.key = key
self.labels = labels

def evaluate_generation(
self,
adapter_spec: AdapterSpec,
request_state: RequestState,
metric_service: MetricService,
eval_cache_path: str,
) -> List[Stat]:
assert request_state.annotations
annotation_label = request_state.annotations[self.annotator_name][self.key]
if annotation_label not in self.labels:
raise ValueError(
f"Unrecognized annotation label '{annotation_label}' "
f"(known labels: {self.labels}) "
f"in annotation {request_state.annotations[self.annotator_name]} "
f"for instance id {request_state.instance.id}"
)
stats: List[Stat] = []
for label in self.labels:
stats.append(
Stat(MetricName(f"annotation_{self.annotator_name}_{self.key}_{label}")).add(
1 if label == annotation_label else 0
)
)
return stats
47 changes: 47 additions & 0 deletions src/helm/benchmark/run_specs/finance_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
Website: https://crfm.stanford.edu/helm/finance/"""

from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
from helm.benchmark.adaptation.common_adapter_specs import (
get_generation_adapter_spec,
)
from helm.benchmark.annotation.annotator import AnnotatorSpec
from helm.benchmark.metrics.common_metric_specs import (
get_basic_metric_specs,
)
Expand All @@ -31,3 +33,48 @@ def get_fin_qa_spec() -> RunSpec:
metric_specs=metric_specs,
groups=["fin_qa"],
)


@run_spec_function("financebench")
def get_financebench_spec() -> RunSpec:
instructions = (
"Answer only the last question using the given evidence. "
"Respond with only a single paragraph, sentence or sentence fragment.\n"
)
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.financebench_scenario.FinanceBenchScenario", args={}
)
adapter_spec = AdapterSpec(
method=ADAPT_GENERATION,
instructions=instructions,
input_prefix="\n",
input_suffix="\n",
output_prefix="\nAnswer: ",
output_suffix="\n",
instance_prefix="\n###\n",
num_outputs=1,
max_tokens=300,
temperature=0.0,
stop_sequences=["###"],
)
annotator_specs = [
AnnotatorSpec(class_name="helm.benchmark.annotation.financebench_annotator.FinanceBenchAnnotator")
]
metric_specs = get_basic_metric_specs([]) + [
MetricSpec(
class_name="helm.benchmark.metrics.annotation_metrics.AnnotationLabelMetric",
args={
"annotator_name": "financebench",
"key": "label",
"labels": ["correct_answer", "incorrect_answer", "failure_to_answer"],
},
)
]
return RunSpec(
name="financebench",
scenario_spec=scenario_spec,
annotators=annotator_specs,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["financebench"],
)
53 changes: 53 additions & 0 deletions src/helm/benchmark/scenarios/financebench_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import dataclasses
import json
import os
import random
from typing import List

from helm.benchmark.scenarios.scenario import (
CORRECT_TAG,
TRAIN_SPLIT,
Scenario,
Instance,
Reference,
TEST_SPLIT,
Input,
Output,
)
from helm.common.general import ensure_directory_exists, ensure_file_downloaded


class FinanceBenchScenario(Scenario):
"""FinanceBench"""

name = "financebench"
description = "FinanceBench"
tags = ["finance"]

def get_instances(self, output_path: str) -> List[Instance]:
cache_dir = os.path.join(output_path, "data")
ensure_directory_exists(cache_dir)
target_path = os.path.join(cache_dir, "financebench_open_source.jsonl")
url: str = (
"https://raw.githubusercontent.com/patronus-ai/financebench/d7beebe5e739e0b806ab4443c1b3e23f51804acf/data/financebench_open_source.jsonl" # noqa: E501
)
ensure_file_downloaded(source_url=url, target_path=target_path)

instances: List[Instance] = []
with open(target_path) as f:
for line in f:
row = json.loads(line)
instance_id = row["financebench_id"]
question = row["question"]
answer = row["answer"]
evidence = row["evidence"][0]["evidence_text_full_page"]
input_text = f"Evidence: {evidence}\nQuestion: {question}"
input = Input(text=input_text)
references = [Reference(output=Output(text=answer), tags=[CORRECT_TAG])]
instance = Instance(id=instance_id, input=input, references=references, split=TEST_SPLIT)
instances.append(instance)
random.seed(0)
train_indexes = random.sample(list(range(len(instances))), k=10)
for train_index in train_indexes:
instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
return instances
26 changes: 26 additions & 0 deletions src/helm/benchmark/scenarios/test_financebench_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pytest
from tempfile import TemporaryDirectory

from helm.benchmark.scenarios.financebench_scenario import FinanceBenchScenario
from helm.benchmark.scenarios.scenario import CORRECT_TAG, TEST_SPLIT, TRAIN_SPLIT


@pytest.mark.scenarios
def test_air_2024_scenario_get_instances():
scenario = FinanceBenchScenario()
with TemporaryDirectory() as tmpdir:
instances = scenario.get_instances(tmpdir)
assert len(instances) == 150
assert len([instance for instance in instances if instance.split == TRAIN_SPLIT]) == 10
assert (
"Evidence: Table of Contents \n3M Company and Subsidiaries\nConsolidated Statement of Cash Flow s\n" # noqa: E501
in instances[0].input.text
)
assert (
"Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the question by relying on the details shown in the cash flow statement." # noqa: E501
in instances[0].input.text
)
assert len(instances[0].references) == 1
assert instances[0].references[0].output.text == "$1577.00"
assert instances[0].references[0].tags == [CORRECT_TAG]
assert instances[0].split == TEST_SPLIT
22 changes: 22 additions & 0 deletions src/helm/benchmark/static/schema_finance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ metrics:
display_name: Execution Accuracy
description: Accuracy of the final result of the generated program
lower_is_better: false
- name: annotation_financebench_label_correct_answer
display_name: Correct Answer
description: Whether the final result was correct, as judged by a GPT-4o
lower_is_better: false

############################################################
perturbations: []
Expand Down Expand Up @@ -114,6 +118,7 @@ run_groups:
category: All scenarios
subgroups:
- fin_qa
- financebench

- name: fin_qa
display_name: FinQA
Expand All @@ -132,6 +137,23 @@ run_groups:
when: 1999 to 2019
language: English

- name: financebench
display_name: FinanceBench
description: FinanceBench is a benchmark for open book financial question answering. It comprises 10,231 questions about publicly traded companies, with corresponding answers and evidence strings
metric_groups:
- accuracy
- efficiency
- general_information
environment:
main_name: annotation_financebench_label_correct_answer
main_split: test
taxonomy:
task: question answering with numeric reasoning
what: financial reports
who: financial experts
when: 2015 to 2023
language: English

- name: financial_scenarios_ablations
display_name: Financial Scenarios Ablations
description: Scenarios for the financial domain with ablations
Expand Down

0 comments on commit a3d3396

Please sign in to comment.