adding wildbench

stanford-crfm · Nov 12, 2024 · 46f7a7c · 46f7a7c
1 parent f9c4498
commit 46f7a7c
Show file tree

Hide file tree

Showing 8 changed files with 386 additions and 0 deletions.
diff --git a/src/helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md b/src/helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md
@@ -0,0 +1,75 @@
+# Instruction 
+
+You are an expert evaluator. Your task is to evaluate the quality of the responses generated by two AI models. 
+We will provide you with the user query and a pair of AI-generated responses (Response A and Response B). 
+You should first read the user query and the conversation history carefully for analyzing the task, and then evaluate the quality of the responses based on and rules provided below.
+
+# Conversation between User and AI
+
+## History
+<|begin_of_history|>
+
+{$history}
+
+<|end_of_history|> 
+
+## Current User Query
+<|begin_of_query|>
+
+{$user_query}
+
+<|end_of_query|>
+
+## Response A
+<|begin_of_response_A|>
+
+{$candidate_A}
+
+<|end_of_response_A|>
+
+## Response B
+<|begin_of_response_B|>
+
+{$candidate_B}
+
+<|end_of_response_B|>
+
+# Evaluation   
+
+## Checklist 
+
+<|begin_of_checklist|>
+
+{$checklist}
+
+<|end_of_checklist|>
+
+Please use this checklist to guide your evaluation, but do not limit your assessment to the checklist.
+
+## Rules 
+
+You should compare the above two responses based on your analysis of the user queries and the conversation history.
+You should first write down your analysis and the checklist that you used for the evaluation, and then provide your assessment according to the checklist.
+There are five choices to give your final assessment: ["A++", "A+", "A=B", "B+", "B++"], which correspond to the following meanings:
+
+- `A++`: Response A is much better than Response B.
+- `A+`: Response A is only slightly better than Response B.
+- `A=B`: Response A and B are of the same quality. Please use this choice sparingly.
+- `B+`: Response B is only slightly better than Response A.
+- `B++`: Response B is much better than Response A.
+
+
+## Output Format 
+First, please output your analysis for each model response, and then summarize your assessment to three aspects: "reason A=B", "reason A>B", and "reason B>A", and finally make your choice for the final assessment.
+
+Please provide your evaluation results in the following json format by filling in the placeholders in []:
+```
+{
+    "analysis of A": "[analysis of Response A]",
+    "analysis of B": "[analysis of Response B]",
+    "reason of A=B": "[where Response A and B perform equally well]",
+    "reason of A>B": "[where Response A is better than Response B]",
+    "reason of B>A": "[where Response B is better than Response A]",
+    "choice": "[A++ or A+ or A=B or B+ or B++]",
+}
+```
diff --git a/src/helm/benchmark/annotation/wildbench/eval_template.score.v2.md b/src/helm/benchmark/annotation/wildbench/eval_template.score.v2.md
@@ -0,0 +1,66 @@
+# Instruction 
+
+You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. 
+We will provide you with the user query and an AI-generated responses.
+You should first read the user query and the conversation history carefully for analyzing the task, and then evaluate the quality of the responses based on and rules provided below.
+
+# Conversation between User and AI
+
+## History
+<|begin_of_history|>
+
+{$history}
+
+<|end_of_history|> 
+
+## Current User Query
+<|begin_of_query|>
+
+{$user_query}
+
+<|end_of_query|>
+
+## AI Response
+<|begin_of_response|>
+
+{$model_output}
+
+<|end_of_response|>
+
+
+# Evaluation   
+
+## Checklist 
+
+<|begin_of_checklist|>
+
+{$checklist}
+
+<|end_of_checklist|>
+
+Please use this checklist to guide your evaluation, but do not limit your assessment to the checklist.
+
+## Rules 
+
+You should compare the above response based on your analysis of the user queries and the conversation history.
+You should first write down your analysis and the checklist that you used for the evaluation, and then provide your assessment according to the checklist.
+The scores are in the range of 1~10, where 1 means the response is very poor and 10 means the response is perfect.
+Here are more detailed criteria for the scores:
+
+- Score 1~2: The response is very poor and does not make sense at all.
+- Score 3~4: The response is poor and does help user solve the problem in a meaningful way.
+- Score 5~6: The response is fair but has some issues (e.g., factual errors, hallucinations, missing key information).
+- Score 7~8: The response is good enough but could be improved in some ways.
+- Score 9~10: The response is perfect and provides helpful information that can help user solve the problem.
+
+## Output Format 
+First, please output your analysis for the model response, and then summarize your assessment to two aspects: "strengths" and "weaknesses"; Finally, please write down your rating for the assessment.
+
+Please provide your evaluation results in the following json format by filling in the placeholders in []:
+```
+{
+    "strengths": "[analysis for the strengths of the response]",
+    "weaknesses": "[analysis for the weaknesses of the response]",
+    "score": "[1~10]"
+}
+```
diff --git a/src/helm/benchmark/annotation/wildbench_annotator.py b/src/helm/benchmark/annotation/wildbench_annotator.py
@@ -0,0 +1,62 @@
+import re
+from typing import Any
+
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+
+
+class WildBenchAnnotator(Annotator):
+    """The WildBench autograder."""
+
+    name = "wildbench"
+
+    def __init__(self, auto_client: AutoClient):
+        self._auto_client = auto_client
+        with open("src/helm/benchmark/annotation/wildbench/eval_template.score.v2.md") as f:
+            self._score_template = f.read()
+        self._pattern = re.compile(
+            r'"strengths"\s*:\s*"(.*?)"\s*,\s*"weaknesses"\s*:\s*"(.*?)"\s*,\s*"score"\s*:\s*"(.*?)"', re.DOTALL
+        )
+
+    def annotate(self, request_state: RequestState) -> Any:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        assert request_state.instance.extra_data
+        model_output_text = request_state.result.completions[0].text
+        if not model_output_text.strip():
+            return {"prompt_text": "", "strengths": "", "weaknesses": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
+        prompt_template = self._score_template
+
+        annotator_prompt = (
+            prompt_template.replace("{$history}", request_state.instance.extra_data["history"])
+            .replace("{$user_query}", request_state.instance.extra_data["user_query"])
+            .replace("{$model_output}", model_output_text)
+            .replace("{$checklist}", request_state.instance.extra_data["checklist"])
+        )
+        annotator_request = Request(
+            model="openai/gpt-4o-2024-05-13",
+            model_deployment="openai/gpt-4o-2024-05-13",
+            prompt=annotator_prompt,
+            temperature=0.0,
+            max_tokens=1000,
+        )
+        annotator_response = self._auto_client.make_request(annotator_request)
+        if not annotator_response.success:
+            raise Exception(f"Annotation request failed: {annotator_response.error}")
+        assert len(annotator_response.completions) == 1
+        annotator_response_text = annotator_response.completions[0].text
+        annotator_response_parts = self._pattern.search(annotator_response_text)
+        if not annotator_response_parts:
+            raise ValueError(f"Malformed annotator response: {annotator_response_text}")
+
+        strengths = annotator_response_parts[1].strip()
+        weaknesses = annotator_response_parts[2].strip()
+        score_text = annotator_response_parts[3].strip()
+        try:
+            score = float(score_text)
+        except ValueError:
+            raise ValueError(f"Malformed score '{score_text}' in annotator response: {annotator_response_text}")
+
+        return {"prompt_text": annotator_prompt, "strengths": strengths, "weaknesses": weaknesses, "score": score}
diff --git a/src/helm/benchmark/metrics/wildbench_metrics.py b/src/helm/benchmark/metrics/wildbench_metrics.py
@@ -0,0 +1,25 @@
+from typing import List
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+
+
+class WildBenchScoreMetric(Metric):
+    """Score metrics for WildBench."""
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        assert request_state.annotations
+        score = request_state.annotations["wildbench"]["score"]
+        return [
+            Stat(MetricName("wildbench_score")).add(score),
+        ]
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -26,6 +26,7 @@
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.runner import get_benchmark_output_path
 from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
+from helm.benchmark.annotation.annotator import AnnotatorSpec
 
 
 @run_spec_function("narrative_qa")
@@ -437,3 +438,26 @@ def get_ifeval_spec() -> RunSpec:
         metric_specs=metric_specs,
         groups=["ifeval"],
     )
+
+
+@run_spec_function("wildbench")
+def get_wildbench_spec(subset: str) -> RunSpec:
+
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.wildbench_scenario.WildBenchScenario", args={"subset": subset}
+    )
+
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION, input_prefix="", output_prefix="", max_tokens=1000, num_outputs=1, temperature=0.0
+    )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.wildbench_annotator.WildBenchAnnotator")]
+    metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.wildbench_metrics.WildBenchScoreMetric")]
+
+    return RunSpec(
+        name="wildbench",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
+        groups=["wildbench"],
+    )
diff --git a/src/helm/benchmark/scenarios/test_wildbench_scenario.py b/src/helm/benchmark/scenarios/test_wildbench_scenario.py
@@ -0,0 +1,23 @@
+import pytest
+from tempfile import TemporaryDirectory
+
+from helm.benchmark.scenarios.wildbench_scenario import WildBenchScenario
+from helm.benchmark.scenarios.scenario import Input, TEST_SPLIT
+
+
+@pytest.mark.scenarios
+def test_wildbench_scenario_get_instances():
+    wildbench_scenario = WildBenchScenario(subset="v2")
+    with TemporaryDirectory() as tmpdir:
+        instances = wildbench_scenario.get_instances(tmpdir)
+    assert len(instances) == 1024
+    # assert instances[0].input.text.startswith("add 10 more balanced governments[aoc2]\n{\n\tGovernment")
+    # assert len(instances[0].input.text) == 17619
+    # assert instances[0].split == TEST_SPLIT
+    # assert instances[0].extra_data
+
+    # assert instances[0].extra_data["gpt-4-turbo-2024-04-09"].startswith("Here are 10 addition")
+    # assert len(instances[0].extra_data["gpt-4-turbo-2024-04-09"]) == 10574
+    # assert instances[0].extra_data["claude-3-haiku-20240307"].startswith("Here are 10 more bal")
+    # assert len(instances[0].extra_data["claude-3-haiku-20240307"]) == 7873
+    # assert instances[0].extra_data["Llama-2-70b-chat-hf"] == ""
diff --git a/src/helm/benchmark/scenarios/wildbench_scenario.py b/src/helm/benchmark/scenarios/wildbench_scenario.py
@@ -0,0 +1,88 @@
+import datasets
+import os
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    Input,
+)
+from helm.common.general import ensure_directory_exists
+
+
+SUBSETS = ["v2"]
+REFERENCE_MODELS = ["gpt-4-turbo-2024-04-09", "claude-3-haiku-20240307", "Llama-2-70b-chat-hf"]
+
+
+class WildBenchScenario(Scenario):
+    """WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild
+
+    WildBench is a benchmark for evaluating large language models (LLMs) on challenging tasks
+    that are more representative of real-world applications. The examples are collected from
+    real users by the AI2 WildChat project."""
+
+    name = "wildbench"
+    description = "Benchmarking LLMs with Challenging Tasks from Real Users in the Wild"
+    tags = ["instruction following"]
+
+    def __init__(self, subset: str):
+        super().__init__()
+        assert subset in SUBSETS, "Unknown subset: {}".format(subset)
+        self.subset = subset
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Get WildBench from HuggingFace
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = datasets.load_dataset(
+            "allenai/WildBench",
+            self.subset,
+            trust_remote_code=True,
+            cache_dir=cache_dir,
+            split="test",
+        )
+        assert isinstance(dataset, datasets.Dataset)
+        baseline_outputs = {
+            f"{model}": datasets.load_dataset(
+                "allenai/WildBench-V2-Model-Outputs",
+                model,
+                trust_remote_code=True,
+                cache_dir=cache_dir,
+                split="train",
+            )
+            for model in REFERENCE_MODELS
+        }
+        assert all(isinstance(baseline_output, datasets.Dataset) for baseline_output in baseline_outputs.values())
+
+        # Read all instances
+        instances: List[Instance] = []
+        for idx, row in enumerate(dataset):
+            input_text = []
+            history_text = []
+            for round in row["conversation_input"][:-1]:
+                noun = "User: " if round["role"] == "user" else "Assistant: "
+                history_text.append(noun + round["content"])
+                input_text.append(noun + round["content"])
+
+            round = row["conversation_input"][-1]
+            noun = "User: "
+            input_text.append(noun + round["content"])
+            user_query = round["content"]
+
+            input = Input(text="\n".join(input_text))
+            instance = Instance(
+                input=input,
+                references=[],
+                split=TEST_SPLIT,
+                extra_data={
+                    "baseline_outputs": {
+                        model: baseline_outputs[model][idx]["output"][0] for model in REFERENCE_MODELS
+                    },
+                    "history": "\n".join(history_text),
+                    "user_query": user_query,
+                    "checklist": "\n".join(row["checklist"]),
+                },
+            )
+            instances.append(instance)
+
+        return instances