addressing comments

stanford-crfm · Dec 10, 2024 · 7e4db67 · 7e4db67
1 parent dbdb4c0
commit 7e4db67
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 52 deletions.
diff --git a/src/helm/benchmark/annotation/bigcodebench_annotator.py b/src/helm/benchmark/annotation/bigcodebench_annotator.py
@@ -1,17 +1,21 @@
-from typing import Any, List
+
+import ast
+import traceback
+import time
+import json
 
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.annotator import Annotator
 from helm.common.request import Request
+from helm.common.hierarchical_logger import hlog
+
+from typing import Any, List
 from gradio_client import Client, handle_file
 from tempfile import TemporaryDirectory
+from tenacity import retry, stop_after_attempt, wait_fixed
 
-from helm.common.hierarchical_logger import hlog
 
-import ast
-import traceback
-import time
-import json
+OUTPUT_FILENAME = "tmp_result.jsonl"
 
 
 def syntax_check(code, verbose=False):
@@ -51,19 +55,32 @@ def __init__(self):
         self.split = "instruct"
         self.subset = "full"
         self.pass_k = "1"  # Original: "1,5,10"
-        self.is_macro = True
+        self.use_global_metric = True
 
     def annotate(self, request_state: RequestState) -> Any:
         pass
 
+    @retry(stop=stop_after_attempt(3), wait=wait_fixed(4))
+    def predict_with_retry(self, filename):
+        client = Client(self.remote_execute_api)
+        results, pass_at_k = client.predict(
+            split=self.split,
+            subset=self.subset,
+            samples=handle_file(filename),
+            pass_k=self.pass_k,
+            api_name="/predict",
+        )
+        results, pass_at_one = pass_at_k["pass@1"]
+        return results, pass_at_one
+
+
     def annotate_all(self, request_states: List[RequestState]) -> Any:
         assert all(request_state.result for request_state in request_states)
         assert all(len(request_state.result.completions) == 1 for request_state in request_states)
         assert all(request_state.instance.extra_data for request_state in request_states)
 
         with TemporaryDirectory() as tmpdir:
-            # with open(f"{tmpdir}/result.jsonl", "w") as file:
-            with open(f"tmp_result.jsonl", "w") as file:
+            with open(OUTPUT_FILENAME, "w") as file:
                 res = []
                 for i in range(1140):
                     init_line = f'{{"task_id": "BigCodeBench/{i}", "solution": ""}}\n'
@@ -73,37 +90,19 @@ def annotate_all(self, request_states: List[RequestState]) -> Any:
                     model_output_text = request_state.result.completions[0].text
                     solution = code_extract(model_output_text)
                     escaped_solution = json.dumps(solution)[1:-1]
-                    idx = int(request_state.instance.extra_data["task_id"].split("/")[-1])
-                    res[idx] = (
-                        f'{{"task_id": "{request_state.instance.extra_data["task_id"]}", "solution": "{escaped_solution}"}}\n'
-                    )
+                    idx = int(request_state.instance.id.split("/")[-1])
+                    res[idx] = json.dumps(
+                        {"task_id": request_state.instance.id, "solution": escaped_solution}
+                    ) + "\n"
                 for line in res:
                     file.write(line)
 
-            pass_at_one: float
-            max_retries = 3
-            retry_count = 0
-            success = False  # Flag to indicate if the operation was successful
-            while retry_count < max_retries:
-                try:
-                    client = Client(self.remote_execute_api)
-                    results, pass_at_k = client.predict(
-                        split=self.split,
-                        subset=self.subset,
-                        # samples=handle_file(f"{tmpdir}/result.jsonl"),
-                        samples=handle_file(f"tmp_result.jsonl"),
-                        pass_k=self.pass_k,
-                        api_name="/predict",
-                    )
-                    success = True  # Operation succeeded
-                    pass_at_one = pass_at_k["pass@1"]
-                    break
-                except Exception as e:
-                    retry_count += 1
-                    hlog(f"Attempt {retry_count} failed. Error Message: {e}. Retrying in 4s...")
-                    time.sleep(4)
-            if not success:
-                hlog("Failed to complete the operation after 3 attempts.")
-                pass_at_one = 0.0
-
-        return {"pass_at_one": pass_at_one}
+        try:
+            results, pass_at_one = self.predict_with_retry(OUTPUT_FILENAME)
+        except Exception as e:
+            hlog("Failed to complete the operation after 3 attempts.")
+            pass_at_one = 0.0
+            results = []
+
+        ret = [{"pass_at_one": results['eval'][state.instance.id][0]['status'] == 'pass'} for state in request_states]
+        return ret
diff --git a/src/helm/benchmark/annotation_executor.py b/src/helm/benchmark/annotation_executor.py
@@ -92,14 +92,14 @@ def execute(self, scenario_state: ScenarioState) -> ScenarioState:
             hlog("No annotators to run.")
             return scenario_state
 
-        if all(getattr(self.factory.get_annotator(spec), "is_macro", False) for spec in scenario_state.annotator_specs):
+        if all(getattr(self.factory.get_annotator(spec), "use_global_metric", False) for spec in scenario_state.annotator_specs):
             # Do it!
             request_states = self.process_all(
                 scenario_state.annotator_specs, scenario_state.request_states  # processing all request together
             )
 
         else:
-            hlog("!!!!Annotators are not all is_macro!.")
+            hlog("!!!!Annotators are not all use_global_metric!.")
 
             # Do it!
             def do_it(request_state: RequestState) -> RequestState:
@@ -141,4 +141,4 @@ def process_all(self, annotator_specs: List[AnnotatorSpec], states: List[Request
                 annotations[annotator.name] = new_annotations
         except Exception as e:
             raise AnnotationExecutorError(f"{str(e)} Request: {states.request}") from e
-        return [replace(state, annotations=annotations) for state in states]
+        return [replace(state, annotations=new_annotations[idx]) for idx, state in enumerate(states)]
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -465,18 +465,18 @@ def get_wildbench_spec(subset: str) -> RunSpec:
 
 
 @run_spec_function("bigcodebench")
-def get_bigcodebench_spec(subset: str) -> RunSpec:
+def get_bigcodebench_spec(version: str) -> RunSpec:
 
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"subset": subset}
+        class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"version": version}
     )
 
     # Adapted from https://github.dev/bigcode-project/bigcodebench/blob/main/bigcodebench/evaluate.py
     adapter_spec = AdapterSpec(
         method=ADAPT_GENERATION,
         input_prefix="",
         output_prefix="",
-        max_tokens=1000,
+        max_tokens=1280,
         num_outputs=1,
         temperature=0.0,
         global_prefix="Please provide a self-contained Python script that solves the following problem in a markdown code block:",

diff --git a/src/helm/benchmark/scenarios/bigcodebench_scenario.py b/src/helm/benchmark/scenarios/bigcodebench_scenario.py
@@ -10,7 +10,7 @@
 from helm.common.general import ensure_directory_exists
 
 
-SUBSETS = ["v0.1.2"]
+VERSIONS = ["v0.1.2"]
 
 
 class BigCodeBenchScenario(Scenario):
@@ -25,20 +25,19 @@ class BigCodeBenchScenario(Scenario):
     description = "Benchmarking Code Generation with Diverse Function Calls and Complex Instructions"
     tags = ["coding"]
 
-    def __init__(self, subset: str):
+    def __init__(self, version: str):
         super().__init__()
-        assert subset in SUBSETS, "Unknown subset: {}".format(subset)
-        self.subset = subset
+        assert version in VERSIONS, "Unknown version: {}".format(version)
+        self.version = version
 
     def get_instances(self, output_path: str) -> List[Instance]:
         # Get BigCodeBench from HuggingFace
         cache_dir = os.path.join(output_path, "data")
         ensure_directory_exists(cache_dir)
         dataset = datasets.load_dataset(
             "bigcode/bigcodebench",
-            trust_remote_code=True,
             cache_dir=cache_dir,
-            split="v0.1.2",
+            split=self.version,
         )
         assert isinstance(dataset, datasets.Dataset)
 
@@ -51,6 +50,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
                 input=input,
                 references=[],
                 split=TEST_SPLIT,
+                id=row['task_id'],
                 extra_data={"task_id": row["task_id"]},
             )
             instances.append(instance)