openai · michaelAlvarino · Jul 21, 2024 · etr2460 · Jul 31, 2024 · etr2460
@@ -14,7 +14,6 @@
 from evals.record import record_sampling
 from evals.registry import Registry
 
-client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 
 
 def load_embeddings(embeddings_and_text_path: str):
@@ -76,6 +75,7 @@ def __init__(
             registry_path: The path to a registry file to add to default registry.
             _kwargs: Additional arguments to pass to the completion function instantiation.
         """
+        self.client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
         registry = Registry() if not registry else registry
         if registry_path:
             registry.add_registry_paths(registry_path)
@@ -96,7 +96,7 @@ def __call__(self, prompt: Union[str, list[dict]], **kwargs: Any) -> RetrievalCo
         """
         # Embed the prompt
         embedded_prompt = (
-            client.embeddings.create(
+            self.client.embeddings.create(
                 model=self.embedding_model, input=CompletionPrompt(prompt).to_formatted_prompt()
             )
             .data[0]

@@ -13,7 +13,6 @@
 from evals.solvers.solver import Solver
 from evals.task_state import TaskState
 
-client = OpenAI()
 logger = logging.getLogger(__name__)
 
 

@@ -4,9 +4,6 @@
 
 from openai import OpenAI
 
-client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-
-
 def is_system_msg(m: dict) -> bool:
     assert isinstance(m, dict), "Message must be a dict."
     assert "role" in m, "Message must have a role."
@@ -72,4 +69,4 @@ def model_output_empty_tags(message: str) -> bool:
 
 
 def openai_chatcompletion_create(*args, **kwargs):
-    return client.chat.completions.create(*args, **kwargs)
+    return OpenAI(api_key=os.environ.get("OPENAI_API_KEY")).chat.completions.create(*args, **kwargs)
@@ -11,7 +11,7 @@
 from evals.api import CompletionFn
 from evals.elsuite.self_prompting.task_description import sample_in_token, task_description_template
 from evals.eval import SolverEval
-from evals.registry import registry
+from evals.registry import Registry
 from evals.solvers.solver import Solver
 from evals.task_state import TaskState
 from evals.utils.log_utils import extract_final_results, extract_spec
@@ -54,7 +54,7 @@ def __init__(
 
         self.tasker_completion_fns = {}
         for tasker_model in self.tasker_models:
-            self.tasker_completion_fns[tasker_model] = registry.make_completion_fn(tasker_model)
+            self.tasker_completion_fns[tasker_model] = Registry().make_completion_fn(tasker_model)
 
     def eval_sample(self, solver: Solver, sample: Any, rng: random.Random):
         if sample["stage"] == "prompting":

@@ -4,15 +4,15 @@
 from eval_list import eval_list
 
 import evals.data
-from evals.registry import registry
+from evals.registry import Registry
 
 np.random.seed(42)
 min_samples_per_dataset = 50
 n_test_samples = 10
 
 seen = set()
 datarows = []
-for eval in registry.get_evals("*"):
+for eval in Registry().get_evals("*"):
     if eval.key not in eval_list or eval.key in seen:
         continue
     seen.add(eval.key)

@@ -11,108 +11,112 @@
 )
 from evals.registry import Registry
 
-registry = Registry()
-
-dummy_eval_spec = {
-    "eval_registry_path": Path("evals/registry"),
-    "completion_fns": [registry.make_completion_fn("gpt-4")],
-    "samples_jsonl": "skill_acquisition/miskito/miskito_test.jsonl",
-    "target_language": "miskito",
-    "n_samples": 5,
-    "knowledge_base_directory": "skill_acquisition/miskito/knowledge_base/",
-    "max_replies": 50,
-}
-
-
-def test_answer_detected():
-    assert answer_detected("[ANSWER foo]") is True
-    assert answer_detected("[ANSWER: foo]") is True
-    assert answer_detected("ANSWER foo") is False
-    assert answer_detected("[ANSWER foo") is False
-    assert answer_detected("ANSWER foo]") is False
-    assert answer_detected("[ANSWER foo][ANSWER bar]") is True
-
-
-def test_view_instruction_detected():
-    SkillAcquisition(**dummy_eval_spec)
-    assert view_instruction_detected("[VIEW file1]") is True
-    assert view_instruction_detected("[VIEW: file1]") is True
-    assert view_instruction_detected("[VIEW file1 section1]") is True
-    assert view_instruction_detected("[VIEW: file1 section1]") is True
-    assert view_instruction_detected("VIEW file1") is False
-    assert view_instruction_detected("[VIEW file1") is False
-    assert view_instruction_detected("VIEW file1]") is False
-    assert view_instruction_detected("[VIEW file1][VIEW file2]") is True
-    assert view_instruction_detected("[VIEW: file1][VIEW: file2]") is True
-
-
-def test_process_answer():
-    SkillAcquisition(**dummy_eval_spec)
-    assert process_answer("[ANSWER foo]") == "foo"
-    assert process_answer("[ANSWER: foo]") == "foo"
-    assert process_answer("[ANSWER foo bar baz]") == "foo bar baz"
-    assert process_answer("[ANSWER: foo bar baz]") == "foo bar baz"
-    assert process_answer("[ANSWER foo][ANSWER bar]") == "bar"
-    assert process_answer("[ANSWER foo][ANSWER bar") == "foo"
-
-
-def test_process_view_instruction():
-    SkillAcquisition(**dummy_eval_spec)
-    assert process_view_instruction("[VIEW file1]") == ("file1", None)
-    assert process_view_instruction("[VIEW: file1]") == ("file1", None)
-    assert process_view_instruction("[VIEW file1 section1]") == (
-        "file1",
-        "section1",
-    )
-    assert process_view_instruction("[VIEW: file1 section1]") == (
-        "file1",
-        "section1",
-    )
-    assert process_view_instruction("[VIEW file1][VIEW file2]") == (
-        "file2",
-        None,
-    )
-    assert process_view_instruction("[VIEW: file1][VIEW: file2]") == (
-        "file2",
-        None,
-    )
-    assert process_view_instruction("[VIEW file1 section1][VIEW file2 section2]") == (
-        "file2",
-        "section2",
-    )
-
-
-def test_process_view_instruction_spaces_and_quotes():
-    assert process_view_instruction("[VIEW file1 sectionpart1 sectionpart2]") == (
-        "file1",
-        "sectionpart1 sectionpart2",
-    )
-    assert process_view_instruction("[VIEW file1 sectionpart1 'sectionpart2']") == (
-        "file1",
-        "sectionpart1 'sectionpart2'",
-    )
-
-
-def test_view_content():
-    skill_acquisition_eval = SkillAcquisition(**dummy_eval_spec)
-
-    # Create a file to view first.
-    filepath = skill_acquisition_eval.knowledge_base_directory / "test_file.jsonl"
-    with open(filepath, "w") as f:
-        f.write(json.dumps({"title": "foo", "content": "Test file contents."}) + "\n")
-
-    content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content(
-        "test_file.jsonl"
-    )
-    assert content == "Table of contents for test_file.jsonl: {'foo'}."
-    assert sections_visible_to_model == {"test_file.jsonl": {"foo"}}
-    assert sections_viewed == {"test_file.jsonl": {"Table of Contents"}}
-
-    content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content(
-        "test_file.jsonl", "foo"
-    )
-    assert content == "Test file contents."
-    assert sections_visible_to_model == {"test_file.jsonl": {"foo"}}
-    assert sections_viewed == {"test_file.jsonl": {"Table of Contents", "foo"}}
-
-    os.remove(filepath)
+
+
+
+class TestSkillAcquisition():
+    def setup_class(self):
+        os.environ["OPENAI_API_KEY"] = "test"
+        self.registry = Registry()
+        self.dummy_eval_spec = {
+            "eval_registry_path": Path("evals/registry"),
+            "completion_fns": [self.registry.make_completion_fn("gpt-4")],
+            "samples_jsonl": "skill_acquisition/miskito/miskito_test.jsonl",
+            "target_language": "miskito",
+            "n_samples": 5,
+            "knowledge_base_directory": "skill_acquisition/miskito/knowledge_base/",
+            "max_replies": 50,
+        }
+
+    def test_answer_detected(self):
+        assert answer_detected("[ANSWER foo]") is True
+        assert answer_detected("[ANSWER: foo]") is True
+        assert answer_detected("ANSWER foo") is False
+        assert answer_detected("[ANSWER foo") is False
+        assert answer_detected("ANSWER foo]") is False
+        assert answer_detected("[ANSWER foo][ANSWER bar]") is True
+
+
+    def test_view_instruction_detected(self):
+        SkillAcquisition(**self.dummy_eval_spec)
+        assert view_instruction_detected("[VIEW file1]") is True
+        assert view_instruction_detected("[VIEW: file1]") is True
+        assert view_instruction_detected("[VIEW file1 section1]") is True
+        assert view_instruction_detected("[VIEW: file1 section1]") is True
+        assert view_instruction_detected("VIEW file1") is False
+        assert view_instruction_detected("[VIEW file1") is False
+        assert view_instruction_detected("VIEW file1]") is False
+        assert view_instruction_detected("[VIEW file1][VIEW file2]") is True
+        assert view_instruction_detected("[VIEW: file1][VIEW: file2]") is True
+
+
+    def test_process_answer(self):
+        SkillAcquisition(**self.dummy_eval_spec)
+        assert process_answer("[ANSWER foo]") == "foo"
+        assert process_answer("[ANSWER: foo]") == "foo"
+        assert process_answer("[ANSWER foo bar baz]") == "foo bar baz"
+        assert process_answer("[ANSWER: foo bar baz]") == "foo bar baz"
+        assert process_answer("[ANSWER foo][ANSWER bar]") == "bar"
+        assert process_answer("[ANSWER foo][ANSWER bar") == "foo"
+
+
+    def test_process_view_instruction(self):
+        SkillAcquisition(**self.dummy_eval_spec)
+        assert process_view_instruction("[VIEW file1]") == ("file1", None)
+        assert process_view_instruction("[VIEW: file1]") == ("file1", None)
+        assert process_view_instruction("[VIEW file1 section1]") == (
+            "file1",
+            "section1",
+        )
+        assert process_view_instruction("[VIEW: file1 section1]") == (
+            "file1",
+            "section1",
+        )
+        assert process_view_instruction("[VIEW file1][VIEW file2]") == (
+            "file2",
+            None,
+        )
+        assert process_view_instruction("[VIEW: file1][VIEW: file2]") == (
+            "file2",
+            None,
+        )
+        assert process_view_instruction("[VIEW file1 section1][VIEW file2 section2]") == (
+            "file2",
+            "section2",
+        )
+
+
+    def test_process_view_instruction_spaces_and_quotes(self):
+        assert process_view_instruction("[VIEW file1 sectionpart1 sectionpart2]") == (
+            "file1",
+            "sectionpart1 sectionpart2",
+        )
+        assert process_view_instruction("[VIEW file1 sectionpart1 'sectionpart2']") == (
+            "file1",
+            "sectionpart1 'sectionpart2'",
+        )
+
+
+    def test_view_content(self):
+        skill_acquisition_eval = SkillAcquisition(**self.dummy_eval_spec)
+
+        # Create a file to view first.
+        filepath = skill_acquisition_eval.knowledge_base_directory / "test_file.jsonl"
+        with open(filepath, "w") as f:
+            f.write(json.dumps({"title": "foo", "content": "Test file contents."}) + "\n")
+
+        content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content(
+            "test_file.jsonl"
+        )
+        assert content == "Table of contents for test_file.jsonl: {'foo'}."
+        assert sections_visible_to_model == {"test_file.jsonl": {"foo"}}
+        assert sections_viewed == {"test_file.jsonl": {"Table of Contents"}}
+
+        content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content(
+            "test_file.jsonl", "foo"
+        )
+        assert content == "Test file contents."
+        assert sections_visible_to_model == {"test_file.jsonl": {"foo"}}
+        assert sections_viewed == {"test_file.jsonl": {"Table of Contents", "foo"}}
+
+        os.remove(filepath)
@@ -263,14 +263,6 @@ def record_final_report(self, final_report: Any):
         logging.info(f"Final report: {final_report}. Not writing anywhere.")
 
 
-def _green(str):
-    return f"\033[1;32m{str}\033[0m"
-
-
-def _red(str):
-    return f"\033[1;31m{str}\033[0m"
-
-
 class DummyRecorder(RecorderBase):
     """
     A "recorder" which only logs certain events to the console.
@@ -282,33 +274,15 @@ def __init__(self, run_spec: RunSpec, log: bool = True):
         self.log = log
 
     def record_event(self, type, data, sample_id=None):
-        from evals.registry import registry
-
         if self.run_spec is None:
             return
 
-        base_eval_spec = registry.get_base_eval(self.run_spec.base_eval)
-        if base_eval_spec and len(base_eval_spec.metrics) >= 1:
-            primary_metric = base_eval_spec.metrics[0]
-        else:
-            primary_metric = "accuracy"
-
         with self._event_lock:
             event = self._create_event(type, data)
             self._events.append(event)
 
         msg = f"Not recording event: {event}"
 
-        if type == "match":
-            accuracy_good = (
-                primary_metric == "accuracy" or primary_metric.startswith("pass@")
-            ) and (data.get("correct", False) or data.get("accuracy", 0) > 0.5)
-            f1_score_good = primary_metric == "f1_score" and data.get("f1_score", 0) > 0.5
-            if accuracy_good or f1_score_good:
-                msg = _green(msg)
-            else:
-                msg = _red(msg)
-
         if self.log:
             logging.info(msg)