-```
-
-Example output:
-```
-Trial 0 | Total Correct: 10 | Total Problems: 17
-```
diff --git a/samples/tools/testbed/scenarios/MATH/count_correct_math.py b/samples/tools/testbed/scenarios/MATH/count_correct_math.py
deleted file mode 100644
index 69766dfb0c5..00000000000
--- a/samples/tools/testbed/scenarios/MATH/count_correct_math.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import argparse
-import json
-import os
-
-
-def main(args):
- stars = "*" * 100
-
- # initiate the correct count for each trial
- correct_count = [0 for i in range(args.num_trials)]
-
- for i in range(args.num_trials):
- for problem_name in os.listdir(args.path):
- problem_path = os.path.join(args.path, problem_name, str(i))
- if os.path.isdir(problem_path):
- checker_file_path = os.path.join(problem_path, "checker_messages.json")
-
- with open(checker_file_path, "r") as file:
- checker_messages = json.load(file)
-
- check_result = checker_messages["checker_proxy"][-1]["content"].lower()
-
- if (
- "the answer is correct" in check_result
- or "the answer is approximated but should be correct" in check_result
- ):
- correct_count[i] += 1
- # print(f"{problem_name} | Correct")
- # else:
- # print(f"{problem_name} | Wrong")
-
- print(f"{stars}\nTrial {i} | Total Correct: {correct_count[i]} | Total Problems: {len(os.listdir(args.path))}")
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="""Print Math Problems results.""".strip(),
- )
- parser.add_argument(
- "--path",
- "-p",
- type=str,
- default="./results/problems/",
- help="Path to the problems directory",
- )
- # num trials
- parser.add_argument(
- "--num_trials",
- "-n",
- type=int,
- default=1,
- help="Number of trials to check",
- )
-
- args = parser.parse_args()
- main(args)
diff --git a/samples/tools/testbed/scenarios/MATH/problems_to_json.py b/samples/tools/testbed/scenarios/MATH/problems_to_json.py
deleted file mode 100644
index 4dd9dba0d12..00000000000
--- a/samples/tools/testbed/scenarios/MATH/problems_to_json.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import json
-
-problems = [
- "Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.",
- "Find the value of $a_2+a_4+a_6+a_8+\\dots+a_{98}$ if $a_1, a_2, a_3, \\ldots$ is an arithmetic progression with common difference $1$ and \\[a_1+a_2+a_3+\\dots+a_{98}=137.\\]",
- "Tina the tourist goes on a trip. She starts at the origin and drives north (in the positive $y$ direction) for $10$ units. Then she turns east (the positive $x$ direction) and as she's turning her camera flies out the window and lands exactly at $(0,10)$. She then drives $9$ units east, turns and drives $8$ units north. She continues this pattern of turning and driving one unit less than after the previous turn, until stopping after driving $1$ unit east. She reaches for her camera only to find it missing! She activates the GPS homing device on her camera and drives back to it in a straight line. What is the equation of this line? Express your answer as $ax+by=c$, where $a$, $b$, and $c$ are integers, $a>0$, and $a$ is as small as possible.",
- "For what negative value of $k$ is there exactly one solution to the system of equations \\begin{align*}\ny &= 2x^2 + kx + 6 \\\\\ny &= -x + 4?\n\\end{align*}",
- "If $\\frac{3x^2-4x+1}{x-1}=m$, and $x$ can be any real number except $1$, what real values can $m$ NOT have?",
- "Find all numbers $a$ for which the graph of $y=x^2+a$ and the graph of $y=ax$ intersect. Express your answer in interval notation.",
- "If $\\displaystyle{f(x)=x^{(x+1)}(x+2)^{(x+3)}}$, then find the value of $f(0)+f(-1)+f(-2)+f(-3)$.",
- "An envelope contains eight bills: 2 ones, 2 fives, 2 tens, and 2 twenties. Two bills are drawn at random without replacement. What is the probability that their sum is $\\$20$ or more?",
- "Find the coefficient of $x^2$ in the expansion of the product $$(1-x)(1+2x)(1-3x)\\dotsm(1+14x)(1-15x).$$",
- "All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction.",
- "Let $x$ and $y$ be real numbers. Find the set of possible values of\n\\[\\frac{(x + y)(1 - xy)}{(1 + x^2)(1 + y^2)}.\\]",
- "On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\overline{PQ}$ is $B$, the midpoint of $\\overline{BQ}$ is $C$, and the midpoint of $\\overline{PC}$ is $D$. What is the coordinate of $D$?",
- "Find $24^{-1} \\pmod{11^2}$. That is, find the residue $b$ for which $24b \\equiv 1\\pmod{11^2}$.\n\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.",
- "There are two cameras that take pictures of a traffic intersection. Camera A starts taking pictures at $6$ AM and takes a picture every $11$ minutes. Camera B starts taking pictures at $7$ AM and takes pictures every $7$ minutes. Camera A and Camera B take a picture at the same time at four different times before noon. When Camera A and Camera B take their last picture together, how many minutes before noon is it?",
- "Let $z$ be a complex number such that $z^{13} = 1.$ Let $w_1,$ $w_2,$ $\\dots,$ $w_k$ be all the possible values of\n\\[z + z^3 + z^4 + z^9 + z^{10} + z^{12}.\\]Find $w_1^2 + w_2^2 + \\dots + w_k^2.$",
- "There are 190 people on the beach. 110 are wearing sunglasses, 70 are wearing bathing suits, and 95 are wearing a hat. Everyone is wearing at least one of these items. 30 are wearing both bathing suits and sunglasses. 25 are wearing both bathing suits and a hat. 40 are wearing both sunglasses and a hat. How many people are wearing all three items?",
- "Completely simplify and rationalize the denominator: $$\\frac{\\sqrt{160}}{\\sqrt{252}}\\times\\frac{\\sqrt{245}}{\\sqrt{108}}$$",
-]
-answers = [
- # 6 algebra
- "(-\\infty, -14)\\cup(-3,\\infty)",
- "93",
- "4x-5y=-50",
- "-5",
- "2",
- "(-\\infty,0]\\cup[4,\\infty)",
- # 11 problems, 2 from each category, (1 algebra is deleted)
- "\\frac{10}{9}",
- "\\frac{1}{2}",
- "-588",
- " \\frac{1}{13}",
- "\\left[ -\\frac{1}{2}, \\frac{1}{2} \\right]",
- "23",
- "116",
- "41",
- "43",
- "10",
- "\\frac{5\\sqrt{42}}{27}",
-]
-
-
-def problem_to_json():
- with open("problems.jsonl", "w") as f:
- for i, problem in enumerate(problems):
- # a = {
- # 'id': problem{i}',
- # 'template': 'scenario.py',
- # 'substitutions': {
- # '__PROMPT__': problem,
- # '__ANSWER__': answers[i],
- # },
- # }
- a = {
- "id": f"problem{i}",
- "template": "./",
- "substitutions": {"prompt.txt": {"__PROMPT__": problem}, "answer.txt": {"__ANSWER__": answers[i]}},
- }
- # Convert the dictionary to a JSON string and write it to the file
- json_string = json.dumps(a)
- f.write(json_string + "\n") # Add a newline character after each JSON object
-
-
-problem_to_json()
-
-problems = []
-with open("problems.jsonl", "r") as file:
- for line in file:
- # Parse each line as a JSON object
- problem = json.loads(line)
- problems.append(problem)
- print(problem["substitutions"])
- print()
-
-# Now 'problems' is a list of dictionaries, each representing a problem
diff --git a/samples/tools/testbed/utils/collate_autogpt.py b/samples/tools/testbed/utils/collate_autogpt.py
deleted file mode 100644
index 3dc8bcdba59..00000000000
--- a/samples/tools/testbed/utils/collate_autogpt.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import argparse
-import os
-import re
-import subprocess
-import sys
-
-
-def collate(results_dir="results"):
- """
- Collate the results of running AutoGPT test.
-
- Args:
- results_dir (str, optional): The folder where results are saved. Defaults to "results".
- """
-
- all_results = list()
- max_instances = 0
-
- for test_name in os.listdir(results_dir):
- test_path = os.path.join(results_dir, test_name)
-
- # Collect the results vector
- results = [test_name]
-
- instance = 0
- instance_dir = os.path.join(test_path, str(instance))
- while os.path.isdir(instance_dir):
- console_log = os.path.join(instance_dir, "console_log.txt")
- if os.path.isfile(console_log):
- with open(console_log, "rt") as fh:
- content = fh.read()
- if "ALL TESTS PASSED!" in content:
- # Ideally we would have a more distinctive pattern.
- results.append(str(len(re.findall(r"\n(.*?) \(to (.*?)\)\:\n", content))))
- else:
- # Sometimes the task actually succeeds, but the check.py isn't properly called
- result = subprocess.run(
- [sys.executable, "../check.py"],
- cwd=os.path.join(instance_dir, "coding"),
- capture_output=True,
- text=True,
- )
- if "error" in result.stderr or result.returncode != 0:
- results.append("-1")
- else:
- # The task actually succeeds.
- if "ALL TESTS PASSED!" in result.stdout:
- results.append(str(len(re.findall(r"\n(.*?) \(to (.*?)\)\:\n", content))))
- else:
- results.append("-1")
- else:
- # Missing results will appear as blanks
- results.append("")
-
- instance += 1
- instance_dir = os.path.join(test_path, str(instance))
-
- max_instances = max(max_instances, instance)
-
- # Buffer the results
- all_results.append(results)
-
- # Create a header
- header = "TestName"
- for i in range(0, max_instances):
- header += ",Trial" + str(i)
- print(header)
-
- # Print a fully-populated table of results
- for r in all_results:
- while len(r) < max_instances + 1:
- r.append("")
- print(",".join(r))
-
-
-if __name__ == "__main__":
- script_path = os.path.realpath(__file__)
- script_name = os.path.basename(script_path)
- script_dir = os.path.dirname(script_path)
-
- # Path to the default results directory
- # (relative to this script, up on directory, then into the results folder)
- default_results_dir = os.path.realpath(os.path.join(script_dir, os.path.pardir, "results"))
-
- parser = argparse.ArgumentParser(
- description=f"""
-{script_name} will collate the results of the AutoGPT scenarios and output them to a CSV. The CSV format is as follows:
-
-TestName, Trial0, Trial1, ..., TrialN
-Test_1, x_10, x_11, ..., X_1N
-Test_2, x_20, x_21, ..., X_2N
-...
-Test_M, x_M0, x_M1, ..., X_MN
-
-
-Where x_ij is the number of AssistantAgent conversation turns needed to pass all the tests for problem i, in Trial/repetition j. If the agent was not able to pass the tests by the end of the conversation, the value will be -1. If data for the trial is missing, the value will be an empty string "".
-""".strip(),
- formatter_class=argparse.RawTextHelpFormatter,
- )
-
- parser.add_argument(
- "scenario",
- nargs="?",
- help="Path to the scenario results. (default: " + default_results_dir + ")",
- default=default_results_dir,
- )
- args = parser.parse_args()
- collate(args.scenario)
diff --git a/samples/tools/testbed/utils/collate_gaia_csv.py b/samples/tools/testbed/utils/collate_gaia_csv.py
deleted file mode 100644
index 88f1ec819ed..00000000000
--- a/samples/tools/testbed/utils/collate_gaia_csv.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import os
-import json
-import re
-import sys
-import argparse
-
-
-def normalize_answer(a):
- # Lower case
- # Trim (left and right)
- # Replace multiple spaces with one space
- # Remove trailing punctuation
- return re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", a.strip().lower()))
-
-
-def collate(results_dir):
- """
- Collate the results of running GAIA
-
- Args:
- results_dir (path): The folder were results were be saved.
- """
-
- all_results = list()
- max_instances = 0
-
- for test_id in os.listdir(results_dir):
- test_path = os.path.join(results_dir, test_id)
-
- # Collect the results vector
- results = [test_id]
-
- instance = 0
- instance_dir = os.path.join(test_path, str(instance))
- while os.path.isdir(instance_dir):
- expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
- if not os.path.isfile(expected_answer_file):
- # Expected answer is missing
- results.append("")
-
- instance += 1
- instance_dir = os.path.join(test_path, str(instance))
- continue
-
- expected_answer = "!!!NULL ANSWER!!!"
- with open(expected_answer_file, "rt") as fh:
- expected_answer = fh.read().strip()
-
- console_log_file = os.path.join(instance_dir, "console_log.txt")
- if not os.path.isfile(console_log_file):
- # Console log file missing
- results.append("")
-
- instance += 1
- instance_dir = os.path.join(test_path, str(instance))
- continue
-
- with open(console_log_file, "rt") as fh:
- console_log = fh.read()
-
- final_answer = ""
- m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
- if m:
- final_answer = m.group(1).strip()
-
- # print(f"Expected Answer: {expected_answer}\nAutogen Answer: {final_answer}\n")
-
- if normalize_answer(expected_answer) == normalize_answer(final_answer):
- results.append("1")
- else:
- results.append("-1")
-
- instance += 1
- instance_dir = os.path.join(test_path, str(instance))
-
- max_instances = max(max_instances, instance)
-
- # Buffer the results
- all_results.append(results)
-
- # Create a header
- header = "TestId"
- for i in range(0, max_instances):
- header += ",Trial" + str(i)
- print(header)
-
- # Print a fully-populated table of results
- for r in all_results:
- while len(r) < max_instances + 1:
- r.append("")
- print(",".join(r))
-
-
-###############################################################################
-if __name__ == "__main__":
- script_path = os.path.realpath(__file__)
- script_name = os.path.basename(script_path)
- script_dir = os.path.dirname(script_path)
-
- # Path to the default results directory
- # (relative to this script, up on directory, then into the results folder)
- default_results_dir = os.path.realpath(
- os.path.join(script_dir, os.path.pardir, "results", "gaia_validation_level_1__two_agents_gpt4")
- )
-
- parser = argparse.ArgumentParser(
- description=f"""
-{script_name} will collate the results of the GAIA scenarios and output them to a CSV. The CSV format is as follows:
-
-TestId, Trial0, Trial1, ..., TrialN
-uuid_1, x_10, x_11, ..., X_1N
-uuid_2, x_20, x_21, ..., X_2N
-...
-uuid_M, x_M0, x_M1, ..., X_MN
-
-Where uuid_i is the identifier of the ith test question, and x_ij is 1 or -1 depending on if the test passed or failed, respectively. If data for the trial is missing (e.g., due to a runtime error, the value will be an empty string "".
-""".strip(),
- formatter_class=argparse.RawTextHelpFormatter,
- )
-
- parser.add_argument(
- "scenario",
- nargs="?",
- help="Path to the scenario results. (default: " + default_results_dir + ")",
- default=default_results_dir,
- )
- args = parser.parse_args()
- collate(args.scenario)
diff --git a/samples/tools/testbed/utils/collate_gaia_jsonl.py b/samples/tools/testbed/utils/collate_gaia_jsonl.py
deleted file mode 100644
index 6a4ac07cad3..00000000000
--- a/samples/tools/testbed/utils/collate_gaia_jsonl.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import os
-import json
-import re
-import sys
-import argparse
-
-
-def normalize_answer(a):
- # Trim (left and right)
- # Replace multiple spaces with one space
- # Remove trailing punctuation
- # Trim again
- return re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", a.strip())).strip()
-
-
-def collate(results_dir, instance=0):
- """
- Collate the results of running GAIA. Print the results in the format accepted by the leaderboard.
-
- Args:
- results_dir (path): The folder where results were be saved.
- """
-
- for test_id in os.listdir(results_dir):
- test_path = os.path.join(results_dir, test_id)
-
- instance_dir = os.path.join(test_path, str(instance))
- console_log_file = os.path.join(instance_dir, "console_log.txt")
-
- final_answer = ""
- if os.path.isfile(console_log_file):
- with open(console_log_file, "rt") as fh:
- console_log = fh.read()
-
- final_answer = ""
- m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
- if m:
- final_answer = normalize_answer(m.group(1))
-
- # Clean up the GAIA logs so they don't have the Docker setup preamble
- m = re.search(r"^.*?\r?\n(user_proxy \(to assistant\).*$)", console_log, re.DOTALL)
- if m:
- console_log = m.group(1)
-
- print(json.dumps({"task_id": test_id, "model_answer": final_answer, "reasoning_trace": console_log}))
-
-
-###############################################################################
-if __name__ == "__main__":
- script_path = os.path.realpath(__file__)
- script_name = os.path.basename(script_path)
- script_dir = os.path.dirname(script_path)
-
- # Path to the default results directory
- # (relative to this script, up on directory, then into the results folder)
- default_results_dir = os.path.realpath(
- os.path.join(script_dir, os.path.pardir, "results", "gaia_validation_level_1__two_agents_gpt4")
- )
-
- parser = argparse.ArgumentParser(
- description=f"""
-{script_name} will collate the results of the GAIA scenarios into the jsonl format that can be submit to the GAIA leaderboard.
-
-NOTE: You will likely need to concatenate results for level 1, level 2 and level 3 to form a complete submission.
-""".strip(),
- formatter_class=argparse.RawTextHelpFormatter,
- )
-
- parser.add_argument(
- "scenario",
- nargs="?",
- help="Path to the scenario results. (default: " + default_results_dir + ")",
- default=default_results_dir,
- )
- args = parser.parse_args()
- collate(args.scenario)
diff --git a/samples/tools/testbed/utils/collate_human_eval.py b/samples/tools/testbed/utils/collate_human_eval.py
deleted file mode 100644
index e46c83f84fc..00000000000
--- a/samples/tools/testbed/utils/collate_human_eval.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import os
-import json
-import re
-import sys
-import argparse
-
-
-def collate(results_dir):
- """
- Collate the results of running human eval.
-
- Args:
- results_dir (path): The folder where results are saved.
- """
-
- all_results = list()
- max_instances = 0
-
- for test_id in os.listdir(results_dir):
- test_path = os.path.join(results_dir, test_id)
-
- # Collect the results vector
- results = [test_id]
-
- instance = 0
- instance_dir = os.path.join(test_path, str(instance))
- while os.path.isdir(instance_dir):
- console_log = os.path.join(instance_dir, "console_log.txt")
- if os.path.isfile(console_log):
- with open(console_log, "rt") as fh:
- content = fh.read()
- if "ALL TESTS PASSED !#!#" in content:
- # Ideally we would have a more distinctive pattern.
- results.append(str(len(re.findall(r"\n(.*?) \(to (.*?)\)\:\n", content))))
- else:
- results.append("-1")
-
- else:
- # Missing results will appear as blanks
- results.append("")
-
- instance += 1
- instance_dir = os.path.join(test_path, str(instance))
-
- max_instances = max(max_instances, instance)
-
- # Buffer the results
- all_results.append(results)
-
- # Create a header
- header = "TestId"
- for i in range(0, max_instances):
- header += ",Trial" + str(i)
- print(header)
-
- # Print a fully-populated table of results
- for r in all_results:
- while len(r) < max_instances + 1:
- r.append("")
- print(",".join(r))
-
-
-###############################################################################
-if __name__ == "__main__":
- script_path = os.path.realpath(__file__)
- script_name = os.path.basename(script_path)
- script_dir = os.path.dirname(script_path)
-
- # Path to the default results directory
- # (relative to this script, up on directory, then into the results folder)
- default_results_dir = os.path.realpath(
- os.path.join(script_dir, os.path.pardir, "results", "human_eval_two_agents_gpt4")
- )
-
- parser = argparse.ArgumentParser(
- description=f"""
-{script_name} will collate the results of the HumanEval scenarios and output them to a CSV. The CSV format is as follows:
-
-TestId, Trial0, Trial1, ..., TrialN
-HumanEval_1, x_10, x_11, ..., X_1N
-HumanEval_2, x_20, x_21, ..., X_2N
-...
-HumanEval_M, x_M0, x_M1, ..., X_MN
-
-
-Where x_ij is the number of AssistantAgent conversation turns needed to pass all the tests for problem i, in Trial/repetition j. If the agent was not able to pass the tests by the end of the conversation, the value will be -1. If data for the trial is missing, the value will be an empty string "".
-""".strip(),
- formatter_class=argparse.RawTextHelpFormatter,
- )
-
- parser.add_argument(
- "scenario",
- nargs="?",
- help="Path to the scenario results. (default: " + default_results_dir + ")",
- default=default_results_dir,
- )
- args = parser.parse_args()
- collate(args.scenario)
diff --git a/samples/tools/testbed/utils/expand_gaia.py b/samples/tools/testbed/utils/expand_gaia.py
deleted file mode 100644
index ed751b08132..00000000000
--- a/samples/tools/testbed/utils/expand_gaia.py
+++ /dev/null
@@ -1,110 +0,0 @@
-#
-# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
-# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
-#
-
-import json
-import os
-import sys
-import shutil
-
-SCRIPT_PATH = os.path.realpath(__file__)
-SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
-SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
-SCENARIOS_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir, "scenarios", "GAIA"))
-
-
-def create_jsonl(name, tasks, template, model):
- """Creates a JSONL scenario file with a given name, list of HumanEval tasks, template path, and model."""
-
- with open(os.path.join(SCENARIOS_DIR, name + ".jsonl"), "wt") as fh:
- for task in tasks:
- print(f"Converting: [{name}] {task['task_id']}")
-
- # Figure out what files we need to copy
- template_cp_list = [template]
- if len(task["file_name"].strip()) > 0:
- template_cp_list.append(
- [
- os.path.join("GAIA_Files", task["file_name"].strip()),
- os.path.join("coding", task["file_name"].strip()),
- ]
- )
-
- record = {
- "id": task["task_id"],
- "template": template_cp_list,
- "substitutions": {
- "scenario.py": {
- "__MODEL__": model,
- "__FILE_NAME__": task["file_name"],
- "__PROMPT__": task["Question"],
- },
- "expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
- },
- }
-
- fh.write(json.dumps(record).strip() + "\n")
-
-
-###############################################################################
-if __name__ == "__main__":
- if len(sys.argv) != 2:
- sys.exit(
- f"SYNTAX: python {SCRIPT_NAME} [path to GIA repository]\n\nNote: to clone the GAIA repository, do 'git clone https://huggingface.co/datasets/gaia-benchmark/GAIA'"
- )
-
- # Copy the relevant GAIA files
- gaia_path = os.path.realpath(sys.argv[1])
-
- gaia_validation_files = os.path.join(gaia_path, "2023", "validation")
- gaia_test_files = os.path.join(gaia_path, "2023", "test")
-
- if not os.path.isdir(gaia_validation_files) or not os.path.isdir(gaia_test_files):
- sys.exit(f"Error: '{gaia_path}' does not appear to be a copy of the GAIA repository.")
-
- gaia_merged_files = os.path.realpath(os.path.join(SCENARIOS_DIR, "GAIA_Files"))
-
- shutil.copytree(
- gaia_validation_files, gaia_merged_files, ignore=shutil.ignore_patterns("metadata.jsonl"), dirs_exist_ok=True
- )
- shutil.copytree(
- gaia_test_files, gaia_merged_files, ignore=shutil.ignore_patterns("metadata.jsonl"), dirs_exist_ok=True
- )
-
- # Load the GAIA data
- gaia_validation_tasks = [[], [], []]
- with open(os.path.join(gaia_validation_files, "metadata.jsonl")) as fh:
- for line in fh:
- data = json.loads(line)
- gaia_validation_tasks[data["Level"] - 1].append(data)
-
- gaia_test_tasks = [[], [], []]
- with open(os.path.join(gaia_test_files, "metadata.jsonl")) as fh:
- for line in fh:
- data = json.loads(line)
- gaia_test_tasks[data["Level"] - 1].append(data)
-
- models = {
- "gpt4": "gpt-4",
- }
-
- templates = {
- "two_agents": "Templates/BasicTwoAgents",
- }
-
- # Add coding directories if needed (these are usually empty and left out of the repo)
- for template in templates.values():
- code_dir_path = os.path.join(SCENARIOS_DIR, template, "coding")
- if not os.path.isdir(code_dir_path):
- os.mkdir(code_dir_path)
-
- # Create the various combinations of [models] x [templates]
- for m in models.items():
- for t in templates.items():
- create_jsonl(f"gaia_validation_level_1__{t[0]}_{m[0]}", gaia_validation_tasks[0], t[1], m[1])
- create_jsonl(f"gaia_validation_level_2__{t[0]}_{m[0]}", gaia_validation_tasks[1], t[1], m[1])
- create_jsonl(f"gaia_validation_level_3__{t[0]}_{m[0]}", gaia_validation_tasks[2], t[1], m[1])
- create_jsonl(f"gaia_test_level_1__{t[0]}_{m[0]}", gaia_test_tasks[0], t[1], m[1])
- create_jsonl(f"gaia_test_level_2__{t[0]}_{m[0]}", gaia_test_tasks[1], t[1], m[1])
- create_jsonl(f"gaia_test_level_3__{t[0]}_{m[0]}", gaia_test_tasks[2], t[1], m[1])
diff --git a/samples/tools/testbed/utils/metrics_gaia.py b/samples/tools/testbed/utils/metrics_gaia.py
deleted file mode 100644
index 6119f4f38f4..00000000000
--- a/samples/tools/testbed/utils/metrics_gaia.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import os
-import sys
-import argparse
-import csv
-
-
-def metrics(results_fh):
- """
- Compute metrics from collated GAIA results.
-
- Args:
- results_fh (File Stream): A file stream containing the collated results in CSV.
- """
-
- reader = csv.reader(results_fh)
- first_row = next(reader) # Read the first line
-
- num_trials = len(first_row) - 1 # Don't count the first column (TestId)
-
- # Set up the counters
- counters = []
- for i in range(0, num_trials):
- counters.append({"successes": 0, "failures": 0, "missing": 0})
-
- # Load the results. We'll need to iterate over them a few times.
- results = list()
- for row in reader:
- name = row[0]
- trials = [(None if v.strip() == "" else int(v)) for v in row[1:]]
- for i in range(0, len(trials)):
- v = trials[i]
- if v is None:
- counters[i]["missing"] += 1
- elif v > 0:
- counters[i]["successes"] += 1
- else:
- counters[i]["failures"] += 1
-
- results.append([name, trials])
-
- def _safe_div(num, denom):
- if denom == 0:
- return ""
- else:
- return num / denom
-
- # Print the header
- for i in range(0, len(counters)):
- counter = counters[i]
- n = counter["successes"] + counter["failures"] + counter["missing"]
- score = _safe_div(counter["successes"], n)
- print(f"{i},{n},{counter['successes']},{counter['failures']},{counter['missing']},{score}")
-
-
-###############################################################################
-if __name__ == "__main__":
- script_path = os.path.realpath(__file__)
- script_name = os.path.basename(script_path)
- script_dir = os.path.dirname(script_path)
-
- parser = argparse.ArgumentParser(
- description=f"""
-{script_name} will compute metrics on the collated results of the GAIA scenarios. Use collate_gaia.py to prepare input to this script.
-
-The output will be formatted as a CSV with the following schema:
-
-Trial, n, successes, failures, missing, score
-0 N_0, s_0 f_0 m_0, p_0
-0 N_1, s_1 f_1 m_1, p_1
-...
-M N_M, s_M f_M m_M, p_M
-
-Where:
-
- N_i is the number of questions in trial i
- s_i is the number of successes in trial i
- f_i is the number of failures in trial i
- m_i is the number of missing values in trial i
- p_i is the proportion of successes in trail i (i.e, s_i / N_i)
-
-""".strip(),
- formatter_class=argparse.RawTextHelpFormatter,
- )
-
- parser.add_argument(
- "scenario",
- nargs="?",
- help="Path to collated results. If '-' or omitted, read from stdin. (default: '-')",
- default="-",
- )
- args = parser.parse_args()
-
- if args.scenario == "" or args.scenario == "-":
- metrics(sys.stdin)
- else:
- with open(args.scenario, "rt") as fh:
- metrics(fh)
diff --git a/samples/tools/testbed/utils/metrics_human_eval.py b/samples/tools/testbed/utils/metrics_human_eval.py
deleted file mode 100644
index 25d9aa90fda..00000000000
--- a/samples/tools/testbed/utils/metrics_human_eval.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import os
-import sys
-import argparse
-import csv
-
-
-def metrics(results_fh):
- """
- Compute metrics from collated HumanEval results.
-
- Args:
- results_fh (File Stream): A file stream containing the collated results in CSV.
- """
-
- reader = csv.reader(results_fh)
- first_row = next(reader) # Read the first line
-
- num_trials = len(first_row) - 1 # Don't count the first column (TestId)
- max_turns = 0
- num_rows = 0
-
- # Load the results. We'll need to iterate over them a few times.
- results = list()
- for row in reader:
- num_rows += 1
-
- name = row[0]
- trials = [(None if v.strip() == "" else int(v)) for v in row[1:]]
- for v in trials:
- if v is not None:
- max_turns = max(max_turns, v)
- results.append([name, trials])
-
- # Print the header
- header = ["Trial"]
- for i in range(1, max_turns + 1):
- header.append("cumulative_passes_by_turn_" + str(i))
- header.append("fails")
- header.append("missing")
- print(",".join(header))
-
- # Compute the metrics
- def _metrics_for_trial(t):
- counts = [None]
- fails = 0
- missing = 0
-
- # Compute cumulative passes for each conversation turn
- for i in range(1, max_turns + 1):
- counts.append(0)
- assert len(counts) == i + 1
-
- for r in results:
- v = r[1][t]
- if v is not None:
- v = int(v)
- if 0 <= v and v <= i:
- counts[i] += 1
-
- # Count missing and failed
- for r in results:
- v = r[1][t]
- if v is None:
- missing += 1
- elif int(v) < 0:
- fails += 1
-
- # Prepare the row in the format specified by the header
- return str(t) + "," + ",".join([str(v) for v in counts[1:]]) + "," + str(fails) + "," + str(missing)
-
- # Print each row
- for t in range(0, num_trials):
- print(_metrics_for_trial(t))
-
-
-###############################################################################
-if __name__ == "__main__":
- script_path = os.path.realpath(__file__)
- script_name = os.path.basename(script_path)
- script_dir = os.path.dirname(script_path)
-
- parser = argparse.ArgumentParser(
- description=f"""
-{script_name} will compute metrics on the collated results of the HumanEval scenarios. Use collate_human_eval.py to prepare input to this script.
-
-The output will be formatted as a CSV with the following schema:
-
-Trial, cumulative_passes_by_turn_1, ..., cumulative_passes_by_turn_N, fails, missing
-0 x_01, x_0N, y_0, z_0
-1 x_11, x_1N, y_1, z_1
-...
-M x_M1, x_MN, y_M, z_M
-
-Where:
-
- x_ij is the number of HumanEval problems in Trial i that achieved a passing result by conversation turn j.
- y_i is the number of HumanEval problems in Trial i that never achieved a passing result (they failed).
- z_i is the number of HumanEval problems in Trial i that have missing data.
-
-""".strip(),
- formatter_class=argparse.RawTextHelpFormatter,
- )
-
- parser.add_argument(
- "scenario",
- nargs="?",
- help="Path to collated results. If '-' or omitted, read from stdin. (default: '-')",
- default="-",
- )
- args = parser.parse_args()
-
- if args.scenario == "" or args.scenario == "-":
- metrics(sys.stdin)
- else:
- with open(args.scenario, "rt") as fh:
- metrics(fh)
diff --git a/samples/tools/testbed/utils/prepare_autogpt.py b/samples/tools/testbed/utils/prepare_autogpt.py
deleted file mode 100644
index 9da27973545..00000000000
--- a/samples/tools/testbed/utils/prepare_autogpt.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import base64
-import glob
-import json
-import os
-import shutil
-
-current_file_dir = os.path.dirname(os.path.abspath(__file__))
-challenge_path = os.path.join(os.path.dirname(current_file_dir), "scenarios/AutoGPT/challenges")
-data_paths = glob.glob(str(challenge_path) + "/*/data.json")
-
-for data_path in data_paths:
- print("Converting data path: ", data_path)
- workspace = os.path.dirname(data_path)
-
- with open(data_path, "r") as f:
- data = json.load(f)
-
- should_contain = data["ground"].get("should_contain", [])
- should_not_contain = data["ground"].get("should_not_contain", [])
- case_sensitive = data["ground"].get("case_sensitive", False)
-
- # since 'should_contain' field may contain escape characters, this can cause problems when using str() method and eval(), I used base64 encode to avoid such problems
- should_contain_base64 = []
- for word in should_contain:
- encoded_word = base64.b64encode(word.encode("utf-8")).decode("utf-8")
- should_contain_base64.append(encoded_word)
-
- should_not_contain_base64 = []
- for word in should_not_contain:
- encoded_word = base64.b64encode(word.encode("utf-8")).decode("utf-8")
- should_not_contain_base64.append(encoded_word)
-
- # copy all the files needed to 'coding' directory
- # 1. 'artifacts_in' directory: all the files needed for QA
- save_path = os.path.join(os.path.dirname(current_file_dir), "scenarios/AutoGPT")
- artifacts_in = False
- if os.path.exists(os.path.join(workspace, "artifacts_in")):
- artifacts_in = True
- target_folder = os.path.join(save_path, "Templates/TwoAgents/coding/file", data["name"])
- if os.path.exists(target_folder):
- shutil.rmtree(target_folder)
- shutil.copytree(os.path.join(workspace, "artifacts_in"), target_folder)
- # print(f"All the artifacts are copied from {os.path.join(workspace, 'artifacts_in')} to {target_folder}")
-
- # 2. 'custom_python' directory: all the files needed for testing python code
- if os.path.exists(os.path.join(workspace, "custom_python")):
- target_folder = os.path.join(save_path, "Templates/TwoAgents/custom_python")
- if not os.path.exists(target_folder):
- os.makedirs(target_folder)
- for filename in os.listdir(os.path.join(workspace, "custom_python")):
- shutil.copy(os.path.join(workspace, "custom_python", filename), os.path.join(target_folder, filename))
- # print(f"File copied from {os.path.join(workspace, 'custom_python', filename)} to {target_folder}")
-
- record = {
- "id": data["name"],
- "template": "Templates/TwoAgents",
- "substitutions": {
- "scenario.py": {
- "__MODEL__": "gpt-35-turbo-16k",
- "__TASK__": data["task"],
- "__TARGET_FOLDER__": f"file/{data['name']}" if artifacts_in else "",
- },
- "check.py": {
- "__FILE_PATTERN__": data["ground"]["files"][0],
- "__EVAL_TYPE__": data["ground"]["eval"]["type"],
- "__CASE_SENSITIVE__": str(case_sensitive),
- },
- "should_contain.txt": {
- "__CONTAIN__": str(should_contain_base64),
- },
- "should_not_contain.txt": {
- "__NO_CONTAIN__": str(should_not_contain_base64),
- },
- },
- }
- with open(os.path.join(save_path, "autogpt_twoagent_gpt35.jsonl"), "a") as f:
- f.write(json.dumps(record).strip() + "\n")
-
- record = {
- "id": data["name"],
- "template": "Templates/TwoAgents",
- "substitutions": {
- "scenario.py": {
- "__MODEL__": "gpt-4-1106-preview",
- "__TASK__": data["task"],
- "__TARGET_FOLDER__": f"file/{data['name']}" if artifacts_in else "",
- },
- "check.py": {
- "__FILE_PATTERN__": data["ground"]["files"][0],
- "__EVAL_TYPE__": data["ground"]["eval"]["type"],
- "__CASE_SENSITIVE__": str(case_sensitive),
- },
- "should_contain.txt": {
- "__CONTAIN__": str(should_contain_base64),
- },
- "should_not_contain.txt": {
- "__NO_CONTAIN__": str(should_not_contain_base64),
- },
- },
- }
- with open(os.path.join(save_path, "autogpt_twoagent_gpt4.jsonl"), "a") as f:
- f.write(json.dumps(record).strip() + "\n")
diff --git a/website/blog/2024-01-25-AutoGenBench/img/teaser.jpg b/website/blog/2024-01-25-AutoGenBench/img/teaser.jpg
new file mode 100755
index 00000000000..00571529bad
Binary files /dev/null and b/website/blog/2024-01-25-AutoGenBench/img/teaser.jpg differ
diff --git a/website/blog/2024-01-25-AutoGenBench/index.mdx b/website/blog/2024-01-25-AutoGenBench/index.mdx
new file mode 100644
index 00000000000..a1f34efeb28
--- /dev/null
+++ b/website/blog/2024-01-25-AutoGenBench/index.mdx
@@ -0,0 +1,131 @@
+---
+title: "AutoGenBench -- A Tool for Measuring and Evaluating AutoGen Agents"
+authors:
+ - afourney
+ - qingyunwu
+tags: [AutoGen]
+---
+
+![AutoGenBench](img/teaser.jpg)
+AutoGenBench is a standalone tool for evaluating AutoGen agents and workflows on common benchmarks.
+
+
+## TLDR
+Today we are releasing AutoGenBench – a tool for evaluating AutoGen agents and workflows on established LLM and agentic benchmarks.
+
+AutoGenBench is a standalone command line tool, installable from PyPI, which handles downloading, configuring, running, and reporting supported benchmarks. AutoGenBench works best when run alongside Docker, since it uses Docker to isolate tests from one another.
+
+* See the [AutoGenBench README](https://github.com/microsoft/autogen/blob/main/samples/tools/testbed/README.md) for information on installation and running benchmarks.
+* See the [AutoGenBench CONTRIBUTING guide](https://github.com/microsoft/autogen/blob/main/samples/tools/testbed/CONTRIBUTING.md) for information on developing or contributing benchmark datasets.
+
+
+### Quick Start
+Get started quickly by running the following commands in a bash terminal.
+
+*Note:* You may need to adjust the path to the `OAI_CONFIG_LIST`, as appropriate.
+```sh
+export OAI_CONFIG_LIST=$(cat ./OAI_CONFIG_LIST)
+pip install autogenbench
+autogenbench clone HumanEval
+cd HumanEval
+cat README.md
+autogenbench run --subsample 0.1 --repeat 3 Tasks/human_eval_two_agents.jsonl
+autogenbench tabulate Results/human_eval_two_agents
+```
+
+## Introduction
+Measurement and evaluation are core components of every major AI or ML research project. The same is true for AutoGen. To this end, today we are releasing AutoGenBench, a standalone command line tool that we have been using to guide development of AutoGen. Conveniently, AutoGenBench handles: downloading, configuring, running, and reporting results of agents on various public benchmark datasets. In addition to reporting top-line numbers, each AutoGenBench run produces a comprehensive set of logs and telemetry that can be used for debugging, profiling, computing custom metrics, and as input to [AgentEval](https://microsoft.github.io/autogen/blog/2023/11/20/AgentEval). In the remainder of this blog post, we outline core design principles for AutoGenBench (key to understanding its operation); present a guide to installing and running AutoGenBench; outline a roadmap for evaluation; and conclude with an open call for contributions.
+
+## Design Principles
+AutoGenBench is designed around three core design principles. Knowing these principles will help you understand the tool, its operation and its output. These three principles are:
+- **Repetition:** LLMs are stochastic, and in many cases, so too is the code they write to solve problems. For example, a Python script might call an external search engine, and the results may vary run-to-run. This can lead to variance in agent performance. Repetition is key to measuring and understanding this variance. To this end, AutoGenBench is built from the ground up with an understanding that tasks may be run multiple times, and that variance is a metric we often want to measure.
+
+ - **Isolation:** Agents interact with their worlds in both subtle and overt ways. For example an agent may install a python library or write a file to disk. This can lead to ordering effects that can impact future measurements. Consider, for example, comparing two agents on a common benchmark. One agent may appear more efficient than the other simply because it ran second, and benefitted from the hard work the first agent did in installing and debugging necessary Python libraries. To address this, AutoGenBench isolates each task in its own Docker container. This ensures that all runs start with the same initial conditions. (Docker is also a *much safer way to run agent-produced code*, in general.)
+
+- **Instrumentation:** While top-line metrics are great for comparing agents or models, we often want much more information about how the agents are performing, where they are getting stuck, and how they can be improved. We may also later think of new research questions that require computing a different set of metrics. To this end, AutoGenBench is designed to log everything, and to compute metrics from those logs. This ensures that one can always go back to the logs to answer questions about what happened, run profiling software, or feed the logs into tools like [AgentEval](https://microsoft.github.io/autogen/blog/2023/11/20/AgentEval).
+
+## Installing and Running AutoGenBench
+As noted above, isolation is a key design principle, and so AutoGenBench must be run in an environment where Docker is available (desktop or Engine). **It will not run in GitHub codespaces**, unless you opt for native execution (which is strongly discouraged). To install Docker Desktop see [https://www.docker.com/products/docker-desktop/](https://www.docker.com/products/docker-desktop/).
+Once Docker is installed, AutoGenBench can then be installed as a standalone tool from PyPI. With `pip`, installation can be achieved as follows:
+
+```sh
+pip install autogenbench
+```
+After installation, you must configure your API keys. As with other AutoGen applications, AutoGenBench will look for the OpenAI keys in the OAI_CONFIG_LIST file in the current working directory, or the OAI_CONFIG_LIST environment variable. This behavior can be overridden using a command-line parameter.
+
+If you will be running multiple benchmarks, it is often most convenient to leverage the environment variable option. You can load your keys into the environment variable by executing:
+
+```sh
+export OAI_CONFIG_LIST=$(cat ./OAI_CONFIG_LIST)
+```
+## A Typical Session
+Once AutoGenBench and necessary keys are installed, a typical session will look as follows:
+
+```
+autogenbench clone HumanEval
+cd HumanEval
+cat README.md
+autogenbench run --subsample 0.1 --repeat 3 Tasks/human_eval_two_agents.jsonl
+autogenbench tabulate results/human_eval_two_agents
+```
+
+Where:
+- `autogenbench clone HumanEval` downloads and expands the HumanEval benchmark scenario.
+- `cd HumanEval; cat README.md` navigates to the benchmark directory, and prints the README (which you should always read!)
+- `autogenbench run --subsample 0.1 --repeat 3 Tasks/human_eval_two_agents.jsonl`
+ runs a 10% subsample of the tasks defined in `Tasks/human_eval_two_agents.jsonl`. Each task is run 3 times.
+- `autogenbench tabulate results/human_eval_two_agents` tabulates the results of the run.
+
+After running the above `tabulate` command, you should see output similar to the following:
+
+```
+ Trial 0 Trial 1 Trial 2
+Task Id Success Success Success
+------------- --------- --------- ---------
+HumanEval_107 False True True
+HumanEval_22 True True True
+HumanEval_43 True True True
+HumanEval_88 True True True
+HumanEval_14 True True True
+HumanEval_157 True True True
+HumanEval_141 True True True
+HumanEval_57 True True True
+HumanEval_154 True True True
+HumanEval_153 True True True
+HumanEval_93 False True False
+HumanEval_137 True True True
+HumanEval_143 True True True
+HumanEval_13 True True True
+HumanEval_49 True True True
+HumanEval_95 True True True
+------------- --------- --------- ---------
+Successes 14 16 15
+Failures 2 0 1
+Missing 0 0 0
+Total 16 16 16
+
+CAUTION: 'autogenbench tabulate' is in early preview.
+Please do not cite these values in academic work without first inspecting and verifying the results in the logs yourself.
+```
+
+From this output we can see the results of the three separate repetitions of each task, and final summary statistics of each run. In this case, the results were generated via GPT-4 (as defined in the OAI_CONFIG_LIST that was provided), and used the `TwoAgents` template. **It is important to remember that AutoGenBench evaluates *specific* end-to-end configurations of agents (as opposed to evaluating a model or cognitive framework more generally).**
+
+Finally, complete execution traces and logs can be found in the `Results` folder. See the [AutoGenBench README](https://github.com/microsoft/autogen/blob/main/samples/tools/testbed/README.md) for more details about command-line options and output formats. Each of these commands also offers extensive in-line help via:
+
+- `autogenbench --help`
+- `autogenbench clone --help`
+- `autogenbench run --help`
+- `autogenbench tabulate --help`
+
+
+## Roadmap
+While we are announcing AutoGenBench, we note that it is very much an evolving project in its own right. Over the next few weeks and months we hope to:
+- Onboard many additional benchmarks beyond those shipping today
+- Greatly improve logging and telemetry
+- Introduce new core metrics including total costs, task completion time, conversation turns, etc.
+- Provide tighter integration with AgentEval and AutoGen Studio
+
+For an up to date tracking of our work items on this project, please see [AutoGenBench Work Items]( https://github.com/microsoft/autogen/issues/973)
+
+## Call for Participation
+Finally, we want to end this blog post with an open call for contributions. AutoGenBench is still nascent, and has much opportunity for improvement. New benchmarks are constantly being published, and will need to be added. Everyone may have their own distinct set of metrics that they care most about optimizing, and these metrics should be onboarded. To this end, we welcome any and all contributions to this corner of the AutoGen project. If contributing is something that interests you, please see the [contributor’s guide](https://github.com/microsoft/autogen/blob/main/samples/tools/testbed/CONTRIBUTING.md) and join our [Discord](https://discord.gg/pAbnFJrkgZ) discussion in the [#autogenbench](https://discord.com/channels/1153072414184452236/1199851779328847902) channel!