From 53a246a23d58605f7e1f39dcfe8a2edc003fd568 Mon Sep 17 00:00:00 2001
From: optimass <massimo.p.caccia@gmail.com>
Date: Wed, 9 Oct 2024 14:32:47 +0000
Subject: [PATCH 1/9] adapting parallel_backend to use wait_func

---
 src/agentlab/experiments/launch_exp.py | 33 ++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py
index b2ed28ec..00b7b70a 100644
--- a/src/agentlab/experiments/launch_exp.py
+++ b/src/agentlab/experiments/launch_exp.py
@@ -16,6 +16,15 @@ def import_object(path: str):
     return obj
 
 
+def wait_and_run(exp_args: ExpArgs, wait_func):
+    try:
+        wait_func()
+    except Exception as e:
+        logging.error(f"Error with wait_func: {e}")
+        return
+    exp_args.run()
+
+
 def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_backend="joblib"):
     """Run a list of ExpArgs in parallel.
 
@@ -39,15 +48,24 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_back
         parallel_backend = "sequential"
 
     logging.info(f"Saving experiments to {exp_dir}")
+    wait_funcs = []
     for exp_args in exp_args_list:
-        exp_args.agent_args.prepare()
+        server_info, wait_func = exp_args.agent_args.prepare()
+        wait_funcs.append(wait_func)
         exp_args.prepare(exp_root=exp_dir)
+
+    # logging.info(f"Saving experiments to {exp_dir}")
+    # for exp_args in exp_args_list:
+    # exp_args.agent_args.prepare()
+    # exp_args.prepare(exp_root=exp_dir)
     try:
         if parallel_backend == "joblib":
             from joblib import Parallel, delayed
 
             Parallel(n_jobs=n_jobs, prefer="processes")(
-                delayed(exp_args.run)() for exp_args in exp_args_list
+                # delayed(exp_args.run)() for exp_args in exp_args_list
+                delayed(wait_and_run)(exp_arg, wait_func)
+                for exp_arg, wait_func in zip(exp_args_list, wait_funcs)
             )
 
         elif parallel_backend == "dask":
@@ -56,8 +74,9 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_back
             with make_dask_client(n_worker=n_jobs):
                 execute_task_graph(exp_args_list)
         elif parallel_backend == "sequential":
-            for exp_args in exp_args_list:
-                exp_args.run()
+            for exp_args, wait_func in zip(exp_args_list, wait_funcs):
+                wait_and_run(exp_args, wait_func)
+                # exp_args.run()
         else:
             raise ValueError(f"Unknown parallel_backend: {parallel_backend}")
     finally:
@@ -117,7 +136,11 @@ def _yield_incomplete_experiments(exp_root, relaunch_mode="incomplete_only"):
             summary_info = exp_result.summary_info
 
         except FileNotFoundError:
-            yield exp_result.exp_args
+            # yield exp_result.exp_args
+            try:
+                yield exp_result.exp_args
+            except Exception as e:
+                logging.error(f"Error with exp_result.exp_args: {e}")
             continue
 
         if relaunch_mode == "incomplete_only":

From 63f81e6a7d0877607fce353778ae72ecca8ea8cd Mon Sep 17 00:00:00 2001
From: optimass <massimo.p.caccia@gmail.com>
Date: Fri, 11 Oct 2024 20:55:13 +0000
Subject: [PATCH 2/9] fix

---
 src/agentlab/analyze/inspect_results.py | 195 +++++++++++++++++++++++-
 src/agentlab/llm/langchain_utils.py     |   6 +-
 2 files changed, 194 insertions(+), 7 deletions(-)

diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index d69b1656..2015c07d 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -2,6 +2,7 @@
 import io
 import random
 import re
+from typing import List, Union
 import warnings
 from collections import defaultdict
 from datetime import datetime
@@ -114,6 +115,9 @@ def load_result_df(
     result_df=None,
     index_white_list=("agent_args.*",),
     index_black_list=("*model_url*", "*extra*"),
+    avg_across_finetining_samples=True,
+    separate_val_test=False,
+    frac_valid=0.25,
 ):
     """Load the result dataframe.
 
@@ -139,7 +143,12 @@ def load_result_df(
     if result_df is not None:
         result_list = list(result_df["exp_result"])
     else:
-        result_list = list(yield_all_exp_results(exp_dir, progress_fn=progress_fn))
+        if isinstance(exp_dir, list):
+            result_list = []
+            for dir in exp_dir:
+                result_list.extend(list(yield_all_exp_results(dir, progress_fn=progress_fn)))
+        else:
+            result_list = list(yield_all_exp_results(exp_dir, progress_fn=progress_fn))
 
     if len(result_list) == 0:
         return None
@@ -148,8 +157,61 @@ def load_result_df(
         result_list = progress_fn(result_list, desc="Loading results")
 
     df = pd.DataFrame([exp_result.get_exp_record() for exp_result in result_list])
+
+    if separate_val_test:
+        # Initialize the new columns
+        df["valid"] = False
+        df["test"] = False
+        df["cum_reward_valid"] = np.nan
+        df["cum_reward_test"] = np.nan
+
+        # Ensure 'env_args.task_name' and 'env_args.task_seed' are columns after resetting index
+        df["task_name"] = df["env_args.task_name"]  # Copy task_name to a new column for convenience
+        grouped = df.groupby("task_name")
+
+        for task_name, group in grouped:
+            # Get the unique seeds for this task
+            unique_seeds = group["env_args.task_seed"].unique()
+            np.random.shuffle(unique_seeds)  # Shuffle to randomize the split
+
+            # Calculate split index
+            split_idx = int(len(unique_seeds) * frac_valid)
+
+            # Split the seeds into validation and test sets
+            valid_seeds = unique_seeds[:split_idx]
+            test_seeds = unique_seeds[split_idx:]
+
+            # Mark the rows that are in the validation set
+            df.loc[
+                (df["task_name"] == task_name) & (df["env_args.task_seed"].isin(valid_seeds)),
+                "valid",
+            ] = True
+
+            # Mark the rows that are in the test set
+            df.loc[
+                (df["task_name"] == task_name) & (df["env_args.task_seed"].isin(test_seeds)), "test"
+            ] = True
+
+            # Set the `cum_reward_valid` and `cum_reward_test` based on the valid and test columns
+            df.loc[df["valid"], "cum_reward_valid"] = df["cum_reward"]
+            df.loc[df["test"], "cum_reward_test"] = df["cum_reward"]
+
     if set_index:
         set_index_from_variables(df, index_white_list, index_black_list)
+
+    if avg_across_finetining_samples:
+        # Reset the index to make it a DataFrame
+        df_reset = df.reset_index()
+
+        # Modify the 'agent_args.chat_model_args.model_path' column by removing 'sample_{int}/'
+        # TODO: will this work for samples_0-1 ?
+        df_reset["agent_args.chat_model_args.model_path"] = df_reset[
+            "agent_args.chat_model_args.model_path"
+        ].str.replace(r"sample_\d+/", "", regex=True)
+
+        # Recreate the MultiIndex
+        df = df_reset.set_index(df.index.names)
+
     return df
 
 
@@ -237,7 +299,7 @@ def get_std_err(df, metric):
     return mean, std_err
 
 
-def summarize(sub_df, use_bootstrap=False):
+def summarize(sub_df, use_bootstrap=False, separate_val_test=True):
     if not "cum_reward" in sub_df:
         record = dict(
             avg_reward=np.nan,
@@ -248,6 +310,7 @@ def summarize(sub_df, use_bootstrap=False):
             n_err=0,
         )
     else:
+
         err = sub_df["err_msg"].notnull()
         n_completed = (err | sub_df["truncated"] | sub_df["terminated"]).sum()
 
@@ -256,8 +319,22 @@ def summarize(sub_df, use_bootstrap=False):
 
         if use_bootstrap:
             _mean_reward, std_reward = get_bootstrap(sub_df, "cum_reward")
+            if separate_val_test:
+                _mean_reward_valid, std_reward_valid = get_bootstrap(
+                    sub_df[sub_df["valid"]], "cum_reward"
+                )
+                _mean_reward_test, std_reward_test = get_bootstrap(
+                    sub_df[sub_df["test"]], "cum_reward"
+                )
         else:
             _mean_reward, std_reward = get_std_err(sub_df, "cum_reward")
+            if separate_val_test:
+                _mean_reward_valid, std_reward_valid = get_std_err(
+                    sub_df[sub_df["valid"]], "cum_reward"
+                )
+                _mean_reward_test, std_reward_test = get_std_err(
+                    sub_df[sub_df["test"]], "cum_reward"
+                )
 
         # sanity check, if there is an error the reward should be zero
         assert sub_df[sub_df["err_msg"].notnull()]["cum_reward"].sum() == 0
@@ -270,6 +347,11 @@ def summarize(sub_df, use_bootstrap=False):
             n_completed=f"{n_completed}/{len(sub_df)}",
             n_err=err.sum(skipna=True),
         )
+        if separate_val_test:
+            record["avg_reward_valid"] = sub_df["cum_reward_valid"].mean(skipna=True).round(3)
+            record["std_err_valid"] = std_reward_valid
+            record["avg_reward_test"] = sub_df["cum_reward_test"].mean(skipna=True).round(3)
+            record["std_err_test"] = std_reward_test
 
     return pd.Series(record)
 
@@ -391,6 +473,7 @@ def global_report(
     result_df: pd.DataFrame,
     reduce_fn=summarize,
     rename_index=lambda name: name.replace("agent_args.flags.", ""),
+    separate_valid_test=False,
 ):
     """Produce a report that summarize all tasks and all episodes for each
     agent.
@@ -422,7 +505,15 @@ def global_report(
             index_names = [rename_index(name) for name in report.index.names]
             report = report.rename_axis(index=index_names)
 
-        # if has key avg_reward
+        if separate_valid_test:
+            if "avg_reward_valid" in report:
+                report = report.sort_values("avg_reward_valid", ascending=False)
+                ## put avg_reward and std_err at the end
+                report_columns = list(report.columns)
+                report_columns.append(report_columns.pop(report_columns.index("avg_reward")))
+                report_columns.append(report_columns.pop(report_columns.index("std_err")))
+                report = report[report_columns]
+
         if "avg_reward" in report.columns:
             report = report.sort_values("avg_reward", ascending=False)
 
@@ -485,7 +576,9 @@ def flag_report(report: pd.DataFrame, metric: str = "avg_reward", round_digits:
 
 
 def get_most_recent_folder(
-    root_dir: Path = None, date_format: str = "%Y-%m-%d_%H-%M-%S", contains=None
+    root_dir: Path = None,
+    date_format: str = "%Y-%m-%d_%H-%M-%S",
+    contains=None,
 ):
     """Return the most recent directory based on the date in the folder name.
 
@@ -519,12 +612,52 @@ def get_most_recent_folder(
     return most_recent_folder
 
 
+def get_nth_most_recent_folder(
+    result_dirs: Union[List[Path], Path] = None,
+    n=1,
+    date_format: str = "%Y-%m-%d_%H-%M-%S",
+    contains=None,
+):
+    """Return the N-th most recent directory based on the date in the folder name."""
+
+    if result_dirs is None:
+        result_dir = RESULTS_DIR
+
+    if isinstance(result_dirs, Path):
+        result_dirs = [result_dirs]
+
+    folders_with_dates = []
+
+    for folder in result_dirs:
+        for item in folder.iterdir():
+            if item.is_dir() and not item.name.startswith("_"):
+                if contains is not None and contains not in item.name:
+                    continue
+                try:
+                    folder_date = datetime.strptime("_".join(item.name.split("_")[:2]), date_format)
+                    folders_with_dates.append((folder_date, item))
+                except (ValueError, IndexError):
+                    continue
+
+    # Sort folders by date in descending order (most recent first)
+    folders_with_dates.sort(reverse=True, key=lambda x: x[0])
+
+    # Return the N-th most recent folder if it exists
+    if n is None:
+        return [folder for date, folder in folders_with_dates]
+    if len(folders_with_dates) >= n:
+        return folders_with_dates[n - 1][1]
+    else:
+        return None  # or raise an exception if you prefer
+
+
 def display_report(
     report: pd.DataFrame,
     apply_shrink_columns: bool = True,
     copy_to_clipboard: bool = True,
     rename_bool_flags: bool = True,
     print_only: str = None,
+    add_summary_stats: bool = False,
 ):
     """Display the report in a nicer-ish format.
 
@@ -539,6 +672,7 @@ def display_report(
         copy_to_clipboard: Copy the report to the clipboard
         rename_bool_flags: Rename the boolean flags to be more compact and readable
         print_only: Print only the given column
+        add_summary_stats: Add a row with the sum and average for numeric columns
     """
     report = report.copy()
 
@@ -559,6 +693,59 @@ def display_report(
         columns = [print_only] + columns
         report = report[columns]
 
+    if add_summary_stats:
+
+        # eplace 'NaN' string with actual NaN
+        report.replace("nan", pd.NA, inplace=True)
+        report.dropna(inplace=True)
+
+        # Convert numeric columns back to numeric types where possible
+        report = report.apply(pd.to_numeric, errors="ignore")
+
+        # Function to split fractions into numerator and denominator
+        def split_fraction(fraction_str):
+            try:
+                if isinstance(fraction_str, str) and "/" in fraction_str:
+                    numerator, denominator = map(float, fraction_str.split("/"))
+                    return numerator, denominator
+            except ValueError:
+                pass
+            return None, None
+
+        # Extract numerators and denominators
+        report["n\ncompleted_num"], report["n\ncompleted_den"] = zip(
+            *report["n\ncompleted"].apply(split_fraction)
+        )
+
+        # Round all numeric columns to two decimal places (excluding fraction columns)
+        numeric_cols = report.select_dtypes(include=["number"]).columns
+        report[numeric_cols] = report[numeric_cols].round(2)
+
+        # Calculate the sum and average for numeric columns (ignoring fraction columns for now)
+        total = report[numeric_cols].sum().round(2)
+        average = report[numeric_cols].mean().round(2)
+
+        # Calculate the sum and average for the fraction column
+        total_numerator = int(report["n\ncompleted_num"].sum())
+        total_denominator = int(report["n\ncompleted_den"].sum())
+
+        average_numerator = int(round(report["n\ncompleted_num"].mean()))
+        average_denominator = int(round(report["n\ncompleted_den"].mean()))
+
+        # Append the 'Total' and 'Average' rows
+        report.loc["Total"] = total
+        report.loc["Average"] = average
+
+        # Manually add the fractions back to the Total and Average rows in the original format
+        report.at["Total", "n\ncompleted"] = f"{total_numerator}/{total_denominator}"
+        report.at["Average", "n\ncompleted"] = f"{average_numerator}/{average_denominator}"
+
+        # Drop the temporary converted columns
+        report.drop(columns=["n\ncompleted_num", "n\ncompleted_den"], inplace=True)
+
+        # Convert all values to strings
+        report = report.astype(str)
+
     styled_report = set_wrap_style(report)
 
     display(styled_report)
diff --git a/src/agentlab/llm/langchain_utils.py b/src/agentlab/llm/langchain_utils.py
index 8b20b1df..76a2051b 100644
--- a/src/agentlab/llm/langchain_utils.py
+++ b/src/agentlab/llm/langchain_utils.py
@@ -133,7 +133,7 @@ class HuggingFaceAPIChatModel(HFBaseChatModel):
     def __init__(
         self,
         model_name: str,
-        temperature: Optional[int] = 1e-1,
+        temperature: Optional[float] = 1e-1,
         max_new_tokens: Optional[int] = 512,
         n_retry_server: Optional[int] = 4,
     ):
@@ -150,7 +150,7 @@ class HuggingFaceLocalChatModel(HFBaseChatModel):
     def __init__(
         self,
         model_name: str,
-        temperature: Optional[int] = 1e-1,
+        temperature: Optional[float] = 1e-1,
         max_new_tokens: Optional[int] = 512,
         n_retry_server: Optional[int] = 4,
     ):
@@ -174,7 +174,7 @@ def __init__(
         model_name: str,
         model_url: str,
         token: Optional[str] = None,
-        temperature: Optional[int] = 1e-1,
+        temperature: Optional[float] = 1e-1,
         max_new_tokens: Optional[int] = 512,
         n_retry_server: Optional[int] = 4,
     ):

From b05c0b1aa4e53ef1a88240a6a805ab0d826a2a74 Mon Sep 17 00:00:00 2001
From: optimass <massimo.p.caccia@gmail.com>
Date: Mon, 14 Oct 2024 17:23:05 +0000
Subject: [PATCH 3/9] adding get_action_post_hoc

---
 .../agents/generic_agent/generic_agent.py     | 129 +++++++++++++++++-
 1 file changed, 128 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
index e2696923..2a861a01 100644
--- a/src/agentlab/agents/generic_agent/generic_agent.py
+++ b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -1,3 +1,5 @@
+import logging
+import re
 from dataclasses import asdict, dataclass
 from functools import partial
 from warnings import warn
@@ -9,7 +11,7 @@
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.agents.utils import openai_monitored_agent
 from agentlab.llm.chat_api import BaseModelArgs
-from agentlab.llm.llm_utils import RetryError, retry_raise
+from agentlab.llm.llm_utils import RetryError, retry_raise, ParseError
 from agentlab.llm.tracking import cost_tracker_decorator
 
 from .generic_agent_prompt import GenericPromptFlags, MainPrompt
@@ -252,3 +254,128 @@ def get_action_post_hoc(agent: GenericAgent, obs: dict, ans_dict: dict):
         output += f"\n<action>\n{action}\n</action>"
 
     return system_prompt, instruction_prompt, output
+
+
+def get_action_post_hoc(agent: GenericAgent, step_info):
+    """
+    Get the action post-hoc for the agent.
+
+    This function is used to get the action after the agent has already been run.
+    Its goal is to recreate the prompt and the output of the agent a posteriori.
+    The purpose is to build datasets for training the agents.
+
+    Args:
+        agent (GenericAgent): The agent for which the action is being determined.
+        obs (dict): The observation dictionary to append to the agent's history.
+        ans_dict (dict): The answer dictionary containing the plan, step, memory, think, and action.
+
+    Returns:
+        Tuple[str, str]: The complete prompt used for the agent and the reconstructed output based on the answer dictionary.
+    """
+    system_prompt = dp.SystemPrompt().prompt
+
+    agent.obs_history.append(step_info.obs)
+
+    main_prompt = MainPrompt(
+        action_set=agent.action_set,
+        obs_history=agent.obs_history,
+        actions=agent.actions,
+        memories=agent.memories,
+        thoughts=agent.thoughts,
+        previous_plan=agent.plan,
+        step=agent.plan_step,
+        flags=agent.flags,
+    )
+
+    max_prompt_tokens, max_trunc_itr = agent._get_maxes()
+
+    fit_function = partial(
+        dp.fit_tokens,
+        max_prompt_tokens=max_prompt_tokens,
+        model_name=agent.chat_model_args.model_name,
+        max_iterations=max_trunc_itr,
+    )
+
+    instruction_prompt = fit_function(shrinkable=main_prompt)
+
+    if isinstance(instruction_prompt, list):
+        # NOTE: this is when we have images
+        instruction_prompt = instruction_prompt[0]["text"]
+
+    def parser(text):
+        try:
+            ans_dict = main_prompt._parse_answer(text)
+        except ParseError as e:
+            # these parse errors will be caught by the retry function and
+            # the chat_llm will have a chance to recover
+            return None, False, str(e)
+        return ans_dict, True, ""
+
+    og_agent_output = step_info.agent_info["chat_messages"][-1].content
+    if og_agent_output.startswith("assistant\n"):
+        og_agent_output = og_agent_output[10:]
+
+    ans_dict = parser(og_agent_output)[0]
+
+    # self.plan = ans_dict.get("plan", self.plan)
+    # self.plan_step = ans_dict.get("step", self.plan_step)
+    # self.actions.append(ans_dict["action"])
+    # self.memories.append(ans_dict.get("memory", None))
+    # self.thoughts.append(ans_dict.get("think", None))
+
+    agent_output = ""
+
+    # TODO: validate this
+    thought = ans_dict.get("think", None)
+    agent.thoughts.append(thought)
+    if thought is not None:
+        agent_output += f"\n<think>\n{thought}\n</think>\n"
+
+    agent.plan = ans_dict.get("plan", agent.plan)
+    if agent.plan != "No plan yet":
+        agent_output += f"\n<plan>\n{agent.plan}\n</plan>\n"
+
+    agent.plan_step = ans_dict.get("step", agent.plan_step)
+    if agent.plan_step != -1:
+        agent_output += f"\n<step>{agent.plan_step}</step>\n"
+
+    memory = ans_dict.get("memory", None)
+    agent.memories.append(memory)
+    if memory is not None:
+        agent_output += f"\n<memory>\n{memory}\n</memory>\n"
+
+    action = step_info.action
+    agent.actions.append(action)
+    if action is not None:
+        agent_output += f"\n<action>\n{action}\n</action>"
+
+    def find_bid(string):
+        # Try to find 'a' followed by digits within single or double quotes
+        match = re.search(r"[\"'](a\d+)[\"']", string)
+
+        # If not found, search digits within single or double quotes
+        if not match:
+            match = re.search(r"[\"'](\d+)[\"']", string)
+
+        # Return the matched pattern or None if no match found
+        if match:
+            return match.group(1)  # Return the match inside the quotes
+        else:
+            return None
+
+    # TODO: finish this
+    bid = find_bid(action)
+    if bid is not None:
+        if bid not in instruction_prompt:
+            logging.info("Bid is not in the instruction prompt.")
+            return "missing_bid"
+
+    # NOTE: keep in mind the original agent output can be more verbose
+    if agent_output not in og_agent_output:
+        logging.info("Agent output does exactly not match the last chat message.")
+        if not set(agent_output.split()).issubset(set(og_agent_output.split())):
+            logging.info("Agent output does not match the last chat message.")
+            return "action_output_mismatch"
+
+    # TODO: make sure the bid is in the prompt
+    return (system_prompt, instruction_prompt, agent_output)

From 0c07cd59cf689796c7212c1ed9c797b0768a5355 Mon Sep 17 00:00:00 2001
From: optimass <massimo.p.caccia@gmail.com>
Date: Wed, 6 Nov 2024 15:34:12 +0000
Subject: [PATCH 4/9] fix

---
 src/agentlab/experiments/launch_exp.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py
index 00b7b70a..6055b5d2 100644
--- a/src/agentlab/experiments/launch_exp.py
+++ b/src/agentlab/experiments/launch_exp.py
@@ -50,8 +50,11 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_back
     logging.info(f"Saving experiments to {exp_dir}")
     wait_funcs = []
     for exp_args in exp_args_list:
-        server_info, wait_func = exp_args.agent_args.prepare()
-        wait_funcs.append(wait_func)
+        # server_info, wait_func = exp_args.agent_args.prepare()
+        output = exp_args.agent_args.prepare()
+        if isinstance(output, tuple):
+            _, wait_func = output
+            wait_funcs.append(wait_func)
         exp_args.prepare(exp_root=exp_dir)
 
     # logging.info(f"Saving experiments to {exp_dir}")

From 7090a57cfd9f378634b4985cf283690208e5e894 Mon Sep 17 00:00:00 2001
From: MurtyShikhar <shikhar.murty@gmail.com>
Date: Thu, 7 Nov 2024 09:02:25 -0800
Subject: [PATCH 5/9] change to the chat api, llm utils for multi-action
 sampling

---
 src/agentlab/analyze/agent_xray.py           | 22 +++++++++----
 src/agentlab/experiments/task_collections.py | 10 ++++++
 src/agentlab/llm/chat_api.py                 | 12 ++++---
 src/agentlab/llm/llm_utils.py                | 34 +++++++++++++++++++-
 4 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 228901b3..9fd5443b 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -220,7 +220,7 @@ def run_gradio(results_dir: Path):
     content. You have to sort back with the Idx column to align the click with
     the order."""
                     )
-                agent_table = gr.DataFrame(height=500, show_label=False, interactive=False)
+                agent_table = gr.DataFrame(max_height=500, show_label=False, interactive=False)
             with gr.Tab("Select Task and Seed", id="Select Task"):
                 with gr.Row():
                     with gr.Column(scale=4):
@@ -236,7 +236,9 @@ def run_gradio(results_dir: Path):
                                 )
                             refresh_results_button = gr.Button("↺", scale=0, size="sm")
 
-                        task_table = gr.DataFrame(height=500, show_label=False, interactive=False)
+                        task_table = gr.DataFrame(
+                            max_height=500, show_label=False, interactive=False
+                        )
 
                     with gr.Column(scale=2):
                         with gr.Accordion("Seed Selector (click for help)", open=False):
@@ -249,7 +251,9 @@ def run_gradio(results_dir: Path):
     the order."""
                             )
 
-                        seed_table = gr.DataFrame(height=500, show_label=False, interactive=False)
+                        seed_table = gr.DataFrame(
+                            max_height=500, show_label=False, interactive=False
+                        )
 
             with gr.Tab("Constants and Variables"):
                 with gr.Row():
@@ -261,7 +265,9 @@ def run_gradio(results_dir: Path):
     **all** agents. They are displayed as a table with the name and value of the
     constant."""
                             )
-                        constants = gr.DataFrame(height=500, show_label=False, interactive=False)
+                        constants = gr.DataFrame(
+                            max_height=500, show_label=False, interactive=False
+                        )
                     with gr.Column(scale=2):
                         with gr.Accordion("Variables", open=False):
                             gr.Markdown(
@@ -270,9 +276,11 @@ def run_gradio(results_dir: Path):
     They are displayed as a table with the name, value and count of unique
     values. A maximum of 3 different values are displayed."""
                             )
-                        variables = gr.DataFrame(height=500, show_label=False, interactive=False)
+                        variables = gr.DataFrame(
+                            max_height=500, show_label=False, interactive=False
+                        )
             with gr.Tab("Global Stats"):
-                global_stats = gr.DataFrame(height=500, show_label=False, interactive=False)
+                global_stats = gr.DataFrame(max_height=500, show_label=False, interactive=False)
 
         with gr.Row():
             episode_info = gr.Markdown(label="Episode Info", elem_classes="my-markdown")
@@ -345,7 +353,7 @@ def run_gradio(results_dir: Path):
                 logs = gr.Code(language=None, **code_args)
 
             with gr.Tab("Stats") as tab_stats:
-                stats = gr.DataFrame(height=500, show_label=False, interactive=False)
+                stats = gr.DataFrame(max_height=500, show_label=False, interactive=False)
 
             with gr.Tab("Agent Info HTML") as tab_agent_info_html:
                 with gr.Row():
diff --git a/src/agentlab/experiments/task_collections.py b/src/agentlab/experiments/task_collections.py
index 66bf00b7..f6ff2aa6 100644
--- a/src/agentlab/experiments/task_collections.py
+++ b/src/agentlab/experiments/task_collections.py
@@ -122,6 +122,7 @@ def get_benchmark_env_args(
         "workarena.l2": 50,
         "workarena.l3": 50,
         "webarena": 15,
+        "webarena_debug": 15,
         "miniwob": 10,
         "miniwob_tiny_test": 5,
         "weblinx": None,
@@ -178,6 +179,15 @@ def get_benchmark_env_args(
         from browsergym.webarena import ALL_WEBARENA_TASK_IDS
 
         env_args_list = _make_env_args(ALL_WEBARENA_TASK_IDS, max_steps, n_repeat, rng)
+    elif benchmark_name == "webarena_debug":
+        from browsergym.webarena import ALL_WEBARENA_TASK_IDS
+
+        env_args_list = _make_env_args(
+            [t for i, t in enumerate(ALL_WEBARENA_TASK_IDS) if i % 16 == 0],
+            max_steps,
+            n_repeat,
+            rng,
+        )
     elif benchmark_name.startswith("miniwob"):
         miniwob_benchmarks_map = {
             "miniwob": MINIWOB_ALL,
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
index a4df0a97..49e049e7 100644
--- a/src/agentlab/llm/chat_api.py
+++ b/src/agentlab/llm/chat_api.py
@@ -245,7 +245,7 @@ def __init__(
             **client_args,
         )
 
-    def __call__(self, messages: list[dict]) -> dict:
+    def __call__(self, messages: list[dict], num_samples=1, temperature=None) -> dict:
         # Initialize retry tracking attributes
         self.retries = 0
         self.success = False
@@ -255,12 +255,14 @@ def __call__(self, messages: list[dict]) -> dict:
         e = None
         for itr in range(self.max_retry):
             self.retries += 1
+            temp_to_use = temperature if temperature is not None else self.temperature
             try:
                 completion = self.client.chat.completions.create(
                     model=self.model_name,
                     messages=messages,
-                    temperature=self.temperature,
+                    temperature=temp_to_use,
                     max_tokens=self.max_tokens,
+                    n=num_samples,
                 )
                 self.success = True
                 break
@@ -273,7 +275,6 @@ def __call__(self, messages: list[dict]) -> dict:
                 f"Failed to get a response from the API after {self.max_retry} retries\n"
                 f"Last error: {error_type}"
             )
-
         input_tokens = completion.usage.prompt_tokens
         output_tokens = completion.usage.completion_tokens
         cost = input_tokens * self.input_cost + output_tokens * self.output_cost
@@ -283,7 +284,10 @@ def __call__(self, messages: list[dict]) -> dict:
         ):
             tracking.TRACKER.instance(input_tokens, output_tokens, cost)
 
-        return make_assistant_message(completion.choices[0].message.content)
+        if num_samples > 1:
+            return [make_assistant_message(c.message.content) for c in completion.choices]
+        else:
+            return make_assistant_message(completion.choices[0].message.content)
 
     def get_stats(self):
         return {
diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py
index c3d75009..3b3e7ed3 100644
--- a/src/agentlab/llm/llm_utils.py
+++ b/src/agentlab/llm/llm_utils.py
@@ -40,6 +40,38 @@ class RetryError(ValueError):
     pass
 
 
+def retry_multiple(
+    chat: "ChatModel",
+    messages: list[dict],
+    n_retry: int,
+    parser: callable,
+    log: bool = True,
+    num_samples: int = 10,
+):
+    """Same as retry except we will generate multiple samples for each retry. And issue a parse error if none of the samples are valid."""
+    tries = 0
+    while tries < n_retry:
+        answer_list = chat(messages, num_samples=num_samples, temperature=1.0)
+        # try to parse each answer
+        parsed_answers = []
+        errors = []
+        for answer in answer_list:
+            try:
+                parsed_answers.append(parser(answer["content"]))
+            except ParseError as parsing_error:
+                errors.append(str(parsing_error))
+        # if we have a valid answer, return it
+        if parsed_answers:
+            return parsed_answers, tries
+        else:
+            tries += 1
+            if log:
+                msg = f"Query failed. Retrying {tries}/{n_retry}.\n[LLM]:\n{answer_list}\n[User]:\n{errors}"
+                logging.info(msg)
+            messages.append(dict(role="user", content=str(errors)))
+    raise ParseError(f"Could not parse a valid value after {n_retry} retries.")
+
+
 def retry(
     chat: "ChatModel",
     messages: list[dict],
@@ -69,6 +101,7 @@ def retry(
             after RateLimtError. will try to parse the wait time from the error
             message.
         rate_limit_max_wait_time (int): the maximum wait time in seconds
+        num_samples (int): number of samples to generate for each retry.
 
     Returns:
         dict: the parsed value, with a string at key "action".
@@ -81,7 +114,6 @@ def retry(
     while tries < n_retry:
         answer = chat(messages)
         messages.append(answer)  # TODO: could we change this to not use inplace modifications ?
-
         try:
             return parser(answer["content"])
         except ParseError as parsing_error:

From 8fbbe8ed33ca89a54d2cbb043d20ea2e81b18b42 Mon Sep 17 00:00:00 2001
From: MurtyShikhar <shikhar.murty@gmail.com>
Date: Mon, 11 Nov 2024 15:46:47 -0800
Subject: [PATCH 6/9] small changes to make things compatible with
 openended-webarena

---
 src/agentlab/experiments/launch_exp.py           |  3 +++
 src/agentlab/experiments/reproducibility_util.py |  2 ++
 src/agentlab/experiments/study_generators.py     | 14 ++++++++++++--
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py
index fd7d2b6b..a9e3788f 100644
--- a/src/agentlab/experiments/launch_exp.py
+++ b/src/agentlab/experiments/launch_exp.py
@@ -20,6 +20,7 @@ def run_experiments(
     exp_args_list: list[ExpArgs],
     study_dir,
     parallel_backend="joblib",
+    save_transition_history=False,
 ):
     """Run a list of ExpArgs in parallel.
 
@@ -35,6 +36,8 @@ def run_experiments(
             Directory where the experiments will be saved.
         parallel_backend: str
             Parallel backend to use. Either "joblib", "dask" or "sequential".
+        save_transition_history: bool
+            If True, save the transition history from the agent.
     """
 
     if len(exp_args_list) == 0:
diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py
index 3ef7d8ef..6aeb1fe2 100644
--- a/src/agentlab/experiments/reproducibility_util.py
+++ b/src/agentlab/experiments/reproducibility_util.py
@@ -35,6 +35,8 @@ def _get_benchmark_version(benchmark_name):
             return metadata.distribution("weblinx_browsergym").version
         except metadata.PackageNotFoundError:
             return "0.0.1rc1"
+    elif benchmark_name.startswith("nnetnav"):
+        return "1.0"
     else:
         raise ValueError(f"Unknown benchmark {benchmark_name}")
 
diff --git a/src/agentlab/experiments/study_generators.py b/src/agentlab/experiments/study_generators.py
index 3a2567d5..c077f91d 100644
--- a/src/agentlab/experiments/study_generators.py
+++ b/src/agentlab/experiments/study_generators.py
@@ -44,7 +44,12 @@ class Study:
     dir: Path = None
     suffix: str = ""  # used for adding a personnal comment to the study name
 
-    def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False):
+    def run(
+        self,
+        n_jobs=1,
+        parallel_backend="joblib",
+        strict_reproducibility=False,
+    ):
         """Run all experiments in the study in parallel when possible.
 
         Args:
@@ -64,7 +69,12 @@ def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False)
         self.make_dir()
         self.write_reproducibility_info(strict_reproducibility=strict_reproducibility)
 
-        run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend)
+        run_experiments(
+            n_jobs,
+            self.exp_args_list,
+            self.dir,
+            parallel_backend=parallel_backend,
+        )
         report_df = self.get_report(ignore_cache=True)
         logging.info(f"Study {self.name} finished.")
         logging.info("\n" + str(report_df))

From ee0d5e27fdf63e4ad3d6241cfe1230d4243df249 Mon Sep 17 00:00:00 2001
From: optimass <massimo.p.caccia@gmail.com>
Date: Fri, 15 Nov 2024 19:43:38 +0000
Subject: [PATCH 7/9] quick fix for edge case

---
 src/agentlab/analyze/agent_xray.py      | 4 +++-
 src/agentlab/analyze/inspect_results.py | 8 ++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index fa6635e6..28731d3c 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -4,6 +4,7 @@
 from logging import warning
 from pathlib import Path
 
+from finetuning.data import data_collection_library
 import gradio as gr
 import matplotlib.patches as patches
 import matplotlib.pyplot as plt
@@ -1096,7 +1097,8 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr
 
 
 def main():
-    run_gradio(RESULTS_DIR)
+    # run_gradio(RESULTS_DIR)
+    run_gradio(data_collection_library.WORKARENA_V1_TRACES_PATHS[0])
 
 
 if __name__ == "__main__":
diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
index 2015c07d..3e9bd388 100644
--- a/src/agentlab/analyze/inspect_results.py
+++ b/src/agentlab/analyze/inspect_results.py
@@ -348,9 +348,13 @@ def summarize(sub_df, use_bootstrap=False, separate_val_test=True):
             n_err=err.sum(skipna=True),
         )
         if separate_val_test:
-            record["avg_reward_valid"] = sub_df["cum_reward_valid"].mean(skipna=True).round(3)
+            record["avg_reward_valid"] = sub_df["cum_reward_valid"].mean(skipna=True)
+            if not np.isnan(record["avg_reward_valid"]):
+                record["avg_reward_valid"] = record["avg_reward_valid"].round(3)
             record["std_err_valid"] = std_reward_valid
-            record["avg_reward_test"] = sub_df["cum_reward_test"].mean(skipna=True).round(3)
+            record["avg_reward_test"] = sub_df["cum_reward_test"].mean(skipna=True)
+            if not np.isnan(record["avg_reward_test"]):
+                record["avg_reward_test"] = record["avg_reward_test"].round(3)
             record["std_err_test"] = std_reward_test
 
     return pd.Series(record)

From b42aaf5f59ceed71a1c16bc0823f37bd8f79b1c6 Mon Sep 17 00:00:00 2001
From: MurtyShikhar <shikhar.murty@gmail.com>
Date: Wed, 20 Nov 2024 10:16:24 -0800
Subject: [PATCH 8/9] add Azure version of GPT-4o-mini

---
 src/agentlab/llm/llm_configs.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
index 30889be3..b4f37fce 100644
--- a/src/agentlab/llm/llm_configs.py
+++ b/src/agentlab/llm/llm_configs.py
@@ -70,6 +70,13 @@
         max_input_tokens=40_000,
         max_new_tokens=4_000,
     ),
+    "azure/gpt-4o-mini-2024-07-18": AzureModelArgs(
+        model_name="gpt-4o-mini",
+        deployment_name="gpt-4o-mini-2024-07-18",
+        max_total_tokens=16_384,
+        max_input_tokens=15_000,
+        max_new_tokens=1_000,
+    ),
     "azure/gpt-4o-2024-08-06": AzureModelArgs(
         model_name="gpt-4o",
         deployment_name="gpt-4o-2024-08-06",

From e7d99a704edc4cdd22792cce295f4aad5031f7bf Mon Sep 17 00:00:00 2001
From: MurtyShikhar <shikhar.murty@gmail.com>
Date: Sat, 23 Nov 2024 16:16:15 -0800
Subject: [PATCH 9/9] add support for vLLM

---
 .../agents/generic_agent/generic_agent.py     |  5 --
 src/agentlab/llm/chat_api.py                  | 49 +++++++++++++++++++
 src/agentlab/llm/llm_configs.py               |  8 +++
 3 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
index a22492dd..f947cef3 100644
--- a/src/agentlab/agents/generic_agent/generic_agent.py
+++ b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -1,5 +1,3 @@
-import logging
-import re
 from dataclasses import asdict, dataclass
 from functools import partial
 from warnings import warn
@@ -10,9 +8,6 @@
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.llm.chat_api import BaseModelArgs, make_system_message, make_user_message
 from agentlab.llm.llm_utils import ParseError, retry
-from agentlab.agents.utils import openai_monitored_agent
-from agentlab.llm.chat_api import BaseModelArgs
-from agentlab.llm.llm_utils import RetryError, retry_raise, ParseError
 from agentlab.llm.tracking import cost_tracker_decorator
 
 from .generic_agent_prompt import GenericPromptFlags, MainPrompt
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
index 49e049e7..87e5920c 100644
--- a/src/agentlab/llm/chat_api.py
+++ b/src/agentlab/llm/chat_api.py
@@ -13,6 +13,10 @@
 import agentlab.llm.tracking as tracking
 from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs
 from agentlab.llm.huggingface_utils import HFBaseChatModel
+from agentlab.llm.langchain_utils import (
+    HuggingFaceAPIChatModel,
+    HuggingFaceURLChatModel,
+)
 
 
 def make_system_message(content: str) -> dict:
@@ -131,6 +135,14 @@ def make_model(self):
                 max_new_tokens=self.max_new_tokens,
                 n_retry_server=self.n_retry_server,
             )
+        elif self.backend == "vllm":
+            return VLLMChatModel(
+                model_name=self.model_name,
+                temperature=self.temperature,
+                max_tokens=self.max_new_tokens,
+                max_retry=4,
+                min_retry_wait_time=60,
+            )
         else:
             raise ValueError(f"Backend {self.backend} is not supported")
 
@@ -296,6 +308,30 @@ def get_stats(self):
         }
 
 
+class VLLMChatModel(ChatModel):
+    def __init__(
+        self,
+        model_name,
+        api_key=None,
+        temperature=0.5,
+        max_tokens=100,
+        max_retry=4,
+        min_retry_wait_time=60,
+    ):
+        super().__init__(
+            model_name=model_name,
+            api_key=api_key,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            max_retry=max_retry,
+            min_retry_wait_time=min_retry_wait_time,
+            api_key_env_var="VLLM_API_KEY",
+            client_class=OpenAI,
+            client_args={"base_url": "http://0.0.0.0:8000/v1"},
+            pricing_func=None,
+        )
+
+
 class OpenAIChatModel(ChatModel):
     def __init__(
         self,
@@ -400,3 +436,16 @@ def __init__(
         self.llm = partial(
             client.text_generation, temperature=temperature, max_new_tokens=max_new_tokens
         )
+
+
+@dataclass
+class HuggingFaceModelArgs(BaseModelArgs):
+    """Serializable object for instantiating a generic chat model with a HuggingFace model."""
+
+    def make_model(self):
+        return HuggingFaceAPIChatModel(
+            model_name=self.model_name,
+            temperature=self.temperature,
+            max_new_tokens=self.max_new_tokens,
+            n_retry_server=4,
+        )
diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
index b4f37fce..ce0369b7 100644
--- a/src/agentlab/llm/llm_configs.py
+++ b/src/agentlab/llm/llm_configs.py
@@ -101,6 +101,14 @@
         backend="huggingface",
         **default_oss_llms_args,
     ),
+    "vllm/meta-llama/Meta-Llama-3-8B-Instruct": SelfHostedModelArgs(
+        model_name="meta-llama/Meta-Llama-3-8B-Instruct",
+        max_total_tokens=16_384,
+        max_input_tokens=16_384 - 512,
+        max_new_tokens=512,
+        backend="vllm",
+        **default_oss_llms_args,
+    ),
     "mistralai/Mixtral-8x22B-Instruct-v0.1": SelfHostedModelArgs(
         model_name="mistralai/Mixtral-8x22B-Instruct-v0.1",
         max_total_tokens=32_000,