From 53a246a23d58605f7e1f39dcfe8a2edc003fd568 Mon Sep 17 00:00:00 2001 From: optimass Date: Wed, 9 Oct 2024 14:32:47 +0000 Subject: [PATCH 1/9] adapting parallel_backend to use wait_func --- src/agentlab/experiments/launch_exp.py | 33 ++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py index b2ed28ec..00b7b70a 100644 --- a/src/agentlab/experiments/launch_exp.py +++ b/src/agentlab/experiments/launch_exp.py @@ -16,6 +16,15 @@ def import_object(path: str): return obj +def wait_and_run(exp_args: ExpArgs, wait_func): + try: + wait_func() + except Exception as e: + logging.error(f"Error with wait_func: {e}") + return + exp_args.run() + + def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_backend="joblib"): """Run a list of ExpArgs in parallel. @@ -39,15 +48,24 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_back parallel_backend = "sequential" logging.info(f"Saving experiments to {exp_dir}") + wait_funcs = [] for exp_args in exp_args_list: - exp_args.agent_args.prepare() + server_info, wait_func = exp_args.agent_args.prepare() + wait_funcs.append(wait_func) exp_args.prepare(exp_root=exp_dir) + + # logging.info(f"Saving experiments to {exp_dir}") + # for exp_args in exp_args_list: + # exp_args.agent_args.prepare() + # exp_args.prepare(exp_root=exp_dir) try: if parallel_backend == "joblib": from joblib import Parallel, delayed Parallel(n_jobs=n_jobs, prefer="processes")( - delayed(exp_args.run)() for exp_args in exp_args_list + # delayed(exp_args.run)() for exp_args in exp_args_list + delayed(wait_and_run)(exp_arg, wait_func) + for exp_arg, wait_func in zip(exp_args_list, wait_funcs) ) elif parallel_backend == "dask": @@ -56,8 +74,9 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_back with make_dask_client(n_worker=n_jobs): execute_task_graph(exp_args_list) elif parallel_backend == "sequential": - for exp_args in exp_args_list: - exp_args.run() + for exp_args, wait_func in zip(exp_args_list, wait_funcs): + wait_and_run(exp_args, wait_func) + # exp_args.run() else: raise ValueError(f"Unknown parallel_backend: {parallel_backend}") finally: @@ -117,7 +136,11 @@ def _yield_incomplete_experiments(exp_root, relaunch_mode="incomplete_only"): summary_info = exp_result.summary_info except FileNotFoundError: - yield exp_result.exp_args + # yield exp_result.exp_args + try: + yield exp_result.exp_args + except Exception as e: + logging.error(f"Error with exp_result.exp_args: {e}") continue if relaunch_mode == "incomplete_only": From 63f81e6a7d0877607fce353778ae72ecca8ea8cd Mon Sep 17 00:00:00 2001 From: optimass Date: Fri, 11 Oct 2024 20:55:13 +0000 Subject: [PATCH 2/9] fix --- src/agentlab/analyze/inspect_results.py | 195 +++++++++++++++++++++++- src/agentlab/llm/langchain_utils.py | 6 +- 2 files changed, 194 insertions(+), 7 deletions(-) diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index d69b1656..2015c07d 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -2,6 +2,7 @@ import io import random import re +from typing import List, Union import warnings from collections import defaultdict from datetime import datetime @@ -114,6 +115,9 @@ def load_result_df( result_df=None, index_white_list=("agent_args.*",), index_black_list=("*model_url*", "*extra*"), + avg_across_finetining_samples=True, + separate_val_test=False, + frac_valid=0.25, ): """Load the result dataframe. @@ -139,7 +143,12 @@ def load_result_df( if result_df is not None: result_list = list(result_df["exp_result"]) else: - result_list = list(yield_all_exp_results(exp_dir, progress_fn=progress_fn)) + if isinstance(exp_dir, list): + result_list = [] + for dir in exp_dir: + result_list.extend(list(yield_all_exp_results(dir, progress_fn=progress_fn))) + else: + result_list = list(yield_all_exp_results(exp_dir, progress_fn=progress_fn)) if len(result_list) == 0: return None @@ -148,8 +157,61 @@ def load_result_df( result_list = progress_fn(result_list, desc="Loading results") df = pd.DataFrame([exp_result.get_exp_record() for exp_result in result_list]) + + if separate_val_test: + # Initialize the new columns + df["valid"] = False + df["test"] = False + df["cum_reward_valid"] = np.nan + df["cum_reward_test"] = np.nan + + # Ensure 'env_args.task_name' and 'env_args.task_seed' are columns after resetting index + df["task_name"] = df["env_args.task_name"] # Copy task_name to a new column for convenience + grouped = df.groupby("task_name") + + for task_name, group in grouped: + # Get the unique seeds for this task + unique_seeds = group["env_args.task_seed"].unique() + np.random.shuffle(unique_seeds) # Shuffle to randomize the split + + # Calculate split index + split_idx = int(len(unique_seeds) * frac_valid) + + # Split the seeds into validation and test sets + valid_seeds = unique_seeds[:split_idx] + test_seeds = unique_seeds[split_idx:] + + # Mark the rows that are in the validation set + df.loc[ + (df["task_name"] == task_name) & (df["env_args.task_seed"].isin(valid_seeds)), + "valid", + ] = True + + # Mark the rows that are in the test set + df.loc[ + (df["task_name"] == task_name) & (df["env_args.task_seed"].isin(test_seeds)), "test" + ] = True + + # Set the `cum_reward_valid` and `cum_reward_test` based on the valid and test columns + df.loc[df["valid"], "cum_reward_valid"] = df["cum_reward"] + df.loc[df["test"], "cum_reward_test"] = df["cum_reward"] + if set_index: set_index_from_variables(df, index_white_list, index_black_list) + + if avg_across_finetining_samples: + # Reset the index to make it a DataFrame + df_reset = df.reset_index() + + # Modify the 'agent_args.chat_model_args.model_path' column by removing 'sample_{int}/' + # TODO: will this work for samples_0-1 ? + df_reset["agent_args.chat_model_args.model_path"] = df_reset[ + "agent_args.chat_model_args.model_path" + ].str.replace(r"sample_\d+/", "", regex=True) + + # Recreate the MultiIndex + df = df_reset.set_index(df.index.names) + return df @@ -237,7 +299,7 @@ def get_std_err(df, metric): return mean, std_err -def summarize(sub_df, use_bootstrap=False): +def summarize(sub_df, use_bootstrap=False, separate_val_test=True): if not "cum_reward" in sub_df: record = dict( avg_reward=np.nan, @@ -248,6 +310,7 @@ def summarize(sub_df, use_bootstrap=False): n_err=0, ) else: + err = sub_df["err_msg"].notnull() n_completed = (err | sub_df["truncated"] | sub_df["terminated"]).sum() @@ -256,8 +319,22 @@ def summarize(sub_df, use_bootstrap=False): if use_bootstrap: _mean_reward, std_reward = get_bootstrap(sub_df, "cum_reward") + if separate_val_test: + _mean_reward_valid, std_reward_valid = get_bootstrap( + sub_df[sub_df["valid"]], "cum_reward" + ) + _mean_reward_test, std_reward_test = get_bootstrap( + sub_df[sub_df["test"]], "cum_reward" + ) else: _mean_reward, std_reward = get_std_err(sub_df, "cum_reward") + if separate_val_test: + _mean_reward_valid, std_reward_valid = get_std_err( + sub_df[sub_df["valid"]], "cum_reward" + ) + _mean_reward_test, std_reward_test = get_std_err( + sub_df[sub_df["test"]], "cum_reward" + ) # sanity check, if there is an error the reward should be zero assert sub_df[sub_df["err_msg"].notnull()]["cum_reward"].sum() == 0 @@ -270,6 +347,11 @@ def summarize(sub_df, use_bootstrap=False): n_completed=f"{n_completed}/{len(sub_df)}", n_err=err.sum(skipna=True), ) + if separate_val_test: + record["avg_reward_valid"] = sub_df["cum_reward_valid"].mean(skipna=True).round(3) + record["std_err_valid"] = std_reward_valid + record["avg_reward_test"] = sub_df["cum_reward_test"].mean(skipna=True).round(3) + record["std_err_test"] = std_reward_test return pd.Series(record) @@ -391,6 +473,7 @@ def global_report( result_df: pd.DataFrame, reduce_fn=summarize, rename_index=lambda name: name.replace("agent_args.flags.", ""), + separate_valid_test=False, ): """Produce a report that summarize all tasks and all episodes for each agent. @@ -422,7 +505,15 @@ def global_report( index_names = [rename_index(name) for name in report.index.names] report = report.rename_axis(index=index_names) - # if has key avg_reward + if separate_valid_test: + if "avg_reward_valid" in report: + report = report.sort_values("avg_reward_valid", ascending=False) + ## put avg_reward and std_err at the end + report_columns = list(report.columns) + report_columns.append(report_columns.pop(report_columns.index("avg_reward"))) + report_columns.append(report_columns.pop(report_columns.index("std_err"))) + report = report[report_columns] + if "avg_reward" in report.columns: report = report.sort_values("avg_reward", ascending=False) @@ -485,7 +576,9 @@ def flag_report(report: pd.DataFrame, metric: str = "avg_reward", round_digits: def get_most_recent_folder( - root_dir: Path = None, date_format: str = "%Y-%m-%d_%H-%M-%S", contains=None + root_dir: Path = None, + date_format: str = "%Y-%m-%d_%H-%M-%S", + contains=None, ): """Return the most recent directory based on the date in the folder name. @@ -519,12 +612,52 @@ def get_most_recent_folder( return most_recent_folder +def get_nth_most_recent_folder( + result_dirs: Union[List[Path], Path] = None, + n=1, + date_format: str = "%Y-%m-%d_%H-%M-%S", + contains=None, +): + """Return the N-th most recent directory based on the date in the folder name.""" + + if result_dirs is None: + result_dir = RESULTS_DIR + + if isinstance(result_dirs, Path): + result_dirs = [result_dirs] + + folders_with_dates = [] + + for folder in result_dirs: + for item in folder.iterdir(): + if item.is_dir() and not item.name.startswith("_"): + if contains is not None and contains not in item.name: + continue + try: + folder_date = datetime.strptime("_".join(item.name.split("_")[:2]), date_format) + folders_with_dates.append((folder_date, item)) + except (ValueError, IndexError): + continue + + # Sort folders by date in descending order (most recent first) + folders_with_dates.sort(reverse=True, key=lambda x: x[0]) + + # Return the N-th most recent folder if it exists + if n is None: + return [folder for date, folder in folders_with_dates] + if len(folders_with_dates) >= n: + return folders_with_dates[n - 1][1] + else: + return None # or raise an exception if you prefer + + def display_report( report: pd.DataFrame, apply_shrink_columns: bool = True, copy_to_clipboard: bool = True, rename_bool_flags: bool = True, print_only: str = None, + add_summary_stats: bool = False, ): """Display the report in a nicer-ish format. @@ -539,6 +672,7 @@ def display_report( copy_to_clipboard: Copy the report to the clipboard rename_bool_flags: Rename the boolean flags to be more compact and readable print_only: Print only the given column + add_summary_stats: Add a row with the sum and average for numeric columns """ report = report.copy() @@ -559,6 +693,59 @@ def display_report( columns = [print_only] + columns report = report[columns] + if add_summary_stats: + + # eplace 'NaN' string with actual NaN + report.replace("nan", pd.NA, inplace=True) + report.dropna(inplace=True) + + # Convert numeric columns back to numeric types where possible + report = report.apply(pd.to_numeric, errors="ignore") + + # Function to split fractions into numerator and denominator + def split_fraction(fraction_str): + try: + if isinstance(fraction_str, str) and "/" in fraction_str: + numerator, denominator = map(float, fraction_str.split("/")) + return numerator, denominator + except ValueError: + pass + return None, None + + # Extract numerators and denominators + report["n\ncompleted_num"], report["n\ncompleted_den"] = zip( + *report["n\ncompleted"].apply(split_fraction) + ) + + # Round all numeric columns to two decimal places (excluding fraction columns) + numeric_cols = report.select_dtypes(include=["number"]).columns + report[numeric_cols] = report[numeric_cols].round(2) + + # Calculate the sum and average for numeric columns (ignoring fraction columns for now) + total = report[numeric_cols].sum().round(2) + average = report[numeric_cols].mean().round(2) + + # Calculate the sum and average for the fraction column + total_numerator = int(report["n\ncompleted_num"].sum()) + total_denominator = int(report["n\ncompleted_den"].sum()) + + average_numerator = int(round(report["n\ncompleted_num"].mean())) + average_denominator = int(round(report["n\ncompleted_den"].mean())) + + # Append the 'Total' and 'Average' rows + report.loc["Total"] = total + report.loc["Average"] = average + + # Manually add the fractions back to the Total and Average rows in the original format + report.at["Total", "n\ncompleted"] = f"{total_numerator}/{total_denominator}" + report.at["Average", "n\ncompleted"] = f"{average_numerator}/{average_denominator}" + + # Drop the temporary converted columns + report.drop(columns=["n\ncompleted_num", "n\ncompleted_den"], inplace=True) + + # Convert all values to strings + report = report.astype(str) + styled_report = set_wrap_style(report) display(styled_report) diff --git a/src/agentlab/llm/langchain_utils.py b/src/agentlab/llm/langchain_utils.py index 8b20b1df..76a2051b 100644 --- a/src/agentlab/llm/langchain_utils.py +++ b/src/agentlab/llm/langchain_utils.py @@ -133,7 +133,7 @@ class HuggingFaceAPIChatModel(HFBaseChatModel): def __init__( self, model_name: str, - temperature: Optional[int] = 1e-1, + temperature: Optional[float] = 1e-1, max_new_tokens: Optional[int] = 512, n_retry_server: Optional[int] = 4, ): @@ -150,7 +150,7 @@ class HuggingFaceLocalChatModel(HFBaseChatModel): def __init__( self, model_name: str, - temperature: Optional[int] = 1e-1, + temperature: Optional[float] = 1e-1, max_new_tokens: Optional[int] = 512, n_retry_server: Optional[int] = 4, ): @@ -174,7 +174,7 @@ def __init__( model_name: str, model_url: str, token: Optional[str] = None, - temperature: Optional[int] = 1e-1, + temperature: Optional[float] = 1e-1, max_new_tokens: Optional[int] = 512, n_retry_server: Optional[int] = 4, ): From b05c0b1aa4e53ef1a88240a6a805ab0d826a2a74 Mon Sep 17 00:00:00 2001 From: optimass Date: Mon, 14 Oct 2024 17:23:05 +0000 Subject: [PATCH 3/9] adding get_action_post_hoc --- .../agents/generic_agent/generic_agent.py | 129 +++++++++++++++++- 1 file changed, 128 insertions(+), 1 deletion(-) diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py index e2696923..2a861a01 100644 --- a/src/agentlab/agents/generic_agent/generic_agent.py +++ b/src/agentlab/agents/generic_agent/generic_agent.py @@ -1,3 +1,5 @@ +import logging +import re from dataclasses import asdict, dataclass from functools import partial from warnings import warn @@ -9,7 +11,7 @@ from agentlab.agents.agent_args import AgentArgs from agentlab.agents.utils import openai_monitored_agent from agentlab.llm.chat_api import BaseModelArgs -from agentlab.llm.llm_utils import RetryError, retry_raise +from agentlab.llm.llm_utils import RetryError, retry_raise, ParseError from agentlab.llm.tracking import cost_tracker_decorator from .generic_agent_prompt import GenericPromptFlags, MainPrompt @@ -252,3 +254,128 @@ def get_action_post_hoc(agent: GenericAgent, obs: dict, ans_dict: dict): output += f"\n\n{action}\n" return system_prompt, instruction_prompt, output + + +def get_action_post_hoc(agent: GenericAgent, step_info): + """ + Get the action post-hoc for the agent. + + This function is used to get the action after the agent has already been run. + Its goal is to recreate the prompt and the output of the agent a posteriori. + The purpose is to build datasets for training the agents. + + Args: + agent (GenericAgent): The agent for which the action is being determined. + obs (dict): The observation dictionary to append to the agent's history. + ans_dict (dict): The answer dictionary containing the plan, step, memory, think, and action. + + Returns: + Tuple[str, str]: The complete prompt used for the agent and the reconstructed output based on the answer dictionary. + """ + system_prompt = dp.SystemPrompt().prompt + + agent.obs_history.append(step_info.obs) + + main_prompt = MainPrompt( + action_set=agent.action_set, + obs_history=agent.obs_history, + actions=agent.actions, + memories=agent.memories, + thoughts=agent.thoughts, + previous_plan=agent.plan, + step=agent.plan_step, + flags=agent.flags, + ) + + max_prompt_tokens, max_trunc_itr = agent._get_maxes() + + fit_function = partial( + dp.fit_tokens, + max_prompt_tokens=max_prompt_tokens, + model_name=agent.chat_model_args.model_name, + max_iterations=max_trunc_itr, + ) + + instruction_prompt = fit_function(shrinkable=main_prompt) + + if isinstance(instruction_prompt, list): + # NOTE: this is when we have images + instruction_prompt = instruction_prompt[0]["text"] + + def parser(text): + try: + ans_dict = main_prompt._parse_answer(text) + except ParseError as e: + # these parse errors will be caught by the retry function and + # the chat_llm will have a chance to recover + return None, False, str(e) + return ans_dict, True, "" + + og_agent_output = step_info.agent_info["chat_messages"][-1].content + if og_agent_output.startswith("assistant\n"): + og_agent_output = og_agent_output[10:] + + ans_dict = parser(og_agent_output)[0] + + # self.plan = ans_dict.get("plan", self.plan) + # self.plan_step = ans_dict.get("step", self.plan_step) + # self.actions.append(ans_dict["action"]) + # self.memories.append(ans_dict.get("memory", None)) + # self.thoughts.append(ans_dict.get("think", None)) + + agent_output = "" + + # TODO: validate this + thought = ans_dict.get("think", None) + agent.thoughts.append(thought) + if thought is not None: + agent_output += f"\n\n{thought}\n\n" + + agent.plan = ans_dict.get("plan", agent.plan) + if agent.plan != "No plan yet": + agent_output += f"\n\n{agent.plan}\n\n" + + agent.plan_step = ans_dict.get("step", agent.plan_step) + if agent.plan_step != -1: + agent_output += f"\n{agent.plan_step}\n" + + memory = ans_dict.get("memory", None) + agent.memories.append(memory) + if memory is not None: + agent_output += f"\n\n{memory}\n\n" + + action = step_info.action + agent.actions.append(action) + if action is not None: + agent_output += f"\n\n{action}\n" + + def find_bid(string): + # Try to find 'a' followed by digits within single or double quotes + match = re.search(r"[\"'](a\d+)[\"']", string) + + # If not found, search digits within single or double quotes + if not match: + match = re.search(r"[\"'](\d+)[\"']", string) + + # Return the matched pattern or None if no match found + if match: + return match.group(1) # Return the match inside the quotes + else: + return None + + # TODO: finish this + bid = find_bid(action) + if bid is not None: + if bid not in instruction_prompt: + logging.info("Bid is not in the instruction prompt.") + return "missing_bid" + + # NOTE: keep in mind the original agent output can be more verbose + if agent_output not in og_agent_output: + logging.info("Agent output does exactly not match the last chat message.") + if not set(agent_output.split()).issubset(set(og_agent_output.split())): + logging.info("Agent output does not match the last chat message.") + return "action_output_mismatch" + + # TODO: make sure the bid is in the prompt + return (system_prompt, instruction_prompt, agent_output) From 0c07cd59cf689796c7212c1ed9c797b0768a5355 Mon Sep 17 00:00:00 2001 From: optimass Date: Wed, 6 Nov 2024 15:34:12 +0000 Subject: [PATCH 4/9] fix --- src/agentlab/experiments/launch_exp.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py index 00b7b70a..6055b5d2 100644 --- a/src/agentlab/experiments/launch_exp.py +++ b/src/agentlab/experiments/launch_exp.py @@ -50,8 +50,11 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_back logging.info(f"Saving experiments to {exp_dir}") wait_funcs = [] for exp_args in exp_args_list: - server_info, wait_func = exp_args.agent_args.prepare() - wait_funcs.append(wait_func) + # server_info, wait_func = exp_args.agent_args.prepare() + output = exp_args.agent_args.prepare() + if isinstance(output, tuple): + _, wait_func = output + wait_funcs.append(wait_func) exp_args.prepare(exp_root=exp_dir) # logging.info(f"Saving experiments to {exp_dir}") From 7090a57cfd9f378634b4985cf283690208e5e894 Mon Sep 17 00:00:00 2001 From: MurtyShikhar Date: Thu, 7 Nov 2024 09:02:25 -0800 Subject: [PATCH 5/9] change to the chat api, llm utils for multi-action sampling --- src/agentlab/analyze/agent_xray.py | 22 +++++++++---- src/agentlab/experiments/task_collections.py | 10 ++++++ src/agentlab/llm/chat_api.py | 12 ++++--- src/agentlab/llm/llm_utils.py | 34 +++++++++++++++++++- 4 files changed, 66 insertions(+), 12 deletions(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 228901b3..9fd5443b 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -220,7 +220,7 @@ def run_gradio(results_dir: Path): content. You have to sort back with the Idx column to align the click with the order.""" ) - agent_table = gr.DataFrame(height=500, show_label=False, interactive=False) + agent_table = gr.DataFrame(max_height=500, show_label=False, interactive=False) with gr.Tab("Select Task and Seed", id="Select Task"): with gr.Row(): with gr.Column(scale=4): @@ -236,7 +236,9 @@ def run_gradio(results_dir: Path): ) refresh_results_button = gr.Button("↺", scale=0, size="sm") - task_table = gr.DataFrame(height=500, show_label=False, interactive=False) + task_table = gr.DataFrame( + max_height=500, show_label=False, interactive=False + ) with gr.Column(scale=2): with gr.Accordion("Seed Selector (click for help)", open=False): @@ -249,7 +251,9 @@ def run_gradio(results_dir: Path): the order.""" ) - seed_table = gr.DataFrame(height=500, show_label=False, interactive=False) + seed_table = gr.DataFrame( + max_height=500, show_label=False, interactive=False + ) with gr.Tab("Constants and Variables"): with gr.Row(): @@ -261,7 +265,9 @@ def run_gradio(results_dir: Path): **all** agents. They are displayed as a table with the name and value of the constant.""" ) - constants = gr.DataFrame(height=500, show_label=False, interactive=False) + constants = gr.DataFrame( + max_height=500, show_label=False, interactive=False + ) with gr.Column(scale=2): with gr.Accordion("Variables", open=False): gr.Markdown( @@ -270,9 +276,11 @@ def run_gradio(results_dir: Path): They are displayed as a table with the name, value and count of unique values. A maximum of 3 different values are displayed.""" ) - variables = gr.DataFrame(height=500, show_label=False, interactive=False) + variables = gr.DataFrame( + max_height=500, show_label=False, interactive=False + ) with gr.Tab("Global Stats"): - global_stats = gr.DataFrame(height=500, show_label=False, interactive=False) + global_stats = gr.DataFrame(max_height=500, show_label=False, interactive=False) with gr.Row(): episode_info = gr.Markdown(label="Episode Info", elem_classes="my-markdown") @@ -345,7 +353,7 @@ def run_gradio(results_dir: Path): logs = gr.Code(language=None, **code_args) with gr.Tab("Stats") as tab_stats: - stats = gr.DataFrame(height=500, show_label=False, interactive=False) + stats = gr.DataFrame(max_height=500, show_label=False, interactive=False) with gr.Tab("Agent Info HTML") as tab_agent_info_html: with gr.Row(): diff --git a/src/agentlab/experiments/task_collections.py b/src/agentlab/experiments/task_collections.py index 66bf00b7..f6ff2aa6 100644 --- a/src/agentlab/experiments/task_collections.py +++ b/src/agentlab/experiments/task_collections.py @@ -122,6 +122,7 @@ def get_benchmark_env_args( "workarena.l2": 50, "workarena.l3": 50, "webarena": 15, + "webarena_debug": 15, "miniwob": 10, "miniwob_tiny_test": 5, "weblinx": None, @@ -178,6 +179,15 @@ def get_benchmark_env_args( from browsergym.webarena import ALL_WEBARENA_TASK_IDS env_args_list = _make_env_args(ALL_WEBARENA_TASK_IDS, max_steps, n_repeat, rng) + elif benchmark_name == "webarena_debug": + from browsergym.webarena import ALL_WEBARENA_TASK_IDS + + env_args_list = _make_env_args( + [t for i, t in enumerate(ALL_WEBARENA_TASK_IDS) if i % 16 == 0], + max_steps, + n_repeat, + rng, + ) elif benchmark_name.startswith("miniwob"): miniwob_benchmarks_map = { "miniwob": MINIWOB_ALL, diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py index a4df0a97..49e049e7 100644 --- a/src/agentlab/llm/chat_api.py +++ b/src/agentlab/llm/chat_api.py @@ -245,7 +245,7 @@ def __init__( **client_args, ) - def __call__(self, messages: list[dict]) -> dict: + def __call__(self, messages: list[dict], num_samples=1, temperature=None) -> dict: # Initialize retry tracking attributes self.retries = 0 self.success = False @@ -255,12 +255,14 @@ def __call__(self, messages: list[dict]) -> dict: e = None for itr in range(self.max_retry): self.retries += 1 + temp_to_use = temperature if temperature is not None else self.temperature try: completion = self.client.chat.completions.create( model=self.model_name, messages=messages, - temperature=self.temperature, + temperature=temp_to_use, max_tokens=self.max_tokens, + n=num_samples, ) self.success = True break @@ -273,7 +275,6 @@ def __call__(self, messages: list[dict]) -> dict: f"Failed to get a response from the API after {self.max_retry} retries\n" f"Last error: {error_type}" ) - input_tokens = completion.usage.prompt_tokens output_tokens = completion.usage.completion_tokens cost = input_tokens * self.input_cost + output_tokens * self.output_cost @@ -283,7 +284,10 @@ def __call__(self, messages: list[dict]) -> dict: ): tracking.TRACKER.instance(input_tokens, output_tokens, cost) - return make_assistant_message(completion.choices[0].message.content) + if num_samples > 1: + return [make_assistant_message(c.message.content) for c in completion.choices] + else: + return make_assistant_message(completion.choices[0].message.content) def get_stats(self): return { diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py index c3d75009..3b3e7ed3 100644 --- a/src/agentlab/llm/llm_utils.py +++ b/src/agentlab/llm/llm_utils.py @@ -40,6 +40,38 @@ class RetryError(ValueError): pass +def retry_multiple( + chat: "ChatModel", + messages: list[dict], + n_retry: int, + parser: callable, + log: bool = True, + num_samples: int = 10, +): + """Same as retry except we will generate multiple samples for each retry. And issue a parse error if none of the samples are valid.""" + tries = 0 + while tries < n_retry: + answer_list = chat(messages, num_samples=num_samples, temperature=1.0) + # try to parse each answer + parsed_answers = [] + errors = [] + for answer in answer_list: + try: + parsed_answers.append(parser(answer["content"])) + except ParseError as parsing_error: + errors.append(str(parsing_error)) + # if we have a valid answer, return it + if parsed_answers: + return parsed_answers, tries + else: + tries += 1 + if log: + msg = f"Query failed. Retrying {tries}/{n_retry}.\n[LLM]:\n{answer_list}\n[User]:\n{errors}" + logging.info(msg) + messages.append(dict(role="user", content=str(errors))) + raise ParseError(f"Could not parse a valid value after {n_retry} retries.") + + def retry( chat: "ChatModel", messages: list[dict], @@ -69,6 +101,7 @@ def retry( after RateLimtError. will try to parse the wait time from the error message. rate_limit_max_wait_time (int): the maximum wait time in seconds + num_samples (int): number of samples to generate for each retry. Returns: dict: the parsed value, with a string at key "action". @@ -81,7 +114,6 @@ def retry( while tries < n_retry: answer = chat(messages) messages.append(answer) # TODO: could we change this to not use inplace modifications ? - try: return parser(answer["content"]) except ParseError as parsing_error: From 8fbbe8ed33ca89a54d2cbb043d20ea2e81b18b42 Mon Sep 17 00:00:00 2001 From: MurtyShikhar Date: Mon, 11 Nov 2024 15:46:47 -0800 Subject: [PATCH 6/9] small changes to make things compatible with openended-webarena --- src/agentlab/experiments/launch_exp.py | 3 +++ src/agentlab/experiments/reproducibility_util.py | 2 ++ src/agentlab/experiments/study_generators.py | 14 ++++++++++++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py index fd7d2b6b..a9e3788f 100644 --- a/src/agentlab/experiments/launch_exp.py +++ b/src/agentlab/experiments/launch_exp.py @@ -20,6 +20,7 @@ def run_experiments( exp_args_list: list[ExpArgs], study_dir, parallel_backend="joblib", + save_transition_history=False, ): """Run a list of ExpArgs in parallel. @@ -35,6 +36,8 @@ def run_experiments( Directory where the experiments will be saved. parallel_backend: str Parallel backend to use. Either "joblib", "dask" or "sequential". + save_transition_history: bool + If True, save the transition history from the agent. """ if len(exp_args_list) == 0: diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py index 3ef7d8ef..6aeb1fe2 100644 --- a/src/agentlab/experiments/reproducibility_util.py +++ b/src/agentlab/experiments/reproducibility_util.py @@ -35,6 +35,8 @@ def _get_benchmark_version(benchmark_name): return metadata.distribution("weblinx_browsergym").version except metadata.PackageNotFoundError: return "0.0.1rc1" + elif benchmark_name.startswith("nnetnav"): + return "1.0" else: raise ValueError(f"Unknown benchmark {benchmark_name}") diff --git a/src/agentlab/experiments/study_generators.py b/src/agentlab/experiments/study_generators.py index 3a2567d5..c077f91d 100644 --- a/src/agentlab/experiments/study_generators.py +++ b/src/agentlab/experiments/study_generators.py @@ -44,7 +44,12 @@ class Study: dir: Path = None suffix: str = "" # used for adding a personnal comment to the study name - def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False): + def run( + self, + n_jobs=1, + parallel_backend="joblib", + strict_reproducibility=False, + ): """Run all experiments in the study in parallel when possible. Args: @@ -64,7 +69,12 @@ def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False) self.make_dir() self.write_reproducibility_info(strict_reproducibility=strict_reproducibility) - run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend) + run_experiments( + n_jobs, + self.exp_args_list, + self.dir, + parallel_backend=parallel_backend, + ) report_df = self.get_report(ignore_cache=True) logging.info(f"Study {self.name} finished.") logging.info("\n" + str(report_df)) From ee0d5e27fdf63e4ad3d6241cfe1230d4243df249 Mon Sep 17 00:00:00 2001 From: optimass Date: Fri, 15 Nov 2024 19:43:38 +0000 Subject: [PATCH 7/9] quick fix for edge case --- src/agentlab/analyze/agent_xray.py | 4 +++- src/agentlab/analyze/inspect_results.py | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index fa6635e6..28731d3c 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -4,6 +4,7 @@ from logging import warning from pathlib import Path +from finetuning.data import data_collection_library import gradio as gr import matplotlib.patches as patches import matplotlib.pyplot as plt @@ -1096,7 +1097,8 @@ def plot_profiling(ax, step_info_list: list[StepInfo], summary_info: dict, progr def main(): - run_gradio(RESULTS_DIR) + # run_gradio(RESULTS_DIR) + run_gradio(data_collection_library.WORKARENA_V1_TRACES_PATHS[0]) if __name__ == "__main__": diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index 2015c07d..3e9bd388 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -348,9 +348,13 @@ def summarize(sub_df, use_bootstrap=False, separate_val_test=True): n_err=err.sum(skipna=True), ) if separate_val_test: - record["avg_reward_valid"] = sub_df["cum_reward_valid"].mean(skipna=True).round(3) + record["avg_reward_valid"] = sub_df["cum_reward_valid"].mean(skipna=True) + if not np.isnan(record["avg_reward_valid"]): + record["avg_reward_valid"] = record["avg_reward_valid"].round(3) record["std_err_valid"] = std_reward_valid - record["avg_reward_test"] = sub_df["cum_reward_test"].mean(skipna=True).round(3) + record["avg_reward_test"] = sub_df["cum_reward_test"].mean(skipna=True) + if not np.isnan(record["avg_reward_test"]): + record["avg_reward_test"] = record["avg_reward_test"].round(3) record["std_err_test"] = std_reward_test return pd.Series(record) From b42aaf5f59ceed71a1c16bc0823f37bd8f79b1c6 Mon Sep 17 00:00:00 2001 From: MurtyShikhar Date: Wed, 20 Nov 2024 10:16:24 -0800 Subject: [PATCH 8/9] add Azure version of GPT-4o-mini --- src/agentlab/llm/llm_configs.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py index 30889be3..b4f37fce 100644 --- a/src/agentlab/llm/llm_configs.py +++ b/src/agentlab/llm/llm_configs.py @@ -70,6 +70,13 @@ max_input_tokens=40_000, max_new_tokens=4_000, ), + "azure/gpt-4o-mini-2024-07-18": AzureModelArgs( + model_name="gpt-4o-mini", + deployment_name="gpt-4o-mini-2024-07-18", + max_total_tokens=16_384, + max_input_tokens=15_000, + max_new_tokens=1_000, + ), "azure/gpt-4o-2024-08-06": AzureModelArgs( model_name="gpt-4o", deployment_name="gpt-4o-2024-08-06", From e7d99a704edc4cdd22792cce295f4aad5031f7bf Mon Sep 17 00:00:00 2001 From: MurtyShikhar Date: Sat, 23 Nov 2024 16:16:15 -0800 Subject: [PATCH 9/9] add support for vLLM --- .../agents/generic_agent/generic_agent.py | 5 -- src/agentlab/llm/chat_api.py | 49 +++++++++++++++++++ src/agentlab/llm/llm_configs.py | 8 +++ 3 files changed, 57 insertions(+), 5 deletions(-) diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py index a22492dd..f947cef3 100644 --- a/src/agentlab/agents/generic_agent/generic_agent.py +++ b/src/agentlab/agents/generic_agent/generic_agent.py @@ -1,5 +1,3 @@ -import logging -import re from dataclasses import asdict, dataclass from functools import partial from warnings import warn @@ -10,9 +8,6 @@ from agentlab.agents.agent_args import AgentArgs from agentlab.llm.chat_api import BaseModelArgs, make_system_message, make_user_message from agentlab.llm.llm_utils import ParseError, retry -from agentlab.agents.utils import openai_monitored_agent -from agentlab.llm.chat_api import BaseModelArgs -from agentlab.llm.llm_utils import RetryError, retry_raise, ParseError from agentlab.llm.tracking import cost_tracker_decorator from .generic_agent_prompt import GenericPromptFlags, MainPrompt diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py index 49e049e7..87e5920c 100644 --- a/src/agentlab/llm/chat_api.py +++ b/src/agentlab/llm/chat_api.py @@ -13,6 +13,10 @@ import agentlab.llm.tracking as tracking from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs from agentlab.llm.huggingface_utils import HFBaseChatModel +from agentlab.llm.langchain_utils import ( + HuggingFaceAPIChatModel, + HuggingFaceURLChatModel, +) def make_system_message(content: str) -> dict: @@ -131,6 +135,14 @@ def make_model(self): max_new_tokens=self.max_new_tokens, n_retry_server=self.n_retry_server, ) + elif self.backend == "vllm": + return VLLMChatModel( + model_name=self.model_name, + temperature=self.temperature, + max_tokens=self.max_new_tokens, + max_retry=4, + min_retry_wait_time=60, + ) else: raise ValueError(f"Backend {self.backend} is not supported") @@ -296,6 +308,30 @@ def get_stats(self): } +class VLLMChatModel(ChatModel): + def __init__( + self, + model_name, + api_key=None, + temperature=0.5, + max_tokens=100, + max_retry=4, + min_retry_wait_time=60, + ): + super().__init__( + model_name=model_name, + api_key=api_key, + temperature=temperature, + max_tokens=max_tokens, + max_retry=max_retry, + min_retry_wait_time=min_retry_wait_time, + api_key_env_var="VLLM_API_KEY", + client_class=OpenAI, + client_args={"base_url": "http://0.0.0.0:8000/v1"}, + pricing_func=None, + ) + + class OpenAIChatModel(ChatModel): def __init__( self, @@ -400,3 +436,16 @@ def __init__( self.llm = partial( client.text_generation, temperature=temperature, max_new_tokens=max_new_tokens ) + + +@dataclass +class HuggingFaceModelArgs(BaseModelArgs): + """Serializable object for instantiating a generic chat model with a HuggingFace model.""" + + def make_model(self): + return HuggingFaceAPIChatModel( + model_name=self.model_name, + temperature=self.temperature, + max_new_tokens=self.max_new_tokens, + n_retry_server=4, + ) diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py index b4f37fce..ce0369b7 100644 --- a/src/agentlab/llm/llm_configs.py +++ b/src/agentlab/llm/llm_configs.py @@ -101,6 +101,14 @@ backend="huggingface", **default_oss_llms_args, ), + "vllm/meta-llama/Meta-Llama-3-8B-Instruct": SelfHostedModelArgs( + model_name="meta-llama/Meta-Llama-3-8B-Instruct", + max_total_tokens=16_384, + max_input_tokens=16_384 - 512, + max_new_tokens=512, + backend="vllm", + **default_oss_llms_args, + ), "mistralai/Mixtral-8x22B-Instruct-v0.1": SelfHostedModelArgs( model_name="mistralai/Mixtral-8x22B-Instruct-v0.1", max_total_tokens=32_000,