diff --git a/.github/workflows/contrib-openai.yml b/.github/workflows/contrib-openai.yml index f0d497b3cd6..467d5270c8e 100644 --- a/.github/workflows/contrib-openai.yml +++ b/.github/workflows/contrib-openai.yml @@ -176,3 +176,41 @@ jobs: with: file: ./coverage.xml flags: unittests + AgentBuilder: + strategy: + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.11" ] + runs-on: ${{ matrix.os }} + environment: openai1 + steps: + # checkout to pr branch + - name: Checkout + uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install packages and dependencies + run: | + docker --version + python -m pip install --upgrade pip wheel + pip install -e . + python -c "import autogen" + pip install coverage pytest-asyncio + - name: Coverage + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_API_BASE: ${{ secrets.AZURE_OPENAI_API_BASE }} + OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }} + run: | + coverage run -a -m pytest test/agentchat/contrib/test_agent_builder.py + coverage xml + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + file: ./coverage.xml + flags: unittests diff --git a/autogen/agentchat/contrib/agent_builder.py b/autogen/agentchat/contrib/agent_builder.py new file mode 100644 index 00000000000..50ef776c04c --- /dev/null +++ b/autogen/agentchat/contrib/agent_builder.py @@ -0,0 +1,408 @@ +import autogen +import time +import subprocess as sp +import socket +import os +import json +import hashlib +from typing import Optional, List, Dict, Tuple, Union + + +class AgentBuilder: + """ + AgentBuilder can help user build an automatic task solving process powered by multi-agent system. + Specifically, our building pipeline includes initialize and build. + In build(), we prompt a gpt-4 model to create multiple participant agents, and specify whether + this task need programming to solve. + User can save the built agents' config by calling save(), and load the saved configs by load(), which can skip the + building process. + """ + + openai_server_name = "openai" + max_tokens = 945 + max_agents = 5 # maximum number of agents build manager can create. + + CODING_PROMPT = """Does the following task need programming (i.e., access external API or tool by coding) to solve, + or use program may help the following task become easier? + + TASK: {task} + + Hint: + # Answer only YES or NO. + """ + + AGENT_NAME_PROMPT = """To complete the following task, what positions/jobs should be set to maximize the efficiency? + + TASK: {task} + + Hint: + # Considering the effort, the position in this task should be no more then {max_agents}, less is better. + # Answer the name of those positions/jobs, separated by comma and use "_" instead of space. For example: Product_manager,Programmer + # Only return the list of positions. + """ + + AGENT_SYS_MSG_PROMPT = """Considering the following position and corresponding task: + + TASK: {task} + POSITION: {position} + + Modify the following position requirement, let it more suitable for the above task and position: + + REQUIREMENT: {default_sys_msg} + + Hint: + # The modified requirement should not contain the code interpreter skill. + # Coding skill is limited to Python. + # Your answer should omit the word "REQUIREMENT". + # Your should let them reply "TERMINATE" in the end when the task complete (user's need has been satisfied). + """ + + def __init__( + self, + config_path: Optional[str] = "OAI_CONFIG_LIST", + builder_model: Optional[str] = "gpt-4-1106-preview", + agent_model: Optional[str] = "gpt-4-1106-preview", + host: Optional[str] = "localhost", + endpoint_building_timeout: Optional[int] = 600, + ): + """ + Args: + config_path: path of the OpenAI api configs. + builder_model: specify a model as the backbone of build manager. + host: endpoint host. + endpoint_building_timeout: timeout for building up an endpoint server. + """ + self.host = host + self.builder_model = builder_model + self.agent_model = agent_model + self.config_path = config_path + self.endpoint_building_timeout = endpoint_building_timeout + + self.building_task: str = None + self.agent_configs: List[Dict] = [] + self.open_ports: List[str] = [] + self.agent_procs: Dict[str, Tuple[sp.Popen, str]] = {} + self.agent_procs_assign: Dict[str, Tuple[autogen.ConversableAgent, str]] = {} + self.cached_configs: Dict = {} + + for port in range(8000, 65535): + if self._is_port_open(host, port): + self.open_ports.append(str(port)) + + @staticmethod + def _is_port_open(host, port): + """Check if a tcp port is open.""" + try: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.settimeout(10) + s.bind((host, int(port))) + s.close() + return True + except OSError: + return False + + def _create_agent( + self, + agent_name: str, + model_name_or_hf_repo: str, + llm_config: dict, + system_message: Optional[str] = autogen.AssistantAgent.DEFAULT_SYSTEM_MESSAGE, + use_oai_assistant: Optional[bool] = False, + world_size: Optional[int] = 1, + ) -> autogen.AssistantAgent: + """ + Create a group chat participant agent. + + If the agent rely on an open-source model, this function will automatically set up an endpoint for that agent. + The API address of that endpoint will be "localhost:{free port}". + + Args: + agent_name: the name that identify the function of the agent (e.g., Coder, Product Manager,...) + model_name_or_hf_repo: + llm_config: specific configs for LLM (e.g., config_list, seed, temperature, ...). + system_message: system prompt use to format an agent's behavior. + use_oai_assistant: use OpenAI assistant api instead of self-constructed agent. + world_size: the max size of parallel tensors (in most of the cases, this is identical to the amount of GPUs). + + Returns: + agent: a set-up agent. + """ + config_list = autogen.config_list_from_json(self.config_path, filter_dict={"model": [model_name_or_hf_repo]}) + if "gpt-" in model_name_or_hf_repo: + server_id = self.openai_server_name + else: + model_name = model_name_or_hf_repo.split("/")[-1] + server_id = f"{model_name}_{self.host}" + if self.agent_procs.get(server_id, None) is None: + while True: + port = self.open_ports.pop() + if self._is_port_open(self.host, port): + break + + # Use vLLM to set up a server with OpenAI API support. + agent_proc = sp.Popen( + [ + "python", + "-m", + "vllm.entrypoints.openai.api_server", + "--host", + f"{self.host}", + "--port", + f"{port}", + "--model", + f"{model_name_or_hf_repo}", + "--tensor-parallel-size", + f"{world_size}", + ], + stdout=sp.PIPE, + stderr=sp.STDOUT, + ) + timeout_start = time.time() + + while True: + server_stdout = agent_proc.stdout.readline() + if server_stdout != b"": + print(server_stdout) + timeout_end = time.time() + if b"running" in server_stdout: + print( + f"Running {model_name_or_hf_repo} on http://{self.host}:{port} " + f"with tensor parallel size {world_size}." + ) + break + elif b"address already in use" in server_stdout: + raise RuntimeError( + f"{self.host}:{port} already in use. Fail to set up the endpoint for " + f"{model_name_or_hf_repo} on {self.host}:{port}." + ) + elif timeout_end - timeout_start > self.endpoint_building_timeout: + raise RuntimeError( + f"Timeout exceed. Fail to set up the endpoint for " + f"{model_name_or_hf_repo} on {self.host}:{port}." + ) + self.agent_procs[server_id] = (agent_proc, port) + else: + port = self.agent_procs[server_id][1] + + config_list[0]["base_url"] = f"http://{self.host}:{port}/v1" + + current_config = llm_config.copy() + current_config.update( + {"config_list": config_list, "model": model_name_or_hf_repo, "max_tokens": self.max_tokens} + ) + if use_oai_assistant: + from autogen.agentchat.contrib.gpt_assistant_agent import GPTAssistantAgent + + agent = GPTAssistantAgent( + name=agent_name, + llm_config={**current_config, "assistant_id": None}, + instructions=system_message, + overwrite_instructions=False, + ) + else: + agent = autogen.AssistantAgent( + name=agent_name, llm_config=current_config.copy(), system_message=system_message + ) + self.agent_procs_assign[agent_name] = (agent, server_id) + return agent + + def clear_agent(self, agent_name: str, recycle_endpoint: Optional[bool] = True): + """ + Clear a specific agent by name. + + Args: + agent_name: the name of agent. + recycle_endpoint: trigger for recycle the endpoint server. If true, the endpoint will be recycled + when there is no agent depending on. + """ + _, server_id = self.agent_procs_assign[agent_name] + del self.agent_procs_assign[agent_name] + if recycle_endpoint: + if server_id == self.openai_server_name: + return + else: + for _, iter_sid in self.agent_procs_assign.values(): + if server_id == iter_sid: + return + self.agent_procs[server_id][0].terminate() + self.open_ports.append(server_id.split("_")[-1]) + print(f"Agent {agent_name} has been cleared.") + + def clear_all_agents(self, recycle_endpoint: Optional[bool] = True): + """ + Clear all cached agents. + """ + for agent_name in [agent_name for agent_name in self.agent_procs_assign.keys()]: + self.clear_agent(agent_name, recycle_endpoint) + print("All agents have been cleared.") + + def build( + self, + building_task: Optional[str] = None, + default_llm_config: Optional[Dict] = None, + coding: Optional[bool] = None, + cached_configs: Optional[Dict] = None, + use_oai_assistant: Optional[bool] = False, + code_execution_config: Optional[Dict] = None, + **kwargs, + ): + """ + Auto build agents based on the building task. + + Args: + building_task: instruction that helps build manager (gpt-4) to decide what agent should be built. + default_llm_config: specific configs for LLM (e.g., config_list, seed, temperature, ...). + coding: use to identify if the user proxy (a code interpreter) should be added. + cached_configs: previously saved agent configs. + use_oai_assistant: use OpenAI assistant api instead of self-constructed agent. + code_execution_config: specific configs for user proxy (e.g., last_n_messages, work_dir, ...). + """ + use_api = False + + if code_execution_config is None: + code_execution_config = { + "last_n_messages": 2, + "work_dir": "groupchat", + "use_docker": False, + "timeout": 60, + } + + if cached_configs is None: + use_api = True + agent_configs = [] + self.building_task = building_task + else: + self.building_task = building_task = cached_configs["building_task"] + default_llm_config = cached_configs["default_llm_config"] + coding = cached_configs["coding"] + agent_configs = cached_configs["agent_configs"] + + config_list = autogen.config_list_from_json(self.config_path, filter_dict={"model": [self.builder_model]}) + build_manager = autogen.OpenAIWrapper(config_list=config_list) + + if use_api: + print("Generating agents...") + resp_agent_name = ( + build_manager.create( + messages=[ + { + "role": "user", + "content": self.AGENT_NAME_PROMPT.format(task=building_task, max_agents=self.max_agents), + } + ] + ) + .choices[0] + .message.content + ) + agent_name_list = resp_agent_name.split(",") + print(f"{resp_agent_name} are generated.") + + agent_sys_msg_list = [] + for name in agent_name_list: + print(f"Preparing configuration for {name}...") + resp_agent_sys_msg = ( + build_manager.create( + messages=[ + { + "role": "user", + "content": self.AGENT_SYS_MSG_PROMPT.format( + task=building_task, + position=name, + default_sys_msg=autogen.AssistantAgent.DEFAULT_SYSTEM_MESSAGE, + ), + } + ] + ) + .choices[0] + .message.content + ) + agent_sys_msg_list.append(resp_agent_sys_msg) + + for i in range(len(agent_name_list)): + agent_configs.append( + {"name": agent_name_list[i], "model": self.agent_model, "system_message": agent_sys_msg_list[i]} + ) + + for config in agent_configs: + print(f"Creating agent {config['name']} with backbone {config['model']}...") + self._create_agent( + config["name"], + config["model"], + default_llm_config, + system_message=config["system_message"], + use_oai_assistant=use_oai_assistant, + **kwargs, + ) + agent_list = [agent_config[0] for agent_config in self.agent_procs_assign.values()] + + if coding is None: + resp = ( + build_manager.create( + messages=[{"role": "user", "content": self.CODING_PROMPT.format(task=building_task)}] + ) + .choices[0] + .message.content + ) + coding = True if resp == "YES" else False + + if coding is True: + print("Adding user console proxy...") + agent_list = [ + autogen.UserProxyAgent( + name="User_console_and_Python_code_interpreter", + is_termination_msg=lambda x: "TERMINATE" in x.get("content"), + system_message="User console with a python code interpreter interface.", + code_execution_config=code_execution_config, + human_input_mode="NEVER", + ) + ] + agent_list + + self.cached_configs.update( + { + "building_task": building_task, + "agent_configs": agent_configs, + "coding": coding, + "default_llm_config": default_llm_config, + "code_execution_config": code_execution_config, + } + ) + + return agent_list, self.cached_configs.copy() + + def save(self, filepath: Optional[str] = None) -> str: + """ + Save building configs. If the filepath is not specific, this function will create a filename by encrypt the + building_task string by md5 with "save_config_" prefix, and save config to the local path. + + Args: + filepath: save path. + + Return: + filepath: path save. + """ + if filepath is None: + filepath = f'./save_config_{hashlib.md5(self.building_task.encode("utf-8")).hexdigest()}.json' + with open(filepath, "w") as save_file: + json.dump(self.cached_configs, save_file, indent=4) + print(f"Building config saved to {filepath}") + + return filepath + + def load( + self, + filepath: str, + **kwargs, + ): + """ + Load building configs and call the build function to complete building without calling online LLMs' api. + + Args: + filepath: filepath for the save config. + """ + try: + print(f"Loding config from {filepath}") + cached_configs = json.load(open(filepath)) + except FileNotFoundError: + raise FileNotFoundError(f"Config file {filepath} does not exist.") + + return self.build(cached_configs=cached_configs, **kwargs) diff --git a/notebook/agentchat_autobuild.ipynb b/notebook/agentchat_autobuild.ipynb new file mode 100644 index 00000000000..02c592b5bb3 --- /dev/null +++ b/notebook/agentchat_autobuild.ipynb @@ -0,0 +1,1144 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c1004af6a7fbfcd8", + "metadata": { + "collapsed": false + }, + "source": [ + "# AutoBuild\n", + "AutoGen offers conversable agents powered by LLM, tool, or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation through multi-agent conversation.\n", + "Please find documentation about this feature [here](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat).\n", + "\n", + "In this notebook, we introduce a new class, `AgentBuilder`, to help user build an automatic task solving process powered by multi-agent system. Specifically, in `build()`, we prompt a LLM to create multiple participant agent and initialize a group chat, and specify whether this task need programming to solve. AgentBuilder also support open-source LLMs by [vLLM](https://docs.vllm.ai/en/latest/index.html) and [Fastchat](https://github.com/lm-sys/FastChat). Check the supported model list [here](https://docs.vllm.ai/en/latest/models/supported_models.html)." + ] + }, + { + "cell_type": "markdown", + "id": "ec78dda8e3826d8a", + "metadata": { + "collapsed": false + }, + "source": [ + "## Requirement\n", + "\n", + "AutoBuild need the latest version of AutoGen.\n", + "You can install AutoGen by the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8e9ae50658be975", + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!pip install pyautogen" + ] + }, + { + "cell_type": "markdown", + "id": "7d0e63ab3604bdb9", + "metadata": { + "collapsed": false + }, + "source": [ + "## Step 1: prepare configuration\n", + "Prepare a `config_path` for assistant agent to limit the choice of LLM you want to use in this task. This config can be a path of json file or a name of environment variable. A `default_llm_config` is also required for initialize the specific config of LLMs like seed, temperature, etc..." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2505f029423b21ab", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T13:31:44.147211100Z", + "start_time": "2023-12-03T13:31:44.121842300Z" + } + }, + "outputs": [], + "source": [ + "config_path = 'OAI_CONFIG_LIST' # modify path\n", + "default_llm_config = {\n", + " 'temperature': 0\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "c2d6586c68fa425b", + "metadata": { + "collapsed": false + }, + "source": [ + "## Step 2: create a AgentBuilder\n", + "Create a `AgentBuilder` with the specified `config_path`. AgentBuilder will use GPT-4 in default to complete the whole process, you can also change the `builder_model` to other OpenAI model if you want. You can also specify a OpenAI or open-source LLM as agent backbone, see blog for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bfa67c771a0fed37", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T13:31:44.996307300Z", + "start_time": "2023-12-03T13:31:44.743284700Z" + } + }, + "outputs": [], + "source": [ + "from autogen.agentchat.contrib.agent_builder import AgentBuilder\n", + "\n", + "builder = AgentBuilder(config_path=config_path, builder_model='gpt-4-1106-preview', agent_model='gpt-4-1106-preview')" + ] + }, + { + "cell_type": "markdown", + "id": "2e6a655fb6618324", + "metadata": { + "collapsed": false + }, + "source": [ + "## Step 3: specify a building task\n", + "\n", + "Specify a building task with a general description. Building task will help build manager (a LLM) decide what agents should be build." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "68315f6ec912c58a", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T13:31:45.444044500Z", + "start_time": "2023-12-03T13:31:45.429483200Z" + } + }, + "outputs": [], + "source": [ + "building_task = \"Find a paper on arxiv by programming, and analyze its application in some domain. For example, find a recent paper about gpt-4 on arxiv and find its potential applications in software.\"" + ] + }, + { + "cell_type": "markdown", + "id": "5782dd5ecb6c217a", + "metadata": { + "collapsed": false + }, + "source": [ + "## Step 4: build group chat agents\n", + "Use `build()` to let build manager (the specified `builder_model`) complete the group chat agents generation. If you think coding is necessary in your task, you can use `coding=True` to add a user proxy (an automatic code interpreter) into the agent list, like: \n", + "```python\n", + "builder.build(building_task, default_llm_config, coding=True)\n", + "```\n", + "If `coding` is not specified, AgentBuilder will determine on its own whether the user proxy should be added or not according to the task." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ab490fdbe46c0473", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T13:32:45.887656900Z", + "start_time": "2023-12-03T13:31:46.822373400Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating agents...\n", + "Data_scientist,Research_analyst,Software_developer are generated.\n", + "Preparing configuration for Data_scientist...\n", + "Preparing configuration for Research_analyst...\n", + "Preparing configuration for Software_developer...\n", + "Creating agent Data_scientist with backbone gpt-4-1106-preview...\n", + "Creating agent Research_analyst with backbone gpt-4-1106-preview...\n", + "Creating agent Software_developer with backbone gpt-4-1106-preview...\n", + "Adding user console proxy...\n" + ] + } + ], + "source": [ + "agent_list, agent_configs = builder.build(building_task, default_llm_config)" + ] + }, + { + "cell_type": "markdown", + "id": "e00dd99880a4bf7b", + "metadata": { + "collapsed": false + }, + "source": [ + "## Step 5: execute task\n", + "Let agents generated in `build()` to complete the task collaboratively in a group chat." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7d52e3d9a1bf91cb", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T13:33:42.369660600Z", + "start_time": "2023-12-03T13:32:45.881740500Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser_console_and_Python_code_interpreter\u001B[0m (to chat_manager):\n", + "Find a recent paper about gpt-4 on arxiv and find its potential applications in software.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001B[33mData_scientist\u001B[0m (to chat_manager):\n", + "\n", + "To begin, we'll write a Python script that uses the `arxiv` library to search for recent papers on arXiv related to GPT-4. The script will automate the process of searching for the papers, downloading the metadata, and then extracting the relevant information to identify potential applications in software.\n", + "\n", + "First, you'll need to install the `arxiv` library if you haven't already. You can do this by running `pip install arxiv`.\n", + "\n", + "Here's a Python script that will perform the search and print out the title, authors, summary, and publication date of the most recent papers related to GPT-4. Save this script to a file and run it in your Python environment.\n", + "\n", + "```python\n", + "# filename: arxiv_search_gpt4.py\n", + "\n", + "import arxiv\n", + "import datetime\n", + "\n", + "# Define the search query and parameters\n", + "search_query = 'all:\"GPT-4\"'\n", + "max_results = 5 # You can adjust this number based on how many results you want\n", + "\n", + "# Search arXiv for papers related to GPT-4\n", + "search = arxiv.Search(\n", + " query=search_query,\n", + " max_results=max_results,\n", + " sort_by=arxiv.SortCriterion.SubmittedDate\n", + ")\n", + "\n", + "# Fetch the results\n", + "results = list(search.results())\n", + "\n", + "# Print the details of the most recent papers\n", + "for result in results:\n", + " published = result.published.strftime('%Y-%m-%d')\n", + " print(f\"Title: {result.title}\\nAuthors: {', '.join(author.name for author in result.authors)}\\nPublished: {published}\\nSummary: {result.summary}\\n\")\n", + "\n", + "# Note: This script does not download the full paper, only the metadata.\n", + "```\n", + "\n", + "After running this script, you will have a list of recent papers related to GPT-4. You can then read through the summaries to identify potential applications in software. If you need to download the full papers, you can modify the script to fetch the PDFs using the URLs provided in the metadata.\n", + "\n", + "Once you have the summaries or full papers, you can use your analytical skills to discern the potential applications of GPT-4 in software. Look for keywords such as \"software engineering\", \"application\", \"tool\", \"framework\", \"integration\", \"development\", and \"automation\" to find relevant information.\n", + "\n", + "Please execute the above script to retrieve the recent papers on GPT-4 from arXiv. After that, I can guide you through the analysis of their content to identify potential applications in software.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001B[31m\n", + ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001B[0m\n", + "\u001B[33mUser_console_and_Python_code_interpreter\u001B[0m (to chat_manager):\n", + "\n", + "exitcode: 0 (execution succeeded)\n", + "Code output: \n", + "Title: Unnatural Error Correction: GPT-4 Can Almost Perfectly Handle Unnatural Scrambled Text\n", + "Authors: Qi Cao, Takeshi Kojima, Yutaka Matsuo, Yusuke Iwasawa\n", + "Published: 2023-11-30\n", + "Summary: While Large Language Models (LLMs) have achieved remarkable performance in\n", + "many tasks, much about their inner workings remains unclear. In this study, we\n", + "present novel experimental insights into the resilience of LLMs, particularly\n", + "GPT-4, when subjected to extensive character-level permutations. To investigate\n", + "this, we first propose the Scrambled Bench, a suite designed to measure the\n", + "capacity of LLMs to handle scrambled input, in terms of both recovering\n", + "scrambled sentences and answering questions given scrambled context. The\n", + "experimental results indicate that most powerful LLMs demonstrate the\n", + "capability akin to typoglycemia, a phenomenon where humans can understand the\n", + "meaning of words even when the letters within those words are scrambled, as\n", + "long as the first and last letters remain in place. More surprisingly, we found\n", + "that only GPT-4 nearly flawlessly processes inputs with unnatural errors, even\n", + "under the extreme condition, a task that poses significant challenges for other\n", + "LLMs and often even for humans. Specifically, GPT-4 can almost perfectly\n", + "reconstruct the original sentences from scrambled ones, decreasing the edit\n", + "distance by 95%, even when all letters within each word are entirely scrambled.\n", + "It is counter-intuitive that LLMs can exhibit such resilience despite severe\n", + "disruption to input tokenization caused by scrambled text.\n", + "\n", + "Title: Language Model Agents Suffer from Compositional Generalization in Web Automation\n", + "Authors: Hiroki Furuta, Yutaka Matsuo, Aleksandra Faust, Izzeddin Gur\n", + "Published: 2023-11-30\n", + "Summary: Language model agents (LMA) recently emerged as a promising paradigm on\n", + "muti-step decision making tasks, often outperforming humans and other\n", + "reinforcement learning agents. Despite the promise, their performance on\n", + "real-world applications that often involve combinations of tasks is still\n", + "underexplored. In this work, we introduce a new benchmark, called CompWoB -- 50\n", + "new compositional web automation tasks reflecting more realistic assumptions.\n", + "We show that while existing prompted LMAs (gpt-3.5-turbo or gpt-4) achieve\n", + "94.0% average success rate on base tasks, their performance degrades to 24.9%\n", + "success rate on compositional tasks. On the other hand, transferred LMAs\n", + "(finetuned only on base tasks) show less generalization gap, dropping from\n", + "85.4% to 54.8%. By balancing data distribution across tasks, we train a new\n", + "model, HTML-T5++, that surpasses human-level performance (95.2%) on MiniWoB,\n", + "and achieves the best zero-shot performance on CompWoB (61.5%). While these\n", + "highlight the promise of small-scale finetuned and transferred models for\n", + "compositional generalization, their performance further degrades under\n", + "different instruction compositions changing combinational order. In contrast to\n", + "the recent remarkable success of LMA, our benchmark and detailed analysis\n", + "emphasize the necessity of building LMAs that are robust and generalizable to\n", + "task compositionality for real-world deployment.\n", + "\n", + "Title: AlignBench: Benchmarking Chinese Alignment of Large Language Models\n", + "Authors: Xiao Liu, Xuanyu Lei, Shengyuan Wang, Yue Huang, Zhuoer Feng, Bosi Wen, Jiale Cheng, Pei Ke, Yifan Xu, Weng Lam Tam, Xiaohan Zhang, Lichao Sun, Hongning Wang, Jing Zhang, Minlie Huang, Yuxiao Dong, Jie Tang\n", + "Published: 2023-11-30\n", + "Summary: Alignment has become a critical step for instruction-tuned Large Language\n", + "Models (LLMs) to become helpful assistants. However, effective evaluation of\n", + "alignment for emerging Chinese LLMs is still significantly lacking, calling for\n", + "real-scenario grounded, open-ended, challenging and automatic evaluations\n", + "tailored for alignment. To fill in this gap, we introduce AlignBench, a\n", + "comprehensive multi-dimensional benchmark for evaluating LLMs' alignment in\n", + "Chinese. Equipped with a human-in-the-loop data curation pipeline, our\n", + "benchmark employs a rule-calibrated multi-dimensional LLM-as-Judge with\n", + "Chain-of-Thought to generate explanations and final ratings as evaluations,\n", + "ensuring high reliability and interpretability. Furthermore, we developed a\n", + "dedicated companion evaluator LLM -- CritiqueLLM, which recovers 95\\% of\n", + "GPT-4's evaluation ability and will be provided via public APIs to researchers\n", + "for evaluation of alignment in Chinese LLMs. All evaluation codes, data, and\n", + "LLM generations are available at \\url{https://github.com/THUDM/AlignBench}.\n", + "\n", + "Title: CritiqueLLM: Scaling LLM-as-Critic for Effective and Explainable Evaluation of Large Language Model Generation\n", + "Authors: Pei Ke, Bosi Wen, Zhuoer Feng, Xiao Liu, Xuanyu Lei, Jiale Cheng, Shengyuan Wang, Aohan Zeng, Yuxiao Dong, Hongning Wang, Jie Tang, Minlie Huang\n", + "Published: 2023-11-30\n", + "Summary: Since the natural language processing (NLP) community started to make large\n", + "language models (LLMs), such as GPT-4, act as a critic to evaluate the quality\n", + "of generated texts, most of them only train a critique generation model of a\n", + "specific scale on specific datasets. We argue that a comprehensive\n", + "investigation on the key factor of LLM-based evaluation models, such as scaling\n", + "properties, is lacking, so that it is still inconclusive whether these models\n", + "have potential to replace GPT-4's evaluation in practical scenarios. In this\n", + "paper, we propose a new critique generation model called CritiqueLLM, which\n", + "includes a dialogue-based prompting method for high-quality referenced /\n", + "reference-free evaluation data. Experimental results show that our model can\n", + "achieve comparable evaluation performance to GPT-4 especially in system-level\n", + "correlations, and even outperform GPT-4 in 3 out of 8 tasks in a challenging\n", + "reference-free setting. We conduct detailed analysis to show promising scaling\n", + "properties of our model in the quality of generated critiques. We also\n", + "demonstrate that our generated critiques can act as scalable feedback to\n", + "directly improve the generation quality of LLMs.\n", + "\n", + "Title: AviationGPT: A Large Language Model for the Aviation Domain\n", + "Authors: Liya Wang, Jason Chou, Xin Zhou, Alex Tien, Diane M Baumgartner\n", + "Published: 2023-11-29\n", + "Summary: The advent of ChatGPT and GPT-4 has captivated the world with large language\n", + "models (LLMs), demonstrating exceptional performance in question-answering,\n", + "summarization, and content generation. The aviation industry is characterized\n", + "by an abundance of complex, unstructured text data, replete with technical\n", + "jargon and specialized terminology. Moreover, labeled data for model building\n", + "are scarce in this domain, resulting in low usage of aviation text data. The\n", + "emergence of LLMs presents an opportunity to transform this situation, but\n", + "there is a lack of LLMs specifically designed for the aviation domain. To\n", + "address this gap, we propose AviationGPT, which is built on open-source LLaMA-2\n", + "and Mistral architectures and continuously trained on a wealth of carefully\n", + "curated aviation datasets. Experimental results reveal that AviationGPT offers\n", + "users multiple advantages, including the versatility to tackle diverse natural\n", + "language processing (NLP) problems (e.g., question-answering, summarization,\n", + "document writing, information extraction, report querying, data cleaning, and\n", + "interactive data exploration). It also provides accurate and contextually\n", + "relevant responses within the aviation domain and significantly improves\n", + "performance (e.g., over a 40% performance gain in tested cases). With\n", + "AviationGPT, the aviation industry is better equipped to address more complex\n", + "research problems and enhance the efficiency and safety of National Airspace\n", + "System (NAS) operations.\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001B[33mData_scientist\u001B[0m (to chat_manager):\n", + "\n", + "Based on the output, we have several recent papers related to GPT-4. Let's analyze their summaries to identify potential applications in software:\n", + "\n", + "1. **Unnatural Error Correction: GPT-4 Can Almost Perfectly Handle Unnatural Scrambled Text**\n", + " - **Potential Application**: This paper suggests that GPT-4 has a remarkable ability to correct and understand scrambled text. This could be applied in software for error correction, data cleaning, and improving resilience against data corruption or obfuscation.\n", + "\n", + "2. **Language Model Agents Suffer from Compositional Generalization in Web Automation**\n", + " - **Potential Application**: The paper discusses the performance of GPT-4 in web automation tasks and highlights its limitations in compositional generalization. This indicates that while GPT-4 can be used in web automation software, there is room for improvement, especially in tasks that require understanding and combining different instructions.\n", + "\n", + "3. **AlignBench: Benchmarking Chinese Alignment of Large Language Models**\n", + " - **Potential Application**: This paper introduces a benchmark for evaluating the alignment of Chinese LLMs, including GPT-4. The potential application here is in developing software tools for evaluating and improving the alignment of language models, particularly for non-English languages, which is crucial for creating more inclusive and effective NLP applications.\n", + "\n", + "4. **CritiqueLLM: Scaling LLM-as-Critic for Effective and Explainable Evaluation of Large Language Model Generation**\n", + " - **Potential Application**: The research presents a model for evaluating the quality of text generated by LLMs. Software applications could include automated quality control for content generation, providing feedback for improving language models, and developing more explainable AI systems.\n", + "\n", + "5. **AviationGPT: A Large Language Model for the Aviation Domain**\n", + " - **Potential Application**: The paper proposes a domain-specific LLM for aviation. This model could be applied in software for various NLP tasks within the aviation industry, such as question-answering, summarization, document writing, information extraction, report querying, data cleaning, and interactive data exploration, leading to improved efficiency and safety in aviation operations.\n", + "\n", + "These summaries provide a glimpse into the diverse applications of GPT-4 in software. From error correction and web automation to domain-specific applications and model evaluation, GPT-4's capabilities can be leveraged to enhance various aspects of software development and deployment.\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "import autogen\n", + "\n", + "def start_task(execution_task: str, agent_list: list, llm_config: dict):\n", + " config_list = autogen.config_list_from_json(config_path, filter_dict={\"model\": [\"gpt-4-1106-preview\"]})\n", + " \n", + " group_chat = autogen.GroupChat(agents=agent_list, messages=[], max_round=12)\n", + " manager = autogen.GroupChatManager(\n", + " groupchat=group_chat, llm_config={\"config_list\": config_list, **llm_config}\n", + " )\n", + " agent_list[0].initiate_chat(manager, message=execution_task)\n", + "\n", + "start_task(\n", + " execution_task=\"Find a recent paper about gpt-4 on arxiv and find its potential applications in software.\",\n", + " agent_list=agent_list,\n", + " llm_config=default_llm_config\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "22a30e4b4297edd1", + "metadata": { + "collapsed": false + }, + "source": [ + "## Step 6 (Optional): clear all agents and prepare for the next task\n", + "You can clear all agents generated in this task by the following code if your task is completed or the next task is largely different from the current task. If the agent's backbone is an open-source LLM, this process will also shutdown the endpoint server. If necessary, you can use `recycle_endpoint=False` to retain the previous open-source LLMs' endpoint server." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7fb0bfff01dd1330", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T13:34:08.429248500Z", + "start_time": "2023-12-03T13:34:08.364799400Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All agents have been cleared.\n" + ] + } + ], + "source": [ + "builder.clear_all_agents(recycle_endpoint=True)" + ] + }, + { + "cell_type": "markdown", + "id": "bbb098638a086898", + "metadata": { + "collapsed": false + }, + "source": [ + "## Save & load configs\n", + "\n", + "You can save all necessary information of the built group chat agents. Here is a case for those agents generated in the above task:\n", + "```json\n", + "{\n", + " \"building_task\": \"Find a paper on arxiv by programming, and analysis its application in some domain. For example, find a latest paper about gpt-4 on arxiv and find its potential applications in software.\",\n", + " \"agent_configs\": [\n", + " {\n", + " \"name\": \"Data_scientist\",\n", + " \"model\": \"gpt-4-1106-preview\",\n", + " \"system_message\": \"As a Data Scientist, you are tasked with automating the retrieval and analysis of academic papers from arXiv. Utilize your Python programming acumen to develop scripts for gathering necessary information such as searching for relevant papers, downloading them, and processing their contents. Apply your analytical and language skills to interpret the data and deduce the applications of the research within specific domains.\\n\\n1. To compile information, write and implement Python scripts that search and interact with online resources, download and read files, extract content from documents, and perform other information-gathering tasks. Use the printed output as the foundation for your subsequent analysis.\\n\\n2. Execute tasks programmatically with Python scripts when possible, ensuring results are directly displayed. Approach each task with efficiency and strategic thinking.\\n\\nProgress through tasks systematically. In instances where a strategy is not provided, outline your plan before executing. Clearly distinguish between tasks handled via code and those utilizing your analytical expertise.\\n\\nWhen providing code, include only Python scripts meant to be run without user alterations. Users should execute your script as is, without modifications:\\n\\n```python\\n# filename: \\n# Python script\\nprint(\\\"Your output\\\")\\n```\\n\\nUsers should not perform any actions other than running the scripts you provide. Avoid presenting partial or incomplete scripts that require user adjustments. Refrain from requesting users to copy-paste results; instead, use the 'print' function when suitable to display outputs. Monitor the execution results they share.\\n\\nIf an error surfaces, supply corrected scripts for a re-run. If the strategy fails to resolve the issue, reassess your assumptions, gather additional details as needed, and explore alternative approaches.\\n\\nUpon successful completion of a task and verification of the results, confirm the achievement of the stated objective. Ensuring accuracy and validity of the findings is paramount. Evidence supporting your conclusions should be provided when feasible.\\n\\nUpon satisfying the user's needs and ensuring all tasks are finalized, conclude your assistance with \\\"TERMINATE\\\".\"\n", + " },\n", + " {\n", + " \"name\": \"Research_analyst\",\n", + " \"model\": \"gpt-4-1106-preview\",\n", + " \"system_message\": \"As a Research Analyst, you are expected to be a proficient AI assistant possessing a strong grasp of programming, specifically in Python, and robust analytical capabilities. Your primary responsibilities will include:\\n\\n1. Conducting comprehensive searches and retrieving information autonomously through Python scripts, such as querying databases, accessing web services (like arXiv), downloading and reading files, and retrieving system information.\\n2. Analyzing the content of the retrieved documents, particularly academic papers, and extracting insights regarding their application in specific domains, such as the potential uses of GPT-4 in software development.\\n3. Presenting your findings in a clear, detailed manner, explaining the implications of the research and its relevance to the assigned task.\\n4. Employing your programming skills to automate tasks where possible, ensuring the output is delivered through Python code with clear, executable instructions. Your code will be designed for the user to execute without amendment or additional input.\\n5. Verifying the results of information gathering and analysis to ensure accuracy and completeness, providing evidence to support your conclusions when available.\\n6. Communicating the completion of each task and confirming that the user's needs have been satisfied through a clear and conclusive statement, followed by the word \\\"TERMINATE\\\" to signal the end of the interaction.\"\n", + " },\n", + " {\n", + " \"name\": \"Software_developer\",\n", + " \"model\": \"gpt-4-1106-preview\",\n", + " \"system_message\": \"As a dedicated AI assistant for a software developer, your role involves employing your Python programming prowess and proficiency in natural language processing to facilitate the discovery and analysis of scholarly articles on arXiv. Your tasks include crafting Python scripts to automatically search, retrieve, and present information regarding the latest research, with a focus on applicable advancements in technology such as GPT-4 and its potential impact on the domain of software development.\\n\\n1. Utilize Python to programmatically seek out and extract pertinent data, for example, navigating or probing the web, downloading/ingesting documents, or showcasing content from web pages or files. When enough information has been accumulated to proceed, you will then analyze and interpret the findings.\\n\\n2. When there's a need to perform an operation programmatically, your Python code should accomplish the task and manifest the outcome. Progress through the task incrementally and systematically.\\n\\nProvide a clear plan outlining each stage of the task, specifying which components will be executed through Python coding and which through your linguistic capabilities. When proposing Python code, remember to:\\n\\n- Label the script type within the code block\\n- Avoid suggesting code that the user would need to alter\\n- Refrain from including more than one code block in your response\\n- Circumvent requesting the user to manually transcribe any results; utilize 'print' statements where applicable\\n- Examine the user's reported execution outcomes\\n\\nIf an error arises, your responsibility is to rectify the issue and submit the corrected script. Should an error remain unresolvable, or if the task remains incomplete post successful code execution, re-evaluate the scenario, gather any further required information, and formulate an alternative approach.\\n\\nUpon confirming that the task has been satisfactorily accomplished and the user's requirements have been met, indicate closure of the procedure with a concluding statement.\"\n", + " }\n", + " ],\n", + " \"coding\": true,\n", + " \"default_llm_config\": {\n", + " \"temperature\": 0\n", + " }\n", + "}\n", + "```\n", + "These information will be saved in JSON format. You can provide a specific filename, otherwise, AgentBuilder will save config to the current path with a generated filename 'save_config_TASK_MD5.json'." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e4b88a5d482ceba4", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T13:34:09.652503400Z", + "start_time": "2023-12-03T13:34:09.639760500Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Building config saved to ./save_config_2a87f83309bd8b5ebb0423f5570385f5.json\n" + ] + } + ], + "source": [ + "saved_path = builder.save()" + ] + }, + { + "cell_type": "markdown", + "id": "a35620c10ee42be", + "metadata": { + "collapsed": false + }, + "source": [ + "After that, you can load the saved config and skip the building process. AgentBuilder will create agents with those information without prompting the builder manager." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "34addd498e5ab174", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T13:34:56.857918300Z", + "start_time": "2023-12-03T13:34:11.108958800Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loding config from ./save_config_2a87f83309bd8b5ebb0423f5570385f5.json\n", + "Creating agent Data_scientist with backbone gpt-4-1106-preview...\n", + "Creating agent Research_analyst with backbone gpt-4-1106-preview...\n", + "Creating agent Software_developer with backbone gpt-4-1106-preview...\n", + "Adding user console proxy...\n", + "\u001B[33mUser_console_and_Python_code_interpreter\u001B[0m (to chat_manager):\n", + "Find a recent paper about Llava on arxiv and find its potential applications in computer vision.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001B[33mData_scientist\u001B[0m (to chat_manager):\n", + "\n", + "To find a recent paper about \"Llava\" on arXiv, we can use the arXiv API to search for papers that match this keyword. However, it's important to note that \"Llava\" might be a typo or a less common term. If you meant \"Lava\" or another term, please correct me. Assuming \"Llava\" is the correct term, I will proceed with that.\n", + "\n", + "Here's a Python script that uses the `arxiv` library to search for papers related to \"Llava\". If the `arxiv` library is not installed on your system, you can install it using `pip install arxiv`.\n", + "\n", + "```python\n", + "# filename: arxiv_search.py\n", + "\n", + "import arxiv\n", + "\n", + "# Define the search query and parameters\n", + "search_query = 'all:Llava'\n", + "max_results = 10\n", + "\n", + "# Search arXiv for papers related to the search query\n", + "def search_papers(query, max_results):\n", + " search = arxiv.Search(\n", + " query=query,\n", + " max_results=max_results,\n", + " sort_by=arxiv.SortCriterion.SubmittedDate\n", + " )\n", + " for result in search.results():\n", + " print(f\"Title: {result.title}\")\n", + " print(f\"Authors: {', '.join(author.name for author in result.authors)}\")\n", + " print(f\"Abstract: {result.summary}\")\n", + " print(f\"URL: {result.entry_id}\")\n", + " print(f\"Published: {result.published}\")\n", + " print(\"\")\n", + "\n", + "# Run the search and print the results\n", + "search_papers(search_query, max_results)\n", + "```\n", + "\n", + "To execute this script, save it to a file named `arxiv_search.py` and run it using a Python interpreter. The script will print out the titles, authors, abstracts, and URLs of up to 10 recent papers related to \"Llava\".\n", + "\n", + "Once we have the papers, we can analyze their abstracts to determine potential applications in computer vision. However, this part of the task will require human analysis and cannot be fully automated, as it involves understanding and interpreting the content of the papers. If the script finds relevant papers, I will proceed with the analysis based on the abstracts provided.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001B[31m\n", + ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001B[0m\n", + "\u001B[33mUser_console_and_Python_code_interpreter\u001B[0m (to chat_manager):\n", + "\n", + "exitcode: 0 (execution succeeded)\n", + "Code output: \n", + "Title: Contrastive Vision-Language Alignment Makes Efficient Instruction Learner\n", + "Authors: Lizhao Liu, Xinyu Sun, Tianhang Xiang, Zhuangwei Zhuang, Liuren Yin, Mingkui Tan\n", + "Abstract: We study the task of extending the large language model (LLM) into a\n", + "vision-language instruction-following model. This task is crucial but\n", + "challenging since the LLM is trained on text modality only, making it hard to\n", + "effectively digest the visual modality. To address this, existing methods\n", + "typically train a visual adapter to align the representation between a\n", + "pre-trained vision transformer (ViT) and the LLM by a generative image\n", + "captioning loss. However, we find that the generative objective can only\n", + "produce weak alignment for vision and language, making the aligned\n", + "vision-language model very hungry for the instruction fine-tuning data. In this\n", + "paper, we propose CG-VLM that applies both Contrastive and Generative alignment\n", + "objectives to effectively align the representation of ViT and LLM. Different\n", + "from image level and sentence level alignment in common contrastive learning\n", + "settings, CG-VLM aligns the image-patch level features and text-token level\n", + "embeddings, which, however, is very hard to achieve as no explicit grounding\n", + "patch-token relation provided in standard image captioning datasets. To address\n", + "this issue, we propose to maximize the averaged similarity between pooled\n", + "image-patch features and text-token embeddings. Extensive experiments\n", + "demonstrate that the proposed CG-VLM produces strong vision-language alignment\n", + "and is an efficient instruction learner. For example, using only 10%\n", + "instruction tuning data, we reach 95% performance of state-of-the-art method\n", + "LLaVA [29] on the zero-shot ScienceQA-Image benchmark.\n", + "URL: http://arxiv.org/abs/2311.17945v1\n", + "Published: 2023-11-29 03:29:46+00:00\n", + "\n", + "Title: Mitigating Hallucination in Visual Language Models with Visual Supervision\n", + "Authors: Zhiyang Chen, Yousong Zhu, Yufei Zhan, Zhaowen Li, Chaoyang Zhao, Jinqiao Wang, Ming Tang\n", + "Abstract: Large vision-language models (LVLMs) suffer from hallucination a lot,\n", + "generating responses that apparently contradict to the image content\n", + "occasionally. The key problem lies in its weak ability to comprehend detailed\n", + "content in a multi-modal context, which can be mainly attributed to two factors\n", + "in training data and loss function. The vision instruction dataset primarily\n", + "focuses on global description, and the auto-regressive loss function favors\n", + "text modeling rather than image understanding. In this paper, we bring more\n", + "detailed vision annotations and more discriminative vision models to facilitate\n", + "the training of LVLMs, so that they can generate more precise responses without\n", + "encounter hallucination. On one hand, we generate image-text pairs with\n", + "detailed relationship annotations in panoptic scene graph dataset (PSG). These\n", + "conversations pay more attention on detailed facts in the image, encouraging\n", + "the model to answer questions based on multi-modal contexts. On the other hand,\n", + "we integrate SAM and mask prediction loss as auxiliary supervision, forcing the\n", + "LVLMs to have the capacity to identify context-related objects, so that they\n", + "can generate more accurate responses, mitigating hallucination. Moreover, to\n", + "provide a deeper evaluation on the hallucination in LVLMs, we propose a new\n", + "benchmark, RAH-Bench. It divides vision hallucination into three different\n", + "types that contradicts the image with wrong categories, attributes or\n", + "relations, and introduces False Positive Rate as detailed sub-metric for each\n", + "type. In this benchmark, our approach demonstrates an +8.4% enhancement\n", + "compared to original LLaVA and achieves widespread performance improvements\n", + "across other models.\n", + "URL: http://arxiv.org/abs/2311.16479v1\n", + "Published: 2023-11-27 09:30:02+00:00\n", + "\n", + "Title: PG-Video-LLaVA: Pixel Grounding Large Video-Language Models\n", + "Authors: Shehan Munasinghe, Rusiru Thushara, Muhammad Maaz, Hanoona Abdul Rasheed, Salman Khan, Mubarak Shah, Fahad Khan\n", + "Abstract: Extending image-based Large Multimodal Models (LMM) to videos is challenging\n", + "due to the inherent complexity of video data. The recent approaches extending\n", + "image-based LMM to videos either lack the grounding capabilities (e.g.,\n", + "VideoChat, Video-ChatGPT, Video-LLaMA) or do not utilize the audio-signals for\n", + "better video understanding (e.g., Video-ChatGPT). Addressing these gaps, we\n", + "propose Video-LLaVA, the first LMM with pixel-level grounding capability,\n", + "integrating audio cues by transcribing them into text to enrich video-context\n", + "understanding. Our framework uses an off-the-shelf tracker and a novel\n", + "grounding module, enabling it to spatially and temporally localize objects in\n", + "videos following user instructions. We evaluate Video-LLaVA using video-based\n", + "generative and question-answering benchmarks and introduce new benchmarks\n", + "specifically designed to measure prompt-based object grounding performance in\n", + "videos. Further, we propose the use of Vicuna over GPT-3.5, as utilized in\n", + "Video-ChatGPT, for video-based conversation benchmarking, ensuring\n", + "reproducibility of results which is a concern with the proprietary nature of\n", + "GPT-3.5. Our framework builds on SoTA image-based LLaVA model and extends its\n", + "advantages to the video domain, delivering promising gains on video-based\n", + "conversation and grounding tasks. Project Page:\n", + "https://github.com/mbzuai-oryx/Video-LLaVA\n", + "URL: http://arxiv.org/abs/2311.13435v1\n", + "Published: 2023-11-22 14:48:30+00:00\n", + "\n", + "Title: ShareGPT4V: Improving Large Multi-Modal Models with Better Captions\n", + "Authors: Lin Chen, Jinsong Li, Xiaoyi Dong, Pan Zhang, Conghui He, Jiaqi Wang, Feng Zhao, Dahua Lin\n", + "Abstract: In the realm of large multi-modal models (LMMs), efficient modality alignment\n", + "is crucial yet often constrained by the scarcity of high-quality image-text\n", + "data. To address this bottleneck, we introduce the ShareGPT4V dataset, a\n", + "pioneering large-scale resource featuring 1.2 million highly descriptive\n", + "captions, which surpasses existing datasets in diversity and information\n", + "content, covering world knowledge, object properties, spatial relationships,\n", + "and aesthetic evaluations. Specifically, ShareGPT4V originates from a curated\n", + "100K high-quality captions collected from advanced GPT4-Vision and has been\n", + "expanded to 1.2M with a superb caption model trained on this subset. ShareGPT4V\n", + "first demonstrates its effectiveness for the Supervised Fine-Tuning (SFT)\n", + "phase, by substituting an equivalent quantity of detailed captions in existing\n", + "SFT datasets with a subset of our high-quality captions, significantly\n", + "enhancing the LMMs like LLaVA-7B, LLaVA-1.5-13B, and Qwen-VL-Chat-7B on the MME\n", + "and MMBench benchmarks, with respective gains of 222.8/22.0/22.3 and\n", + "2.7/1.3/1.5. We further incorporate ShareGPT4V data into both the pre-training\n", + "and SFT phases, obtaining ShareGPT4V-7B, a superior LMM based on a simple\n", + "architecture that has remarkable performance across a majority of the\n", + "multi-modal benchmarks. This project is available at\n", + "https://ShareGPT4V.github.io to serve as a pivotal resource for advancing the\n", + "LMMs community.\n", + "URL: http://arxiv.org/abs/2311.12793v2\n", + "Published: 2023-11-21 18:58:11+00:00\n", + "\n", + "Title: VLM-Eval: A General Evaluation on Video Large Language Models\n", + "Authors: Shuailin Li, Yuang Zhang, Yucheng Zhao, Qiuyue Wang, Fan Jia, Yingfei Liu, Tiancai Wang\n", + "Abstract: Despite the rapid development of video Large Language Models (LLMs), a\n", + "comprehensive evaluation is still absent. In this paper, we introduce a unified\n", + "evaluation that encompasses multiple video tasks, including captioning,\n", + "question and answering, retrieval, and action recognition. In addition to\n", + "conventional metrics, we showcase how GPT-based evaluation can match human-like\n", + "performance in assessing response quality across multiple aspects. We propose a\n", + "simple baseline: Video-LLaVA, which uses a single linear projection and\n", + "outperforms existing video LLMs. Finally, we evaluate video LLMs beyond\n", + "academic datasets, which show encouraging recognition and reasoning\n", + "capabilities in driving scenarios with only hundreds of video-instruction pairs\n", + "for fine-tuning. We hope our work can serve as a unified evaluation for video\n", + "LLMs, and help expand more practical scenarios. The evaluation code will be\n", + "available soon.\n", + "URL: http://arxiv.org/abs/2311.11865v1\n", + "Published: 2023-11-20 16:02:10+00:00\n", + "\n", + "Title: Video-LLaVA: Learning United Visual Representation by Alignment Before Projection\n", + "Authors: Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, Li Yuan\n", + "Abstract: The Large Vision-Language Model (LVLM) has enhanced the performance of\n", + "various downstream tasks in visual-language understanding. Most existing\n", + "approaches encode images and videos into separate feature spaces, which are\n", + "then fed as inputs to large language models. However, due to the lack of\n", + "unified tokenization for images and videos, namely misalignment before\n", + "projection, it becomes challenging for a Large Language Model (LLM) to learn\n", + "multi-modal interactions from several poor projection layers. In this work, we\n", + "unify visual representation into the language feature space to advance the\n", + "foundational LLM towards a unified LVLM. As a result, we establish a simple but\n", + "robust LVLM baseline, Video-LLaVA, which learns from a mixed dataset of images\n", + "and videos, mutually enhancing each other. Video-LLaVA achieves superior\n", + "performances on a broad range of 9 image benchmarks across 5 image\n", + "question-answering datasets and 4 image benchmark toolkits. Additionally, our\n", + "Video-LLaVA also outperforms Video-ChatGPT by 5.8%, 9.9%, 18.6%, and 10.1% on\n", + "MSRVTT, MSVD, TGIF, and ActivityNet, respectively. Notably, extensive\n", + "experiments demonstrate that Video-LLaVA mutually benefits images and videos\n", + "within a unified visual representation, outperforming models designed\n", + "specifically for images or videos. We aim for this work to provide modest\n", + "insights into the multi-modal inputs for the LLM.\n", + "URL: http://arxiv.org/abs/2311.10122v2\n", + "Published: 2023-11-16 10:59:44+00:00\n", + "\n", + "Title: Unlock the Power: Competitive Distillation for Multi-Modal Large Language Models\n", + "Authors: Xinwei Li, Li Lin, Shuai Wang, Chen Qian\n", + "Abstract: Recently, multi-modal content generation has attracted lots of attention from\n", + "researchers by investigating the utilization of visual instruction tuning based\n", + "on large language models (LLMs). To enhance the performance and generalization\n", + "ability of such LLMs, the practice of distilling knowledge from pretrained\n", + "multi-modal models (a.k.a. teachers) to more compact multi-modal LLMs\n", + "(students) has gained considerable interest. However, the prevailing paradigm\n", + "of instructiontuning in multi-modal LLMs knowledge distillation is\n", + "resource-intensive and unidirectional, neglecting the potential for mutual\n", + "feedback between the student and teacher models. Thus, we propose an innovative\n", + "Competitive Multi-modal Distillation framework (CoMD), which captures\n", + "bidirectional feedback between teacher and student models and continually\n", + "updates the multi-modal capabilities that the student model has learned. It\n", + "comprises two stages: multi-modal pre-training and multi-modal competitive\n", + "distillation. The first stage pre-trains the student model on a large number of\n", + "filtered multi-modal datasets. The second stage facilitates a bidirectional\n", + "knowledge transfer between the student and teacher models. Our experimental\n", + "analysis of diverse datasets shows that our knowledge transfer method\n", + "consistently improves the capabilities of the student model. Finally, the\n", + "7B-sized student model after four distillations surpassed the current\n", + "state-of-the-art model LLaVA-13B on the ScienceQA and LLaVA Test dataset, also\n", + "outperforms other strong baselines in the zero-shot setting.\n", + "URL: http://arxiv.org/abs/2311.08213v1\n", + "Published: 2023-11-14 14:49:46+00:00\n", + "\n", + "Title: Improving hateful memes detection via learning hatefulness-aware embedding space through retrieval-guided contrastive learning\n", + "Authors: Jingbiao Mei, Jinghong Chen, Weizhe Lin, Bill Byrne, Marcus Tomalin\n", + "Abstract: Hateful memes have emerged as a significant concern on the Internet. These\n", + "memes, which are a combination of image and text, often convey messages vastly\n", + "different from their individual meanings. Thus, detecting hateful memes\n", + "requires the system to jointly understand the visual and textual modalities.\n", + "However, our investigation reveals that the embedding space of existing\n", + "CLIP-based systems lacks sensitivity to subtle differences in memes that are\n", + "vital for correct hatefulness classification. To address this issue, we propose\n", + "constructing a hatefulness-aware embedding space through retrieval-guided\n", + "contrastive training. Specifically, we add an auxiliary loss that utilizes hard\n", + "negative and pseudo-gold samples to train the embedding space. Our approach\n", + "achieves state-of-the-art performance on the HatefulMemes dataset with an AUROC\n", + "of 86.7. Notably, our approach outperforms much larger fine-tuned Large\n", + "Multimodal Models like Flamingo and LLaVA. Finally, we demonstrate a\n", + "retrieval-based hateful memes detection system, which is capable of making\n", + "hatefulness classification based on data unseen in training from a database.\n", + "This allows developers to update the hateful memes detection system by simply\n", + "adding new data without retraining, a desirable feature for real services in\n", + "the constantly-evolving landscape of hateful memes on the Internet.\n", + "URL: http://arxiv.org/abs/2311.08110v1\n", + "Published: 2023-11-14 12:14:54+00:00\n", + "\n", + "Title: To See is to Believe: Prompting GPT-4V for Better Visual Instruction Tuning\n", + "Authors: Junke Wang, Lingchen Meng, Zejia Weng, Bo He, Zuxuan Wu, Yu-Gang Jiang\n", + "Abstract: Existing visual instruction tuning methods typically prompt large language\n", + "models with textual descriptions to generate instruction-following data.\n", + "Despite the promising performance achieved, these descriptions are derived from\n", + "image annotations, which are oftentimes coarse-grained. Furthermore, the\n", + "instructions might even contradict the visual content without observing the\n", + "entire visual context. To address this challenge, we introduce a fine-grained\n", + "visual instruction dataset, LVIS-Instruct4V, which contains 220K visually\n", + "aligned and context-aware instructions produced by prompting the powerful\n", + "GPT-4V with images from LVIS. Through experimental validation and case studies,\n", + "we demonstrate that high-quality visual instructional data could improve the\n", + "performance of LLaVA-1.5, a state-of-the-art large multimodal model, across a\n", + "wide spectrum of benchmarks by clear margins. Notably, by simply replacing the\n", + "LLaVA-Instruct with our LVIS-Instruct4V, we achieve better results than LLaVA\n", + "on most challenging LMM benchmarks, e.g., LLaVA$^w$ (76.7 vs. 70.7) and MM-Vet\n", + "(40.2 vs. 35.4). We release our data and model at\n", + "https://github.com/X2FD/LVIS-INSTRUCT4V.\n", + "URL: http://arxiv.org/abs/2311.07574v2\n", + "Published: 2023-11-13 18:59:31+00:00\n", + "\n", + "Title: MEGAVERSE: Benchmarking Large Language Models Across Languages, Modalities, Models and Tasks\n", + "Authors: Sanchit Ahuja, Divyanshu Aggarwal, Varun Gumma, Ishaan Watts, Ashutosh Sathe, Millicent Ochieng, Rishav Hada, Prachi Jain, Maxamed Axmed, Kalika Bali, Sunayana Sitaram\n", + "Abstract: Recently, there has been a rapid advancement in research on Large Language\n", + "Models (LLMs), resulting in significant progress in several Natural Language\n", + "Processing (NLP) tasks. Consequently, there has been a surge in LLM evaluation\n", + "research to comprehend the models' capabilities and limitations. However, much\n", + "of this research has been confined to the English language, leaving LLM\n", + "building and evaluation for non-English languages relatively unexplored. There\n", + "has been an introduction of several new LLMs, necessitating their evaluation on\n", + "non-English languages. This study aims to expand our MEGA benchmarking suite by\n", + "including six new datasets to form the MEGAVERSE benchmark. The benchmark\n", + "comprises 22 datasets covering 81 languages, including low-resource African\n", + "languages. We evaluate several state-of-the-art LLMs like GPT-3.5-Turbo, GPT4,\n", + "PaLM2, and Llama2 on the MEGAVERSE datasets. Additionally, we include two\n", + "multimodal datasets in the benchmark and assess the performance of the\n", + "LLaVa-v1.5 model. Our experiments suggest that GPT4 and PaLM2 outperform the\n", + "Llama models on various tasks, notably on low-resource languages, with GPT4\n", + "outperforming PaLM2 on more datasets than vice versa. However, issues such as\n", + "data contamination must be addressed to obtain an accurate assessment of LLM\n", + "performance on non-English languages.\n", + "URL: http://arxiv.org/abs/2311.07463v1\n", + "Published: 2023-11-13 16:45:37+00:00\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001B[33mData_scientist\u001B[0m (to chat_manager):\n", + "\n", + "Based on the search results, it appears that \"LLaVA\" is a term related to Large Language Models (LLMs) and their applications in vision-language tasks. The papers listed discuss various aspects of LLaVA and its applications, including instruction learning, hallucination mitigation, video understanding, and more.\n", + "\n", + "From the abstracts, we can see that LLaVA and its variants are being used to improve the alignment between visual and language representations, which is crucial for tasks such as image captioning, visual question answering, and video understanding. These models are designed to process and understand multi-modal data, combining visual information with textual instructions or queries.\n", + "\n", + "For example, the paper titled \"Contrastive Vision-Language Alignment Makes Efficient Instruction Learner\" discusses how to align the representation of a Vision Transformer (ViT) with an LLM to create an efficient instruction learner for vision-language tasks. Another paper, \"PG-Video-LLaVA: Pixel Grounding Large Video-Language Models,\" extends the capabilities of LLaVA to videos, enabling the model to spatially and temporally localize objects in videos following user instructions.\n", + "\n", + "The potential applications in computer vision are vast and include:\n", + "\n", + "1. Image and video captioning: Generating descriptive text for images and videos.\n", + "2. Visual question answering: Answering questions based on visual content.\n", + "3. Object detection and localization: Identifying and locating objects in images and videos.\n", + "4. Video understanding: Interpreting actions, events, and narratives in video content.\n", + "5. Hallucination mitigation: Reducing instances where the model generates responses that contradict the visual content.\n", + "\n", + "These applications are crucial for developing more intelligent and interactive AI systems that can understand and respond to visual content in a human-like manner. The research on LLaVA and related models is contributing to the advancement of multi-modal AI, which can have significant implications for fields such as autonomous vehicles, assistive technologies, content moderation, and more.\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n", + "All agents have been cleared.\n" + ] + } + ], + "source": [ + "new_builder = AgentBuilder(config_path=config_path)\n", + "agent_list, agent_configs = new_builder.load(saved_path) # load previous agent configs\n", + "start_task(\n", + " execution_task=\"Find a recent paper about Llava on arxiv and find its potential applications in computer vision.\",\n", + " agent_list=agent_list,\n", + " llm_config=default_llm_config\n", + ")\n", + "new_builder.clear_all_agents()" + ] + }, + { + "cell_type": "markdown", + "id": "32e0cf8f09eef5cd", + "metadata": { + "collapsed": false + }, + "source": [ + "## Use OpenAI Assistant\n", + "\n", + "[The Assistants API](https://platform.openai.com/docs/assistants/overview) allows you to build AI assistants within your own applications. An Assistant has instructions and can leverage models, tools, and knowledge to respond to user queries.\n", + "AutoBuild also support assistant api by adding `use_oai_assistant=True` to `build()`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4051c25b2cd1918c", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-12-03T13:59:35.497212500Z", + "start_time": "2023-12-03T13:47:45.765859300Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating agents...\n", + "Data_scientist,Machine_learning_engineer,Research_analyst are generated.\n", + "Preparing configuration for Data_scientist...\n", + "Preparing configuration for Machine_learning_engineer...\n", + "Preparing configuration for Research_analyst...\n", + "Creating agent Data_scientist with backbone gpt-4-1106-preview...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Multiple assistants with name Data_scientist found. Using the first assistant in the list. Please specify the assistant ID in llm_config to use a specific assistant.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating agent Machine_learning_engineer with backbone gpt-4-1106-preview...\n", + "Creating agent Research_analyst with backbone gpt-4-1106-preview...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Multiple assistants with name Research_analyst found. Using the first assistant in the list. Please specify the assistant ID in llm_config to use a specific assistant.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adding user console proxy...\n", + "\u001B[33mUser_console_and_Python_code_interpreter\u001B[0m (to chat_manager):\n", + "Find a recent paper about XAI on arxiv and find its potential applications in medical.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001B[33mData_scientist\u001B[0m (to chat_manager):\n", + "\n", + "We will start by searching for a recent paper about Explainable Artificial Intelligence (XAI) on arXiv. To do this, I'll write a Python script that uses the arXiv API to fetch relevant papers. We will then look at the most recent paper and analyze its content to identify potential applications within the medical domain. \n", + "\n", + "Let's start by writing the script to search for papers on arXiv. This code will fetch papers related to XAI that are latest:\n", + "\n", + "```python\n", + "import urllib\n", + "import feedparser\n", + "\n", + "# Base api query url\n", + "base_url = 'http://export.arxiv.org/api/query?'\n", + "\n", + "# Search parameters\n", + "search_query = 'all:XAI AND cat:cs.AI' # search for XAI in the field of computer science artificial intelligence\n", + "start = 0 # start at the first result\n", + "total_results = 1 # want only the most recent result\n", + "results_per_iteration = 1 # results per query fetch\n", + "wait_time = 5 # seconds to wait between fetches, as a courtesy to arXiv's servers\n", + "\n", + "# Form the full query url\n", + "query = f'search_query={search_query}&start={start}&max_results={total_results}'\n", + "\n", + "# Perform a GET request to the arXiv API and parse the results\n", + "response = urllib.request.urlopen(base_url+query).read()\n", + "feed = feedparser.parse(response)\n", + "\n", + "# Print out the information from the most recent paper on XAI\n", + "for entry in feed.entries:\n", + " print('Title: %s' % entry.title)\n", + " print('Authors: %s' % ', '.join(author.name for author in entry.authors))\n", + " print('Published: %s' % entry.published)\n", + " print('Summary: %s' % entry.summary)\n", + " print('arXiv ID: %s' % entry.id.split('/abs/')[-1])\n", + " print('Link: %s' % entry.id)\n", + "```\n", + "\n", + "Please run this script in a Python environment to fetch the information about the most recent paper on XAI. After running this code, we will get the title, authors, publication date, summary, arXiv ID, and a link to the paper. Once we have the paper information, we can proceed to analyze it and discuss potential medical applications.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001B[31m\n", + ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001B[0m\n", + "\u001B[33mUser_console_and_Python_code_interpreter\u001B[0m (to chat_manager):\n", + "\n", + "exitcode: 1 (execution failed)\n", + "Code output: \n", + "Traceback (most recent call last):\n", + " File \"\", line 18, in \n", + " response = urllib.request.urlopen(base_url+query).read()\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/elpis_ubuntu/anaconda3/envs/llm/lib/python3.11/urllib/request.py\", line 216, in urlopen\n", + " return opener.open(url, data, timeout)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/elpis_ubuntu/anaconda3/envs/llm/lib/python3.11/urllib/request.py\", line 519, in open\n", + " response = self._open(req, data)\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/elpis_ubuntu/anaconda3/envs/llm/lib/python3.11/urllib/request.py\", line 536, in _open\n", + " result = self._call_chain(self.handle_open, protocol, protocol +\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/elpis_ubuntu/anaconda3/envs/llm/lib/python3.11/urllib/request.py\", line 496, in _call_chain\n", + " result = func(*args)\n", + " ^^^^^^^^^^^\n", + " File \"/home/elpis_ubuntu/anaconda3/envs/llm/lib/python3.11/urllib/request.py\", line 1377, in http_open\n", + " return self.do_open(http.client.HTTPConnection, req)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/elpis_ubuntu/anaconda3/envs/llm/lib/python3.11/urllib/request.py\", line 1348, in do_open\n", + " h.request(req.get_method(), req.selector, req.data, headers,\n", + " File \"/home/elpis_ubuntu/anaconda3/envs/llm/lib/python3.11/http/client.py\", line 1286, in request\n", + " self._send_request(method, url, body, headers, encode_chunked)\n", + " File \"/home/elpis_ubuntu/anaconda3/envs/llm/lib/python3.11/http/client.py\", line 1297, in _send_request\n", + " self.putrequest(method, url, **skips)\n", + " File \"/home/elpis_ubuntu/anaconda3/envs/llm/lib/python3.11/http/client.py\", line 1131, in putrequest\n", + " self._validate_path(url)\n", + " File \"/home/elpis_ubuntu/anaconda3/envs/llm/lib/python3.11/http/client.py\", line 1231, in _validate_path\n", + " raise InvalidURL(f\"URL can't contain control characters. {url!r} \"\n", + "http.client.InvalidURL: URL can't contain control characters. '/api/query?search_query=all:XAI AND cat:cs.AI&start=0&max_results=1' (found at least ' ')\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001B[33mData_scientist\u001B[0m (to chat_manager):\n", + "\n", + "It seems there's an issue with the URL encoding in the script when making the request to the arXiv API. The query parameters need to be properly encoded to ensure that spaces and special characters are handled correctly. Let's correct the script by encoding the query parameters.\n", + "\n", + "Below is the corrected Python script. Please run it to fetch the information about the most recent paper on Explainable Artificial Intelligence (XAI) from arXiv:\n", + "\n", + "```python\n", + "import urllib.parse\n", + "import urllib.request\n", + "import feedparser\n", + "\n", + "# Base api query url\n", + "base_url = 'http://export.arxiv.org/api/query?'\n", + "\n", + "# Search parameters\n", + "search_query = 'all:XAI AND cat:cs.AI' # search for XAI in the field of computer science artificial intelligence\n", + "start = 0 # start at the first result\n", + "total_results = 1 # want only the most recent result\n", + "results_per_iteration = 1 # results per query fetch\n", + "\n", + "# Encode the query parameters\n", + "query = f'search_query={urllib.parse.quote(search_query)}&start={start}&max_results={total_results}'\n", + "\n", + "# Perform a GET request to the arXiv API and parse the results\n", + "response = urllib.request.urlopen(base_url+query).read()\n", + "feed = feedparser.parse(response)\n", + "\n", + "# Print out the information from the most recent paper on XAI\n", + "for entry in feed.entries:\n", + " print('Title:', entry.title)\n", + " print('Authors:', ', '.join(author.name for author in entry.authors))\n", + " print('Published:', entry.published)\n", + " print('Summary:', entry.summary)\n", + " print('arXiv ID:', entry.id.split('/abs/')[-1])\n", + " print('Link:', entry.id)\n", + "```\n", + "\n", + "This updated script ensures that the search query is properly encoded before making the request to arXiv. Once you run the updated version, you should be able to retrieve the details of the most recent paper on XAI. We can then move to the analysis part and discuss potential applications in the medical field.\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001B[31m\n", + ">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001B[0m\n", + "\u001B[33mUser_console_and_Python_code_interpreter\u001B[0m (to chat_manager):\n", + "\n", + "exitcode: 0 (execution succeeded)\n", + "Code output: \n", + "Title: A Critical Survey on Fairness Benefits of XAI\n", + "Authors: Luca Deck, Jakob Schoeffer, Maria De-Arteaga, Niklas Kühl\n", + "Published: 2023-10-15T08:17:45Z\n", + "Summary: In this critical survey, we analyze typical claims on the relationship\n", + "between explainable AI (XAI) and fairness to disentangle the multidimensional\n", + "relationship between these two concepts. Based on a systematic literature\n", + "review and a subsequent qualitative content analysis, we identify seven\n", + "archetypal claims from 175 papers on the alleged fairness benefits of XAI. We\n", + "present crucial caveats with respect to these claims and provide an entry point\n", + "for future discussions around the potentials and limitations of XAI for\n", + "specific fairness desiderata. While the literature often suggests XAI to be an\n", + "enabler for several fairness desiderata, we notice a divide between these\n", + "desiderata and the capabilities of XAI. We encourage to conceive XAI as one of\n", + "many tools to approach the multidimensional, sociotechnical challenge of\n", + "algorithmic fairness and to be more specific about how exactly what kind of XAI\n", + "method enables whom to address which fairness desideratum.\n", + "arXiv ID: 2310.13007v4\n", + "Link: http://arxiv.org/abs/2310.13007v4\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001B[33mResearch_analyst\u001B[0m (to chat_manager):\n", + "\n", + "Based on the output provided, the most recent paper on Explainable Artificial Intelligence (XAI) from arXiv is titled \"A Critical Survey on Fairness Benefits of XAI.\" It was authored by Luca Deck, Jakob Schoeffer, Maria De-Arteaga, and Niklas Kühl, and published on October 15, 2023.\n", + "\n", + "The summary discusses a critical survey conducted to analyze claims about the relationship between XAI and fairness. Through a systematic literature review and qualitative content analysis, the authors identified seven archetypal claims from 175 papers about the supposed fairness benefits of XAI. They present significant limitations and caveats regarding these claims, challenging the notion that XAI is a straightforward solution for fairness issues. The paper suggests reconsidering the role of XAI as one of the many tools to address the complex, sociotechnical challenge of algorithmic fairness. It emphasizes the importance of being specific about how certain XAI methods enable stakeholders to address particular fairness desiderata.\n", + "\n", + "Regarding potential applications in the medical field, one can infer from the summary that while the paper itself may not be directly focused on medical applications, its insights could be relevant. In healthcare, fairness is a critical concern due to the potential impact of biased algorithms on patient outcomes. XAI could help medical professionals and policymakers understand how AI models make predictions, which can be essential for identifying and mitigating biases in high-stakes decisions such as diagnosis, treatment planning, or resource allocation.\n", + "\n", + "While the summary does not provide explicit applications of XAI in medicine, understanding the interplay between AI explainability and fairness is undoubtedly beneficial in the context of ethical AI deployment in healthcare. Increased transparency through XAI could lead to more equitable healthcare algorithms, but this requires careful consideration of how the explainability ties into fairness outcomes, as indicated by the authors.\n", + "\n", + "For further analysis, I would recommend reading the full paper to extract detailed discussions of these issues, which might highlight more specific applications or considerations for the medical field.\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n", + "All agents have been cleared.\n" + ] + } + ], + "source": [ + "new_builder = AgentBuilder(config_path=config_path)\n", + "agent_list, agent_configs = new_builder.build(building_task, default_llm_config, use_oai_assistant=True) # Transfer to OpenAI assistant API.\n", + "start_task(\n", + " execution_task=\"Find a recent paper about XAI on arxiv and find its potential applications in medical.\",\n", + " agent_list=agent_list,\n", + " llm_config=default_llm_config\n", + ")\n", + "new_builder.clear_all_agents()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + }, + "id": "8fbfef9268fc5191" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/test/agentchat/contrib/example_test_agent_builder_config.json b/test/agentchat/contrib/example_test_agent_builder_config.json new file mode 100644 index 00000000000..109ea0f20be --- /dev/null +++ b/test/agentchat/contrib/example_test_agent_builder_config.json @@ -0,0 +1,25 @@ +{ + "building_task": "Find a paper on arxiv by programming, and analyze its application in some domain. For example, find a recent paper about gpt-4 on arxiv and find its potential applications in software.", + "agent_configs": [ + { + "name": "Data_scientist", + "model": "gpt-4-1106-preview", + "system_message": "As a Data Scientist, you will:\n\n- Utilize your advanced coding skills specifically in Python to automate information gathering from various sources including web scraping, file downloads, and parsing data. This may include writing Python scripts to retrieve and present the latest research papers from preprint services like arXiv.\n- Apply your analytical acumen to conduct thorough examinations of the technical materials you gather, especially focusing on their practical applications within different domains, such as software development in the case of GPT-4 research papers.\n- Perform data processing tasks that may involve complex algorithmic work, statistical analysis, or machine learning methodologies to extract insights and build models based on the gathered information, executing Python code as necessary to accomplish these tasks.\n- Present findings with clarity, extracting and interpreting results solely from the execution of Python scripts you've crafted. Use 'print' functions adequately in your Python code to ensure all results are clear and interpretable.\n- Be diligent in checking the viability and correctness of your code and analysis. When errors occur, address them promptly and provide corrected Python code for execution.\n- Remain adaptive to the dynamic field of data science, continually seeking additional relevant information when required, and revising your approach to problem-solving as needed.\n- Persistently strive for the successful completion of the task at hand, ready to pursue alternative strategies in case initial methods fall short of fulfilling the task's requirements.\n- Conclude any sequence of task-related interactions with a final confirmation that the user's needs have been met, signifying the end of the process by replying \"TERMINATE\"." + }, + { + "name": "Domain_expert", + "model": "gpt-4-1106-preview", + "system_message": "As a Domain Expert, you leverage your deep understanding and analytical abilities to provide insights and applications of new findings in scholarly articles. Your role focuses on identifying, interpreting, and discussing the implications of cutting-edge research in a specific domain. You will:\n\n1. Employ Python programming to autonomously locate and retrieve academic papers from databases such as arXiv. This involves formulating queries, processing search results, and downloading relevant documents using automated scripts.\n\n2. Analyze and synthesize the information contained within the located papers, with a particular emphasis on assessing their applications in the specified domain. Your language skills will be pivotal in understanding complex scientific texts and elucidating their potential impact on real-world problems and industry practices.\n\n3. Clearly communicate your findings and developed applications, providing comprehensive insights into how the content of the research paper can be utilized or integrated into existing systems or processes within your domain of expertise.\n\n4. Your work will be structured and systematic, starting from the initial programming stage to the final analysis and communication. Each phase should be clearly demarcated, with an explanation of your methodology and steps taken.\n\n5. Ensure all coding is provided in Python, and your guidance will be executed directly without the need for user modifications or intervention beyond the execution of provided scripts.\n\n6. You will manage any encountered issues during the process, including correcting errors in code and revising your approach based on the results obtained from script execution.\n\n7. Upon completing your task and providing a thorough analysis, confirm your final output and conclude the interaction with the statement \"TERMINATE,\" signaling the successful satisfaction of the user's need." + }, + { + "name": "Software_engineer", + "model": "gpt-4-1106-preview", + "system_message": "As a skilled Software Engineer, your primary role is to leverage your coding expertise, particularly in Python, to facilitate the discovery and analysis of academic papers on arXiv, and to evaluate their real-world applications. \n\n1. You are expected to craft Python scripts capable of web tasks such as searching for academic papers, downloading and reading files, extracting and presenting content, as well as recognizing the current date/time and operating system details. Your script should output all necessary information for task completion.\n\n2. You should use Python scripts to accomplish specific tasks, ensuring that the script completes the task autonomously and provides the results to the user.\n\nYour responsibilities involve executing tasks in a systematic manner, clarifying your approach when a plan is not provided. Clearly distinguish between steps that involve executing Python code and those that engage your analytical skills. \n\nAlways present your Python code within a code block, ensuring it is ready for immediate execution without requiring modifications from the user. Here is how you should format a code suggestion:\n```python\n# Python code goes here\n```\n\nIf a script is to be saved before execution, indicate the filename at the beginning of the code block. Do not include multiple code blocks in a single interaction or ask users to manually copy results \u2014 use the `print` function within the script to display outputs. After providing a script, review the user's execution result. In case of an error, deliver a corrected script. If the task remains unsolved despite error-free execution, reassess your approach, gather more information if needed, and try a different strategy.\n\nEnsure that your solution is methodically verified and, where possible, supported by verifiable evidence.\n\nConclude your interaction by replying \u201cTERMINATE\u201d once the task is complete and the user\u2019s need has been satisfied. \n\nRemember, while your role is to assist with a task, it is also to enable and educate, ultimately fostering a user's understanding and their ability to independently solve similar problems in the future." + } + ], + "manager_system_message": "Group chat manager.", + "coding": true, + "default_llm_config": { + "temperature": 0 + } +} diff --git a/test/agentchat/contrib/test_agent_builder.py b/test/agentchat/contrib/test_agent_builder.py new file mode 100644 index 00000000000..3f0ee7cbbfc --- /dev/null +++ b/test/agentchat/contrib/test_agent_builder.py @@ -0,0 +1,134 @@ +import pytest +import os +import json +import sys +from packaging.requirements import Requirement +from autogen.agentchat.contrib.agent_builder import AgentBuilder +from autogen import UserProxyAgent + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402 + +here = os.path.abspath(os.path.dirname(__file__)) +oai_config_path = OAI_CONFIG_LIST + +# openai>=1 required +try: + from openai import OpenAI, APIError + from openai.types.chat import ChatCompletion + from openai.types.chat.chat_completion import ChatCompletionMessage, Choice + from openai.types.completion import Completion + from openai.types.completion_usage import CompletionUsage + import diskcache + + OPENAI_INSTALLED = True +except ImportError: + OPENAI_INSTALLED = False + + +@pytest.mark.skipif( + not OPENAI_INSTALLED, + reason="do not run when dependency is not installed", +) +def test_build(): + builder = AgentBuilder(config_path=oai_config_path, builder_model="gpt-4", agent_model="gpt-4") + building_task = ( + "Find a paper on arxiv by programming, and analyze its application in some domain. " + "For example, find a recent paper about gpt-4 on arxiv " + "and find its potential applications in software." + ) + builder.build( + building_task=building_task, + default_llm_config={"temperature": 0}, + user_proxy_work_dir=f"{here}/test_agent_scripts", + docker="python:3", + ) + + # check number of agents + assert len(builder.agent_procs_assign.keys()) <= builder.max_agents + + # check system message + for agent, proc in builder.agent_procs_assign.values(): + assert "TERMINATE" in agent.system_message + + +@pytest.mark.skipif( + not OPENAI_INSTALLED, + reason="do not run when dependency is not installed", +) +def test_save(): + builder = AgentBuilder(config_path=oai_config_path, builder_model="gpt-4", agent_model="gpt-4") + building_task = ( + "Find a paper on arxiv by programming, and analyze its application in some domain. " + "For example, find a recent paper about gpt-4 on arxiv " + "and find its potential applications in software." + ) + + builder.build( + building_task=building_task, + default_llm_config={"temperature": 0}, + user_proxy_work_dir=f"{here}/test_agent_scripts", + docker="python:3", + ) + saved_files = builder.save(f"{here}/example_save_agent_builder_config.json") + + # check config file path + assert os.path.isfile(saved_files) + + saved_configs = json.load(open(saved_files)) + + # check config format + assert saved_configs.get("building_task", None) is not None + assert saved_configs.get("agent_configs", None) is not None + assert saved_configs.get("coding", None) is not None + assert saved_configs.get("default_llm_config", None) is not None + + +@pytest.mark.skipif( + not OPENAI_INSTALLED, + reason="do not run when dependency is not installed", +) +def test_load(): + builder = AgentBuilder(config_path=oai_config_path, builder_model="gpt-4", agent_model="gpt-4") + + config_save_path = f"{here}/example_test_agent_builder_config.json" + configs = json.load(open(config_save_path)) + agent_configs = { + e["name"]: {"model": e["model"], "system_message": e["system_message"]} for e in configs["agent_configs"] + } + + agent_list, loaded_agent_configs = builder.load( + config_save_path, + user_proxy_work_dir=f"{here}/test_agent_scripts", + docker="python:3", + ) + + # check config loading + assert loaded_agent_configs["coding"] == configs["coding"] + if loaded_agent_configs["coding"] is True: + assert isinstance(agent_list[0], UserProxyAgent) + agent_list = agent_list[1:] + for agent in agent_list: + agent_name = agent.name + assert agent_configs.get(agent_name, None) is not None + assert agent_configs[agent_name]["model"] == agent.llm_config["model"] + assert agent_configs[agent_name]["system_message"] == agent.system_message + + +@pytest.mark.skipif( + not OPENAI_INSTALLED, + reason="do not run when dependency is not installed", +) +def test_clear_agent(): + builder = AgentBuilder(config_path=oai_config_path, builder_model="gpt-4", agent_model="gpt-4") + + config_save_path = f"{here}/example_test_agent_builder_config.json" + builder.load( + config_save_path, + user_proxy_work_dir=f"{here}/test_agent_scripts", + docker="python:3", + ) + builder.clear_all_agents() + + # check if the agent cleared + assert len(builder.agent_procs_assign) == 0 diff --git a/website/blog/2023-11-26-Agent-AutoBuild/img/agent_autobuild.png b/website/blog/2023-11-26-Agent-AutoBuild/img/agent_autobuild.png new file mode 100644 index 00000000000..a6645b109d2 Binary files /dev/null and b/website/blog/2023-11-26-Agent-AutoBuild/img/agent_autobuild.png differ diff --git a/website/blog/2023-11-26-Agent-AutoBuild/index.mdx b/website/blog/2023-11-26-Agent-AutoBuild/index.mdx new file mode 100644 index 00000000000..5b9c9804b21 --- /dev/null +++ b/website/blog/2023-11-26-Agent-AutoBuild/index.mdx @@ -0,0 +1,152 @@ + +--- +title: Agent AutoBuild - Automatically Building Multi-agent Systems +authors: + - LinxinS97 + - jieyuz2 +tags: [LLM, research] +--- + +![Overall structure of AutoBuild](img/agent_autobuild.png) + +**TL;DR:** +Introducing **AutoBuild**, building multi-agent system automatically, fast, and easily for complex tasks with minimal +user prompt required, powered by a new designed class **AgentBuilder**. AgentBuilder also support open-source LLMs by +leveraging [vLLM](https://docs.vllm.ai/en/latest/index.html) and [FastChat](https://github.com/lm-sys/FastChat). +Checkout example notebooks and file for reference: + +- [AutoBuild Examples](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_autobuild.ipynb) +- [AgentBuilder](https://github.com/microsoft/autogen/blob/main/autogen/agentchat/contrib/agent_builder.py) + +## Introduction +In this blog, we introduce **AutoBuild**, a pipeline that can automatically build multi-agent system for complex task. +Specifically, we design a new class called **AgentBuilder**, which will complete the generation of participant expert agents +and the construction of group chat automatically after the user provide descriptions of a building task and a execution task. + +AgentBuilder support open-source models on Hugging Face powered by [vLLM](https://docs.vllm.ai/en/latest/index.html) +and [FastChat](https://github.com/lm-sys/FastChat). Once the user choose to use open-source LLM, AgentBuilder will set +up an endpoint server automatically without any user participant. + +## Installation +- AutoGen: +```bash +pip install pyautogen==0.2.0b5 +``` +- (Optional: if you want to use open-source LLMs) vLLM and FastChat +```bash +pip install vllm fastchat +``` + +## Basic Example +In this section, we provide a step-by-step example of how to use AgentBuilder to build a multi-agent system for a specific task. + +### Step 1: prepare configurations +First, we need to prepare the Agent configurations. +Specifically, a config path containing model name and api key, and a default config for each agent, are required. +```python +config_path = '/home/elpis_ubuntu/LLM/autogen/OAI_CONFIG_LIST' # modify path +default_llm_config = { + 'temperature': 0 +} +``` + +### Step 2: create a AgentBuilder instance +Then, we create a AgentBuilder instance with the config path and default config. +You can also specific the builder model and agent model, which are the LLMs used for building and agent respectively. +```python +from autogen.agentchat.contrib.agent_builder import AgentBuilder + +builder = AgentBuilder(config_path=config_path, builder_model='gpt-4-1106-preview', agent_model='gpt-4-1106-preview') +``` + +### Step 3: specify the building task +Specify a building task with a general description. Building task will help build manager (a LLM) decide what agents should be build. +Note that your building task should have a general description of the task. Adding some specific examples is better. +```python +building_task = "Find a paper on arxiv by programming, and analysis its application in some domain. For example, find a latest paper about gpt-4 on arxiv and find its potential applications in software." +``` + +### Step 4: build group chat agents +Use `build()` to let build manager (with a `builder_model` as backbone) complete the group chat agents generation. +If you think coding is necessary in your task, you can use `coding=True` to add a user proxy (a local code interpreter) into the agent list as: +```python +builder.build(building_task, default_llm_config, coding=True) +``` +If `coding` is not specified, AgentBuilder will determine on its own whether the user proxy should be added or not according to the task. + +### Step 5: execute the task +Let agents generated in `build()` to complete the task collaboratively in a group chat. +```python +execution_task="Find a latest paper about gpt-4 on arxiv and find its potential applications in software." +builder.start(task=execution_task) +``` + +### Step 6 (Optional): clear all agents and prepare for the next task +You can clear all agents generated in this task by the following code if your task is completed or the next task is largely different from the current task. +```python +builder.clear_all_agents(recycle_endpoint=True) +``` +If the agent's backbone is an open-source LLM, this process will also shutdown the endpoint server. More details in the next section. +If necessary, you can use `recycle_endpoint=False` to retain the previous open-source LLM's endpoint server. + +## Save and Load +You can save all necessary information of the built group chat agents by +```python +saved_path = builder.save() +``` +Configs will be saved in the JSON format with following content: +```json +// FILENAME: save_config_TASK_MD5.json +{ + "building_task": "Find a paper on arxiv by programming, and analysis its application in some domain. For example, find a latest paper about gpt-4 on arxiv and find its potential applications in software.", + "agent_configs": [ + { + "name": "...", + "model": "...", + "system_message": "..." + }, + ... + ], + "manager_system_message": "...", + "coding": true, + "default_llm_config": { + "temperature": 0 + } +} +``` +You can provide a specific filename, otherwise, AgentBuilder will save config to the current path with a generated filename `save_config_TASK_MD5.json`. + +You can load the saved config and skip the building process. AgentBuilder will create agents with those information without prompting the build manager. +```python +new_builder = AgentBuilder(config_path=config_path).load(saved_path) +new_builder.start() +``` + +## Use Open-source LLM +AutoBuild support open-source LLM by [vLLM](https://docs.vllm.ai/en/latest/index.html) and [FastChat](https://github.com/lm-sys/FastChat). +Check the supported model list [here](https://docs.vllm.ai/en/latest/models/supported_models.html). +After satisfying the requirements, you can add an open-source LLM's huggingface repository to the config file, +```json, +// Add the LLM's huggingface repo to your config file and use EMPTY as the api_key. +[ + ... + { + "model": "meta-llama/Llama-2-13b-chat-hf", + "api_key": "EMPTY" + } +] +``` +and specify it when initializing AgentBuilder. +AgentBuilder will automatically set up an endpoint server for open-source LLM. Make sure you have sufficient GPUs resources. + +## Use GPTs +[GPTs](https://openai.com/blog/introducing-gpts) allow user to create an assistant with a simple instruction of the task. It has plugin support that can let ChatGPT complete some complex instructions, and can optionally update the assistant's instruction to let it adapt to new task or improve on the current task. +AutoBuild also support GPTs api by adding `use_gpts=True` to the `build()` function. +```python +# Transfer to GPTs API. +new_builder.build(building_task, default_llm_config, use_gpts=True) +``` + +## Summary +We propose AutoBuild with a new class `AgentBuilder`. AutoBuild can help user solve their complex task with an automatically built multi-agent system. AutoBuild support open-source LLMs and GPTs api, giving users more flexibility to choose their favorite models. +More related features coming soon. diff --git a/website/blog/authors.yml b/website/blog/authors.yml index a487086cb22..7f6f7dc482b 100644 --- a/website/blog/authors.yml +++ b/website/blog/authors.yml @@ -87,3 +87,9 @@ narabzad: title: PhD student at the University of Waterloo url: https://www.negara.me/ image_url: https://github.com/Narabzad.png + +LinxinS97: + name: Linxin Song + title: MS student at Waseda University + url: https://linxins97.github.io/ + image_url: https://github.com/LinxinS97.png diff --git a/website/docs/Examples.md b/website/docs/Examples.md index 8e313668e01..0a0324303a4 100644 --- a/website/docs/Examples.md +++ b/website/docs/Examples.md @@ -61,6 +61,8 @@ Links to notebook examples: - Conversations with Chat History Compression Enabled - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_compression.ipynb) 1. **Evaluation and Assessment** - AgentEval: A Multi-Agent System for Assess Utility of LLM-powered Applications - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agenteval_cq_math.ipynb) +1. **Automatic Agent Building** + - Automatically Build Multi-agent System with AgentBuilder - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_autobuild.ipynb) ## Enhanced Inferences ### Utilities