diff --git a/.github/workflows/contrib-openai.yml b/.github/workflows/contrib-openai.yml index 4eda8d93071..2d6d64dc814 100644 --- a/.github/workflows/contrib-openai.yml +++ b/.github/workflows/contrib-openai.yml @@ -245,6 +245,15 @@ jobs: pip install -e .[websurfer] python -c "import autogen" pip install coverage pytest + - name: Setup edge + uses: browser-actions/setup-edge@latest + if: ${{ matrix.browsers == 'edge' }} + - name: Setup firefox + uses: browser-actions/setup-firefox@latest + if: ${{ matrix.browsers == 'firefox' }} + - name: Setup chrome + uses: browser-actions/setup-chrome@latest + if: ${{ matrix.browsers == 'chrome' }} - name: Coverage env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -253,7 +262,7 @@ jobs: OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }} BING_API_KEY: ${{ secrets.BING_API_KEY }} run: | - coverage run -a -m pytest test/agentchat/contrib/test_web_surfer.py + coverage run -a -m pytest test/agentchat/contrib/test_web_surfer.py test/agentchat/contrib/test_web_surfer_selenium.py test/agentchat/contrib/test_web_archiver_agent.py coverage xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index ced35dc115b..997ba20a6fc 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -201,6 +201,15 @@ jobs: - name: Install packages and dependencies for WebSurfer run: | pip install -e .[websurfer] + - name: Setup edge + uses: browser-actions/setup-edge@latest + if: ${{ matrix.browsers == 'edge' }} + - name: Setup firefox + uses: browser-actions/setup-firefox@latest + if: ${{ matrix.browsers == 'firefox' }} + - name: Setup chrome + uses: browser-actions/setup-chrome@latest + if: ${{ matrix.browsers == 'chrome' }} - name: Set AUTOGEN_USE_DOCKER based on OS shell: bash run: | @@ -210,7 +219,7 @@ jobs: - name: Coverage run: | pip install coverage>=5.3 - coverage run -a -m pytest test/test_browser_utils.py test/agentchat/contrib/test_web_surfer.py --skip-openai + coverage run -a -m pytest test/test_browser_utils.py test/agentchat/contrib/test_web_surfer.py --skip-openai test/agentchat/contrib/test_web_surfer_selenium.py --skip-openai test/agentchat/contrib/test_web_archiver_agent.py --skip-openai coverage xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/autogen/agentchat/contrib/web_archiver_agent.py b/autogen/agentchat/contrib/web_archiver_agent.py new file mode 100644 index 00000000000..f3e1c9c232c --- /dev/null +++ b/autogen/agentchat/contrib/web_archiver_agent.py @@ -0,0 +1,589 @@ +from ..agent import Agent +from ..conversable_agent import ConversableAgent +from ..assistant_agent import AssistantAgent +from ...browser_utils import ( + SeleniumBrowser, + download_using_requests, + get_domain, + get_scheme, + get_path, + get_last_path, + github_path_rule, + get_file_path_from_url, + fix_missing_protocol, + extract_pdf_text, +) +from typing import List, Union, Any, Tuple, Dict +import os +import re +import json +import traceback +import requests +from collections import deque +from urllib.parse import urlparse, urlunparse +from bs4 import BeautifulSoup +from io import BytesIO +from PIL import Image +import base64 + +# Import the arxiv library if it is available +IS_ARXIV_CAPABLE = False +try: + import arxiv + + IS_ARXIV_CAPABLE = True +except ModuleNotFoundError: + print("The 'arxiv' library was not found in this environment, but can be installed with 'pip install arxiv'.") + pass + + +class WebArchiverAgent(ConversableAgent): + def __init__( + self, + silent: bool = True, + storage_path: str = "./content", + max_depth: int = 1, + page_load_time: float = 6, + *args, + **kwargs, + ): + """ + WebArchiverAgent: Custom LLM agent for collecting online content. + + The WebArchiverAgent class is a custom Autogen agent that can be used to collect and store online content from different + web pages. It extends the ConversableAgent class and provides additional functionality for managing a list of + additional links, storing collected content in local directories, and customizing request headers. WebArchiverAgent + uses deque to manage a list of additional links for further exploration, with a maximum depth limit set by max_depth + parameter. The collected content is stored in the specified storage path (storage_path) using local directories. + WebArchiverAgent can be customized with request_kwargs and llm_config parameters during instantiation. The default + User-Agent header is used for requests, but it can be overridden by providing a new dictionary of headers under + request_kwargs. + + Parameters: + silent (bool): If True, the agent operates in silent mode with minimal output. Defaults to True. + storage_path (str): The path where the collected content will be stored. Defaults to './content'. + max_depth (int): Maximum depth limit for exploring additional links from a web page. This defines how deep + the agent will go into linked pages from the starting point. Defaults to 1. + page_load_time (float): Time in seconds to wait for loading each web page. This ensures that dynamic content + has time to load before the page is processed. Defaults to 6 seconds. + *args, **kwargs: Additional arguments and keyword arguments to be passed to the parent class `ConversableAgent`. + These can be used to configure underlying behaviors of the agent that are not explicitly + covered by the constructor's parameters. + + Note: + The `silent` parameter can be useful for controlling the verbosity of the agent's operations, particularly + in environments where logging or output needs to be minimized for performance or clarity. + + Software Dependencies: + - requests + - beautifulsoup4 + - pdfminer + - selenium + - arxiv + - pillow + """ + + self.browser_kwargs = kwargs.pop("browser_config", {"browser": "firefox"}) + super().__init__(*args, **kwargs) + + self.additional_links = deque() + self.link_depth = 0 + self.max_depth = max_depth + self.local_dir = storage_path + self.page_load_time = page_load_time + self.silent = silent + self.request_kwargs = { + "headers": { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15" + } + } + self.small_llm_config = kwargs["llm_config"] + self.process_history = {} + self.browser = None + self.domain_path_rules = { + "github.com": github_path_rule, + # Add more domain rules as needed + } + + # Define the classifiers + self._define_classifiers() + + def classifier_to_collector_reply( + self, + recipient: Agent, + messages: Union[List[str], str], + sender: Agent, + config: dict, + ) -> Tuple[bool, str]: + """ + Processes the last message in a conversation to generate a boolean classification response. + + This method takes the most recent message from a conversation, uses the recipient's method to generate a reply, + and classifies the reply as either "True" or "False" based on its content. It is designed for scenarios where + the reply is expected to represent a boolean value, simplifying downstream processing. + + Parameters: + recipient (Agent): The agent or object responsible for generating replies. Must have a method `generate_oai_reply` + that accepts a list of messages, a sender, and optionally a configuration, and returns a tuple + where the second element is the reply string. + messages (Union[List[str], str]): A list of messages or a single message string from the conversation. The last message + in this list is used to generate the reply. + sender (Agent): The entity that sent the message. This could be an identifier, an object, or any representation + that the recipient's reply generation method expects. + config (dict): Configuration parameters for the reply generation process, if required by the recipient's method. + + Returns: + Tuple[bool, str]: A tuple containing a boolean status (always True in this implementation) and the classification result + as "True" or "False" based on the content of the generated reply. + + Note: + The classification is case-insensitive and defaults to "False" if the reply does not explicitly contain + "true" or "false". This behavior ensures a conservative approach to classification. + """ + last_message = messages[-1] if isinstance(messages, list) else messages + _, rep = recipient.generate_oai_reply([last_message], sender) + + # Streamlined classification logic + rep_lower = rep.lower() + classified_reply = "True" if "true" in rep_lower else "False" + + return True, classified_reply + + def _define_classifiers(self): + """ + Defines the agents used for classification tasks. + + Parameters: + - None + + Returns: + - None + """ + # Define the system messages for the classifiers + self.metadata_classifier_system_msg = "Help the user identify if the metadata contains potentially useful information such as: author, title, description, a date, etc. Respond True for useful, False for not." + self.content_classifier_system_msg = "You are to classify web data as content or other (such as an adversitement) based on the page title. Respond True if it is content, False if not." + + # Define the prompt templates for the classifiers + self.content_classifier_prompt = lambda title, content: f"Title: `{title}`, Data: ```{content}`" + self.metadata_classifier_prompt = ( + lambda content: f"We are parsing html metadata to extract useful data. Should we hold onto this item? {content}." + ) + + # Define the metadata classifier + self.metadata_classifier = AssistantAgent( + "Metadata Classifier", + system_message=self.metadata_classifier_system_msg, + llm_config=self.small_llm_config, + max_consecutive_auto_reply=0, + ) + self.metadata_classifier.register_reply(self, self.classifier_to_collector_reply, 1) + + # Define the html content classifier + self.content_classifier = AssistantAgent( + "Content Classifier", + system_message=self.content_classifier_system_msg, + llm_config=self.small_llm_config, + max_consecutive_auto_reply=0, + ) + self.content_classifier.register_reply(self, self.classifier_to_collector_reply, 1) + + def _fetch_content(self, link: str) -> Tuple[str, str]: + """ + Fetches content from a given URL. + + Parameters: + - link (str): The URL from which to fetch content. + + Returns: + - Tuple[str, str]: Content type and fetched content or error message. + """ + # Parse the link + parsed_url = urlparse(link) + + # A special case for arxiv links + if "arxiv" in link and IS_ARXIV_CAPABLE: + return "pdf", self._fetch_arxiv_content(parsed_url) + + elif parsed_url.path.endswith(".pdf"): + return "pdf", self._fetch_pdf_content(link) + + else: + return "html", self._fetch_html_content(link) + + def _fetch_html_content(self, link: str) -> str: + """ + Handles the fetching of HTML content from a web page. + + Parameters: + - link (str): The URL of the web page. + + Returns: + - str: Success (errors are handled at the higher level) + """ + # Handle web page content (html) + + sd = {} # submission_data + sd["url"] = link + + # Establish the downloads folder + sd["local_path"] = os.path.join(self.local_dir, get_file_path_from_url(link, self.domain_path_rules)) + os.makedirs(sd["local_path"], exist_ok=True) + + # We can instantiate the browser now that we know where the files and downloads will go + self.browser = SeleniumBrowser(browser=self.browser_kwargs["browser"], download_dir=sd["local_path"]) + + if "github.com" in link and "README.md" not in link: + # Small patch to facilitate github repos + link = os.path.join(link, "README.md") + + self.browser.get(link) + self.browser.maximize_window() + self.browser.implicitly_wait(self.page_load_time) + + # Define where the screeshot is stored + sd["browser_screenshot_path"] = os.path.join(sd["local_path"], "screenshot.png") + + # Save a screenshot of the browser window + if self.browser_kwargs["browser"] == "firefox": + # save_full_page_screenshot + self.browser.save_full_page_screenshot(sd["browser_screenshot_path"]) + else: + page_height = self.browser.execute_script("return window.pageYOffset + window.innerHeight") + self.browser.set_window_size(1920, page_height) + self.browser.save_screenshot(sd["browser_screenshot_path"]) + + sd["title"] = self.browser.title + sd["html"] = self.browser.page_source + + # Write the HTML to disk for archival purposes + with open(os.path.join(sd["local_path"], "index.html"), "w", encoding="utf-8") as f: + f.write(str(self.browser.page_source)) + + # Store the BS object + sd["soup"] = BeautifulSoup(sd["html"], "html.parser") + + sd["content"] = self._identify_content(sd["soup"]) + + # Save the content to a text file on disk + with open(os.path.join(sd["local_path"], "content.txt"), "w") as f: + for data in sd["content"]: # Iterate over each record + f.write(data + "\n") # Write the content to the file + + # Save the original URL for convenience elsewhere (when parsing images) + sd["soup"].url = link + + # Parse and store the Metadata + sd["meta"] = self._identify_metadata(sd["soup"]) # [ data.attrs for data in sd['soup'].find_all("meta") ] + + # Open a file to write the metadata to + with open(os.path.join(sd["local_path"], "metadata.txt"), "w") as f: + for data in sd["meta"]: # Iterate over each record + f.write(json.dumps(data) + "\n") # Write the link to the file + + # Parse and store the links + sd["links"] = [ + {"text": link.get_text().strip(), "href": link["href"]} + for link in sd["soup"].find_all("a") + if link.has_attr("href") and "/" in link["href"] + ] + + # Open a file to write the link URLs to + with open(os.path.join(sd["local_path"], "links.txt"), "w") as f: + for link in sd["links"]: # Iterate over each link + f.write(json.dumps(link) + "\n") # Write the link to the file + + # Recursive link checking, up to 1 level deep past the root + if self.link_depth < 1: + # Check if we find any useful relevant links that we should catalog + if ( + "project" in link["text"] or "paper" in link["text"] or "code" in link["text"] + ) and "marktekpost" in link["href"].lower(): + self.additional_links.append(link["href"]) + elif "arxiv" in link["href"] or ( + "github.com" in link["href"] + and (link["href"][:-3] != ".md" or os.path.basename(link["href"]) == "README.md") + ): + self.additional_links.append(link["href"]) + + # Parse and store the images + self._collect_images(sd["soup"], sd["local_path"]) + + # Close down the browser + self.browser.quit() + + # Log the processed link, motivated by the unit test + self.process_history[sd["url"]] = sd + + return "success" + + def _fetch_pdf_content(self, link: str) -> str: + """ + Fetches PDF content from a given URL. + + Parameters: + - link (str): The URL from which to fetch the PDF content. + + Returns: + - str: Extracted content or None in a failure event + """ + local_pdf_path = os.path.join( + self.local_dir, os.path.join(get_file_path_from_url(link, self.domain_path_rules), link.split("/")[-1]) + ) + os.makedirs(local_pdf_path, exist_ok=True) + + # This could be replaced with `download_using_requests` + response = requests.get(link, params={"headers": self.request_kwargs["headers"]}) + + if response.status_code == 200: + with open(local_pdf_path, "wb") as f: + f.write(response.content) + + # Extract text from the PDF file + text = extract_pdf_text(local_pdf_path) + + # Let's store the content to disk for later access + with open(local_pdf_path.replace("pdf", "txt"), "w") as f: + f.write(text) + + return text + else: + return None + + def _fetch_arxiv_content(self, link: str) -> str: + """ + Fetches content specifically from arXiv URLs. + + Parameters: + - link (str): The arXiv URL from which to fetch content. + + Returns: + - str: Extracted text content + """ + # Identify the paper identification + arxiv_id = link.path.split("/")[-1] + + # Define the local directory + local_base_path = os.path.join(self.local_dir, get_file_path_from_url(link, self.domain_path_rules)) + os.makedirs(local_base_path, exist_ok=True) + + local_pdf_path = os.path.join(local_base_path, f"{arxiv_id}.pdf") + + # Download the paper if we don't already have it + if not os.path.exists(local_pdf_path): + # Define the record belonging to the paper + paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id]))) + + # Download the archive to the local downloads folder. + paper.download_pdf(dirpath=local_base_path, filename=f"{arxiv_id}.pdf") + + # Download the archive to the local downloads folder. + paper.download_source(dirpath=local_base_path, filename=f"{arxiv_id}.tar.gz") + + text = extract_pdf_text(local_pdf_path) + + # Let's store the content to disk for later access + with open(local_pdf_path.replace("pdf", "txt"), "w") as f: + f.write(text) + + return text + + def _identify_content(self, soup: BeautifulSoup) -> List[str]: + """ + Identifies the title of the web page from the BeautifulSoup object. + + Parameters: + - soup (BeautifulSoup): BeautifulSoup object of the web page. + + Returns: + - list: A list of all text content classified as relevant + """ + # Get the page title for use with the queries + page_title = soup.find("head").find("title").string + + # Find and extract relevant content from soup based on the title + relevant_content = [] + + for element in soup.find_all(True): + if element.name in ["h1", "h2", "h3", "p"]: + text = element.text.strip().replace("\t", " ").replace("\n", " ") + if len(text) > 0: + while text.find(" ") != -1: + text = text.replace(" ", " ") + prompt = self.content_classifier_prompt(page_title, text) + relevant = self.initiate_chat( + self.content_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent + ).chat_history[-1]["content"] + if relevant == "True": + relevant_content.append(text.strip()) + if not self.silent: + print(element) + + return relevant_content + + def _identify_metadata(self, soup: BeautifulSoup, verbose: bool = False) -> List[Dict]: + """ + Extracts metadata from the web page using BeautifulSoup. + + Parameters: + - soup (BeautifulSoup): BeautifulSoup object of the web page. + - verbose (bool): Flag to enable verbose logging. + + Returns: + - List[Dict]: A list of dictionaries representing the relevant Metadata extracted from the page. + """ + soup.find("head").find("title").string + relevant_content = [] + for data in soup.find_all("meta"): + relevant = False + + prompt = self.metadata_classifier_prompt(data.attrs) + + if "content" in data.attrs and "http" in data.attrs["content"]: + relevant = True + elif "content" in data.attrs: + data.attrs["content"] = data.attrs["content"].strip() + relevant = self.initiate_chat( + self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent + ).chat_history[-1]["content"] + elif "property" in data.attrs: + data.attrs["property"] = data.attrs["property"].strip() + relevant = self.initiate_chat( + self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent + ).chat_history[-1]["content"] + elif "name" in data.attrs: + data.attrs["name"] = data.attrs["name"].strip() + relevant = self.initiate_chat( + self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent + ).chat_history[-1]["content"] + + if relevant == "True": + relevant_content.append(data.attrs) + if verbose: + print(data.attrs) + + return relevant_content + + def _collect_images(self, soup: BeautifulSoup, local_path: str, verbose: bool = False) -> None: + """ + Collects and saves images from the web page to a local path. + + Parameters: + - soup (BeautifulSoup): BeautifulSoup object of the web page. + - local_path (str): The local directory path where images will be saved. + - verbose (bool): Flag to enable verbose logging. + + Returns: + - None + """ + + def get_basename(filename): + return os.path.splitext(os.path.basename(filename))[0] + + for img in soup.find_all("img"): + img_alt = img.attrs["alt"] if "alt" in img.attrs else "" + img_src = img.attrs["src"].lower() + + if "png;base64" in img_src: + # Step 1: Strip the prefix to get the Base64 data + encoded_data = img.attrs["src"].split(",")[1] + + # Step 2: Decode the Base64 string + image_data = base64.b64decode(encoded_data) + + # Step 3: Create a BytesIO buffer from the decoded data + image_buffer = BytesIO(image_data) + + # Step 4: Open the image using PIL + image = Image.open(image_buffer) + + # Save the image to a file + image.save(f"{img_src.replace('data:image/png;base64','')[:28]}.png") + + elif "logo" in img_src: + continue + + elif ( + "png" in img_src + or "jpg" in img_src + or "jpeg" in img_src + or "webp" in img_src + or "avif" in img_src + or "heif" in img_src + or "heic" in img_src + or "svg" in img_src + ): + file_name = img_src.split("/")[-1] # there are other ways to do this + local_image_description_path = os.path.join(local_path, get_basename(file_name) + ".txt") + local_image_path = os.path.join(local_path, file_name) + if len(img_alt) > 0 and not os.path.exists(local_image_description_path): + with open(local_image_description_path, "w") as f: + f.write(img_alt) + if not os.path.exists(local_image_path): + image_url = fix_missing_protocol(img.attrs["src"], soup.url) + try: + # response = requests.get(image_url, params={'headers': self.request_kwargs}) + download_using_requests(self.browser, image_url, local_image_path) + except Exception: + print(image_url, img.attrs["src"]) + traceback.print_exc() + + # Main entry point + def collect_content( + self, + recipient: Agent, + messages: Union[List[str], str], + sender: Agent, + config: dict, + ) -> Tuple[bool, str]: + """ + Collects and archives content from links found in messages. + + This function scans messages for URLs, fetches content from these URLs, + and archives them to a specified local directory. It supports recursive + link fetching up to a defined depth. + + Parameters: + - recipient (Agent): The agent designated to receive the content. + - messages (Union[List[str], str]): A list of messages or a single message containing URLs. + - sender (Agent): The agent sending the content. + - config (dict): Configuration parameters for content fetching and archiving. + + Returns: + - Tuple[bool, str]: A tuple where the first element is a boolean indicating + success or failure, and the second element is a string message detailing + the outcome or providing error logs in case of failure. + """ + + try: + content_type, content = "", "" + all_links = [] + for message in messages: + if message.get("role") == "user": + links = re.findall( + r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", + message.get("content"), + ) + for link in links: + all_links.append(link) + + # Process the links provided by the user + for link in all_links: + content_type, content = self._fetch_content(link) + + # Inform self that it has completed the root level of link(s) + self.link_depth = 1 + if self.link_depth <= self.max_depth: + while len(self.additional_links) > 0: + additional_link = self.additional_links.pop() + content_type, content = self._fetch_content(additional_link) + all_links.append(all_links) + + self.link_depth = 0 + return ( + True, + f"Success: archived the following links in your chosen location {self.local_dir}/ <-- {', '.join(all_links)}", + ) + except Exception: + # Return traceback information in case of an exception + error_log = traceback.format_exc() + return False, f"Failed to collect content due to an error: {error_log}" diff --git a/autogen/agentchat/contrib/web_surfer.py b/autogen/agentchat/contrib/web_surfer.py index 6cd71dc636d..e51dffdfd1c 100644 --- a/autogen/agentchat/contrib/web_surfer.py +++ b/autogen/agentchat/contrib/web_surfer.py @@ -5,10 +5,18 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, Union, Callable, Literal, Tuple from typing_extensions import Annotated -from ... import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat, OpenAIWrapper -from ...browser_utils import SimpleTextBrowser -from ...code_utils import content_str from datetime import datetime +from ..agent import Agent +from .. import ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat +from ...oai.client import OpenAIWrapper +from ...browser_utils import ( + SimpleTextBrowser, + SeleniumBrowserWrapper, + IS_SELENIUM_CAPABLE, + display_binary_image, + generate_png_filename, +) +from ...code_utils import content_str from ...token_count_utils import count_token, get_max_token_limit from ...oai.openai_utils import filter_config @@ -55,8 +63,21 @@ def __init__( self._create_summarizer_client(summarizer_llm_config, llm_config) + # Determine if the user has requested the Selenium browser or not + browser_type = browser_config.pop("type", "text") + # Create the browser - self.browser = SimpleTextBrowser(**(browser_config if browser_config else {})) + if browser_type != "text" and IS_SELENIUM_CAPABLE: + self.browser = SeleniumBrowserWrapper(**(browser_config if browser_config else {})) + self.is_graphical_browser = True + else: + # Cleanup any arguments specific to the desktop browser + if "web_driver" in browser_config: + browser_config.pop("web_driver") + if "render_text" in browser_config: + browser_config.pop("render_text") + self.browser = SimpleTextBrowser(**(browser_config if browser_config else {})) + self.is_graphical_browser = False inner_llm_config = copy.deepcopy(llm_config) @@ -84,6 +105,18 @@ def __init__( self.register_reply([Agent, None], ConversableAgent.generate_function_call_reply) self.register_reply([Agent, None], ConversableAgent.check_termination_and_human_reply) + @property + def text_content(self): + return self.browser.page_content + + @property + def render_text(self): + self.browser.set_page_content(self.browser.page_content) + return self.browser.page_content + + def close_the_browser(self): + self.browser.driver.quit() + def _create_summarizer_client(self, summarizer_llm_config: Dict[str, Any], llm_config: Dict[str, Any]) -> None: # If the summarizer_llm_config is None, we copy it from the llm_config if summarizer_llm_config is None: @@ -182,6 +215,41 @@ def _page_down() -> str: header, content = _browser_state() return header.strip() + "\n=======================\n" + content + if self.is_graphical_browser: + + @self._user_proxy.register_for_execution() + @self._assistant.register_for_llm( + name="get_screenshot", + description="Captures and displays a screenshot of the current web page as seen by the browser.", + ) + def _get_screenshot( + url: Annotated[Optional[str], "[Optional] The url of the page. (Defaults to the current page)"] = None, + ) -> str: + if url is not None and url != self.browser.address: + self.browser.visit_page(url) + else: + url = self.browser.address + + self.screenshot = self.browser.driver.get_screenshot_as_png() + display_binary_image(self.screenshot) + + @self._user_proxy.register_for_execution() + @self._assistant.register_for_llm( + name="save_screenshot", + description="Saves a screenshot of the current web page as seen by the browser.", + ) + def _save_screenshot( + url: Annotated[Optional[str], "[Optional] The url of the page. (Defaults to the current page)"] = None, + ) -> str: + if url is not None and url != self.browser.address: + self.browser.visit_page(url) + else: + url = self.browser.address + + png_filename = generate_png_filename(url) + self.screenshot = self.browser.driver.save_screenshot(png_filename) + # display_binary_image(self.screenshot) + if self.summarization_client is not None: @self._user_proxy.register_for_execution() diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index 41d2d62f825..88a389b0352 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -1,14 +1,22 @@ import json import os import requests +import traceback import re import markdownify import io import uuid import mimetypes -from urllib.parse import urljoin, urlparse +import hashlib # Used for generating a content ID from the URL (currently unused) +import random +import string +import tempfile +from math import ceil # to determine the total number of pages +from typing import Any, Dict, List, Optional, Union, Tuple, Callable +from urllib.parse import ParseResult, urljoin, urlparse from bs4 import BeautifulSoup -from typing import Any, Dict, List, Optional, Union, Tuple +from PIL import Image +from IPython.core.display_functions import display # Optional PDF support IS_PDF_CAPABLE = False @@ -26,6 +34,25 @@ except ModuleNotFoundError: pass +# The Selenium package is used to automate web browser interaction from Python +try: + from selenium import webdriver + from selenium.common.exceptions import TimeoutException + + # from selenium.webdriver.support.ui import WebDriverWait # We might implement this next + from selenium.webdriver.common.by import By + from selenium.webdriver.common.desired_capabilities import DesiredCapabilities + from selenium.webdriver.common.keys import Keys + from selenium.webdriver.edge.service import Service as EdgeService + from selenium.webdriver.edge.options import Options as EdgeOptions + from selenium.webdriver.firefox.options import Options as FirefoxOptions + from selenium.webdriver.firefox.firefox_profile import FirefoxProfile + from selenium.webdriver.chrome.options import Options as ChromeOptions + + IS_SELENIUM_CAPABLE = True +except: + IS_SELENIUM_CAPABLE = False + class SimpleTextBrowser: """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use.""" @@ -61,7 +88,7 @@ def set_address(self, uri_or_path: str) -> None: # Handle special URIs if uri_or_path == "about:blank": - self._set_page_content("") + self.set_page_content("") elif uri_or_path.startswith("bing:"): self._bing_search(uri_or_path[len("bing:") :].strip()) else: @@ -83,7 +110,7 @@ def page_content(self) -> str: """Return the full contents of the current page.""" return self._page_content - def _set_page_content(self, content: str) -> None: + def set_page_content(self, content: str) -> None: """Sets the text content of the current page.""" self._page_content = content self._split_pages() @@ -179,7 +206,7 @@ def _bing_search(self, query: str) -> None: ) if len(news_snippets) > 0: content += "\n\n## News Results:\n" + "\n\n".join(news_snippets) - self._set_page_content(content) + self.set_page_content(content) def _fetch_page(self, url: str) -> None: try: @@ -234,7 +261,7 @@ def _fetch_page(self, url: str) -> None: # Remove excessive blank lines self.page_title = soup.title.string - self._set_page_content(re.sub(r"\n{2,}", "\n\n", webpage_text).strip()) + self.set_page_content(re.sub(r"\n{2,}", "\n\n", webpage_text).strip()) elif content_type == "text/plain": # Get the content of the response plain_text = "" @@ -242,11 +269,11 @@ def _fetch_page(self, url: str) -> None: plain_text += chunk self.page_title = None - self._set_page_content(plain_text) + self.set_page_content(plain_text) elif IS_PDF_CAPABLE and content_type == "application/pdf": pdf_data = io.BytesIO(response.raw.read()) self.page_title = None - self._set_page_content(pdfminer.high_level.extract_text(pdf_data)) + self.set_page_content(pdfminer.high_level.extract_text(pdf_data)) elif self.downloads_folder is not None: # Try producing a safe filename fname = None @@ -270,13 +297,738 @@ def _fetch_page(self, url: str) -> None: # Return a page describing what just happened self.page_title = "Download complete." - self._set_page_content(f"Downloaded '{url}' to '{download_path}'.") + self.set_page_content(f"Downloaded '{url}' to '{download_path}'.") else: self.page_title = f"Error - Unsupported Content-Type '{content_type}'" - self._set_page_content(self.page_title) + self.set_page_content(self.page_title) else: self.page_title = "Error" - self._set_page_content("Failed to retrieve " + url) + self.set_page_content("Failed to retrieve " + url) + except requests.exceptions.RequestException as e: + self.page_title = "Error" + self.set_page_content(str(e)) + + +def get_scheme(url: Union[str, ParseResult]) -> str: + """ + Extracts the scheme component from a given URL. + + This function supports both string URLs and ParseResult objects. For string URLs, it parses + the URL and extracts the scheme part. For ParseResult objects, it directly accesses the scheme attribute. + + Args: + url (Union[str, ParseResult]): The URL from which to extract the scheme. Can be a string or a ParseResult object. + + Returns: + str: The scheme of the URL (e.g., 'http', 'https'). + """ + return urlparse(url).scheme if isinstance(url, str) else url.scheme + + +def get_domain(url: Union[str, ParseResult]) -> str: + """ + Retrieves the domain (network location) component from a URL. + + Similar to `get_scheme`, this function can handle both string representations of URLs and + ParseResult objects. It extracts the network location part from the URL. + + Args: + url (Union[str, ParseResult]): The URL from which to extract the domain. Can be a string or a ParseResult object. + + Returns: + str: The domain of the URL (e.g., 'www.example.com'). + """ + return urlparse(url).netloc if isinstance(url, str) else url.netloc + + +def get_path(url: Union[str, ParseResult]) -> str: + """ + Extracts the path component from a URL. + + This function processes both strings and ParseResult objects to return the path segment of the URL. + The path is the part of the URL that follows the domain but precedes any query parameters or fragment identifiers. + + Args: + url (Union[str, ParseResult]): The URL from which to extract the path. Can be a string or a ParseResult object. + + Returns: + str: The path of the URL (e.g., '/path/to/resource'). + """ + return urlparse(url).path if isinstance(url, str) else url.path + + +def get_last_path(url: Union[str, ParseResult]) -> str: + """ + Retrieves the last component of the path from a URL. + + This function is useful for extracting the final part of the path, often representing a specific resource or page. + It handles both string URLs and ParseResult objects. For string URLs, it parses the URL to extract the path and then + retrieves the last component. + + Args: + url (Union[str, ParseResult]): The URL from which to extract the last path component. Can be a string or a ParseResult object. + + Returns: + str: The last component of the path (e.g., 'resource.html'). + """ + return ( + os.path.basename(urlparse(url).path.rstrip("/")) + if isinstance(url, str) + else os.path.basename(url.path.rstrip("/")) + ) + + +def github_path_rule(parsed_url: ParseResult) -> str: + """Specific rule for GitHub URLs.""" + return os.path.join(parsed_url.netloc.replace("www.", ""), parsed_url.path.lstrip("/")) + + +def default_path_rule(parsed_url: ParseResult) -> str: + """Fallback rule for general URLs.""" + return os.path.join(parsed_url.netloc.replace("www.", ""), get_last_path(parsed_url.path)) + + +def get_file_path_from_url( + url: Union[str, ParseResult], + domain_rules: Optional[Dict[str, Callable[[ParseResult], str]]] = None, + default_path_rule: Optional[Callable[[ParseResult], str]] = None, +) -> str: + """ + Converts a URL into a corresponding local file path, allowing for domain-specific customization. + + This function takes a URL, either as a string or a ParseResult object, and generates a path that represents + the URL's location in a hypothetical local file system structure. It supports domain-specific rules for + customizable path generation, with a default rule applied to URLs from domains not explicitly configured. + + Parameters: + url (Union[str, ParseResult]): The URL to be converted into a local file path. + domain_rules (Optional[Dict[str, Callable[[ParseResult], str]]]): A dictionary mapping domains to functions + that define how to construct file paths for URLs from those domains. + default_path_rule (Optional[Callable[[ParseResult], str]]): A function to construct file paths for URLs + from domains not covered by `domain_rules`. + + Returns: + str: The generated local file path, which omits the protocol and optionally adjusts for specific domain structures. + """ + # Parse the URL if not already + parsed_url = urlparse(url) if isinstance(url, str) else url + canonical_url = parsed_url.netloc.replace("www.", "") + + # Determine the appropriate path rule to use + if domain_rules and canonical_url in domain_rules: + path_rule = domain_rules[canonical_url] + else: + path_rule = ( + default_path_rule + if default_path_rule + else lambda u: os.path.join(u.netloc.replace("www.", ""), get_last_path(u.path.rstrip("/"))) + ) + + # Generate the relative path using the selected rule + relative_path = path_rule(parsed_url) + + # Remove any preceding forward slash for consistency + relative_path = relative_path.lstrip("/") + + return relative_path + + +def fix_missing_protocol(img_url: str, source_url: str) -> str: + """ + Ensures that an image URL has a proper protocol specified, using the protocol of a source URL as a reference. + + This function checks if the given image URL lacks a protocol (http or https) and, if so, fixes the URL by + prepending it with the protocol from the source URL. This is useful for fixing relative URLs or those missing + a scheme. + + Parameters: + img_url (str): The image URL to be corrected. It can be a relative URL or one missing a protocol. + source_url (str): The source URL from which to extract the protocol and, if necessary, the domain. + + Returns: + str: The corrected image URL with a protocol. + + Note: + The function handles URLs starting with "//" by directly adding the protocol. If the domain is missing + from `img_url`, the function constructs the full URL using the protocol and domain from `source_url`. + """ + protocol = get_scheme(source_url) + domain = get_domain(source_url) + + if img_url.startswith("//"): # If the URL starts with "//" + img_url = f"{protocol}:{img_url}" # Add "https:" before it + + elif not bool(get_domain(img_url)): # domain not in img_url: + img_url = f"{protocol}://{domain}/{img_url}" + + return img_url + + +def extract_pdf_text(local_pdf_path: str): # Returns the extracted text content from a local PDF file + """ + Extracts the text content from a local PDF file and returns it as a string. + + Parameters: + - local_pdf_path (str): The path to the local PDF file from which the text will be extracted. + + Returns: + - str: A string containing the text content of the provided PDF file. + """ + + try: + text = pdfminer.high_level.extract_text(local_pdf_path) + except Exception: + traceback.print_exc() + text = "" + + return text + + +def download_using_requests( + driver: Union[ + webdriver.edge.webdriver.WebDriver, webdriver.firefox.webdriver.WebDriver, webdriver.chrome.webdriver.WebDriver + ], + download_url: str, + save_path: str, +) -> None: + """ + This function takes a Selenium WebDriver instance, a URL to download a file, and a path where you want to save the downloaded file. + + It first retrieves cookies from the given driver, converts them into a format suitable for use with the `requests` library, and then uses these cookies to successfully download the specified file using the `requests.get()` function. The `User-Agent` header is also set to match that used by the WebDriver instance. + + Args: + driver (webdriver.edge.webdriver.WebDriver): A Selenium WebDriver instance, typically obtained from selenium.webdriver.Edge() or another appropriate method for your browser of choice. + download_url (str): The URL to the file you want to download. + save_path (str): The path where you would like the downloaded file to be saved. + + Returns: + None, but successfully downloads a file from the given URL using the cookies and headers obtained from the WebDriver instance. + + Raises: + Exception: If the file cannot be downloaded due to an error in the `requests.get()` call. + """ + + def get_cookies(driver): + return driver.get_cookies() + + def convert_cookies_to_requests_format(cookies): + cookie_dict = {} + for cookie in cookies: + cookie_dict[cookie["name"]] = cookie["value"] + return cookie_dict + + def download_file_with_cookies(url, session_cookies, save_path, user_agent=None): + headers = { + "User-Agent": user_agent + if user_agent + else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15" + } + + response = requests.get(url, cookies=session_cookies, headers=headers, stream=True) + if response.status_code == 200: + with open(save_path, "wb") as file: + for chunk in response.iter_content(1024): + file.write(chunk) + + # Extract cookies from WebDriver + cookies = get_cookies(driver) + + # Convert cookies for use with requests + session_cookies = convert_cookies_to_requests_format(cookies) + + # Define the user-agent if you want to match the one used by your WebDriver + user_agent = driver.execute_script("return navigator.userAgent;") + + # Download file using requests with the same session cookies and headers + download_file_with_cookies(download_url, session_cookies, save_path, user_agent=user_agent) + + +def display_binary_image(binary_data): + """ + display_binary_image(binary_data): + This function displays the binary image data in Jupyter notebook cells or shows it in non-notebook environments. + + Args: + - binary_data (bytes): A bytes object containing the PNG image data. + + Returns: + - Nothing, but in non-notebook environment, it displays the image. + """ + img = Image.open(io.BytesIO(binary_data)) + try: + __IPYTHON__ + display(img) + except NameError: + img.show() + + +def generate_png_filename(url: str): # Function to help provide a PNG filename (with relative path) + """ + Generates a PNG filename based on the provided URL, along with a small random hash. + + Args: + url (str): The URL from which to create a filename. + + Returns: + str: A unique PNG filename based on the URL and a random hash. + """ + + # Split the URL into its components + parsed_url = urlparse(url) + + # Generate a 4-character random hash from lowercase letters and digits + random_hash = "".join(random.choices(string.ascii_lowercase + string.digits, k=6)) + + return f"{'.'.join(parsed_url.netloc.split('.')[-2:])}-{random_hash}.png" + + +def SeleniumBrowser(**kwargs): # Function that loads the web driver + """ + This function launches a headless Selenium browser based on the specified 'browser'. The available options are 'edge', 'firefox', and 'chrome'. + + Parameters: + browser (str): A string specifying which browser to launch. Defaults to 'firefox'. + download_dir (str): A path to where downloaded files are stored. Defaults to None + resolution (tuple): A tuple of size 2 for screen resolution in the order of width and height. Defaults to (1920,1080) + + Returns: + webdriver: An instance of the Selenium WebDriver based on the specified browser. User can open a new page by `webdriver.get('https://www.microsoft.com')`. + """ + + # Load the arguments from kwargs + browser = kwargs.get("browser", "edge") + download_dir = kwargs.get("download_dir", tempfile.gettempdir()) + if not download_dir: + download_dir = tempfile.gettempdir() + + browser_res = kwargs.get("resolution", (1920, 1080)) + + def get_headless_options(download_dir, options): + options.headless = True + options.add_argument("--headless") + options.add_argument(f"--window-size={browser_res[0]},{browser_res[1]}") + options.add_argument("--downloadsEnabled") + if download_dir: + options.set_preference("download.default_directory", download_dir) + return options + + if browser.lower() == "edge": + options = EdgeOptions() + options.use_chromium = True # Ensure we're using the Chromium-based version of Edge + options.headless = True + options.add_argument("--headless") + options.add_argument(f"--window-size={browser_res[0]},{browser_res[1]}") + options.add_argument("--downloadsEnabled") + + prefs = { + "download.default_directory": download_dir, + "download.prompt_for_download": False, # Disable download prompt + "download.directory_upgrade": True, # Enable directory upgrade + "safebrowsing.enabled": True, # Enable safe browsing + } + options.add_experimental_option("prefs", prefs) + # Instantiate the EdgeService object + edge_service = EdgeService() + # Instantiate the Edge WebDriver with the configured options + driver = webdriver.Edge(options=options, service=edge_service) + + elif browser.lower() == "firefox": + # Instantiate the Firefox Profile to specify options + profile = FirefoxProfile() + profile.set_preference("browser.download.folderList", 2) # Custom location + profile.set_preference("browser.download.dir", download_dir) + profile.set_preference("browser.download.useDownloadDir", True) + profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") # MIME type + profile.set_preference("javascript.enabled", False) + profile.update_preferences() + options = FirefoxOptions() + options.profile = profile + options.set_capability("se:downloadsEnabled", True) + + # Instantiate the Firefox WebDriver with the configured options + driver = webdriver.Firefox(options=get_headless_options(download_dir, options)) + + elif browser.lower() == "chrome": + # Instantiate the Chrome Options + options = ChromeOptions() + prefs = { + "download.default_directory": download_dir, + "download.prompt_for_download": False, # Disable download prompt + "download.directory_upgrade": True, # Enable directory upgrade + "safebrowsing.enabled": True, # Enable safe browsing + } + options.add_experimental_option("prefs", prefs) + # Instantiate the Chrome WebDriver with the configured options + driver = webdriver.Chrome(options=get_headless_options(download_dir, options)) + + else: + raise (f"Unknown browser type {browser}") + + # Ensure that downloads are permitted + driver.capabilities["se:downloadsEnablead"] = True + # Ensure that the window is at the expected size + driver.set_window_size(browser_res[0], browser_res[1]) + + return driver + + +class SeleniumBrowserWrapper: # A wrapper to bridge compatibility between SimpleTextBrowser and SeleniumBrowser + """ + SeleniumBrowserWrapper class is a wrapper that manages the interaction with a Selenium web driver. + It provides methods to control the browser, set up the viewport size, and download files. + + Parameters: + - start_page (Optional[str]): The initial URL of the web page to load. Defaults to "about:blank". + - viewport_size (Optional[int]): The width of the viewport in pixels. Defaults to 1024 * 8. + - downloads_folder (Optional[Union[str, None]]): The directory where downloaded files will be saved. If set to `None`, default downloads folder will be used. + - bing_api_key (Optional[Union[str, None]]): The API key for Bing search engine. + - request_kwargs (Optional[Union[Dict[str, Any], None]]): Additional keyword arguments that can be passed for customization. + - web_driver (Optional[str]): The type of web driver to use. Defaults to 'edge'. + + Attributes: + - start_page (str): The initial URL of the web page to load. + - viewport_size (int): The width of the viewport in pixels. + - downloads_folder (Union[str, None]): The directory where downloaded files will be saved. + - history (List[str]): A list containing the URLs visited by the browser. + - page_title (Optional[str]): The title of the current web page. + - viewport_current_page (int): The index of the current web page in relation to all pages loaded. + - viewport_pages (List[Tuple[int, int]]): A list containing tuples of width and height for each viewed web page. + - bing_api_key (Optional[str]): The API key for Bing search engine. + - request_kwargs (Optional[Union[Dict[str, Any], None]]): Additional keyword arguments passed during instantiation. + - _page_content (str): The content of the current web page. + - driver: An instance of SeleniumBrowser class that manages the browser interaction. + + Notes: + - Viewport Size and Pages: The concept of viewport size and pagination doesn't directly apply to Selenium as it does in a text-based browser. Selenium interacts with the whole page. However, actions like scrolling can be simulated. + - Downloads Folder: This is handled through ChromeOptions if you need to set a default download directory. + - History Management: This wrapper maintains a simple history of visited URLs for compatibility with the SimpleTextBrowser's API. + - Page Content: Selenium's page_source property provides the HTML content of the current page, making the distinction between viewport and page content less relevant. + + """ + + def __init__( + self, + start_page: Optional[str] = None, + viewport_size: Optional[int] = 1024 * 8, + downloads_folder: Optional[Union[str, None]] = None, + bing_api_key: Optional[Union[str, None]] = None, + request_kwargs: Optional[Union[Dict[str, Any], None]] = None, + browser: Optional[str] = "edge", + page_load_time: Optional[int] = 6, + resolution: Optional[Tuple] = (1920, 1080), + render_text: Optional[bool] = False, + ): + self.start_page: str = start_page if start_page else "about:blank" + self.downloads_folder = downloads_folder + self.history: List[str] = list() + self.page_title: Optional[str] = None + self.viewport_current_page = 0 + self.viewport_pages: List[Tuple[int, int]] = list() + self.bing_api_key = bing_api_key + self.request_kwargs = request_kwargs + self.page_load_time = page_load_time + self._page_content = "" + self.window_width = resolution[0] + self.window_height = resolution[1] + self.viewport_size = resolution[1] # We override this from SimpleTextBrowser to match the browser window height + self.render_text = render_text # Just in case for functionality purposes + + # Initialize the WebDriver + self.driver = SeleniumBrowser(browser=browser, download_dir=downloads_folder, resolution=resolution) + if start_page: + self.set_address(self.start_page) + + @property + def address(self) -> str: + """Return the address of the current page.""" + return self.history[-1] if self.history else "about:blank" + + @property + def viewport(self) -> str: + """Return the content of the current viewport.""" + return self._page_content + + @property + def page_content(self) -> str: + """Return the full contents of the current page.""" + return self.viewport # In Selenium, viewport essentially contains the full page content + + def set_address(self, uri_or_path: str) -> None: + """Navigate to a given URI and update history.""" + if not uri_or_path.startswith("http:") and not uri_or_path.startswith("https:"): + uri_or_path = urljoin(self.address, uri_or_path) + + self.history.append(uri_or_path) + + # Handle special URIs + if uri_or_path == "about:blank": + self.set_page_content("") + elif uri_or_path.startswith("bing:"): + self._bing_search(uri_or_path[len("bing:") :].strip()) + else: + if not uri_or_path.startswith("http:") and not uri_or_path.startswith("https:"): + uri_or_path = urljoin(self.address, uri_or_path) + self.history[-1] = uri_or_path # Update the address with the fully-qualified path + # Navigate to the specified URI or path + self._fetch_page(uri_or_path) + + self.viewport_current_page = 0 + self._split_pages() + + def visit_page(self, path_or_uri: str) -> str: + """Navigate to a page and return its content.""" + self.set_address(path_or_uri) + return self.viewport + + def page_down(self) -> None: + """Simulate page down action.""" + # Simulate pressing Page Down key + self.driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_DOWN) + + def page_up(self) -> None: + """Simulate page up action.""" + # Simulate pressing Page Up key + self.driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_UP) + + def _update_page_content(self) -> None: + """Update internal content state, including page title.""" + self.page_title = self.driver.title + + def close(self): + """Close the browser.""" + self.driver.quit() + + def _split_pages(self) -> None: + # Page scroll position + int(self.driver.execute_script("return document.documentElement.scrollHeight")) + + # Grab the current page height based on the scrollbar + self.page_height = self.driver.execute_script("return window.pageYOffset + window.innerHeight") + + # Calculate the total number of pages currently rendered + self.page_count = ceil(self.window_height / self.page_height) + + # Split only regular pages + if not self.address.startswith("http:") and not self.address.startswith("https:"): + self.viewport_pages = [(0, len(self._page_content))] + return + + # Handle empty pages + if len(self._page_content) == 0: + self.viewport_pages = [(0, 0)] + return + + # Break the viewport into pages + self.viewport_pages = [] + start_idx = 0 + while start_idx < self.page_height: + end_idx = min(start_idx + self.viewport_size, self.page_height) # type: ignore[operator] + self.viewport_pages.append((start_idx, end_idx)) + start_idx = end_idx + + return + + def _bing_api_call(self, query: str) -> Dict[str, Dict[str, List[Dict[str, Union[str, Dict[str, str]]]]]]: + # Make sure the key was set + if self.bing_api_key is None: + raise ValueError("Missing Bing API key.") + + # Prepare the request parameters + request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {} + + if "headers" not in request_kwargs: + request_kwargs["headers"] = {} + request_kwargs["headers"]["Ocp-Apim-Subscription-Key"] = self.bing_api_key + + if "params" not in request_kwargs: + request_kwargs["params"] = {} + request_kwargs["params"]["q"] = query + request_kwargs["params"]["textDecorations"] = False + request_kwargs["params"]["textFormat"] = "raw" + request_kwargs["stream"] = False + + # Make the request + response = requests.get("https://api.bing.microsoft.com/v7.0/search", **request_kwargs) + response.raise_for_status() + results = response.json() + + return results # type: ignore[no-any-return] + + def _bing_search(self, query: str) -> None: + results = self._bing_api_call(query) + self.bing_results = results + web_snippets: List[str] = list() + idx = 0 + for page in results["webPages"]["value"]: + idx += 1 + web_snippets.append(f"{idx}. [{page['name']}]({page['url']})\n{page['snippet']}") + if "deepLinks" in page: + for dl in page["deepLinks"]: + idx += 1 + web_snippets.append( + f"{idx}. [{dl['name']}]({dl['url']})\n{dl['snippet'] if 'snippet' in dl else ''}" # type: ignore[index] + ) + + news_snippets = list() + if "news" in results: + for page in results["news"]["value"]: + idx += 1 + news_snippets.append(f"{idx}. [{page['name']}]({page['url']})\n{page['description']}") + + self.page_title = f"{query} - Search" + + content = ( + f"A Bing search for '{query}' found {len(web_snippets) + len(news_snippets)} results:\n\n## Web Results\n" + + "\n\n".join(web_snippets) + ) + if len(news_snippets) > 0: + content += "\n\n## News Results:\n" + "\n\n".join(news_snippets) + + self.set_page_content(content) + + def set_page_content(self, content): + """Sets the text content of the current page.""" + self._page_content = content + + # Your custom HTML content + custom_html_content = "" + content.replace("\n", "
") + "" + + # Create a temporary HTML file + with tempfile.NamedTemporaryFile("w", delete=False, suffix=".html") as tmp_file: + tmp_file.write(custom_html_content) + html_file_path = tmp_file.name + + # Navigate to the file + self.driver.get(f"file://{html_file_path}") + + def download(self, uri_or_path: str) -> None: + """Download from a given URI""" + download_using_requests(self.driver, self.downloads_folder, os.path.basename(uri_or_path.rstrip("/"))) + + def _get_headers(self): + def parse_list_to_dict(lst): + result_dict = {} + for item in lst: + key, value = item.split(": ", 1) + # Attempt to load JSON content if present + try: + value_json = json.loads(value) + result_dict[key] = value_json + except json.JSONDecodeError: + # Handle non-JSON value + result_dict[key] = value + return result_dict + + headers = self.driver.execute_script( + "var req = new XMLHttpRequest();req.open('GET', document.location, false);req.send(null);return req.getAllResponseHeaders()" + ) + headers = headers.splitlines() + headers = parse_list_to_dict(headers) + return headers + + def _fetch_page(self, url: str) -> None: + try: + self.driver.get(url) + self.driver.implicitly_wait(self.page_load_time) + self.history.append(url) + headers = self._get_headers() + + self.page_title = self.driver.title + + # We can't get response codes without using a proxy or using requests in a double call + content_type = headers.get("content-type", "") + for ct in ["text/html", "text/plain", "application/pdf"]: + if ct in content_type.lower(): + content_type = ct + break + + if content_type == "text/html": + html = self.driver.page_source + soup = BeautifulSoup(html, "html.parser") + + # Remove javascript and style blocks + for script in soup(["script", "style"]): + script.extract() + + # Convert to markdown -- Wikipedia gets special attention to get a clean version of the page + if url.startswith("https://en.wikipedia.org/"): + body_elm = soup.find("div", {"id": "mw-content-text"}) + title_elm = soup.find("span", {"class": "mw-page-title-main"}) + + if body_elm: + # What's the title + main_title = soup.title.string + if title_elm and len(title_elm) > 0: + main_title = title_elm.string + webpage_text = ( + "# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm) + ) + else: + webpage_text = markdownify.MarkdownConverter().convert_soup(soup) + else: + webpage_text = markdownify.MarkdownConverter().convert_soup(soup) + + # Convert newlines + webpage_text = re.sub(r"\r\n", "\n", webpage_text) + + # Remove excessive blank lines + if self.render_text: + self.page_title = soup.title.string + self.set_page_content(webpage_text.strip()) + else: + self._page_content = webpage_text + + elif content_type == "text/plain": + html = self.driver.page_source + soup = BeautifulSoup(html, "html.parser") + plain_text = soup.prettify() + if self.render_text: + self.page_title = None + self.set_page_content(plain_text) + else: + self._page_content = plain_text + + elif IS_PDF_CAPABLE and content_type == "application/pdf": + download_using_requests(self.driver, self.downloads_folder, os.path.basename(url)) + plain_text = extract_pdf_text(os.path.join(self.downloads_folder, os.path.basename(url))) + if self.render_text: + self.page_title = None + self.set_page_content(plain_text) + else: + self._page_content = plain_text + + elif self.downloads_folder is not None: + # Try producing a safe filename + fname = None + try: + fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip() + except NameError: + pass + + # No suitable name, so make one + if fname is None: + extension = mimetypes.guess_extension(content_type) + if extension is None: + extension = ".download" + fname = str(uuid.uuid4()) + extension + + # Open a file for writing + download_path = os.path.abspath(os.path.join(self.downloads_folder, fname)) + download_using_requests(self.driver, self.downloads_folder, fname) + + # Return a page describing what just happened + if self.render_text: + self.page_title = "Download complete." + self.set_page_content(f"Downloaded '{url}' to '{download_path}'.") + else: + self._page_content = f"Downloaded '{url}' to '{download_path}'." + + elif self.render_text: + self.page_title = f"Error - Unsupported Content-Type '{content_type}'" + self.set_page_content(self.page_title) + else: + self._page_content = None + except requests.exceptions.RequestException as e: self.page_title = "Error" - self._set_page_content(str(e)) + self.set_page_content(str(e)) diff --git a/notebook/agentchat_surfer_edge.ipynb b/notebook/agentchat_surfer_edge.ipynb new file mode 100644 index 00000000000..ce4015f7f60 --- /dev/null +++ b/notebook/agentchat_surfer_edge.ipynb @@ -0,0 +1,796 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# WebSurfer Agent with Headless GUI-based Browsing\n", + "\n", + "This notebook is derived from the standard [WebSurferAgent Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_surfer.ipynb) for the purposes of demonstrating coverage." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Requirements\n", + "\n", + "AutoGen requires `Python>=3.8`. To run this notebook example, please install:\n", + "```bash\n", + "pip install pyautogen selenium markdownify pillow pdfminer.six beautifulsoup4 arxiv\n", + "```\n", + "or\n", + "```bash\n", + "pip install \"pyautogen[websurfer]\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ensure that we have the WebDrivers present for Selenium\n", + "\n", + "*EDIT*:\n", + "[Selenium Manager](https://www.selenium.dev/documentation/selenium_manager/) states:\n", + "\"Selenium Manager is a command-line tool implemented in Rust that provides automated driver and browser management for Selenium. Selenium bindings use this tool by default, so you do not need to download it or add anything to your code or do anything else to use it.\"\n", + "\n", + "Therefore the folling instructions should not be needed:\n", + "Following the instructions in [Selenium Documentation](https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location/#download-the-driver), \n", + "we first download the web driver for our browser of choice, or all 3: [Edge](https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads), [Firefox](https://github.com/mozilla/geckodriver/releases), [Chrome](https://chromedriver.chromium.org/downloads).~~" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Neither powershell nor pwsh is installed.\n" + ] + } + ], + "source": [ + "# %%capture --no-stderr\n", + "import os\n", + "import logging\n", + "import autogen\n", + "from time import sleep\n", + "\n", + "from autogen.agentchat.contrib.web_surfer import WebSurferAgent\n", + "from autogen.agentchat.conversable_agent import ConversableAgent\n", + "from autogen.agentchat.user_proxy_agent import UserProxyAgent\n", + "from autogen.oai import config_list_from_json\n", + "from autogen.browser_utils import display_binary_image\n", + "\n", + "# Get the logger instance for the current module (__name__).\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set your API Endpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The [`config_list_from_json`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_json) function loads a list of configurations from an environment variable or a json file.\n", + "\n", + "It first looks for environment variable \"OAI_CONFIG_LIST\" which needs to be a valid json string. If that variable is not found, it then looks for a json file named \"OAI_CONFIG_LIST\". It filters the configs by models (you can filter by other keys as well).\n", + "\n", + "The WebSurferAgent uses a combination of models. GPT-4 and GPT-3.5-turbo-16 are recommended.\n", + "\n", + "Your json config should look something like the following:\n", + "```json\n", + "[\n", + " {\n", + " \"model\": \"gpt-4\",\n", + " \"api_key\": \"\"\n", + " },\n", + " {\n", + " \"model\": \"gpt-3.5-turbo-16k\",\n", + " \"api_key\": \"\"\n", + " }\n", + "]\n", + "```\n", + "\n", + "If you open this notebook in colab, you can upload your files by clicking the file icon on the left panel and then choose \"upload file\" icon.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "llm_config = {\n", + " \"timeout\": 600,\n", + " \"cache_seed\": 44, # change the seed for different trials\n", + " \"config_list\": config_list_from_json(\n", + " \"OAI_CONFIG_LIST\",\n", + " filter_dict={\"model\": [\"gpt-3.5-turbo\"]},\n", + " ),\n", + " \"temperature\": 0,\n", + "}\n", + "\n", + "summarizer_llm_config = {\n", + " \"timeout\": 600,\n", + " \"cache_seed\": 44, # change the seed for different trials\n", + " \"config_list\": config_list_from_json(\n", + " \"OAI_CONFIG_LIST\",\n", + " filter_dict={\"model\": [\"gpt-3.5-turbo\"]},\n", + " ),\n", + " \"temperature\": 0,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure Bing\n", + "\n", + "For WebSurferAgent to be reasonably useful, it needs to be able to search the web -- and that means it needs a Bing API key. \n", + "You can read more about how to get an API on the [Bing Web Search API](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) page.\n", + "\n", + "Once you have your key, either set it as the `BING_API_KEY` system environment variable, or simply input your key below." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "bing_api_key = os.environ[\"BING_API_KEY\"] if \"BING_API_KEY\" in os.environ else \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Construct Agents\n", + "\n", + "We now create out WebSurferAgent, and a UserProxyAgent to surf the web, but using a graphical based browser required for many use-cases. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "web_surfer = WebSurferAgent(\n", + " \"web_surfer\",\n", + " llm_config=llm_config,\n", + " summarizer_llm_config=summarizer_llm_config,\n", + " browser_config={\n", + " \"type\": \"selenium\", # *NEW* Here we specify that we intend to use our headless GUI browser. The default setting is \"text\".\n", + " \"browser\": \"edge\", # *NEW* We'll use the edge browser for these tests. Choices include 'edge', 'firefox', and 'chrome'\n", + " \"resolution\": (1400, 900), # *NEW* we specify the browser window size. The default is (1920,5200)\n", + " \"render_text\": False, # *NEW* We still have the option to convert the output to text and render it in the browser\n", + " \"bing_api_key\": bing_api_key,\n", + " },\n", + ")\n", + "\n", + "user_proxy = UserProxyAgent(\n", + " \"user_proxy\",\n", + " human_input_mode=\"NEVER\",\n", + " code_execution_config=False,\n", + " default_auto_reply=\"\",\n", + " is_termination_msg=lambda x: True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook Content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Part 1: Search, summarize\n", + "- Search for information aobut Microsoft AutoGen\n", + "- Summarize the results\n", + "- Visit the Getting Started Docs page" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to web_surfer):\n", + "\n", + "\n", + "Search the web for information about Microsoft AutoGen\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION informational_web_search...\u001b[0m\n", + "\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n", + "\n", + "Address: bing: Microsoft AutoGen\n", + "Title: Microsoft AutoGen - Search\n", + "Viewport position: Showing page 1 of 1.\n", + "=======================\n", + "A Bing search for 'Microsoft AutoGen' found 8 results:\n", + "\n", + "## Web Results\n", + "1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\n", + "AutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\n", + "\n", + "2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\n", + "AutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\n", + "\n", + "3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\n", + "AutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\n", + "\n", + "4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\n", + "AutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\n", + "\n", + "5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\n", + "AutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\n", + "\n", + "6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\n", + "AutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\n", + "\n", + "7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\n", + "AutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\n", + "\n", + "8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\n", + "AutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': '\\nSearch the web for information about Microsoft AutoGen\\n', 'role': 'assistant'}, {'content': \"Address: bing: Microsoft AutoGen\\nTitle: Microsoft AutoGen - Search\\nViewport position: Showing page 1 of 1.\\n=======================\\nA Bing search for 'Microsoft AutoGen' found 8 results:\\n\\n## Web Results\\n1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\\nAutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\\n\\n2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\\nAutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\\n\\n3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\\n\\n4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\\nAutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\\n\\n5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\\nAutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\\n\\n6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\\n\\n7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\\nAutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\\n\\n8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\", 'role': 'user'}], summary=\"Address: bing: Microsoft AutoGen\\nTitle: Microsoft AutoGen - Search\\nViewport position: Showing page 1 of 1.\\n=======================\\nA Bing search for 'Microsoft AutoGen' found 8 results:\\n\\n## Web Results\\n1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\\nAutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\\n\\n2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\\nAutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\\n\\n3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\\n\\n4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\\nAutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\\n\\n5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\\nAutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\\n\\n6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\\n\\n7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\\nAutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\\n\\n8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\", cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Bing search is a special case and we return the text in the same way as the SimpleTextBrowser\n", + "\n", + "task1 = \"\"\"\n", + "Search the web for information about Microsoft AutoGen\n", + "\"\"\"\n", + "\n", + "user_proxy.initiate_chat(web_surfer, message=task1)\n", + "\n", + "# Note that these results are also accessable in JSON format with `web_surfer.browser.bing_results`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to web_surfer):\n", + "\n", + "Summarize these results\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION summarize_page...\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The token limit (4096) of the WebSurferAgent.summarizer_llm_config, is below the recommended 16k.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n", + "\n", + "AutoGen is a framework developed by Microsoft Research to simplify the orchestration, optimization, and automation of large language model (LLM) workflows. The framework offers customizable and conversable agents that utilize advanced LLM capabilities, such as GPT-4, while also integrating with humans and tools to address limitations and enhance performance. As developers create more complex LLM-based applications, the workflows become intricate, requiring significant effort and expertise to design and implement. Automating these workflows using AutoGen can streamline the process and improve efficiency, enabling the creation of next-generation applications that leverage the full potential of LLMs. The framework supports conversations between multiple agents through automated chat, providing a solution to the challenge of orchestrating optimal workflows in a vast and complex design space.\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': '\\nSearch the web for information about Microsoft AutoGen\\n', 'role': 'assistant'}, {'content': \"Address: bing: Microsoft AutoGen\\nTitle: Microsoft AutoGen - Search\\nViewport position: Showing page 1 of 1.\\n=======================\\nA Bing search for 'Microsoft AutoGen' found 8 results:\\n\\n## Web Results\\n1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\\nAutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\\n\\n2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\\nAutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\\n\\n3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\\n\\n4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\\nAutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\\n\\n5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\\nAutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\\n\\n6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\\n\\n7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\\nAutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\\n\\n8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\", 'role': 'user'}, {'content': 'Summarize these results', 'role': 'assistant'}, {'content': 'AutoGen is a framework developed by Microsoft Research to simplify the orchestration, optimization, and automation of large language model (LLM) workflows. The framework offers customizable and conversable agents that utilize advanced LLM capabilities, such as GPT-4, while also integrating with humans and tools to address limitations and enhance performance. As developers create more complex LLM-based applications, the workflows become intricate, requiring significant effort and expertise to design and implement. Automating these workflows using AutoGen can streamline the process and improve efficiency, enabling the creation of next-generation applications that leverage the full potential of LLMs. The framework supports conversations between multiple agents through automated chat, providing a solution to the challenge of orchestrating optimal workflows in a vast and complex design space.', 'role': 'user'}], summary='AutoGen is a framework developed by Microsoft Research to simplify the orchestration, optimization, and automation of large language model (LLM) workflows. The framework offers customizable and conversable agents that utilize advanced LLM capabilities, such as GPT-4, while also integrating with humans and tools to address limitations and enhance performance. As developers create more complex LLM-based applications, the workflows become intricate, requiring significant effort and expertise to design and implement. Automating these workflows using AutoGen can streamline the process and improve efficiency, enabling the creation of next-generation applications that leverage the full potential of LLMs. The framework supports conversations between multiple agents through automated chat, providing a solution to the challenge of orchestrating optimal workflows in a vast and complex design space.', cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task2 = \"Summarize these results\"\n", + "user_proxy.initiate_chat(web_surfer, message=task2, clear_history=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to web_surfer):\n", + "\n", + "Click the 'Getting Started' result\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION visit_page...\u001b[0m\n", + "\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n", + "\n", + "Address: https://microsoft.github.io/autogen/docs/Getting-Started/\n", + "Title: Getting Started | AutoGen\n", + "Viewport position: Showing page 1 of 1.\n", + "=======================\n", + "\n", + "\n", + "\n", + "Getting Started | AutoGen\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\n", + "* [Gallery](/autogen/docs/Gallery)\n", + "[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\n", + "[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\n", + "* [Installation](/autogen/docs/installation/)\n", + "* [LLM Configuration](/autogen/docs/llm_configuration)\n", + "* [Use Cases](#)\n", + "* [Contributing](/autogen/docs/Contribute)\n", + "* [Research](/autogen/docs/Research)\n", + "* [Migration Guide](/autogen/docs/Migration-Guide)\n", + "* \n", + "* Getting Started\n", + "On this pageGetting Started\n", + "===============\n", + "\n", + "\n", + "AutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools.\n", + "\n", + "\n", + "![AutoGen Overview](/autogen/assets/images/autogen_agentchat-250ca64b77b87e70d34766a080bf6ba8.png)\n", + "\n", + "\n", + "### Main Features[​](#main-features \"Direct link to Main Features\")\n", + "\n", + "\n", + "* AutoGen enables building next-gen LLM applications based on [multi-agent conversations](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) with minimal effort. It simplifies the orchestration, automation, and optimization of a complex LLM workflow. It maximizes the performance of LLM models and overcomes their weaknesses.\n", + "* It supports [diverse conversation patterns](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#supporting-diverse-conversation-patterns) for complex workflows. With customizable and conversable agents, developers can use AutoGen to build a wide range of conversation patterns concerning conversation autonomy,\n", + "the number of agents, and agent conversation topology.\n", + "* It provides a collection of working systems with different complexities. These systems span a [wide range of applications](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#diverse-applications-implemented-with-autogen) from various domains and complexities. This demonstrates how AutoGen can easily support diverse conversation patterns.\n", + "* AutoGen provides [enhanced LLM inference](https://microsoft.github.io/autogen/docs/Use-Cases/enhanced_inference#api-unification). It offers utilities like API unification and caching, and advanced usage patterns, such as error handling, multi-config inference, context programming, etc.\n", + "\n", + "\n", + "AutoGen is powered by collaborative [research studies](/autogen/docs/Research) from Microsoft, Penn State University, and University of Washington.\n", + "\n", + "\n", + "### Quickstart[​](#quickstart \"Direct link to Quickstart\")\n", + "\n", + "\n", + "Install from pip: `pip install pyautogen`. Find more options in [Installation](/autogen/docs/installation/).\n", + "For [code execution](/autogen/docs/FAQ#code-execution), we strongly recommend installing the python docker package, and using docker.\n", + "\n", + "\n", + "#### Multi-Agent Conversation Framework[​](#multi-agent-conversation-framework \"Direct link to Multi-Agent Conversation Framework\")\n", + "\n", + "\n", + "Autogen enables the next-gen LLM applications with a generic multi-agent conversation framework. It offers customizable and conversable agents which integrate LLMs, tools, and humans.\n", + "By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\n", + "\n", + "\n", + "\n", + "```\n", + "from autogen import AssistantAgent, UserProxyAgent, config\\_list\\_from\\_json \n", + " \n", + "# Load LLM inference endpoints from an env variable or a file \n", + "# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints \n", + "# and OAI\\_CONFIG\\_LIST\\_sample.json \n", + "config\\_list = config\\_list\\_from\\_json(env\\_or\\_file=\"OAI\\_CONFIG\\_LIST\") \n", + "assistant = AssistantAgent(\"assistant\", llm\\_config={\"config\\_list\": config\\_list}) \n", + "user\\_proxy = UserProxyAgent(\"user\\_proxy\", code\\_execution\\_config={\"work\\_dir\": \"coding\", \"use\\_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended \n", + "user\\_proxy.initiate\\_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\") \n", + "# This initiates an automated chat between the two agents to solve the task \n", + "\n", + "```\n", + "\n", + "The figure below shows an example conversation flow with AutoGen.\n", + "![Agent Chat Example](/autogen/assets/images/chat_example-da70a7420ebc817ef9826fa4b1e80951.png)\n", + "\n", + "\n", + "* [Code examples](/autogen/docs/Examples).\n", + "* [Documentation](/autogen/docs/Use-Cases/agent_chat).\n", + "\n", + "\n", + "#### Enhanced LLM Inferences[​](#enhanced-llm-inferences \"Direct link to Enhanced LLM Inferences\")\n", + "\n", + "\n", + "Autogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers enhanced LLM inference with powerful functionalities like tuning, caching, error handling, templating. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.\n", + "\n", + "\n", + "\n", + "```\n", + "# perform tuning for openai<1 \n", + "config, analysis = autogen.Completion.tune( \n", + " data=tune\\_data, \n", + " metric=\"success\", \n", + " mode=\"max\", \n", + " eval\\_func=eval\\_func, \n", + " inference\\_budget=0.05, \n", + " optimization\\_budget=3, \n", + " num\\_samples=-1, \n", + ") \n", + "# perform inference for a test instance \n", + "response = autogen.Completion.create(context=test\\_instance, \\*\\*config) \n", + "\n", + "```\n", + "\n", + "* [Code examples](/autogen/docs/Examples).\n", + "* [Documentation](/autogen/docs/Use-Cases/enhanced_inference).\n", + "\n", + "\n", + "### Where to Go Next ?[​](#where-to-go-next- \"Direct link to Where to Go Next ?\")\n", + "\n", + "\n", + "* Understand the use cases for [multi-agent conversation](/autogen/docs/Use-Cases/agent_chat) and [enhanced LLM inference](/autogen/docs/Use-Cases/enhanced_inference).\n", + "* Find [code examples](/autogen/docs/Examples).\n", + "* Read [SDK](/autogen/docs/reference/agentchat/conversable_agent/).\n", + "* Learn about [research](/autogen/docs/Research) around AutoGen.\n", + "* [Roadmap](https://github.com/orgs/microsoft/projects/989/views/3)\n", + "* Chat on [Discord](https://discord.gg/pAbnFJrkgZ).\n", + "* Follow on [Twitter](https://twitter.com/pyautogen).\n", + "\n", + "\n", + "If you like our project, please give it a [star](https://github.com/microsoft/autogen/stargazers) on GitHub. If you are interested in contributing, please read [Contributor's Guide](/autogen/docs/Contribute).\n", + "\n", + "\n", + "[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Getting-Started.md)[NextInstallation](/autogen/docs/installation/)* [Main Features](#main-features)\n", + "* [Quickstart](#quickstart)\n", + "* [Where to Go Next ?](#where-to-go-next-)\n", + "Community* [Discord](https://discord.gg/pAbnFJrkgZ)\n", + "* [Twitter](https://twitter.com/pyautogen)\n", + "Copyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': '\\nSearch the web for information about Microsoft AutoGen\\n', 'role': 'assistant'}, {'content': \"Address: bing: Microsoft AutoGen\\nTitle: Microsoft AutoGen - Search\\nViewport position: Showing page 1 of 1.\\n=======================\\nA Bing search for 'Microsoft AutoGen' found 8 results:\\n\\n## Web Results\\n1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\\nAutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\\n\\n2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\\nAutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\\n\\n3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\\n\\n4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\\nAutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\\n\\n5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\\nAutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\\n\\n6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\\n\\n7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\\nAutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\\n\\n8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\", 'role': 'user'}, {'content': 'Summarize these results', 'role': 'assistant'}, {'content': 'AutoGen is a framework developed by Microsoft Research to simplify the orchestration, optimization, and automation of large language model (LLM) workflows. The framework offers customizable and conversable agents that utilize advanced LLM capabilities, such as GPT-4, while also integrating with humans and tools to address limitations and enhance performance. As developers create more complex LLM-based applications, the workflows become intricate, requiring significant effort and expertise to design and implement. Automating these workflows using AutoGen can streamline the process and improve efficiency, enabling the creation of next-generation applications that leverage the full potential of LLMs. The framework supports conversations between multiple agents through automated chat, providing a solution to the challenge of orchestrating optimal workflows in a vast and complex design space.', 'role': 'user'}, {'content': \"Click the 'Getting Started' result\", 'role': 'assistant'}, {'content': 'Address: https://microsoft.github.io/autogen/docs/Getting-Started/\\nTitle: Getting Started | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nGetting Started | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Getting Started\\nOn this pageGetting Started\\n===============\\n\\n\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools.\\n\\n\\n![AutoGen Overview](/autogen/assets/images/autogen_agentchat-250ca64b77b87e70d34766a080bf6ba8.png)\\n\\n\\n### Main Features[\\u200b](#main-features \"Direct link to Main Features\")\\n\\n\\n* AutoGen enables building next-gen LLM applications based on [multi-agent conversations](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) with minimal effort. It simplifies the orchestration, automation, and optimization of a complex LLM workflow. It maximizes the performance of LLM models and overcomes their weaknesses.\\n* It supports [diverse conversation patterns](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#supporting-diverse-conversation-patterns) for complex workflows. With customizable and conversable agents, developers can use AutoGen to build a wide range of conversation patterns concerning conversation autonomy,\\nthe number of agents, and agent conversation topology.\\n* It provides a collection of working systems with different complexities. These systems span a [wide range of applications](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#diverse-applications-implemented-with-autogen) from various domains and complexities. This demonstrates how AutoGen can easily support diverse conversation patterns.\\n* AutoGen provides [enhanced LLM inference](https://microsoft.github.io/autogen/docs/Use-Cases/enhanced_inference#api-unification). It offers utilities like API unification and caching, and advanced usage patterns, such as error handling, multi-config inference, context programming, etc.\\n\\n\\nAutoGen is powered by collaborative [research studies](/autogen/docs/Research) from Microsoft, Penn State University, and University of Washington.\\n\\n\\n### Quickstart[\\u200b](#quickstart \"Direct link to Quickstart\")\\n\\n\\nInstall from pip: `pip install pyautogen`. Find more options in [Installation](/autogen/docs/installation/).\\nFor [code execution](/autogen/docs/FAQ#code-execution), we strongly recommend installing the python docker package, and using docker.\\n\\n\\n#### Multi-Agent Conversation Framework[\\u200b](#multi-agent-conversation-framework \"Direct link to Multi-Agent Conversation Framework\")\\n\\n\\nAutogen enables the next-gen LLM applications with a generic multi-agent conversation framework. It offers customizable and conversable agents which integrate LLMs, tools, and humans.\\nBy automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\\n\\n\\n\\n```\\nfrom autogen import AssistantAgent, UserProxyAgent, config\\\\_list\\\\_from\\\\_json \\n \\n# Load LLM inference endpoints from an env variable or a file \\n# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints \\n# and OAI\\\\_CONFIG\\\\_LIST\\\\_sample.json \\nconfig\\\\_list = config\\\\_list\\\\_from\\\\_json(env\\\\_or\\\\_file=\"OAI\\\\_CONFIG\\\\_LIST\") \\nassistant = AssistantAgent(\"assistant\", llm\\\\_config={\"config\\\\_list\": config\\\\_list}) \\nuser\\\\_proxy = UserProxyAgent(\"user\\\\_proxy\", code\\\\_execution\\\\_config={\"work\\\\_dir\": \"coding\", \"use\\\\_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended \\nuser\\\\_proxy.initiate\\\\_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\") \\n# This initiates an automated chat between the two agents to solve the task \\n\\n```\\n\\nThe figure below shows an example conversation flow with AutoGen.\\n![Agent Chat Example](/autogen/assets/images/chat_example-da70a7420ebc817ef9826fa4b1e80951.png)\\n\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/agent_chat).\\n\\n\\n#### Enhanced LLM Inferences[\\u200b](#enhanced-llm-inferences \"Direct link to Enhanced LLM Inferences\")\\n\\n\\nAutogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers enhanced LLM inference with powerful functionalities like tuning, caching, error handling, templating. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.\\n\\n\\n\\n```\\n# perform tuning for openai<1 \\nconfig, analysis = autogen.Completion.tune( \\n data=tune\\\\_data, \\n metric=\"success\", \\n mode=\"max\", \\n eval\\\\_func=eval\\\\_func, \\n inference\\\\_budget=0.05, \\n optimization\\\\_budget=3, \\n num\\\\_samples=-1, \\n) \\n# perform inference for a test instance \\nresponse = autogen.Completion.create(context=test\\\\_instance, \\\\*\\\\*config) \\n\\n```\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/enhanced_inference).\\n\\n\\n### Where to Go Next ?[\\u200b](#where-to-go-next- \"Direct link to Where to Go Next ?\")\\n\\n\\n* Understand the use cases for [multi-agent conversation](/autogen/docs/Use-Cases/agent_chat) and [enhanced LLM inference](/autogen/docs/Use-Cases/enhanced_inference).\\n* Find [code examples](/autogen/docs/Examples).\\n* Read [SDK](/autogen/docs/reference/agentchat/conversable_agent/).\\n* Learn about [research](/autogen/docs/Research) around AutoGen.\\n* [Roadmap](https://github.com/orgs/microsoft/projects/989/views/3)\\n* Chat on [Discord](https://discord.gg/pAbnFJrkgZ).\\n* Follow on [Twitter](https://twitter.com/pyautogen).\\n\\n\\nIf you like our project, please give it a [star](https://github.com/microsoft/autogen/stargazers) on GitHub. If you are interested in contributing, please read [Contributor\\'s Guide](/autogen/docs/Contribute).\\n\\n\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Getting-Started.md)[NextInstallation](/autogen/docs/installation/)* [Main Features](#main-features)\\n* [Quickstart](#quickstart)\\n* [Where to Go Next ?](#where-to-go-next-)\\nCommunity* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n', 'role': 'user'}], summary='Address: https://microsoft.github.io/autogen/docs/Getting-Started/\\nTitle: Getting Started | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nGetting Started | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Getting Started\\nOn this pageGetting Started\\n===============\\n\\n\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools.\\n\\n\\n![AutoGen Overview](/autogen/assets/images/autogen_agentchat-250ca64b77b87e70d34766a080bf6ba8.png)\\n\\n\\n### Main Features[\\u200b](#main-features \"Direct link to Main Features\")\\n\\n\\n* AutoGen enables building next-gen LLM applications based on [multi-agent conversations](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) with minimal effort. It simplifies the orchestration, automation, and optimization of a complex LLM workflow. It maximizes the performance of LLM models and overcomes their weaknesses.\\n* It supports [diverse conversation patterns](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#supporting-diverse-conversation-patterns) for complex workflows. With customizable and conversable agents, developers can use AutoGen to build a wide range of conversation patterns concerning conversation autonomy,\\nthe number of agents, and agent conversation topology.\\n* It provides a collection of working systems with different complexities. These systems span a [wide range of applications](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#diverse-applications-implemented-with-autogen) from various domains and complexities. This demonstrates how AutoGen can easily support diverse conversation patterns.\\n* AutoGen provides [enhanced LLM inference](https://microsoft.github.io/autogen/docs/Use-Cases/enhanced_inference#api-unification). It offers utilities like API unification and caching, and advanced usage patterns, such as error handling, multi-config inference, context programming, etc.\\n\\n\\nAutoGen is powered by collaborative [research studies](/autogen/docs/Research) from Microsoft, Penn State University, and University of Washington.\\n\\n\\n### Quickstart[\\u200b](#quickstart \"Direct link to Quickstart\")\\n\\n\\nInstall from pip: `pip install pyautogen`. Find more options in [Installation](/autogen/docs/installation/).\\nFor [code execution](/autogen/docs/FAQ#code-execution), we strongly recommend installing the python docker package, and using docker.\\n\\n\\n#### Multi-Agent Conversation Framework[\\u200b](#multi-agent-conversation-framework \"Direct link to Multi-Agent Conversation Framework\")\\n\\n\\nAutogen enables the next-gen LLM applications with a generic multi-agent conversation framework. It offers customizable and conversable agents which integrate LLMs, tools, and humans.\\nBy automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\\n\\n\\n\\n```\\nfrom autogen import AssistantAgent, UserProxyAgent, config\\\\_list\\\\_from\\\\_json \\n \\n# Load LLM inference endpoints from an env variable or a file \\n# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints \\n# and OAI\\\\_CONFIG\\\\_LIST\\\\_sample.json \\nconfig\\\\_list = config\\\\_list\\\\_from\\\\_json(env\\\\_or\\\\_file=\"OAI\\\\_CONFIG\\\\_LIST\") \\nassistant = AssistantAgent(\"assistant\", llm\\\\_config={\"config\\\\_list\": config\\\\_list}) \\nuser\\\\_proxy = UserProxyAgent(\"user\\\\_proxy\", code\\\\_execution\\\\_config={\"work\\\\_dir\": \"coding\", \"use\\\\_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended \\nuser\\\\_proxy.initiate\\\\_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\") \\n# This initiates an automated chat between the two agents to solve the task \\n\\n```\\n\\nThe figure below shows an example conversation flow with AutoGen.\\n![Agent Chat Example](/autogen/assets/images/chat_example-da70a7420ebc817ef9826fa4b1e80951.png)\\n\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/agent_chat).\\n\\n\\n#### Enhanced LLM Inferences[\\u200b](#enhanced-llm-inferences \"Direct link to Enhanced LLM Inferences\")\\n\\n\\nAutogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers enhanced LLM inference with powerful functionalities like tuning, caching, error handling, templating. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.\\n\\n\\n\\n```\\n# perform tuning for openai<1 \\nconfig, analysis = autogen.Completion.tune( \\n data=tune\\\\_data, \\n metric=\"success\", \\n mode=\"max\", \\n eval\\\\_func=eval\\\\_func, \\n inference\\\\_budget=0.05, \\n optimization\\\\_budget=3, \\n num\\\\_samples=-1, \\n) \\n# perform inference for a test instance \\nresponse = autogen.Completion.create(context=test\\\\_instance, \\\\*\\\\*config) \\n\\n```\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/enhanced_inference).\\n\\n\\n### Where to Go Next ?[\\u200b](#where-to-go-next- \"Direct link to Where to Go Next ?\")\\n\\n\\n* Understand the use cases for [multi-agent conversation](/autogen/docs/Use-Cases/agent_chat) and [enhanced LLM inference](/autogen/docs/Use-Cases/enhanced_inference).\\n* Find [code examples](/autogen/docs/Examples).\\n* Read [SDK](/autogen/docs/reference/agentchat/conversable_agent/).\\n* Learn about [research](/autogen/docs/Research) around AutoGen.\\n* [Roadmap](https://github.com/orgs/microsoft/projects/989/views/3)\\n* Chat on [Discord](https://discord.gg/pAbnFJrkgZ).\\n* Follow on [Twitter](https://twitter.com/pyautogen).\\n\\n\\nIf you like our project, please give it a [star](https://github.com/microsoft/autogen/stargazers) on GitHub. If you are interested in contributing, please read [Contributor\\'s Guide](/autogen/docs/Contribute).\\n\\n\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Getting-Started.md)[NextInstallation](/autogen/docs/installation/)* [Main Features](#main-features)\\n* [Quickstart](#quickstart)\\n* [Where to Go Next ?](#where-to-go-next-)\\nCommunity* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n', cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task3 = \"Click the 'Getting Started' result\"\n", + "user_proxy.initiate_chat(web_surfer, message=task3, clear_history=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Part 2: Let's look at the actual page rendered" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_binary_image(web_surfer.browser.driver.get_screenshot_as_png())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's scroll down and look again" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "task5 = \"\"\"Scroll down.\"\"\"\n", + "user_proxy.initiate_chat(web_surfer, message=task5, clear_history=False)\n", + "\n", + "# We give it few seconds before viewing the browser\n", + "sleep(3)\n", + "display_binary_image(web_surfer.browser.driver.get_screenshot_as_png())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's test our navigation using the rendered page\n", + "Note: this does require vision capabilities" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to web_surfer):\n", + "\n", + "Click the 'research studies' link\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION visit_page...\u001b[0m\n", + "\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n", + "\n", + "Address: https://microsoft.github.io/autogen/docs/Research\n", + "Title: Research | AutoGen\n", + "Viewport position: Showing page 1 of 1.\n", + "=======================\n", + "\n", + "\n", + "\n", + "Research | AutoGen\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\n", + "* [Gallery](/autogen/docs/Gallery)\n", + "[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\n", + "[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\n", + "* [Installation](/autogen/docs/installation/)\n", + "* [LLM Configuration](/autogen/docs/llm_configuration)\n", + "* [Use Cases](#)\n", + "* [Contributing](/autogen/docs/Contribute)\n", + "* [Research](/autogen/docs/Research)\n", + "* [Migration Guide](/autogen/docs/Migration-Guide)\n", + "* \n", + "* Research\n", + "Research\n", + "========\n", + "\n", + "\n", + "For technical details, please check our technical report and research publications.\n", + "\n", + "\n", + "* [AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework](https://arxiv.org/abs/2308.08155). Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Shaokun Zhang, Erkang Zhu, Beibin Li, Li Jiang, Xiaoyun Zhang and Chi Wang. ArXiv 2023.\n", + "\n", + "\n", + "\n", + "```\n", + "@inproceedings{wu2023autogen, \n", + " title={AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework}, \n", + " author={Qingyun Wu and Gagan Bansal and Jieyu Zhang and Yiran Wu and Shaokun Zhang and Erkang Zhu and Beibin Li and Li Jiang and Xiaoyun Zhang and Chi Wang}, \n", + " year={2023}, \n", + " eprint={2308.08155}, \n", + " archivePrefix={arXiv}, \n", + " primaryClass={cs.AI} \n", + "} \n", + "\n", + "```\n", + "\n", + "* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. AutoML'23.\n", + "\n", + "\n", + "\n", + "```\n", + "@inproceedings{wang2023EcoOptiGen, \n", + " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference}, \n", + " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah}, \n", + " year={2023}, \n", + " booktitle={AutoML'23}, \n", + "} \n", + "\n", + "```\n", + "\n", + "* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", + "\n", + "\n", + "\n", + "```\n", + "@inproceedings{wu2023empirical, \n", + " title={An Empirical Study on Challenging Math Problem Solving with GPT-4}, \n", + " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang}, \n", + " year={2023}, \n", + " booktitle={ArXiv preprint arXiv:2306.01337}, \n", + "} \n", + "\n", + "```\n", + "\n", + "* [EcoAssistant: Using LLM Assistant More Affordably and Accurately](https://arxiv.org/abs/2310.03046). Jieyu Zhang, Ranjay Krishna, Ahmed H. Awadallah, Chi Wang. ArXiv preprint arXiv:2310.03046 (2023).\n", + "\n", + "\n", + "\n", + "```\n", + "@inproceedings{zhang2023ecoassistant, \n", + " title={EcoAssistant: Using LLM Assistant More Affordably and Accurately}, \n", + " author={Zhang, Jieyu and Krishna, Ranjay and Awadallah, Ahmed H and Wang, Chi}, \n", + " year={2023}, \n", + " booktitle={ArXiv preprint arXiv:2310.03046}, \n", + "} \n", + "\n", + "```\n", + "[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Research.md)[PreviousContributing](/autogen/docs/Contribute)[NextMigration Guide](/autogen/docs/Migration-Guide)Community* [Discord](https://discord.gg/pAbnFJrkgZ)\n", + "* [Twitter](https://twitter.com/pyautogen)\n", + "Copyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': '\\nSearch the web for information about Microsoft AutoGen\\n', 'role': 'assistant'}, {'content': \"Address: bing: Microsoft AutoGen\\nTitle: Microsoft AutoGen - Search\\nViewport position: Showing page 1 of 1.\\n=======================\\nA Bing search for 'Microsoft AutoGen' found 8 results:\\n\\n## Web Results\\n1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\\nAutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\\n\\n2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\\nAutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\\n\\n3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\\n\\n4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\\nAutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\\n\\n5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\\nAutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\\n\\n6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\\n\\n7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\\nAutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\\n\\n8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\", 'role': 'user'}, {'content': 'Summarize these results', 'role': 'assistant'}, {'content': 'AutoGen is a framework developed by Microsoft Research to simplify the orchestration, optimization, and automation of large language model (LLM) workflows. The framework offers customizable and conversable agents that utilize advanced LLM capabilities, such as GPT-4, while also integrating with humans and tools to address limitations and enhance performance. As developers create more complex LLM-based applications, the workflows become intricate, requiring significant effort and expertise to design and implement. Automating these workflows using AutoGen can streamline the process and improve efficiency, enabling the creation of next-generation applications that leverage the full potential of LLMs. The framework supports conversations between multiple agents through automated chat, providing a solution to the challenge of orchestrating optimal workflows in a vast and complex design space.', 'role': 'user'}, {'content': \"Click the 'Getting Started' result\", 'role': 'assistant'}, {'content': 'Address: https://microsoft.github.io/autogen/docs/Getting-Started/\\nTitle: Getting Started | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nGetting Started | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Getting Started\\nOn this pageGetting Started\\n===============\\n\\n\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools.\\n\\n\\n![AutoGen Overview](/autogen/assets/images/autogen_agentchat-250ca64b77b87e70d34766a080bf6ba8.png)\\n\\n\\n### Main Features[\\u200b](#main-features \"Direct link to Main Features\")\\n\\n\\n* AutoGen enables building next-gen LLM applications based on [multi-agent conversations](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) with minimal effort. It simplifies the orchestration, automation, and optimization of a complex LLM workflow. It maximizes the performance of LLM models and overcomes their weaknesses.\\n* It supports [diverse conversation patterns](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#supporting-diverse-conversation-patterns) for complex workflows. With customizable and conversable agents, developers can use AutoGen to build a wide range of conversation patterns concerning conversation autonomy,\\nthe number of agents, and agent conversation topology.\\n* It provides a collection of working systems with different complexities. These systems span a [wide range of applications](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#diverse-applications-implemented-with-autogen) from various domains and complexities. This demonstrates how AutoGen can easily support diverse conversation patterns.\\n* AutoGen provides [enhanced LLM inference](https://microsoft.github.io/autogen/docs/Use-Cases/enhanced_inference#api-unification). It offers utilities like API unification and caching, and advanced usage patterns, such as error handling, multi-config inference, context programming, etc.\\n\\n\\nAutoGen is powered by collaborative [research studies](/autogen/docs/Research) from Microsoft, Penn State University, and University of Washington.\\n\\n\\n### Quickstart[\\u200b](#quickstart \"Direct link to Quickstart\")\\n\\n\\nInstall from pip: `pip install pyautogen`. Find more options in [Installation](/autogen/docs/installation/).\\nFor [code execution](/autogen/docs/FAQ#code-execution), we strongly recommend installing the python docker package, and using docker.\\n\\n\\n#### Multi-Agent Conversation Framework[\\u200b](#multi-agent-conversation-framework \"Direct link to Multi-Agent Conversation Framework\")\\n\\n\\nAutogen enables the next-gen LLM applications with a generic multi-agent conversation framework. It offers customizable and conversable agents which integrate LLMs, tools, and humans.\\nBy automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\\n\\n\\n\\n```\\nfrom autogen import AssistantAgent, UserProxyAgent, config\\\\_list\\\\_from\\\\_json \\n \\n# Load LLM inference endpoints from an env variable or a file \\n# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints \\n# and OAI\\\\_CONFIG\\\\_LIST\\\\_sample.json \\nconfig\\\\_list = config\\\\_list\\\\_from\\\\_json(env\\\\_or\\\\_file=\"OAI\\\\_CONFIG\\\\_LIST\") \\nassistant = AssistantAgent(\"assistant\", llm\\\\_config={\"config\\\\_list\": config\\\\_list}) \\nuser\\\\_proxy = UserProxyAgent(\"user\\\\_proxy\", code\\\\_execution\\\\_config={\"work\\\\_dir\": \"coding\", \"use\\\\_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended \\nuser\\\\_proxy.initiate\\\\_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\") \\n# This initiates an automated chat between the two agents to solve the task \\n\\n```\\n\\nThe figure below shows an example conversation flow with AutoGen.\\n![Agent Chat Example](/autogen/assets/images/chat_example-da70a7420ebc817ef9826fa4b1e80951.png)\\n\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/agent_chat).\\n\\n\\n#### Enhanced LLM Inferences[\\u200b](#enhanced-llm-inferences \"Direct link to Enhanced LLM Inferences\")\\n\\n\\nAutogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers enhanced LLM inference with powerful functionalities like tuning, caching, error handling, templating. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.\\n\\n\\n\\n```\\n# perform tuning for openai<1 \\nconfig, analysis = autogen.Completion.tune( \\n data=tune\\\\_data, \\n metric=\"success\", \\n mode=\"max\", \\n eval\\\\_func=eval\\\\_func, \\n inference\\\\_budget=0.05, \\n optimization\\\\_budget=3, \\n num\\\\_samples=-1, \\n) \\n# perform inference for a test instance \\nresponse = autogen.Completion.create(context=test\\\\_instance, \\\\*\\\\*config) \\n\\n```\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/enhanced_inference).\\n\\n\\n### Where to Go Next ?[\\u200b](#where-to-go-next- \"Direct link to Where to Go Next ?\")\\n\\n\\n* Understand the use cases for [multi-agent conversation](/autogen/docs/Use-Cases/agent_chat) and [enhanced LLM inference](/autogen/docs/Use-Cases/enhanced_inference).\\n* Find [code examples](/autogen/docs/Examples).\\n* Read [SDK](/autogen/docs/reference/agentchat/conversable_agent/).\\n* Learn about [research](/autogen/docs/Research) around AutoGen.\\n* [Roadmap](https://github.com/orgs/microsoft/projects/989/views/3)\\n* Chat on [Discord](https://discord.gg/pAbnFJrkgZ).\\n* Follow on [Twitter](https://twitter.com/pyautogen).\\n\\n\\nIf you like our project, please give it a [star](https://github.com/microsoft/autogen/stargazers) on GitHub. If you are interested in contributing, please read [Contributor\\'s Guide](/autogen/docs/Contribute).\\n\\n\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Getting-Started.md)[NextInstallation](/autogen/docs/installation/)* [Main Features](#main-features)\\n* [Quickstart](#quickstart)\\n* [Where to Go Next ?](#where-to-go-next-)\\nCommunity* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n', 'role': 'user'}, {'content': 'Scroll down.', 'role': 'assistant'}, {'content': 'Address: https://microsoft.github.io/autogen/docs/Getting-Started/\\nTitle: Getting Started | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nGetting Started | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Getting Started\\nOn this pageGetting Started\\n===============\\n\\n\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools.\\n\\n\\n![AutoGen Overview](/autogen/assets/images/autogen_agentchat-250ca64b77b87e70d34766a080bf6ba8.png)\\n\\n\\n### Main Features[\\u200b](#main-features \"Direct link to Main Features\")\\n\\n\\n* AutoGen enables building next-gen LLM applications based on [multi-agent conversations](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) with minimal effort. It simplifies the orchestration, automation, and optimization of a complex LLM workflow. It maximizes the performance of LLM models and overcomes their weaknesses.\\n* It supports [diverse conversation patterns](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#supporting-diverse-conversation-patterns) for complex workflows. With customizable and conversable agents, developers can use AutoGen to build a wide range of conversation patterns concerning conversation autonomy,\\nthe number of agents, and agent conversation topology.\\n* It provides a collection of working systems with different complexities. These systems span a [wide range of applications](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#diverse-applications-implemented-with-autogen) from various domains and complexities. This demonstrates how AutoGen can easily support diverse conversation patterns.\\n* AutoGen provides [enhanced LLM inference](https://microsoft.github.io/autogen/docs/Use-Cases/enhanced_inference#api-unification). It offers utilities like API unification and caching, and advanced usage patterns, such as error handling, multi-config inference, context programming, etc.\\n\\n\\nAutoGen is powered by collaborative [research studies](/autogen/docs/Research) from Microsoft, Penn State University, and University of Washington.\\n\\n\\n### Quickstart[\\u200b](#quickstart \"Direct link to Quickstart\")\\n\\n\\nInstall from pip: `pip install pyautogen`. Find more options in [Installation](/autogen/docs/installation/).\\nFor [code execution](/autogen/docs/FAQ#code-execution), we strongly recommend installing the python docker package, and using docker.\\n\\n\\n#### Multi-Agent Conversation Framework[\\u200b](#multi-agent-conversation-framework \"Direct link to Multi-Agent Conversation Framework\")\\n\\n\\nAutogen enables the next-gen LLM applications with a generic multi-agent conversation framework. It offers customizable and conversable agents which integrate LLMs, tools, and humans.\\nBy automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\\n\\n\\n\\n```\\nfrom autogen import AssistantAgent, UserProxyAgent, config\\\\_list\\\\_from\\\\_json \\n \\n# Load LLM inference endpoints from an env variable or a file \\n# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints \\n# and OAI\\\\_CONFIG\\\\_LIST\\\\_sample.json \\nconfig\\\\_list = config\\\\_list\\\\_from\\\\_json(env\\\\_or\\\\_file=\"OAI\\\\_CONFIG\\\\_LIST\") \\nassistant = AssistantAgent(\"assistant\", llm\\\\_config={\"config\\\\_list\": config\\\\_list}) \\nuser\\\\_proxy = UserProxyAgent(\"user\\\\_proxy\", code\\\\_execution\\\\_config={\"work\\\\_dir\": \"coding\", \"use\\\\_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended \\nuser\\\\_proxy.initiate\\\\_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\") \\n# This initiates an automated chat between the two agents to solve the task \\n\\n```\\n\\nThe figure below shows an example conversation flow with AutoGen.\\n![Agent Chat Example](/autogen/assets/images/chat_example-da70a7420ebc817ef9826fa4b1e80951.png)\\n\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/agent_chat).\\n\\n\\n#### Enhanced LLM Inferences[\\u200b](#enhanced-llm-inferences \"Direct link to Enhanced LLM Inferences\")\\n\\n\\nAutogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers enhanced LLM inference with powerful functionalities like tuning, caching, error handling, templating. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.\\n\\n\\n\\n```\\n# perform tuning for openai<1 \\nconfig, analysis = autogen.Completion.tune( \\n data=tune\\\\_data, \\n metric=\"success\", \\n mode=\"max\", \\n eval\\\\_func=eval\\\\_func, \\n inference\\\\_budget=0.05, \\n optimization\\\\_budget=3, \\n num\\\\_samples=-1, \\n) \\n# perform inference for a test instance \\nresponse = autogen.Completion.create(context=test\\\\_instance, \\\\*\\\\*config) \\n\\n```\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/enhanced_inference).\\n\\n\\n### Where to Go Next ?[\\u200b](#where-to-go-next- \"Direct link to Where to Go Next ?\")\\n\\n\\n* Understand the use cases for [multi-agent conversation](/autogen/docs/Use-Cases/agent_chat) and [enhanced LLM inference](/autogen/docs/Use-Cases/enhanced_inference).\\n* Find [code examples](/autogen/docs/Examples).\\n* Read [SDK](/autogen/docs/reference/agentchat/conversable_agent/).\\n* Learn about [research](/autogen/docs/Research) around AutoGen.\\n* [Roadmap](https://github.com/orgs/microsoft/projects/989/views/3)\\n* Chat on [Discord](https://discord.gg/pAbnFJrkgZ).\\n* Follow on [Twitter](https://twitter.com/pyautogen).\\n\\n\\nIf you like our project, please give it a [star](https://github.com/microsoft/autogen/stargazers) on GitHub. If you are interested in contributing, please read [Contributor\\'s Guide](/autogen/docs/Contribute).\\n\\n\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Getting-Started.md)[NextInstallation](/autogen/docs/installation/)* [Main Features](#main-features)\\n* [Quickstart](#quickstart)\\n* [Where to Go Next ?](#where-to-go-next-)\\nCommunity* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n', 'role': 'user'}, {'content': \"Click the 'research studies' link\", 'role': 'assistant'}, {'content': \"Address: https://microsoft.github.io/autogen/docs/Research\\nTitle: Research | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nResearch | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Research\\nResearch\\n========\\n\\n\\nFor technical details, please check our technical report and research publications.\\n\\n\\n* [AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework](https://arxiv.org/abs/2308.08155). Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Shaokun Zhang, Erkang Zhu, Beibin Li, Li Jiang, Xiaoyun Zhang and Chi Wang. ArXiv 2023.\\n\\n\\n\\n```\\n@inproceedings{wu2023autogen, \\n title={AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework}, \\n author={Qingyun Wu and Gagan Bansal and Jieyu Zhang and Yiran Wu and Shaokun Zhang and Erkang Zhu and Beibin Li and Li Jiang and Xiaoyun Zhang and Chi Wang}, \\n year={2023}, \\n eprint={2308.08155}, \\n archivePrefix={arXiv}, \\n primaryClass={cs.AI} \\n} \\n\\n```\\n\\n* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. AutoML'23.\\n\\n\\n\\n```\\n@inproceedings{wang2023EcoOptiGen, \\n title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference}, \\n author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah}, \\n year={2023}, \\n booktitle={AutoML'23}, \\n} \\n\\n```\\n\\n* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\\n\\n\\n\\n```\\n@inproceedings{wu2023empirical, \\n title={An Empirical Study on Challenging Math Problem Solving with GPT-4}, \\n author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang}, \\n year={2023}, \\n booktitle={ArXiv preprint arXiv:2306.01337}, \\n} \\n\\n```\\n\\n* [EcoAssistant: Using LLM Assistant More Affordably and Accurately](https://arxiv.org/abs/2310.03046). Jieyu Zhang, Ranjay Krishna, Ahmed H. Awadallah, Chi Wang. ArXiv preprint arXiv:2310.03046 (2023).\\n\\n\\n\\n```\\n@inproceedings{zhang2023ecoassistant, \\n title={EcoAssistant: Using LLM Assistant More Affordably and Accurately}, \\n author={Zhang, Jieyu and Krishna, Ranjay and Awadallah, Ahmed H and Wang, Chi}, \\n year={2023}, \\n booktitle={ArXiv preprint arXiv:2310.03046}, \\n} \\n\\n```\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Research.md)[PreviousContributing](/autogen/docs/Contribute)[NextMigration Guide](/autogen/docs/Migration-Guide)Community* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n\", 'role': 'user'}], summary=\"Address: https://microsoft.github.io/autogen/docs/Research\\nTitle: Research | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nResearch | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Research\\nResearch\\n========\\n\\n\\nFor technical details, please check our technical report and research publications.\\n\\n\\n* [AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework](https://arxiv.org/abs/2308.08155). Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Shaokun Zhang, Erkang Zhu, Beibin Li, Li Jiang, Xiaoyun Zhang and Chi Wang. ArXiv 2023.\\n\\n\\n\\n```\\n@inproceedings{wu2023autogen, \\n title={AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework}, \\n author={Qingyun Wu and Gagan Bansal and Jieyu Zhang and Yiran Wu and Shaokun Zhang and Erkang Zhu and Beibin Li and Li Jiang and Xiaoyun Zhang and Chi Wang}, \\n year={2023}, \\n eprint={2308.08155}, \\n archivePrefix={arXiv}, \\n primaryClass={cs.AI} \\n} \\n\\n```\\n\\n* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. AutoML'23.\\n\\n\\n\\n```\\n@inproceedings{wang2023EcoOptiGen, \\n title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference}, \\n author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah}, \\n year={2023}, \\n booktitle={AutoML'23}, \\n} \\n\\n```\\n\\n* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\\n\\n\\n\\n```\\n@inproceedings{wu2023empirical, \\n title={An Empirical Study on Challenging Math Problem Solving with GPT-4}, \\n author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang}, \\n year={2023}, \\n booktitle={ArXiv preprint arXiv:2306.01337}, \\n} \\n\\n```\\n\\n* [EcoAssistant: Using LLM Assistant More Affordably and Accurately](https://arxiv.org/abs/2310.03046). Jieyu Zhang, Ranjay Krishna, Ahmed H. Awadallah, Chi Wang. ArXiv preprint arXiv:2310.03046 (2023).\\n\\n\\n\\n```\\n@inproceedings{zhang2023ecoassistant, \\n title={EcoAssistant: Using LLM Assistant More Affordably and Accurately}, \\n author={Zhang, Jieyu and Krishna, Ranjay and Awadallah, Ahmed H and Wang, Chi}, \\n year={2023}, \\n booktitle={ArXiv preprint arXiv:2310.03046}, \\n} \\n\\n```\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Research.md)[PreviousContributing](/autogen/docs/Contribute)[NextMigration Guide](/autogen/docs/Migration-Guide)Community* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n\", cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task6 = \"Click the 'research studies' link\"\n", + "user_proxy.initiate_chat(web_surfer, message=task6, clear_history=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Show us the results of that action" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_binary_image(web_surfer.browser.driver.get_screenshot_as_png())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Amazing! Agent navigation on the web still works with the full desktop browser which is great news!\n", + "### And we can always still display the text on screen if our use-case benefited from that" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_binary_image(web_surfer.browser.driver.get_screenshot_as_png())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup process\n", + "To ensure that we have no lingering processes in the background, we can shutdown the browser" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Gracefully shut down our headless desktop browser\n", + "web_surfer.close_the_browser()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebook/agentchat_web_archiver_agent.ipynb b/notebook/agentchat_web_archiver_agent.ipynb new file mode 100644 index 00000000000..f61a9e6496e --- /dev/null +++ b/notebook/agentchat_web_archiver_agent.ipynb @@ -0,0 +1,1746 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Content Collection Tasks with WebArchiverAgent\n", + "\n", + "### Why would we want this?\n", + "As part of a larger pipeline, `WebArchiverAgent` accomplishes the task of automatic retrieval and storage of online content for numerous downstream tasks. \n", + "This task is facilitated by a headless Selenium webdriver. \n", + "\n", + "\n", + "## Requirements\n", + "\n", + "AutoGen requires `Python>=3.8`. To run this notebook example, please install:\n", + "```bash\n", + "pip install \"pyautogen[websurfer]\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ensure that we have the WebDrivers present for Selenium\n", + "Following the instructions in [Selenium Documentation](https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location/#download-the-driver), \n", + "we first download the web driver for our browser of choice, or all 3: [Edge](https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads), [Firefox](https://github.com/mozilla/geckodriver/releases), [Chrome](https://chromedriver.chromium.org/downloads)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Neither powershell nor pwsh is installed.\n" + ] + } + ], + "source": [ + "# %%capture --no-stderr\n", + "import os\n", + "import logging\n", + "import autogen\n", + "from PIL import Image\n", + "from IPython.core.display_functions import display\n", + "from autogen.agentchat.contrib.web_archiver_agent import WebArchiverAgent\n", + "from autogen.agentchat.user_proxy_agent import UserProxyAgent\n", + "from autogen.oai import config_list_from_json\n", + "from autogen.browser_utils import display_binary_image\n", + "from autogen.browser_utils import get_file_path_from_url\n", + "\n", + "# Get the logger instance for the current module (__name__).\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set your API Endpoint\n", + "\n", + "The [`config_list_from_json`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_json) function loads a list of configurations from an environment variable or a json file.\n", + "\n", + "It first looks for environment variable \"OAI_CONFIG_LIST\" which needs to be a valid json string. If that variable is not found, it then looks for a json file named \"OAI_CONFIG_LIST\". It filters the configs by models (you can filter by other keys as well).\n", + "\n", + "The WebSurferAgent uses a combination of models. GPT-4 and GPT-3.5-turbo-16 are recommended.\n", + "\n", + "Your json config should look something like the following:\n", + "```json\n", + "[\n", + " {\n", + " \"model\": \"gpt-4\",\n", + " \"api_key\": \"\"\n", + " },\n", + " {\n", + " \"model\": \"gpt-3.5-turbo-16k\",\n", + " \"api_key\": \"\"\n", + " }\n", + "]\n", + "```\n", + "\n", + "If you open this notebook in colab, you can upload your files by clicking the file icon on the left panel and then choose \"upload file\" icon.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "llm_config = {\n", + " \"timeout\": 600,\n", + " \"cache_seed\": 44, # change the seed for different trials\n", + " \"config_list\": config_list_from_json(\n", + " \"OAI_CONFIG_LIST\",\n", + " # filter_dict={\"model\": [\"Sakura-SOLAR-Instruct-f16\"]},\n", + " filter_dict={\n", + " \"model\": [\"gpt-3.5-turbo\"]\n", + " }, # , \"gpt-4\", \"gpt-4-0613\", \"gpt-4-32k\", \"gpt-4-32k-0613\", \"gpt-4-1106-preview\"]},\n", + " ),\n", + " \"temperature\": 0,\n", + "}\n", + "\n", + "summarizer_llm_config = {\n", + " \"timeout\": 600,\n", + " \"cache_seed\": 44, # change the seed for different trials\n", + " \"config_list\": config_list_from_json(\n", + " \"OAI_CONFIG_LIST\",\n", + " # filter_dict={\"model\": [\"Sakura-SOLAR-Instruct-f16\"]},\n", + " filter_dict={\"model\": [\"gpt-3.5-turbo\"]},\n", + " ),\n", + " \"temperature\": 0,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure Bing\n", + "\n", + "For WebSurferAgent to be reasonably useful, it needs to be able to search the web -- and that means it needs a Bing API key. \n", + "You can read more about how to get an API on the [Bing Web Search API](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) page.\n", + "\n", + "Once you have your key, either set it as the `BING_API_KEY` system environment variable, or simply input your key below." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "bing_api_key = os.environ[\"BING_API_KEY\"] if \"BING_API_KEY\" in os.environ else \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define our agents" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Specify where our web content will be stored, we'll use this at the end of the notebook\n", + "storage_path = \"./content\"\n", + "\n", + "web_archiver_agent = WebArchiverAgent(\n", + " name=\"ContentAgent\", # Choose any name you prefer\n", + " system_message=\"You are data collection agent specializing in content on the web.\",\n", + " max_depth=0,\n", + " llm_config=llm_config,\n", + " max_consecutive_auto_reply=0,\n", + " silent=False, # *NEW* In case we want to hear the inner-conversation,\n", + " storage_path=storage_path, # *NEW* This is where our archived content is stored, defaulting to `./content`\n", + " browser_config={\n", + " \"bing_api_key\": bing_api_key,\n", + " \"type\": \"selenium\", # *NEW* Here we specify that we intend to use our headless GUI browser. The default setting is \"text\".\n", + " \"browser\": \"edge\", # *NEW* We'll use the edge browser for these tests. Choices include 'edge', 'firefox', and 'chrome'\n", + " # \"resolution\": (1400,900), # *NEW* we specify the browser window size. The default is (1920,5200)\n", + " \"render_text\": False, # *NEW* We still have the option to convert the output to text and render it on the screen\n", + " },\n", + ")\n", + "\n", + "# Define the user agent\n", + "user_proxy = autogen.agentchat.UserProxyAgent(\n", + " \"user_proxy\",\n", + " human_input_mode=\"NEVER\",\n", + " code_execution_config=False,\n", + " default_auto_reply=\"\",\n", + " is_termination_msg=lambda x: True,\n", + " max_consecutive_auto_reply=0,\n", + ")\n", + "\n", + "# We register our collection function as the default response\n", + "web_archiver_agent.register_reply(user_proxy, web_archiver_agent.collect_content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's take it for a spin! \n", + "The Autogen open-source framework has an academic paper on arxiv.org! We'd certainly be interested to have that in our archives for later retrieval" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to ContentAgent):\n", + "\n", + "https://arxiv.org/abs/2308.08155\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to user_proxy):\n", + "\n", + "Success: archived the following links in your chosen location ./content/ <-- https://arxiv.org/abs/2308.08155\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': 'https://arxiv.org/abs/2308.08155', 'role': 'assistant'}, {'content': 'Success: archived the following links in your chosen location ./content/ <-- https://arxiv.org/abs/2308.08155', 'role': 'user'}], summary='Success: archived the following links in your chosen location ./content/ <-- https://arxiv.org/abs/2308.08155', cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "link = \"https://arxiv.org/abs/2308.08155\"\n", + "\n", + "user_proxy.initiate_chat(web_archiver_agent, message=link)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### We'll try another, this time the examples page from the Autogen official website" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to ContentAgent):\n", + "\n", + "https://microsoft.github.io/autogen/docs/Examples\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Examples`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Automated Multi Agent Chat​`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```AutoGen offers conversable agents powered by LLM, tool or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation via multi-agent conversation. Please find documentation about this feature here.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

AutoGen offers conversable agents powered by LLM, tool or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation via multi-agent conversation.\n", + "Please find documentation about this feature here.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Links to notebook examples:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Code Generation, Execution, and Debugging`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Code Generation, Execution, and Debugging

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Multi-Agent Collaboration (>3 Agents)`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Sequential Multi-Agent Chats`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Applications`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Tool Use`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Human Involvement`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Agent Teaching and Learning`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Multi-Agent Chat with OpenAI Assistants in the loop`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Multimodal Agent`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Long Context Handling`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Evaluation and Assessment`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Automatic Agent Building`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Enhanced Inferences​`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Utilities​`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Inference Hyperparameters Tuning​`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```AutoGen offers a cost-effective hyperparameter optimization technique EcoOptiGen for tuning Large Language Models. The research study finds that tuning hyperparameters can significantly improve the utility of them. Please find documentation about this feature here.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

AutoGen offers a cost-effective hyperparameter optimization technique EcoOptiGen for tuning Large Language Models. The research study finds that tuning hyperparameters can significantly improve the utility of them.\n", + "Please find documentation about this feature here.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Links to notebook examples:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'generator', 'content': 'Docusaurus v3.1.1'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'data-rh': 'true', 'name': 'twitter:card', 'content': 'summary_large_image'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'data-rh': 'true', 'property': 'og:locale', 'content': 'en'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'data-rh': 'true', 'name': 'docusaurus_locale', 'content': 'en'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'data-rh': 'true', 'name': 'docsearch:language', 'content': 'en'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'viewport', 'content': 'width=device-width, initial-scale=1.0', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'docusaurus_version', 'content': 'current', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'docusaurus_tag', 'content': 'docs-default-current', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'docsearch:version', 'content': 'current', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'docsearch:docusaurus_tag', 'content': 'docs-default-current', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:title', 'content': 'Examples | AutoGen', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'description', 'content': 'Automated Multi Agent Chat', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:description', 'content': 'Automated Multi Agent Chat', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to user_proxy):\n", + "\n", + "Success: archived the following links in your chosen location ./content/ <-- https://microsoft.github.io/autogen/docs/Examples\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': 'https://microsoft.github.io/autogen/docs/Examples', 'role': 'assistant'}, {'content': 'Success: archived the following links in your chosen location ./content/ <-- https://microsoft.github.io/autogen/docs/Examples', 'role': 'user'}], summary='Success: archived the following links in your chosen location ./content/ <-- https://microsoft.github.io/autogen/docs/Examples', cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "link = \"https://microsoft.github.io/autogen/docs/Examples\"\n", + "user_proxy.initiate_chat(web_archiver_agent, message=link)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see a lot of communication taking place when listening to the inner-dialog. The agent needs to confirm relevance of various pieces of content so its not storing advertisements or content not associated with the page topic.\n", + "\n", + "### We'll collect one more recent and very interesting publication by the good scientists at Microsoft" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to ContentAgent):\n", + "\n", + "https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Global`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Microsoft Research Blog`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```GraphRAG: Unlocking LLM discovery on narrative private data`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Published February 13, 2024`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

\n", + "\t\t\t\tPublished\t\t\t\t\n", + "

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```By Jonathan Larson , Senior Principal Data Architect Steven Truitt , Principal Program Manager`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Share this page`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Perhaps the greatest challenge – and opportunity – of LLMs is extending their powerful capabilities to solve problems beyond the data on which they have been trained, and to achieve comparable results with data the LLM has never seen.  This opens new possibilities in data investigation, such as identifying themes and semantic concepts with context and grounding on datasets.  In this post, we introduce GraphRAG, created by Microsoft Research, as a significant advance in enhancing the capability of LLMs.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Perhaps the greatest challenge – and opportunity – of LLMs is extending their powerful capabilities to solve problems beyond the data on which they have been trained, and to achieve comparable results with data the LLM has never seen.  This opens new possibilities in data investigation, such as identifying themes and semantic concepts with context and grounding on datasets.  In this post, we introduce GraphRAG, created by Microsoft Research, as a significant advance in enhancing the capability of LLMs.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Retrieval-Augmented Generation (RAG) is a technique to search for information based on a user query and provide the results as reference for an AI answer to be generated. This technique is an important part of most LLM-based tools and the majority of RAG approaches use vector similarity as the search technique. GraphRAG uses LLM-generated knowledge graphs to provide substantial improvements in question-and-answer performance when conducting document analysis of complex information.  This builds upon our recent research, which points to the power of prompt augmentation when performing discovery on private datasets. Here, we define private dataset as data that the LLM is not trained on and has never seen before, such as an enterprise’s proprietary research, business documents, or communications. Baseline RAG1 was created to help solve this problem, but we observe situations where baseline RAG performs very poorly. For example:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Retrieval-Augmented Generation (RAG) is a technique to search for information based on a user query and provide the results as reference for an AI answer to be generated. This technique is an important part of most LLM-based tools and the majority of RAG approaches use vector similarity as the search technique. GraphRAG uses LLM-generated knowledge graphs to provide substantial improvements in question-and-answer performance when conducting document analysis of complex information.  This builds upon our recent research, which points to the power of prompt augmentation when performing discovery on private datasets. Here, we define private dataset as data that the LLM is not trained on and has never seen before, such as an enterprise’s proprietary research, business documents, or communications. Baseline RAG1 was created to help solve this problem, but we observe situations where baseline RAG performs very poorly. For example:

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```To address this, the tech community is working to develop methods that extend and enhance RAG (e.g., LlamaIndex (opens in new tab)).  Microsoft Research’s new approach, GraphRAG, uses the LLM to create a knowledge graph based on the private dataset.  This graph is then used alongside graph machine learning to perform prompt augmentation at query time.  GraphRAG shows substantial improvement in answering the two classes of questions described above, demonstrating intelligence or mastery that outperforms other approaches previously applied to private datasets.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

To address this, the tech community is working to develop methods that extend and enhance RAG (e.g., LlamaIndex (opens in new tab)).  Microsoft Research’s new approach, GraphRAG, uses the LLM to create a knowledge graph based on the private dataset.  This graph is then used alongside graph machine learning to perform prompt augmentation at query time.  GraphRAG shows substantial improvement in answering the two classes of questions described above, demonstrating intelligence or mastery that outperforms other approaches previously applied to private datasets.   

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Applying RAG to private datasets`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```To demonstrate the effectiveness of GraphRAG, let’s start with an investigation using the Violent Incident Information from News Articles (VIINA) dataset (opens in new tab).  This dataset was chosen due to its complexity and the presence of differing opinions and partial information.  It is a messy real-world test case that was recent enough not to be included in the LLM base model’s training.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

To demonstrate the effectiveness of GraphRAG, let’s start with an investigation using the Violent Incident Information from News Articles (VIINA) dataset (opens in new tab).  This dataset was chosen due to its complexity and the presence of differing opinions and partial information.  It is a messy real-world test case that was recent enough not to be included in the LLM base model’s training.  

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```For this research, we use thousands of news articles from both Russian and Ukrainian news sources for the month of June 2023, translated into English, to create a private dataset on which we will perform our LLM-based retrieval.  The dataset is far too large to fit into an LLM context window, thus demanding a RAG approach.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

For this research, we use thousands of news articles from both Russian and Ukrainian news sources for the month of June 2023, translated into English, to create a private dataset on which we will perform our LLM-based retrieval.  The dataset is far too large to fit into an LLM context window, thus demanding a RAG approach.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```We start with an exploratory query, which we pose to both a baseline RAG system and to our new approach, GraphRAG:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

We start with an exploratory query, which we pose to both a baseline RAG system and to our new approach, GraphRAG:

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Query: “What is Novorossiya?”`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```In these results, we can see both systems perform well – highlighting a class of query on which baseline RAG performs well.  Let’s try a query that requires connecting the dots:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

In these results, we can see both systems perform well – highlighting a class of query on which baseline RAG performs well.  Let’s try a query that requires connecting the dots:

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Query: “What has Novorossiya done?”`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Baseline RAG fails to answer this question.  Looking at the source documents inserted into the context window (Figure 1), none of the text segments discuss Novorossiya, resulting in this failure.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Baseline RAG fails to answer this question.  Looking at the source documents inserted into the context window (Figure 1), none of the text segments discuss Novorossiya, resulting in this failure.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```In comparison, the GraphRAG approach discovered an entity in the query, Novorossiya.  This allows the LLM to ground itself in the graph and results in a superior answer that contains provenance through links to the original supporting text.  For example, Figure 2 below shows the exact content the LLM used for the LLM-generated statement, “Novorossiya has been implicated in plans to blow up ATMs.” We see the snippet from the raw source documents (after English translation) that the LLM used to support the assertion that a specific bank was a target for Novorossiya via the relationship that exists between the two entities in the graph.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

In comparison, the GraphRAG approach discovered an entity in the query, Novorossiya.  This allows the LLM to ground itself in the graph and results in a superior answer that contains provenance through links to the original supporting text.  For example, Figure 2 below shows the exact content the LLM used for the LLM-generated statement, “Novorossiya has been implicated in plans to blow up ATMs.” We see the snippet from the raw source documents (after English translation) that the LLM used to support the assertion that a specific bank was a target for Novorossiya via the relationship that exists between the two entities in the graph. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```By using the LLM-generated knowledge graph, GraphRAG vastly improves the “retrieval” portion of RAG, populating the context window with higher relevance content, resulting in better answers and capturing evidence provenance.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

By using the LLM-generated knowledge graph, GraphRAG vastly improves the “retrieval” portion of RAG, populating the context window with higher relevance content, resulting in better answers and capturing evidence provenance. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Being able to trust and verify LLM-generated results is always important.  We care that the results are factually correct, coherent, and accurately represent content found in the source material. GraphRAG provides the provenance, or source grounding information, as it generates each response.  It demonstrates that an answer is grounded in the dataset.  Having the cited source for each assertion readily available also enables a human user to quickly and accurately audit the LLM’s output directly against the original source material.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Being able to trust and verify LLM-generated results is always important.  We care that the results are factually correct, coherent, and accurately represent content found in the source material. GraphRAG provides the provenance, or source grounding information, as it generates each response.  It demonstrates that an answer is grounded in the dataset.  Having the cited source for each assertion readily available also enables a human user to quickly and accurately audit the LLM’s output directly against the original source material.   

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```However, this isn’t all that’s possible using GraphRAG.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

However, this isn’t all that’s possible using GraphRAG. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Whole dataset reasoning`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Whole dataset reasoning 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Baseline RAG struggles with queries that require aggregation of information across the dataset to compose an answer. Queries such as “What are the top 5 themes in the data?” perform terribly because baseline RAG relies on a vector search of semantically similar text content within the dataset. There is nothing in the query to direct it to the correct information.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Baseline RAG struggles with queries that require aggregation of information across the dataset to compose an answer. Queries such as “What are the top 5 themes in the data?” perform terribly because baseline RAG relies on a vector search of semantically similar text content within the dataset. There is nothing in the query to direct it to the correct information. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```However, with GraphRAG we can answer such questions, because the structure of the LLM-generated knowledge graph tells us about the structure (and thus themes) of the dataset as a whole.  This allows the private dataset to be organized into meaningful semantic clusters that are pre-summarized.  The LLM uses these clusters to summarize these themes when responding to a user query.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

However, with GraphRAG we can answer such questions, because the structure of the LLM-generated knowledge graph tells us about the structure (and thus themes) of the dataset as a whole.  This allows the private dataset to be organized into meaningful semantic clusters that are pre-summarized.  The LLM uses these clusters to summarize these themes when responding to a user query. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```We illustrate whole-dataset reasoning abilities by posing the following question to the two systems:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

We illustrate whole-dataset reasoning abilities by posing the following question to the two systems: 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Query: “What are the top 5 themes in the data?“`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Looking at the results from baseline RAG, we see that none of the listed themes has much to do with the war between the two countries.  As anticipated, the vector search retrieved irrelevant text, which was inserted into the LLM’s context window.  Results that were included were likely keying on the word “theme,” resulting in a less than useful assessment of what is going on in the dataset.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Looking at the results from baseline RAG, we see that none of the listed themes has much to do with the war between the two countries.  As anticipated, the vector search retrieved irrelevant text, which was inserted into the LLM’s context window.  Results that were included were likely keying on the word “theme,” resulting in a less than useful assessment of what is going on in the dataset. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Observing the results from GraphRAG, we can clearly see that the results are far more aligned with what is going on in the dataset as a whole.  The answer provides the five main themes as well as supporting details that are observed in the dataset.  The referenced reports are pre-generated by the LLM for each semantic cluster in GraphRAG and, in turn, provide provenance back to original source material.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Observing the results from GraphRAG, we can clearly see that the results are far more aligned with what is going on in the dataset as a whole.  The answer provides the five main themes as well as supporting details that are observed in the dataset.  The referenced reports are pre-generated by the LLM for each semantic cluster in GraphRAG and, in turn, provide provenance back to original source material.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Spotlight: On-demand video`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```AI Explainer: Foundation models ​and the next era of AI`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Explore how the transformer architecture, larger models and more data, and in-context learning have helped advance AI from perception to creation.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Explore how the transformer architecture, larger models and more data, and in-context learning have helped advance AI from perception to creation.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Creating LLM-generated knowledge graphs`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Creating LLM-generated knowledge graphs

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```We note the basic flow that underpins GraphRAG, which builds upon our prior research (opens in new tab) and repositories (opens in new tab) using graph machine learning:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

We note the basic flow that underpins GraphRAG, which builds upon our prior research (opens in new tab) and repositories (opens in new tab) using graph machine learning: 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```An example visualization of the graph is shown in Figure 3.  Each circle is an entity (e.g., a person, place, or organization), with the entity size representing the number of relationships that entity has, and the color representing groupings of similar entities.  The color partitioning is a bottom-up clustering method built on top of the graph structure, which enables us to answer questions at varying levels of abstraction.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

An example visualization of the graph is shown in Figure 3.  Each circle is an entity (e.g., a person, place, or organization), with the entity size representing the number of relationships that entity has, and the color representing groupings of similar entities.  The color partitioning is a bottom-up clustering method built on top of the graph structure, which enables us to answer questions at varying levels of abstraction.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Result metrics`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```The illustrative examples above are representative of GraphRAG’s consistent improvement across multiple datasets in different subject domains.  We assess this improvement by performing an evaluation using an LLM grader to determine a pairwise winner between GraphRAG and baseline RAG.  We use a set of qualitative metrics, including comprehensiveness (completeness within the framing of the implied context of the question), human enfranchisement (provision of supporting source material or other contextual information), and diversity (provision of differing viewpoints or angles on the question posed). Initial results show that GraphRAG consistently outperforms baseline RAG on these metrics.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

The illustrative examples above are representative of GraphRAG’s consistent improvement across multiple datasets in different subject domains.  We assess this improvement by performing an evaluation using an LLM grader to determine a pairwise winner between GraphRAG and baseline RAG.  We use a set of qualitative metrics, including comprehensiveness (completeness within the framing of the implied context of the question), human enfranchisement (provision of supporting source material or other contextual information), and diversity (provision of differing viewpoints or angles on the question posed). Initial results show that GraphRAG consistently outperforms baseline RAG on these metrics.  

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```In addition to relative comparisons, we also use SelfCheckGPT (opens in new tab) to perform an absolute measurement of faithfulness to help ensure factual, coherent results grounded in the source material. Results show that GraphRAG achieves a similar level of faithfulness to baseline RAG. We are currently developing an evaluation framework to measure performance on the class of problems above.  This will include more robust mechanisms for generating question-answer test sets as well as additional metrics, such as accuracy and context relevance.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

In addition to relative comparisons, we also use SelfCheckGPT (opens in new tab) to perform an absolute measurement of faithfulness to help ensure factual, coherent results grounded in the source material. Results show that GraphRAG achieves a similar level of faithfulness to baseline RAG. We are currently developing an evaluation framework to measure performance on the class of problems above.  This will include more robust mechanisms for generating question-answer test sets as well as additional metrics, such as accuracy and context relevance. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Next steps`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```By combining LLM-generated knowledge graphs and graph machine learning, GraphRAG enables us to answer important classes of questions that we cannot attempt with baseline RAG alone.  We have seen promising results after applying this technology to a variety of scenarios, including social media, news articles, workplace productivity, and chemistry.  Looking forward, we plan to work closely with customers on a variety of new domains as we continue to apply this technology while working on metrics and robust evaluation. We look forward to sharing more as our research continues.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

By combining LLM-generated knowledge graphs and graph machine learning, GraphRAG enables us to answer important classes of questions that we cannot attempt with baseline RAG alone.  We have seen promising results after applying this technology to a variety of scenarios, including social media, news articles, workplace productivity, and chemistry.  Looking forward, we plan to work closely with customers on a variety of new domains as we continue to apply this technology while working on metrics and robust evaluation. We look forward to sharing more as our research continues.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```1As baseline RAG in this comparison we use LangChain’s Q&A (opens in new tab), a well-known representative example of this class of RAG tools in widespread use today.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Related publications`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

\n", + "\n", + "Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine\n", + "\n", + "\n", + "

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Meet the authors`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Jonathan Larson`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Senior Principal Data Architect`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Steven Truitt`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Principal Program Manager`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Continue reading`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Research Areas`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Related tools`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Follow us:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Share this page:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Notifications`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'viewport', 'content': 'width=device-width, initial-scale=1'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:dnt', 'content': 'on'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-product', 'content': 'MSR'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-stv', 'content': '8.5.0'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-sitesection', 'content': ''}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-pageType', 'content': 'Post'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-market', 'content': 'en-us'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-env', 'content': 'Production'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa‐asst', 'content': '1005408'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-pgidx', 'content': '1'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-pgtot', 'content': '-1'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-pgtop', 'content': 'Artificial intelligence'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'robots', 'content': 'index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:locale', 'content': 'en_US'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:type', 'content': 'article'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:title', 'content': 'GraphRAG: A new approach for discovery using complex information'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:description', 'content': 'Microsoft is transforming retrieval-augmented generation with GraphRAG, using LLM-generated knowledge graphs to significantly improve Q&A when analyzing complex information and consistently outperforming baseline RAG. Get the details.'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:site_name', 'content': 'Microsoft Research'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'article:published_time', 'content': '2024-02-13T20:00:00+00:00'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'article:modified_time', 'content': '2024-02-13T16:50:07+00:00'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:image:width', 'content': '1200'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:image:height', 'content': '627'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:image:type', 'content': 'image/jpeg'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'author', 'content': 'Brenda Potts'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:card', 'content': 'summary_large_image'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:title', 'content': 'GraphRAG: A new approach for discovery using complex information'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:description', 'content': 'Microsoft is transforming retrieval-augmented generation with GraphRAG, using LLM-generated knowledge graphs to significantly improve Q&A when analyzing complex information and consistently outperforming baseline RAG. Get the details.'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:creator', 'content': '@MSFTResearch'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:site', 'content': '@MSFTResearch'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'generator', 'content': 'WordPress 6.4.3'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'research-area', 'content': 'Artificial intelligence'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'itemprop': 'width', 'content': '216'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'itemprop': 'height', 'content': '46'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'itemprop': 'name', 'content': 'Microsoft'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'itemprop': 'width', 'content': '1024'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'itemprop': 'height', 'content': '576'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to user_proxy):\n", + "\n", + "Success: archived the following links in your chosen location ./content/ <-- https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': 'https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/', 'role': 'assistant'}, {'content': 'Success: archived the following links in your chosen location ./content/ <-- https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/', 'role': 'user'}], summary='Success: archived the following links in your chosen location ./content/ <-- https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/', cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "link = \"https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/\"\n", + "user_proxy.initiate_chat(web_archiver_agent, message=link)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aiex01_blog_hero_1400x788.png\n", + "aiex01_blog_hero_1400x788.txt\n", + "amit_emre_podcast_hero_feature_1400x788.jpg\n", + "amit_emre_podcast_hero_feature_1400x788.txt\n", + "content.txt\n", + "emnlp-2023-blogherofeature-1400x788-1.png\n", + "emnlp-2023-blogherofeature-1400x788-1.txt\n", + "graphrag-blogherofeature-1400x788-1.png\n", + "graphrag-blogherofeature-1400x788-1.txt\n", + "graphrag-figure3.jpg\n", + "graphrag-figure3.txt\n", + "graphrag_figure1.png\n", + "graphrag_figure1.txt\n", + "graphrag_figure2.png\n", + "graphrag_figure2.txt\n", + "headshot150px.png\n", + "headshot150px.txt\n", + "index.html\n", + "links.txt\n", + "metadata.txt\n", + "msr-ai-2x.png\n", + "newsplitwise-jan-24-blogherofeature-1400x788-1.jpg\n", + "newsplitwise-jan-24-blogherofeature-1400x788-1.txt\n", + "screenshot.png\n", + "sot-blogherofeature-1400x788-1.jpg\n", + "sot-blogherofeature-1400x788-1.txt\n", + "steven-truitt_360x360.jpg\n", + "steven-truitt_360x360.txt\n" + ] + } + ], + "source": [ + "!ls {storage_path}/microsoft.com/graphrag-unlocking-llm-discovery-on-narrative-private-data/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Just for reference, what did the page look like?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "last_page = list(web_archiver_agent.process_history.keys())[-1]\n", + "\n", + "local_path = f\"{storage_path}/{get_file_path_from_url(last_page)}\"\n", + "screenshot_path = os.path.join(local_path, \"screenshot.png\")\n", + "assert os.path.exists(screenshot_path)\n", + "\n", + "# Load the image\n", + "image = Image.open(screenshot_path)\n", + "\n", + "# Display the image\n", + "display(image)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It seems the bottom was cropped, but using the 'firefox' browser for our agent will trigger the \"full page screenshot\" function.
\n", + "But not to worry, everything is also stored to disk in its original form, including the source HTML as it was loaded in the desktop browser.\n", + "\n", + "Below we confirm that our Autogen Agent successfully cataloged all of the content into the file." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We located our search term on line 14 out of a total 27 lines\n", + "\n", + "The last 3 lines stored in content were:\n", + "\n", + "In addition to relative comparisons, we also use SelfCheckGPT (opens in new tab) to perform an absolute measurement of faithfulness to help ensure factual, coherent results grounded in the source material. Results show that GraphRAG achieves a similar level of faithfulness to baseline RAG. We are currently developing an evaluation framework to measure performance on the class of problems above.  This will include more robust mechanisms for generating question-answer test sets as well as additional metrics, such as accuracy and context relevance.\n", + "\n", + "By combining LLM-generated knowledge graphs and graph machine learning, GraphRAG enables us to answer important classes of questions that we cannot attempt with baseline RAG alone.  We have seen promising results after applying this technology to a variety of scenarios, including social media, news articles, workplace productivity, and chemistry.  Looking forward, we plan to work closely with customers on a variety of new domains as we continue to apply this technology while working on metrics and robust evaluation. We look forward to sharing more as our research continues.\n", + "\n", + "Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine\n", + "\n" + ] + } + ], + "source": [ + "with open(f\"{local_path}/content.txt\") as f:\n", + " content = f.readlines()\n", + "for idx, line in enumerate(content):\n", + " if \"What are the top 5\" in line:\n", + " break\n", + "print(f\"We located our search term on line {idx} out of a total {len(content)} lines\\n\")\n", + "print(\"The last 3 lines stored in content were:\\n\")\n", + "for i in reversed(range(1, 4)):\n", + " print(content[-i])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Thanks for looking at our new WebArchiverAgent:\n", + "### Stay tuned for more updates from Autogen!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/setup.py b/setup.py index 4f94ad5e589..d567da946ed 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,16 @@ "teachable": ["chromadb"], "lmm": ["replicate", "pillow"], "graph": ["networkx", "matplotlib"], - "websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate"], + "websurfer": [ + "beautifulsoup4", + "markdownify", + "pdfminer.six", + "pathvalidate", + "selenium", + "arxiv", + "requests", + "pillow", + ], "redis": ["redis"], "jupyter-executor": jupyter_executor, "types": ["mypy==1.9.0"] + jupyter_executor, diff --git a/test/agentchat/contrib/test_web_archiver_agent.py b/test/agentchat/contrib/test_web_archiver_agent.py new file mode 100644 index 00000000000..080274ded24 --- /dev/null +++ b/test/agentchat/contrib/test_web_archiver_agent.py @@ -0,0 +1,130 @@ +import os +import sys +import re +import tempfile +import pytest +from autogen.agentchat import UserProxyAgent +from autogen.oai.openai_utils import filter_config, config_list_from_json +from autogen.cache import Cache + +sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +from conftest import MOCK_OPEN_AI_API_KEY, skip_openai # noqa: E402 + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402 + +try: + from openai import OpenAI +except ImportError: + skip_oai = True +else: + skip_oai = False or skip_openai + +if not skip_oai: + config_list = config_list_from_json(env_or_file=OAI_CONFIG_LIST, file_location=KEY_LOC) + from autogen.agentchat.contrib.web_archiver_agent import WebArchiverAgent + + +@pytest.mark.skipif( + skip_oai, + reason="do not run if oai is not installed", +) +def test_content_agent() -> None: + browser = "edge" + llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": 42} + + model = ["gpt-3.5-turbo"] + model += [m.replace(".", "") for m in model] + + assert len(llm_config["config_list"]) > 0 # type: ignore[arg-type] + + # Define the temporary storage location + temporary_content_storage = os.path.join(tempfile.gettempdir(), "test_content_agent_storage") + print(f"Storing temporary test files in {temporary_content_storage}") + + # Define the system message for the WebArchiverAgent + content_agent_system_msg = "You are data collection agent specializing in content on the web." + + # Instantiate the WebArchiverAgent + content_agent = WebArchiverAgent( + name="WebArchiverAgent", + system_message=content_agent_system_msg, + llm_config=llm_config, + max_consecutive_auto_reply=0, + # Below are the arguments specific to the WebArchiverAgent + silent=True, + storage_path=temporary_content_storage, + browser_config={"browser": browser}, + max_depth=0, + ) + + # Instantiate the User Proxy Agent + user_proxy = UserProxyAgent( + "user_proxy", + human_input_mode="NEVER", + code_execution_config=False, + default_auto_reply="", + is_termination_msg=lambda x: True, + ) + + # Register the collection process as the default reply to the user + content_agent.register_reply(user_proxy, content_agent.collect_content) + + # Define the links used during the testing process + links = ["https://microsoft.github.io/autogen/docs/Examples"] + + with Cache.disk(): + for link in links: + # Collect the content from the requested link + user_proxy.initiate_chat(content_agent, message=link) + + assert ( + content_agent.process_history[link]["url"] == link + ), "Investigate why the correct not link was reported" + + assert os.path.exists( + content_agent.process_history[link]["local_path"] + ), "The content storage path was not found" + + assert len(content_agent.process_history[link]["content"]) > 0, "No content was identified or stored" + + assert os.path.exists( + os.path.join(content_agent.process_history[link]["local_path"], "content.txt") + ), "The file path for content.txt was not found" + + assert os.path.exists( + os.path.join(content_agent.process_history[link]["local_path"], "metadata.txt") + ), "The file path for metadata.txt was not found" + + assert os.path.exists( + os.path.join(content_agent.process_history[link]["local_path"], "index.html") + ), "The file path for index.html was not found" + + assert os.path.exists( + os.path.join(content_agent.process_history[link]["local_path"], "screenshot.png") + ), "The file path for screenshot.png was not found" + + assert os.path.exists( + os.path.join(content_agent.process_history[link]["local_path"], "links.txt") + ), "The file path for links.txt was not found" + + assert ( + os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "links.txt")) > 0 + ), "The file size of links.txt was zero" + assert ( + os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "content.txt")) > 0 + ), "The file size of content.txt was zero" + assert ( + os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "metadata.txt")) > 0 + ), "The file size of metadata.txt was zero" + assert ( + os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "index.html")) > 0 + ), "The file size of index.html was zero" + assert ( + os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "screenshot.png")) > 0 + ), "The file size of screenshot.png was zero" + + +if __name__ == "__main__": + """Runs this file's tests from the command line.""" + test_content_agent() diff --git a/test/agentchat/contrib/test_web_surfer_selenium.py b/test/agentchat/contrib/test_web_surfer_selenium.py new file mode 100644 index 00000000000..1e858466a5b --- /dev/null +++ b/test/agentchat/contrib/test_web_surfer_selenium.py @@ -0,0 +1,175 @@ +import os +import sys +import re +import pytest +from autogen.agentchat import UserProxyAgent +from autogen.oai.openai_utils import filter_config, config_list_from_json +from autogen.cache import Cache + +sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +from conftest import MOCK_OPEN_AI_API_KEY, skip_openai # noqa: E402 + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402 + +BLOG_POST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" +BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen" +BING_QUERY = "Microsoft" + +try: + from autogen.agentchat.contrib.web_surfer import WebSurferAgent +except ImportError: + skip_all = True + print("THERE WAS AN ERROR") +else: + skip_all = False + +try: + from openai import OpenAI +except ImportError: + skip_oai = True +else: + skip_oai = False or skip_openai + +try: + BING_API_KEY = os.environ["BING_API_KEY"] +except KeyError: + skip_bing = True +else: + skip_bing = False + +if not skip_oai: + config_list = config_list_from_json(env_or_file=OAI_CONFIG_LIST, file_location=KEY_LOC) + + +@pytest.mark.skipif( + skip_all, + reason="do not run if dependency is not installed", +) +def test_web_surfer() -> None: + browser = "edge" # can be 'edge', 'firefox', or 'chrome' + with pytest.MonkeyPatch.context() as mp: + # we mock the API key so we can register functions (llm_config must be present for this to work) + mp.setenv("OPENAI_API_KEY", MOCK_OPEN_AI_API_KEY) + page_size = 4096 + web_surfer = WebSurferAgent( + "web_surfer", + llm_config={"model": "gpt-3.5-turbo", "config_list": []}, + browser_config={"viewport_size": page_size, "type": "selenium", "browser": browser}, + ) + + # Sneak a peak at the function map, allowing us to call the functions for testing here + function_map = web_surfer._user_proxy._function_map + + # Test some basic navigations + response = function_map["visit_page"](BLOG_POST_URL) + assert f"Address: {BLOG_POST_URL}".strip() in response + assert f"Title: {BLOG_POST_TITLE}".strip() in response + + # Test web search -- we don't have a key in this case, so we expect it to raise an error (but it means the code path is correct) + with pytest.raises(ValueError, match="Missing Bing API key."): + response = function_map["informational_web_search"](BING_QUERY) + + with pytest.raises(ValueError, match="Missing Bing API key."): + response = function_map["navigational_web_search"](BING_QUERY) + + # Test Q&A and summarization -- we don't have a key so we expect it to fail (but it means the code path is correct) + with pytest.raises(IndexError): + response = function_map["answer_from_page"]("When was it founded?") + + with pytest.raises(IndexError): + response = function_map["summarize_page"]() + + +@pytest.mark.skipif( + skip_oai, + reason="do not run if oai is not installed", +) +def test_web_surfer_oai() -> None: + browser = "edge" # can be 'edge', 'firefox', or 'chrome' + llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": 42} + + # adding Azure name variations to the model list + model = ["gpt-3.5-turbo"] + model += [m.replace(".", "") for m in model] + + summarizer_llm_config = { + "config_list": filter_config(config_list, dict(model=model)), # type: ignore[no-untyped-call] + "timeout": 180, + } + + assert len(llm_config["config_list"]) > 0 # type: ignore[arg-type] + assert len(summarizer_llm_config["config_list"]) > 0 + + page_size = 4096 + web_surfer = WebSurferAgent( + "web_surfer", + llm_config=llm_config, + summarizer_llm_config=summarizer_llm_config, + browser_config={"viewport_size": page_size, "type": "selenium", "browser": browser}, + ) + + user_proxy = UserProxyAgent( + "user_proxy", + human_input_mode="NEVER", + code_execution_config=False, + default_auto_reply="", + is_termination_msg=lambda x: True, + ) + + with Cache.disk(): + # Make some requests that should test function calling + user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'") + + user_proxy.initiate_chat(web_surfer, message="Please scroll down.") + + user_proxy.initiate_chat(web_surfer, message="Please scroll up.") + + user_proxy.initiate_chat(web_surfer, message="When was it founded?") + + +@pytest.mark.skipif( + skip_bing, + reason="do not run if bing api key is not available", +) +def test_web_surfer_bing() -> None: + browser = "edge" # can be 'edge', 'firefox', or 'chrome' + page_size = 4096 + web_surfer = WebSurferAgent( + "web_surfer", + llm_config={ + "config_list": [ + { + "model": "gpt-3.5-turbo", + "api_key": "sk-PLACEHOLDER_KEY", + } + ] + }, + browser_config={ + "viewport_size": page_size, + "bing_api_key": BING_API_KEY, + "type": "selenium", + "browser": browser, + }, + ) + + # Sneak a peak at the function map, allowing us to call the functions for testing here + function_map = web_surfer._user_proxy._function_map + + # Test informational queries + response = function_map["informational_web_search"](BING_QUERY) + assert f"Address: bing: {BING_QUERY}" in response + assert f"Title: {BING_QUERY} - Search" in response + assert f"A Bing search for '{BING_QUERY}' found " in response + + # Test informational queries + response = function_map["navigational_web_search"](BING_QUERY + " Wikipedia") + assert "Address: https://en.wikipedia.org/wiki/" in response + + +if __name__ == "__main__": + """Runs this file's tests from the command line.""" + + test_web_surfer() + test_web_surfer_oai() + test_web_surfer_bing()