From 23ee1450bcee02c8395628cc34a44347119a0cd3 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 15:40:24 -0600 Subject: [PATCH 01/36] Update test_web_surfer.py Tests for the new Selenium WebDriver addition --- test/agentchat/contrib/test_web_surfer.py | 85 +++++++++++++++-------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/test/agentchat/contrib/test_web_surfer.py b/test/agentchat/contrib/test_web_surfer.py index d5dae0beb1c..37983ffb980 100644 --- a/test/agentchat/contrib/test_web_surfer.py +++ b/test/agentchat/contrib/test_web_surfer.py @@ -2,8 +2,8 @@ import sys import re import pytest -from autogen import UserProxyAgent, config_list_from_json -from autogen.oai.openai_utils import filter_config +from autogen.agentchat import UserProxyAgent +from autogen.oai.openai_utils import filter_config, config_list_from_json from autogen.cache import Cache sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) @@ -17,9 +17,10 @@ BING_QUERY = "Microsoft" try: - from autogen.agentchat.contrib.web_surfer import WebSurferAgent + from autogen.agentchat.contrib.web_surfer import WebSurferAgent, IS_SELENIUM_CAPABLE except ImportError: skip_all = True + print("THERE WAS AN ERROR") else: skip_all = False @@ -45,7 +46,7 @@ skip_all, reason="do not run if dependency is not installed", ) -def test_web_surfer() -> None: +def test_web_surfer(browser_type='text', web_driver=None) -> None: with pytest.MonkeyPatch.context() as mp: # we mock the API key so we can register functions (llm_config must be present for this to work) mp.setenv("OPENAI_API_KEY", MOCK_OPEN_AI_API_KEY) @@ -53,7 +54,7 @@ def test_web_surfer() -> None: web_surfer = WebSurferAgent( "web_surfer", llm_config={"model": "gpt-4", "config_list": []}, - browser_config={"viewport_size": page_size}, + browser_config={"viewport_size": page_size, 'type': browser_type, 'web_driver': web_driver}, ) # Sneak a peak at the function map, allowing us to call the functions for testing here @@ -69,28 +70,33 @@ def test_web_surfer() -> None: total_pages = int(m.group(1)) # type: ignore[union-attr] response = function_map["page_down"]() - assert ( - f"Viewport position: Showing page 2 of {total_pages}." in response - ) # Assumes the content is longer than one screen + if browser_type=='text': + assert ( + f"Viewport position: Showing page 2 of {total_pages}." in response + ) # Assumes the content is longer than one screen response = function_map["page_up"]() - assert f"Viewport position: Showing page 1 of {total_pages}." in response + if browser_type=='text': + assert f"Viewport position: Showing page 1 of {total_pages}." in response # Try to scroll too far back up response = function_map["page_up"]() - assert f"Viewport position: Showing page 1 of {total_pages}." in response + if browser_type=='text': + assert f"Viewport position: Showing page 1 of {total_pages}." in response # Try to scroll too far down for i in range(0, total_pages + 1): response = function_map["page_down"]() - assert f"Viewport position: Showing page {total_pages} of {total_pages}." in response + if browser_type=='text': + assert f"Viewport position: Showing page {total_pages} of {total_pages}." in response - # Test web search -- we don't have a key in this case, so we expect it to raise an error (but it means the code path is correct) - with pytest.raises(ValueError, match="Missing Bing API key."): - response = function_map["informational_web_search"](BING_QUERY) + if not skip_bing: + # Test web search -- we don't have a key in this case, so we expect it to raise an error (but it means the code path is correct) + with pytest.raises(ValueError, match="Missing Bing API key."): + response = function_map["informational_web_search"](BING_QUERY) - with pytest.raises(ValueError, match="Missing Bing API key."): - response = function_map["navigational_web_search"](BING_QUERY) + with pytest.raises(ValueError, match="Missing Bing API key."): + response = function_map["navigational_web_search"](BING_QUERY) # Test Q&A and summarization -- we don't have a key so we expect it to fail (but it means the code path is correct) with pytest.raises(IndexError): @@ -104,17 +110,19 @@ def test_web_surfer() -> None: skip_oai, reason="do not run if oai is not installed", ) -def test_web_surfer_oai() -> None: +def test_web_surfer_oai(browser_type='text', web_driver=None) -> None: llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": 42} # adding Azure name variations to the model list - model = ["gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-16k"] + model = ["gpt-3.5-turbo", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-16k"] model += [m.replace(".", "") for m in model] summarizer_llm_config = { "config_list": filter_config(config_list, dict(model=model)), # type: ignore[no-untyped-call] "timeout": 180, } + # import ipdb + # ipdb.set_trace() assert len(llm_config["config_list"]) > 0 # type: ignore[arg-type] assert len(summarizer_llm_config["config_list"]) > 0 @@ -124,7 +132,7 @@ def test_web_surfer_oai() -> None: "web_surfer", llm_config=llm_config, summarizer_llm_config=summarizer_llm_config, - browser_config={"viewport_size": page_size}, + browser_config={"viewport_size": page_size, 'type': browser_type, 'web_driver': web_driver}, ) user_proxy = UserProxyAgent( @@ -135,23 +143,24 @@ def test_web_surfer_oai() -> None: is_termination_msg=lambda x: True, ) - # Make some requests that should test function calling - user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'") + with Cache.disk(): + # Make some requests that should test function calling + user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'") - user_proxy.initiate_chat(web_surfer, message="Please scroll down.") + user_proxy.initiate_chat(web_surfer, message="Please scroll down.") - user_proxy.initiate_chat(web_surfer, message="Please scroll up.") + user_proxy.initiate_chat(web_surfer, message="Please scroll up.") - user_proxy.initiate_chat(web_surfer, message="When was it founded?") + user_proxy.initiate_chat(web_surfer, message="When was it founded?") - user_proxy.initiate_chat(web_surfer, message="What's this page about?") + user_proxy.initiate_chat(web_surfer, message="What's this page about?") @pytest.mark.skipif( skip_bing, reason="do not run if bing api key is not available", ) -def test_web_surfer_bing() -> None: +def test_web_surfer_bing(browser_type='text', web_driver=None) -> None: page_size = 4096 web_surfer = WebSurferAgent( "web_surfer", @@ -163,7 +172,7 @@ def test_web_surfer_bing() -> None: } ] }, - browser_config={"viewport_size": page_size, "bing_api_key": BING_API_KEY}, + browser_config={"viewport_size": page_size, "bing_api_key": BING_API_KEY, 'type': browser_type, 'web_driver': web_driver}, ) # Sneak a peak at the function map, allowing us to call the functions for testing here @@ -183,6 +192,24 @@ def test_web_surfer_bing() -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" + test_web_surfer() - test_web_surfer_oai() - test_web_surfer_bing() + + if not skip_oai: + test_web_surfer_oai() + + if not skip_bing: + test_web_surfer_bing() + + if IS_SELENIUM_CAPABLE: # Test the selenium browser if installed + + # Todo: automatically determine which is available in order to avoid unnecessary errors + selected_driver = 'edge' # can be 'edge', 'firefox', or 'chrome' + + test_web_surfer(browser_type='selenium', web_driver=selected_driver) + + if not skip_oai: + test_web_surfer_oai(browser_type='selenium', web_driver=selected_driver) + + if not skip_bing: + test_web_surfer_bing(browser_type='selenium', web_driver=selected_driver) From 2daec154b4b67623652ef18dab43d04f77f86139 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 15:46:26 -0600 Subject: [PATCH 02/36] Update browser_utils.py Inclusions of `SeleniumBrowserWrapper`, `SeleniumBrowser`, and several required helper functions that are part of the upcoming `ContentCollector` class and the `WebCollectionAgent`. --- autogen/browser_utils.py | 408 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 408 insertions(+) diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index 41d2d62f825..877793d6279 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -1,6 +1,7 @@ import json import os import requests +import traceback import re import markdownify import io @@ -26,6 +27,19 @@ except ModuleNotFoundError: pass +# The Selenium package is used to automate web browser interaction from Python +IS_SELENIUM_CAPABLE = False +try: + from selenium import webdriver + from selenium.webdriver.common.by import By + from selenium.webdriver.common.keys import Keys + + IS_SELENIUM_CAPABLE = True +except ImportError as e: + print(f"The module/package '{e.name}' is not available.") + print("Try running 'pip install selenium'. You may need to run 'sudo easy_install selenium' on Linux or MacOS") + print("Official selenium installation documentation: https://www.selenium.dev/documentation/webdriver/getting_started/install_library/") + raise e class SimpleTextBrowser: """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use.""" @@ -280,3 +294,397 @@ def _fetch_page(self, url: str) -> None: except requests.exceptions.RequestException as e: self.page_title = "Error" self._set_page_content(str(e)) + +get_scheme = lambda url: urlparse(url).scheme if isinstance(url,str) else url.scheme +get_domain = lambda url: urlparse(url).netloc if isinstance(url,str) else url.netloc +get_path = lambda url: urlparse(url).path if isinstance(url, str) else url.path +get_last_path = lambda url: os.path.basename(urlparse(url).path) if isinstance(url, str) else os.path.basename(url.path) + +def get_file_path_from_url(url): # URL to Directory function + """ + get_file_path_from_url function: This function takes a URL as input and returns the corresponding local file path as a string. + + Parameters: + url (str | ParseResult): The URL of the file for which the local path is to be obtained. + + Returns: + str: The local file path on the system as a string. + """ + + # Remove any trailing forward slash + url = url[:-1] if url[-1] == '/' else url + + # Parse the URL + parsed_url = urlparse(url) if isinstance(url, str) else url + canonical_url = parsed_url.netloc.replace("www.","") + + if 'github' in url and len(parsed_url.path.split('/')) >= 2: + relative_path = os.path.join(canonical_url, parsed_url.path) + elif len(parsed_url.path.split('/')) >= 1: + relative_path = os.path.join(canonical_url, get_last_path(parsed_url)) + + # Remove any preceding forward slash + relative_path = relative_path[1:] if relative_path[0] == '/' else relative_path + + return relative_path + +def fix_missing_protocol(img_url, source_url): # Correct a url if it's missing the protocol + """ + Fixes a URL by adding the missing protocol (http or https) based on the provided domain. + + Parameters: + - img_url (str): The input image URL to be fixed. + - domain (str): The domain of the image URL which is used to determine the protocol. + + Returns: + - str: A corrected URL string with the missing protocol added. + """ + + protocol = get_scheme(source_url) + domain = get_domain(source_url) + + if img_url.startswith('//'): # If the URL starts with "//" + img_url = f"{protocol}:{img_url}" # Add "https:" before it + + elif not bool(domain): # domain not in img_url: + img_url = f"{protocol}://{domain}/{img_url}" + + return img_url + +def extract_pdf_text(local_pdf_path): # Returns the extracted text content from a local PDF file + """ + Extracts the text content from a local PDF file and returns it as a string. + + Parameters: + - local_pdf_path (str): The path to the local PDF file from which the text will be extracted. + + Returns: + - str: A string containing the text content of the provided PDF file. + """ + + try: + text = pdfminer.high_level.extract_text(local_pdf_path) + except Exception: + traceback.print_exc() + text = '' + + return text + +def download_using_requests(driver, download_url, save_path): # `requests` downloads assisted by selenium webdriver cookies + """ + This function takes a Selenium WebDriver instance, a URL to download a file, and a path where you want to save the downloaded file. + + It first retrieves cookies from the given driver, converts them into a format suitable for use with the `requests` library, and then uses these cookies to successfully download the specified file using the `requests.get()` function. The `User-Agent` header is also set to match that used by the WebDriver instance. + + Args: + driver (webdriver.chrome.webdriver.WebDriver): A Selenium WebDriver instance, typically obtained from selenium.webdriver.Chrome() or another appropriate method for your browser of choice. + download_url (str): The URL to the file you want to download. + save_path (str): The path where you would like the downloaded file to be saved. + + Returns: + None, but successfully downloads a file from the given URL using the cookies and headers obtained from the WebDriver instance. + + Raises: + Exception: If the file cannot be downloaded due to an error in the `requests.get()` call. + """ + + def get_cookies(driver): + return driver.get_cookies() + + def convert_cookies_to_requests_format(cookies): + cookie_dict = {} + for cookie in cookies: + cookie_dict[cookie['name']] = cookie['value'] + return cookie_dict + + def download_file_with_cookies(url, session_cookies, save_path, user_agent=None): + headers = { + 'User-Agent': user_agent if user_agent else 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + } + + response = requests.get(url, cookies=session_cookies, headers=headers, stream=True) + if response.status_code == 200: + with open(save_path, 'wb') as file: + for chunk in response.iter_content(1024): + file.write(chunk) + + # Extract cookies from WebDriver + cookies = get_cookies(driver) + + # Convert cookies for use with requests + session_cookies = convert_cookies_to_requests_format(cookies) + + # Define the user-agent if you want to match the one used by your WebDriver + user_agent = driver.execute_script("return navigator.userAgent;") + + # Download file using requests with the same session cookies and headers + download_file_with_cookies(download_url, session_cookies, save_path, user_agent=user_agent) + +def SeleniumBrowser(**kwargs): # Function that loads the web driver + """ + This function launches a headless Selenium browser based on the specified 'browser'. The available options are 'edge', 'firefox', and 'chrome'. + + Parameters: + browser (str): A string specifying which browser to launch. Defaults to 'firefox'. + download_dir (str): A path to where downloaded files are stored. Defaults to None + + Returns: + webdriver: An instance of the Selenium WebDriver based on the specified browser. User can open a new page by `webdriver.get('https://www.microsoft.com')`. + + Raises: + ImportError: If selenium package is not installed, it raises an ImportError with a message suggesting to install it using pip. + """ + + # Load the argumnets from kwargs + browser = kwargs.get('browser', 'edge') + download_dir = kwargs.get('download_dir', None) + + try: + from selenium import webdriver + except ImportError as e: + import logging + logger = logging.getLogger(__name__) + logger.fatal("Failed to import selenium. Try running 'pip install selenium'. You may need to run 'sudo easy_install selenium' on Linux or MacOS") + raise e + + def get_headless_options(download_dir): + options = Options() + options.headless = True + options.add_argument('--headless') + options.add_argument("--window-size=1920,5200") + options.add_argument('--downloadsEnabled') + if download_dir: + options.set_preference("download.default_directory",download_dir) + return options + + if browser.lower()=='firefox': + from selenium.webdriver.firefox.options import Options + driver = webdriver.Firefox(options=get_headless_options(download_dir)) + elif browser.lower()=='chrome': + from selenium.webdriver.chrome.options import Options + driver = webdriver.Chrome(options=get_headless_options(download_dir)) + elif browser.lower()=='edge': + from selenium.webdriver.edge.options import Options + driver = webdriver.Edge(options=get_headless_options(download_dir)) + driver.capabilities['se:downloadsEnablead'] = True + + return driver + +class SeleniumBrowserWrapper: # A wrapper to bridge compatability between SimpleTextBrowser and SeleniumBrowser + """ + SeleniumBrowserWrapper class is a wrapper that manages the interaction with a Selenium web driver. + It provides methods to control the browser, set up the viewport size, and download files. + + Parameters: + - start_page (Optional[str]): The initial URL of the web page to load. Defaults to "about:blank". + - viewport_size (Optional[int]): The width of the viewport in pixels. Defaults to 1024 * 8. + - downloads_folder (Optional[Union[str, None]]): The directory where downloaded files will be saved. If set to `None`, default downloads folder will be used. + - bing_api_key (Optional[Union[str, None]]): The API key for Bing search engine. + - request_kwargs (Optional[Union[Dict[str, Any], None]]): Additional keyword arguments that can be passed for customization. + - web_driver (Optional[str]): The type of web driver to use. Defaults to 'edge'. + + Attributes: + - start_page (str): The initial URL of the web page to load. + - viewport_size (int): The width of the viewport in pixels. + - downloads_folder (Union[str, None]): The directory where downloaded files will be saved. + - history (List[str]): A list containing the URLs visited by the browser. + - page_title (Optional[str]): The title of the current web page. + - viewport_current_page (int): The index of the current web page in relation to all pages loaded. + - viewport_pages (List[Tuple[int, int]]): A list containing tuples of width and height for each viewed web page. + - bing_api_key (Optional[str]): The API key for Bing search engine. + - request_kwargs (Optional[Union[Dict[str, Any], None]]): Additional keyword arguments passed during instantiation. + - _page_content (str): The content of the current web page. + - driver: An instance of SeleniumBrowser class that manages the browser interaction. + + Notes: + - Viewport Size and Pages: The concept of viewport size and pagination doesn't directly apply to Selenium as it does in a text-based browser. Selenium interacts with the whole page. However, actions like scrolling can be simulated. + - Downloads Folder: This is handled through ChromeOptions if you need to set a default download directory. + - History Management: This wrapper maintains a simple history of visited URLs for compatibility with the SimpleTextBrowser's API. + - Page Content: Selenium's page_source property provides the HTML content of the current page, making the distinction between viewport and page content less relevant. + + """ + + def __init__( + self, + start_page: Optional[str] = None, + viewport_size: Optional[int] = 1024 * 8, + downloads_folder: Optional[Union[str, None]] = None, + bing_api_key: Optional[Union[str, None]] = None, + request_kwargs: Optional[Union[Dict[str, Any], None]] = None, + web_driver: Optional[str] = 'edge', + ): + self.start_page: str = start_page if start_page else "about:blank" + self.viewport_size = viewport_size # Applies only to the standard uri types + self.downloads_folder = downloads_folder + self.history: List[str] = list() + self.page_title: Optional[str] = None + self.viewport_current_page = 0 + self.viewport_pages: List[Tuple[int, int]] = list() + self.bing_api_key = bing_api_key + self.request_kwargs = request_kwargs + + self._page_content = "" + + # Initialize the WebDriver + self.driver = SeleniumBrowser(browser=web_driver, download_dir=downloads_folder) + if start_page: + self.set_address(self.start_page) + + @property + def address(self) -> str: + """Return the address of the current page.""" + return self.history[-1] if self.history else "about:blank" + + @property + def viewport(self) -> str: + """Return the content of the current viewport.""" + return self.driver.page_source # Selenium directly interacts with the page, no viewport concept + + @property + def page_content(self) -> str: + """Return the full contents of the current page.""" + return self.viewport # In Selenium, viewport essentially contains the full page content + + def set_address(self, uri_or_path: str) -> None: + """Navigate to a given URI and update history.""" + if not uri_or_path.startswith("http:") and not uri_or_path.startswith("https:"): + uri_or_path = urljoin(self.address, uri_or_path) + self.driver.get(uri_or_path) + self.history.append(uri_or_path) + self._update_page_content() + + def visit_page(self, path_or_uri: str) -> str: + """Navigate to a page and return its content.""" + self.set_address(path_or_uri) + return self.viewport + + def page_down(self) -> None: + """Simulate page down action.""" + # Simulate pressing Page Down key + self.driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN) + + def page_up(self) -> None: + """Simulate page up action.""" + # Simulate pressing Page Up key + self.driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_UP) + + def _update_page_content(self) -> None: + """Update internal content state, including page title.""" + self.page_title = self.driver.title + + def close(self): + """Close the browser.""" + self.driver.quit() + + def _split_pages(self) -> None: + # # Split only regular pages + # if not self.address.startswith("http:") and not self.address.startswith("https:"): + # self.viewport_pages = [(0, len(self._page_content))] + # return + + # # Handle empty pages + # if len(self._page_content) == 0: + # self.viewport_pages = [(0, 0)] + # return + + # # Break the viewport into pages + # self.viewport_pages = [] + # start_idx = 0 + # while start_idx < len(self._page_content): + # end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator] + # # Adjust to end on a space + # while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]: + # end_idx += 1 + # self.viewport_pages.append((start_idx, end_idx)) + # start_idx = end_idx + return + + def _bing_api_call(self, query: str) -> Dict[str, Dict[str, List[Dict[str, Union[str, Dict[str, str]]]]]]: + # Make sure the key was set + if self.bing_api_key is None: + raise ValueError("Missing Bing API key.") + + # Prepare the request parameters + request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {} + + if "headers" not in request_kwargs: + request_kwargs["headers"] = {} + request_kwargs["headers"]["Ocp-Apim-Subscription-Key"] = self.bing_api_key + + if "params" not in request_kwargs: + request_kwargs["params"] = {} + request_kwargs["params"]["q"] = query + request_kwargs["params"]["textDecorations"] = False + request_kwargs["params"]["textFormat"] = "raw" + + request_kwargs["stream"] = False + + # Make the request + response = requests.get("https://api.bing.microsoft.com/v7.0/search", **request_kwargs) + response.raise_for_status() + results = response.json() + + return results # type: ignore[no-any-return] + + def _bing_search(self, query: str) -> None: + results = self._bing_api_call(query) + + web_snippets: List[str] = list() + idx = 0 + for page in results["webPages"]["value"]: + idx += 1 + web_snippets.append(f"{idx}. [{page['name']}]({page['url']})\n{page['snippet']}") + if "deepLinks" in page: + for dl in page["deepLinks"]: + idx += 1 + web_snippets.append( + f"{idx}. [{dl['name']}]({dl['url']})\n{dl['snippet'] if 'snippet' in dl else ''}" # type: ignore[index] + ) + + news_snippets = list() + if "news" in results: + for page in results["news"]["value"]: + idx += 1 + news_snippets.append(f"{idx}. [{page['name']}]({page['url']})\n{page['description']}") + + self.page_title = f"{query} - Search" + + content = ( + f"A Bing search for '{query}' found {len(web_snippets) + len(news_snippets)} results:\n\n## Web Results\n" + + "\n\n".join(web_snippets) + ) + if len(news_snippets) > 0: + content += "\n\n## News Results:\n" + "\n\n".join(news_snippets) + self._set_page_content(content) + + def download(self, uri_or_path: str) -> None: # TODO: update this based on the new method + """Download from a given URI""" + self.driver.get(uri_or_path) + + def _fetch_page(self, url: str) -> None: + from selenium.common.exceptions import TimeoutException + try: + self.driver.get(url) + self.page_title = self.driver.title + + # Selenium WebDriver directly accesses the rendered page, + # so we don't need to manually fetch or process the HTML. + # However, you can still manipulate or extract content from the page using Selenium methods. + content_type = "text/html" # Default to text/html since Selenium renders web pages + + # Example of extracting and cleaning the page content + if "wikipedia.org" in url: + body_elm = self.driver.find_element_by_css_selector("div#mw-content-text") + main_title = self.driver.title + webpage_text = "# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm.get_attribute('innerHTML')) + else: + webpage_text = self.driver.find_element_by_tag_name('body').get_attribute('innerText') + + # Convert newlines, remove excessive blank lines + webpage_text = re.sub(r"\r\n", "\n", webpage_text) + self._set_page_content(re.sub(r"\n{2,}", "\n\n", webpage_text).strip()) + + except TimeoutException as e: + self.page_title = "Error" + self._set_page_content("Timeout while retrieving " + url) + From 9efb29709c554e571acdbec3df9a5bfe2228a13f Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 15:49:38 -0600 Subject: [PATCH 03/36] Update web_surfer.py Provides an optional drop-in replacement for `SimpleTextBrowser` with `SeleniumBrowserWrapper` for use-cases including pages that depend on JavaScript and others that prevent calls from `requests`. Nearly all compatibility is held through with the exception of page numbering. --- autogen/agentchat/contrib/web_surfer.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/autogen/agentchat/contrib/web_surfer.py b/autogen/agentchat/contrib/web_surfer.py index 9b7320f092f..e0342c29718 100644 --- a/autogen/agentchat/contrib/web_surfer.py +++ b/autogen/agentchat/contrib/web_surfer.py @@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional, Union, Callable, Literal, Tuple from typing_extensions import Annotated from ... import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat, OpenAIWrapper -from ...browser_utils import SimpleTextBrowser +from ...browser_utils import SimpleTextBrowser, SeleniumBrowserWrapper, IS_SELENIUM_CAPABLE from ...code_utils import content_str from datetime import datetime from ...token_count_utils import count_token, get_max_token_limit @@ -55,8 +55,15 @@ def __init__( self._create_summarizer_client(summarizer_llm_config, llm_config) + # Determine if the user has requested the Selenium browser or not + browser_type = browser_config.pop('type', 'simple') + web_driver = browser_config.pop('web_driver', 'edge') + # Create the browser - self.browser = SimpleTextBrowser(**(browser_config if browser_config else {})) + if browser_type != 'text' and IS_SELENIUM_CAPABLE: + self.browser = SeleniumBrowserWrapper(**(browser_config if browser_config else {})) + else: + self.browser = SimpleTextBrowser(**(browser_config if browser_config else {})) inner_llm_config = copy.deepcopy(llm_config) From 217ed91dad9540c044c6368e77c801a3f40948eb Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 16:04:20 -0600 Subject: [PATCH 04/36] ContentAgent: Custom LLM agent for collecting online content. The ContentAgent class is a custom Autogen agent that can be used to collect and store online content from different web pages. It extends the ConversableAgent class and provides additional functionality for managing a list of additional links, storing collected content in local directories, and customizing request headers. ContentAgent uses deque to manage a list of additional links for further exploration, with a maximum depth limit set by max_depth parameter. The collected content is stored in the specified storage path (storage_path) using local directories. ContentAgent can be customized with request_kwargs and llm_config parameters during instantiation. The default User-Agent header is used for requests, but it can be overridden by providing a new dictionary of headers under request_kwargs. --- autogen/agentchat/contrib/content_agent.py | 379 +++++++++++++++++++++ 1 file changed, 379 insertions(+) create mode 100644 autogen/agentchat/contrib/content_agent.py diff --git a/autogen/agentchat/contrib/content_agent.py b/autogen/agentchat/contrib/content_agent.py new file mode 100644 index 00000000000..e99f16aaa8c --- /dev/null +++ b/autogen/agentchat/contrib/content_agent.py @@ -0,0 +1,379 @@ +import os +import re +import json +import traceback +import requests +from collections import deque +from fileinput import filename +from urllib.parse import urlparse, urlunparse +from bs4 import BeautifulSoup + + +from ...browser_utils import ( + SeleniumBrowser, download_using_requests, + get_domain, get_scheme, get_path, get_last_path, get_file_path_from_url, fix_missing_protocol, + extract_pdf_text # perhaps there is a more logical location to hold this +) + + +from autogen.agentchat.conversable_agent import ConversableAgent + +class ContentAgent(ConversableAgent): + """ + ContentAgent: Custom LLM agent for collecting online content. + + The ContentAgent class is a custom Autogen agent that can be used to collect and store online content from different web pages. It extends the ConversableAgent class and provides additional functionality for managing a list of additional links, storing collected content in local directories, and customizing request headers. + ContentAgent uses deque to manage a list of additional links for further exploration, with a maximum depth limit set by max_depth parameter. The collected content is stored in the specified storage path (storage_path) using local directories. + ContentAgent can be customized with request_kwargs and llm_config parameters during instantiation. The default User-Agent header is used for requests, but it can be overridden by providing a new dictionary of headers under request_kwargs. + + Parameters: + request_kwargs (dict): A dictionary containing key-value pairs used to configure request parameters such as headers and other options. + storage_path (str): The path where the collected content will be stored. Defaults to './content'. + max_depth (int): Maximum depth limit for exploring additional links from a web page. Defaults to 1. + page_loading_time (float): Time in seconds to wait before loading each web page. Defaults to 5. + *args, **kwargs: Additional arguments and keyword arguments to be passed to the parent class ConversableAgent. + + Software Dependencies: + - beautifulsoup4 + - pdfminer + - selenium + - arxiv + - pillow + + """ + def __init__(self, silent=True, storage_path='./content', max_depth=1, page_loading_time=5, *args, **kwargs): #request_kwargs, + super().__init__(*args, **kwargs) + + from collections import deque + self.additional_links = deque() + self.link_depth = 0 + self.max_depth = max_depth + self.local_dir = storage_path + self.page_load_time = page_loading_time + self.silent = silent + self.browser_kwargs = kwargs.get('browser_kwargs', {"browser": "firefox"}) # {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.3; rv:122.0) Gecko/20100101 Firefox/122.0"} }) + self.small_llm_config = kwargs['llm_config'] + + # Define the classifiers + self.define_classifiers() + + def classifier_to_collector_reply(self, recipient, messages, sender, config): # replacement for classify_content + last_message = messages[-1] if isinstance(messages, list) else messages + # print(last_message) + _, rep = recipient.generate_oai_reply([last_message], sender) + if 'false' in rep.lower(): rep = 'False' + elif 'true' in rep.lower(): rep = 'True' + else: rep = 'False' + return True, rep + + def define_classifiers(self): + from autogen.agentchat.assistant_agent import AssistantAgent + + # Define the system messages for the classifiers + self.metadata_classifier_system_msg = "Help the user identify if the metadata contains potentially useful information such as: author, title, description, a date, etc. Respond True for useful, False for not." + self.content_classifier_system_msg = "You are to classify web data as content or other (such as an adversitement) based on the page title. Respond True if it is content, False if not." + + # Define the prompt templates for the classifiers + self.content_classifier_prompt = lambda title, content: f"Title: `{title}`, Data: ```{content}`" + self.metadata_classifier_prompt = lambda content: f"We are parsing html metadata to extract useful data. Should we hold onto this item? {content}." + + # Define the metadata classifier + self.metadata_classifier = AssistantAgent( + "Metadata Classifier", + system_message=self.metadata_classifier_system_msg, + llm_config=self.small_llm_config, + max_consecutive_auto_reply=0, + ) + self.metadata_classifier.register_reply(self, self.classifier_to_collector_reply, 1) + + # Define the html content classifier + self.content_classifier = AssistantAgent( + "Content Classifier", + system_message=self.content_classifier_system_msg, + llm_config=self.small_llm_config, + max_consecutive_auto_reply=0, + ) + self.content_classifier.register_reply(self, self.classifier_to_collector_reply, 1) + + + # Main entry point + def collect_content(self, recipient, messages, sender, config): + content_type, content = '', '' + success = False + all_links = [] + for message in messages: + if message.get("role") == "user": + links = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', message.get("content")) + for link in links: + all_links.append(link) + + # Process the links provided by the user + for link in all_links: + content_type, content = self.fetch_content(link) + + # Inform self that it has completed the root level of link(s) + self.link_depth = 1 + if self.link_depth <= self.max_depth: + while len(self.additional_links) > 0: + additional_link = self.additional_links.pop() + content_type, content = self.fetch_content( additional_link ) + all_links.append(all_links) + + self.link_depth = 0 + return True, f"Success: archived the following links in your chosen location {self.local_dir}/ <-- {', '.join(all_links)}" + + def fetch_content(self, link): + + # Parse the link + parsed_url = urlparse(link) + + # A special case for arxiv links + if 'arxiv' in link: + return 'pdf', self.fetch_arxiv_content(parsed_url) + + elif parsed_url.path.endswith('.pdf'): + return 'pdf', self.fetch_pdf_content(link) + + else: + return 'html', self.fetch_html_content(link) + + def fetch_html_content(self, link): + + # Handle web page content (html) + + sd = {} # submission_data + sd['url'] = link + + # Establish the downloads folder + sd['local_path'] = os.path.join( self.local_dir, get_file_path_from_url(link) ) + os.makedirs(sd['local_path'], exist_ok=True) + + # We can instantiate the browser now that we know where the files and downloads will go + self.browser = SeleniumBrowser(browser=self.browser_kwargs['browser'], download_dir=sd['local_path']) + + if 'github' in link and 'README.md' not in link: + # Small patch to facilitate github repos + link = os.path.join(link, 'README.md') + + self.browser.get(link) + self.browser.maximize_window() + self.browser.implicitly_wait(self.page_load_time) + + # Define where the screeshot is stored + sd['browser_screenshot_path'] = os.path.join( sd['local_path'], "screenshot.png" ) + + # Save a screenshot of the browser window + self.browser.save_full_page_screenshot(sd['browser_screenshot_path']) + + sd['title'] = self.browser.title + sd['html'] = self.browser.page_source + + # Write the HTML to disk for archival purposes + with open(os.path.join(sd['local_path'],'index.html'), 'w', encoding='utf-8') as f: + f.write(str(self.browser.page_source)) + + # Store the BS object + sd['soup'] = BeautifulSoup(sd['html'], 'html.parser') + + sd['content'] = self.identify_content( sd['soup'] ) + + # Save the content to a text file on disk + with open(os.path.join(sd['local_path'], "content.txt"), "w") as f: + for data in sd['content']: # Iterate over each record + f.write(data + "\n") # Write the content to the file + + # Save the original URL for convenience elsewhere (when parsing images) + sd['soup'].url = link + + # Parse and store the Metadata + sd['meta'] = self.identify_metadata(sd['soup']) # [ data.attrs for data in sd['soup'].find_all("meta") ] + + # Open a file to write the metadata to + with open(os.path.join(sd['local_path'], "metadata.txt"), "w") as f: + for data in sd['meta']: # Iterate over each record + f.write(json.dumps(data) + "\n") # Write the link to the file + + # Parse and store the links + sd['links'] = [{'text': link.get_text().strip(), 'href': link['href']} for link in sd['soup'].find_all('a') if link.has_attr('href') and '/' in link['href']] + + # Open a file to write the link URLs to + with open(os.path.join(sd['local_path'], "links.txt"), "w") as f: + for link in sd['links']: # Iterate over each link + f.write(json.dumps(link) + "\n") # Write the link to the file + + # Recursive link checking, up to 1 level deep past the root + if self.link_depth < 1: + # Check if we find any useful relevant links that we should catalog + if ('project' in link['text'] or 'paper' in link['text'] or 'code' in link['text']) and 'marktekpost' in link['href'].lower(): + self.additional_links.append(link['href']) + elif 'arxiv' in link['href'] or ( 'github' in link['href'] and (link['href'][:-3] != ".md" or os.path.basename(link['href'])=='README.md') ): + self.additional_links.append(link['href']) + + # Parse and store the images + self.collect_images(sd['soup'], sd['local_path']) + + # Close down the browser + self.browser.quit() + + # # Deallocate the variable contents + # self.browser = None + + return 'success' + + def fetch_pdf_content(self, link): + + local_pdf_path = os.path.join( self.local_dir, + os.path.join( get_file_path_from_url(link), link.split('/')[-1] ) + ) + os.makedirs(local_pdf_path, exist_ok=True) + + + response = requests.get(link, params={'headers': self.request_kwargs}) + + if response.status_code == 200: + with open(local_pdf_path, 'wb') as f: + f.write(response.content) + + # Extract text from the PDF file + text = extract_pdf_text(local_pdf_path) + + # Let's store the content to disk for later access + with open(local_pdf_path.replace('pdf','txt'), 'w') as f: + f.write(text) + + return text + else: + return None + + def fetch_arxiv_content(self, link): + # Import the arxiv library + import arxiv # todo: add try/catch + + # Identify the paper identification + arxiv_id = link.path.split('/')[-1] + + # Define the local directory + local_base_path = os.path.join( self.local_dir, get_file_path_from_url(link) ) + os.makedirs(local_base_path, exist_ok=True) + + local_pdf_path = os.path.join( local_base_path, f"{arxiv_id}.pdf" ) + + # Download the paper if we don't already have it + if not os.path.exists(local_pdf_path): + # Define the record belonging to the paper + paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id]))) + + # Download the archive to the local downloads folder. + paper.download_pdf(dirpath=local_base_path, filename=f"{arxiv_id}.pdf") + + # Download the archive to the local downloads folder. + paper.download_source(dirpath=local_base_path, filename=f"{arxiv_id}.tar.gz") + + text = extract_pdf_text(local_pdf_path) + + # Let's store the content to disk for later access + with open(local_pdf_path.replace('pdf','txt'), 'w') as f: + f.write(text) + + return text + + def identify_content(self, soup): + + # Get the page title for use with the queries + page_title = soup.find('head').find('title').string + + # Find and extract relevant content from soup based on the title + relevant_content = [] + + for element in soup.find_all(True): + if element.name in ["h1", "h2", "h3", "p"]: + text = element.text.strip().replace("\t"," ").replace("\n"," ") + if len(text) > 0: + while text.find(" ") != -1: + text = text.replace(" "," ") + prompt = self.content_classifier_prompt(page_title, text) + relevant = self.initiate_chat(self.content_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent).chat_history[-1]['content'] + if relevant == 'True': + relevant_content.append(text.strip()) + if not self.silent: + print(element) + + return relevant_content + + def identify_metadata(self, soup, verbose=False): + page_title = soup.find('head').find('title').string + relevant_content = [] + for data in soup.find_all("meta"): + relevant = False + + prompt = self.metadata_classifier_prompt(data.attrs) + + if 'content' in data.attrs and 'http' in data.attrs['content']: + relevant = True + elif 'content' in data.attrs: + data.attrs['content'] = data.attrs['content'].strip() + relevant = self.initiate_chat(self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent).chat_history[-1]['content'] + elif 'property' in data.attrs: + data.attrs['property'] = data.attrs['property'].strip() + relevant = self.initiate_chat(self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent).chat_history[-1]['content'] + elif 'name' in data.attrs: + data.attrs['name'] = data.attrs['name'].strip() + relevant = self.initiate_chat(self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent).chat_history[-1]['content'] + + if relevant == 'True': + relevant_content.append(data.attrs) + if verbose: print(data.attrs) + + return relevant_content + + def collect_images(self, soup, local_path, verbose=False): + import os + def get_basename(filename): + return os.path.splitext(os.path.basename(filename))[0] + + for img in soup.find_all('img'): + + relevant = False + img_alt = img.attrs['alt'] if 'alt' in img.attrs else "" + img_src = img.attrs['src'].lower() + + if 'png;base64' in img_src: + from io import BytesIO + from PIL import Image + import base64 + + # Step 1: Strip the prefix to get the Base64 data + encoded_data = img.attrs['src'].split(",")[1] + + # Step 2: Decode the Base64 string + image_data = base64.b64decode(encoded_data) + + # Step 3: Create a BytesIO buffer from the decoded data + image_buffer = BytesIO(image_data) + + # Step 4: Open the image using PIL + image = Image.open(image_buffer) + + # Save the image to a file + image.save(f"{img_src.replace('data:image/png;base64','')[:28]}.png") + + elif 'logo' in img_src: + continue + + elif 'png' in img_src or 'jpg' in img_src or 'jpeg' in img_src or 'webp' in img_src or 'avif' in img_src or 'heif' in img_src or 'heic' in img_src or 'svg' in img_src: + + file_name = img_src.split("/")[-1] # there are other ways to do this + local_image_description_path = os.path.join(local_path, get_basename(file_name) + ".txt") + local_image_path = os.path.join(local_path, file_name) + if len(img_alt) > 0 and not os.path.exists(local_image_description_path): + with open(local_image_description_path, 'w') as f: + f.write(img_alt) + if not os.path.exists(local_image_path): + + image_url = fix_missing_protocol(img.attrs['src'], soup.url) + try: + # response = requests.get(image_url, params={'headers': self.request_kwargs}) + download_using_requests(self.browser, image_url, local_image_path) + except Exception: + print(image_url, img.attrs['src']) + traceback.print_exc() From 72a165aec33fb3ca17d63fa0a094ef82ce0d4639 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 17:58:49 -0600 Subject: [PATCH 05/36] Update content_agent.py Very minor updates prior to submitting a PR --- autogen/agentchat/contrib/content_agent.py | 32 ++++++++++++---------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/autogen/agentchat/contrib/content_agent.py b/autogen/agentchat/contrib/content_agent.py index e99f16aaa8c..518654015cd 100644 --- a/autogen/agentchat/contrib/content_agent.py +++ b/autogen/agentchat/contrib/content_agent.py @@ -8,7 +8,15 @@ from urllib.parse import urlparse, urlunparse from bs4 import BeautifulSoup - +# Import the arxiv library if it is available +IS_ARXIV_CAPABLE = False +try: + import arxiv + IS_ARXIV_CAPABLE = True +except ModuleNotFoundError: + print("The 'arxiv' library was not found in this environment, but can be installed with 'pip install arxiv'.") + pass + from ...browser_utils import ( SeleniumBrowser, download_using_requests, get_domain, get_scheme, get_path, get_last_path, get_file_path_from_url, fix_missing_protocol, @@ -41,7 +49,7 @@ class ContentAgent(ConversableAgent): - pillow """ - def __init__(self, silent=True, storage_path='./content', max_depth=1, page_loading_time=5, *args, **kwargs): #request_kwargs, + def __init__(self, silent=True, storage_path='./content', max_depth=1, page_loading_time=5, *args, **kwargs): super().__init__(*args, **kwargs) from collections import deque @@ -51,15 +59,16 @@ def __init__(self, silent=True, storage_path='./content', max_depth=1, page_load self.local_dir = storage_path self.page_load_time = page_loading_time self.silent = silent - self.browser_kwargs = kwargs.get('browser_kwargs', {"browser": "firefox"}) # {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.3; rv:122.0) Gecko/20100101 Firefox/122.0"} }) + self.browser_kwargs = kwargs.get('browser_kwargs', {"browser": "firefox"}) + self.request_kwargs = {'headers': { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15"} } self.small_llm_config = kwargs['llm_config'] # Define the classifiers self.define_classifiers() - def classifier_to_collector_reply(self, recipient, messages, sender, config): # replacement for classify_content + def classifier_to_collector_reply(self, recipient, messages, sender, config): + # Inner dialogue reply for boolean classification results last_message = messages[-1] if isinstance(messages, list) else messages - # print(last_message) _, rep = recipient.generate_oai_reply([last_message], sender) if 'false' in rep.lower(): rep = 'False' elif 'true' in rep.lower(): rep = 'True' @@ -128,7 +137,7 @@ def fetch_content(self, link): parsed_url = urlparse(link) # A special case for arxiv links - if 'arxiv' in link: + if 'arxiv' in link and IS_ARXIV_CAPABLE: return 'pdf', self.fetch_arxiv_content(parsed_url) elif parsed_url.path.endswith('.pdf'): @@ -214,10 +223,7 @@ def fetch_html_content(self, link): # Close down the browser self.browser.quit() - - # # Deallocate the variable contents - # self.browser = None - + return 'success' def fetch_pdf_content(self, link): @@ -227,8 +233,8 @@ def fetch_pdf_content(self, link): ) os.makedirs(local_pdf_path, exist_ok=True) - - response = requests.get(link, params={'headers': self.request_kwargs}) + # This could be replaced with `download_using_requests` + response = requests.get(link, params={'headers': self.request_kwargs['headers']}) if response.status_code == 200: with open(local_pdf_path, 'wb') as f: @@ -246,8 +252,6 @@ def fetch_pdf_content(self, link): return None def fetch_arxiv_content(self, link): - # Import the arxiv library - import arxiv # todo: add try/catch # Identify the paper identification arxiv_id = link.path.split('/')[-1] From 46b24249dc78731e120daa605b2b40a859b3203e Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 18:41:26 -0600 Subject: [PATCH 06/36] Update browser_utils.py small fix in the `fix_missing_protocol` function --- autogen/browser_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index 877793d6279..744d9950add 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -346,7 +346,7 @@ def fix_missing_protocol(img_url, source_url): # Correct a url if it's missing t if img_url.startswith('//'): # If the URL starts with "//" img_url = f"{protocol}:{img_url}" # Add "https:" before it - elif not bool(domain): # domain not in img_url: + elif not bool(get_domain(img_url)): # domain not in img_url: img_url = f"{protocol}://{domain}/{img_url}" return img_url From d34ae1b13da07ca58a642ad4ccad53be71ea16cc Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 18:47:20 -0600 Subject: [PATCH 07/36] Update content_agent.py Small addition to maintain a dictionary of processed html content, referenced by the source URL (Uniform Resource Locator) --- autogen/agentchat/contrib/content_agent.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/autogen/agentchat/contrib/content_agent.py b/autogen/agentchat/contrib/content_agent.py index 518654015cd..055b7ea677b 100644 --- a/autogen/agentchat/contrib/content_agent.py +++ b/autogen/agentchat/contrib/content_agent.py @@ -62,6 +62,7 @@ def __init__(self, silent=True, storage_path='./content', max_depth=1, page_load self.browser_kwargs = kwargs.get('browser_kwargs', {"browser": "firefox"}) self.request_kwargs = {'headers': { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15"} } self.small_llm_config = kwargs['llm_config'] + self.process_history = {} # Define the classifiers self.define_classifiers() @@ -224,6 +225,9 @@ def fetch_html_content(self, link): # Close down the browser self.browser.quit() + # Log the processed link, motivated by the unit test + self.process_history[link] = sd + return 'success' def fetch_pdf_content(self, link): From 1ba9e05eb837d4758e3e3daa29e0f6db3bb7b8d1 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 18:56:25 -0600 Subject: [PATCH 08/36] Update content_agent.py --- autogen/agentchat/contrib/content_agent.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autogen/agentchat/contrib/content_agent.py b/autogen/agentchat/contrib/content_agent.py index 055b7ea677b..e120f1a3b06 100644 --- a/autogen/agentchat/contrib/content_agent.py +++ b/autogen/agentchat/contrib/content_agent.py @@ -161,7 +161,7 @@ def fetch_html_content(self, link): # We can instantiate the browser now that we know where the files and downloads will go self.browser = SeleniumBrowser(browser=self.browser_kwargs['browser'], download_dir=sd['local_path']) - if 'github' in link and 'README.md' not in link: + if 'github.com' in link and 'README.md' not in link: # Small patch to facilitate github repos link = os.path.join(link, 'README.md') @@ -216,7 +216,7 @@ def fetch_html_content(self, link): # Check if we find any useful relevant links that we should catalog if ('project' in link['text'] or 'paper' in link['text'] or 'code' in link['text']) and 'marktekpost' in link['href'].lower(): self.additional_links.append(link['href']) - elif 'arxiv' in link['href'] or ( 'github' in link['href'] and (link['href'][:-3] != ".md" or os.path.basename(link['href'])=='README.md') ): + elif 'arxiv' in link['href'] or ( 'github.com' in link['href'] and (link['href'][:-3] != ".md" or os.path.basename(link['href'])=='README.md') ): self.additional_links.append(link['href']) # Parse and store the images @@ -226,7 +226,7 @@ def fetch_html_content(self, link): self.browser.quit() # Log the processed link, motivated by the unit test - self.process_history[link] = sd + self.process_history[ sd['url'] ] = sd return 'success' From 84fa1b8b41232b9ba24c772fb38b396ed3dce1f3 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 20:18:31 -0600 Subject: [PATCH 09/36] Unit Tests for the ContentAgent We cover a small sample of websites, asserting expectations against a number of measurements performed on the collected content. The assertions include, but are not limited to: - the expected variables contain values - the presence of the expected output files - that the expected output files are not empty Further improvements can include: - evaluation against all choices of WebDriver to confirm functionality - evaluation against a larger sample of websites - --- test/agentchat/contrib/test_content_agent.py | 129 +++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 test/agentchat/contrib/test_content_agent.py diff --git a/test/agentchat/contrib/test_content_agent.py b/test/agentchat/contrib/test_content_agent.py new file mode 100644 index 00000000000..b8754e4f9a5 --- /dev/null +++ b/test/agentchat/contrib/test_content_agent.py @@ -0,0 +1,129 @@ +import os +import sys +import re +import tempfile +import pytest +from autogen.agentchat import UserProxyAgent +from autogen.agentchat.contrib.content_agent import ContentAgent +from autogen.oai.openai_utils import filter_config, config_list_from_json +from autogen.cache import Cache + +sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +from conftest import MOCK_OPEN_AI_API_KEY, skip_openai # noqa: E402 + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402 + +try: + from openai import OpenAI +except ImportError: + skip_oai = True +else: + skip_oai = False or skip_openai + +if not skip_oai: + config_list = config_list_from_json(env_or_file=OAI_CONFIG_LIST, file_location=KEY_LOC) + + +@pytest.mark.skipif( + skip_oai, + reason="do not run if oai is not installed", +) +def test_content_agent(browser:str) -> None: + + llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": 42} + + model = ["gpt-3.5-turbo"] + model += [m.replace(".", "") for m in model] + + # model = ['dolphin-mistral:7b-v2.6-q8_0'] + assert len(llm_config["config_list"]) > 0 # type: ignore[arg-type] + + # Define the temporary storage location + temporary_content_storage = os.path.join( tempfile.gettempdir(), "test_content_agent_storage") + print( f"Storing temporary test files in {temporary_content_storage}" ) + + # Define the system message for the ContentAgent + content_agent_system_msg = "You are data collection agent specializing in content on the web." + + # Instantiate the ContentAgent + content_agent = ContentAgent( + name="ContentAgent", + system_message=content_agent_system_msg, + llm_config=llm_config, + max_consecutive_auto_reply=0, + silent=False, + + # Below are the arguments specific to the ContentAgent + storage_path=temporary_content_storage, + browser_kwargs={"browser": browser}, + max_depth=0, + ) + + # Instantiate the User Proxy Agent + user_proxy = UserProxyAgent( + "user_proxy", + human_input_mode="NEVER", + code_execution_config=False, + default_auto_reply="", + is_termination_msg=lambda x: True, + ) + + # Register the collection process as the default reply to the user + content_agent.register_reply(user_proxy, content_agent.collect_content) + + # Define the links used during the testing process + links = [ + "https://microsoft.github.io/autogen/docs/Examples", + "https://microsoft.github.io/autogen/docs/Getting-Started", + "https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/", + ] + + + with Cache.disk(): + + for link in links: + + # Collect the content from the requested link + user_proxy.initiate_chat(content_agent, message=link) + + assert content_agent.process_history[link]['url'] == link, "Investigate why the correct not link was reported" + + assert os.path.exists( content_agent.process_history[link]['local_path'] ), "The content storage path was not found" + + assert len(content_agent.process_history[link]['content']) > 0, "No content was identified or stored" + + assert os.path.exists( + os.path.join( content_agent.process_history[link]['local_path'], 'content.txt') + ), "The file path for content.txt was not found" + + assert os.path.exists( + os.path.join( content_agent.process_history[link]['local_path'], 'metadata.txt') + ), "The file path for metadata.txt was not found" + + assert os.path.exists( + os.path.join( content_agent.process_history[link]['local_path'], 'index.html') + ), "The file path for index.html was not found" + + assert os.path.exists( + os.path.join( content_agent.process_history[link]['local_path'], 'screenshot.png') + ), "The file path for screenshot.png was not found" + + assert os.path.exists( + os.path.join( content_agent.process_history[link]['local_path'], 'links.txt') + ), "The file path for links.txt was not found" + + assert os.path.getsize( os.path.join( content_agent.process_history[link]['local_path'], 'links.txt') ) > 0, "The file size of links.txt was zero" + assert os.path.getsize( os.path.join( content_agent.process_history[link]['local_path'], 'content.txt') ) > 0, "The file size of content.txt was zero" + assert os.path.getsize( os.path.join( content_agent.process_history[link]['local_path'], 'metadata.txt') ) > 0, "The file size of metadata.txt was zero" + assert os.path.getsize( os.path.join( content_agent.process_history[link]['local_path'], 'index.html') ) > 0, "The file size of index.html was zero" + assert os.path.getsize( os.path.join( content_agent.process_history[link]['local_path'], 'screenshot.png') ) > 0, "The file size of screenshot.png was zero" + + print() + print( f"All done, feel free to browse the collected content at: {temporary_content_storage}" ) + +if __name__ == "__main__": + """Runs this file's tests from the command line.""" + + if not skip_oai: + test_content_agent(browser="firefox") From 67f95bffcca041bccf383896be76ae1e30ea87da Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 20:33:34 -0600 Subject: [PATCH 10/36] Update browser_utils.py It's noted that `_set_page_content`, `_split_pages`, and `viewport` are likely not yet compatible but seemingly not necessary at this time for the selenium browser wrapper class. --- autogen/browser_utils.py | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index 744d9950add..0eecfb473df 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -318,7 +318,7 @@ def get_file_path_from_url(url): # URL to Directory function parsed_url = urlparse(url) if isinstance(url, str) else url canonical_url = parsed_url.netloc.replace("www.","") - if 'github' in url and len(parsed_url.path.split('/')) >= 2: + if 'github.com' in url and len(parsed_url.path.split('/')) >= 2: relative_path = os.path.join(canonical_url, parsed_url.path) elif len(parsed_url.path.split('/')) >= 1: relative_path = os.path.join(canonical_url, get_last_path(parsed_url)) @@ -577,26 +577,7 @@ def close(self): self.driver.quit() def _split_pages(self) -> None: - # # Split only regular pages - # if not self.address.startswith("http:") and not self.address.startswith("https:"): - # self.viewport_pages = [(0, len(self._page_content))] - # return - - # # Handle empty pages - # if len(self._page_content) == 0: - # self.viewport_pages = [(0, 0)] - # return - - # # Break the viewport into pages - # self.viewport_pages = [] - # start_idx = 0 - # while start_idx < len(self._page_content): - # end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator] - # # Adjust to end on a space - # while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]: - # end_idx += 1 - # self.viewport_pages.append((start_idx, end_idx)) - # start_idx = end_idx + # This is not implemented with the selenium.webdirver wrapper return def _bing_api_call(self, query: str) -> Dict[str, Dict[str, List[Dict[str, Union[str, Dict[str, str]]]]]]: @@ -674,12 +655,13 @@ def _fetch_page(self, url: str) -> None: # Example of extracting and cleaning the page content if "wikipedia.org" in url: - body_elm = self.driver.find_element_by_css_selector("div#mw-content-text") + + body_elm = self.driver.find_element(By.cssSelector, 'div#mw-content-text') main_title = self.driver.title webpage_text = "# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm.get_attribute('innerHTML')) else: - webpage_text = self.driver.find_element_by_tag_name('body').get_attribute('innerText') - + webpage_text = self.driver.find_element(By.TAG_NAME,'body').get_attribute('innerText') + # Convert newlines, remove excessive blank lines webpage_text = re.sub(r"\r\n", "\n", webpage_text) self._set_page_content(re.sub(r"\n{2,}", "\n\n", webpage_text).strip()) From 08f8ff90d1886d584cab9e16c3c921562ccd864a Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 20:38:59 -0600 Subject: [PATCH 11/36] Update web_surfer.py Small updates on imports that have been recently refactored to other locations. Specifically: ``` from ..agent import Agent from .. import ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat from ...oai.client import OpenAIWrapper ``` --- autogen/agentchat/contrib/web_surfer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/autogen/agentchat/contrib/web_surfer.py b/autogen/agentchat/contrib/web_surfer.py index e0342c29718..a4ee63bfece 100644 --- a/autogen/agentchat/contrib/web_surfer.py +++ b/autogen/agentchat/contrib/web_surfer.py @@ -5,7 +5,9 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, Union, Callable, Literal, Tuple from typing_extensions import Annotated -from ... import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat, OpenAIWrapper +from ..agent import Agent +from .. import ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat +from ...oai.client import OpenAIWrapper from ...browser_utils import SimpleTextBrowser, SeleniumBrowserWrapper, IS_SELENIUM_CAPABLE from ...code_utils import content_str from datetime import datetime From 39544125bc14e48bbbd979e68a80a07b873c9989 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 20:44:28 -0600 Subject: [PATCH 12/36] Update content_agent.py A small change to declaring `self.browser_kwargs` prior to initializing the parent class (ConversableAgent). This is done to avoid triggering an unexpected argument error for `browser_kwargs`. --- autogen/agentchat/contrib/content_agent.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autogen/agentchat/contrib/content_agent.py b/autogen/agentchat/contrib/content_agent.py index e120f1a3b06..0180077b2b6 100644 --- a/autogen/agentchat/contrib/content_agent.py +++ b/autogen/agentchat/contrib/content_agent.py @@ -50,6 +50,8 @@ class ContentAgent(ConversableAgent): """ def __init__(self, silent=True, storage_path='./content', max_depth=1, page_loading_time=5, *args, **kwargs): + + self.browser_kwargs = kwargs.pop('browser_kwargs', {"browser": "firefox"}) super().__init__(*args, **kwargs) from collections import deque @@ -59,7 +61,6 @@ def __init__(self, silent=True, storage_path='./content', max_depth=1, page_load self.local_dir = storage_path self.page_load_time = page_loading_time self.silent = silent - self.browser_kwargs = kwargs.get('browser_kwargs', {"browser": "firefox"}) self.request_kwargs = {'headers': { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15"} } self.small_llm_config = kwargs['llm_config'] self.process_history = {} From 749a55696047ad19edbf6d94d4d3d3d0f85ba66c Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 23:19:30 -0600 Subject: [PATCH 13/36] Update content_agent.py fixing the following pre-commit errors: autogen/agentchat/contrib/content_agent.py:21:1: E402 Module level import not at top of file autogen/agentchat/contrib/content_agent.py:34:1: E402 Module level import not at top of file autogen/agentchat/contrib/content_agent.py:65:33: F811 Redefinition of unused `deque` from line 6 autogen/agentchat/contrib/content_agent.py:374:26: F811 Redefinition of unused `filename` from line 7 --- autogen/agentchat/contrib/content_agent.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/autogen/agentchat/contrib/content_agent.py b/autogen/agentchat/contrib/content_agent.py index 0180077b2b6..0ff7e7c8a33 100644 --- a/autogen/agentchat/contrib/content_agent.py +++ b/autogen/agentchat/contrib/content_agent.py @@ -1,3 +1,11 @@ +from ..conversable_agent import ConversableAgent +from ..assistant_agent import AssistantAgent +from ...browser_utils import ( + SeleniumBrowser, download_using_requests, + get_domain, get_scheme, get_path, get_last_path, get_file_path_from_url, fix_missing_protocol, + extract_pdf_text +) + import os import re import json @@ -17,14 +25,7 @@ print("The 'arxiv' library was not found in this environment, but can be installed with 'pip install arxiv'.") pass -from ...browser_utils import ( - SeleniumBrowser, download_using_requests, - get_domain, get_scheme, get_path, get_last_path, get_file_path_from_url, fix_missing_protocol, - extract_pdf_text # perhaps there is a more logical location to hold this -) - -from autogen.agentchat.conversable_agent import ConversableAgent class ContentAgent(ConversableAgent): """ @@ -54,7 +55,6 @@ def __init__(self, silent=True, storage_path='./content', max_depth=1, page_load self.browser_kwargs = kwargs.pop('browser_kwargs', {"browser": "firefox"}) super().__init__(*args, **kwargs) - from collections import deque self.additional_links = deque() self.link_depth = 0 self.max_depth = max_depth @@ -78,7 +78,7 @@ def classifier_to_collector_reply(self, recipient, messages, sender, config): return True, rep def define_classifiers(self): - from autogen.agentchat.assistant_agent import AssistantAgent + f # Define the system messages for the classifiers self.metadata_classifier_system_msg = "Help the user identify if the metadata contains potentially useful information such as: author, title, description, a date, etc. Respond True for useful, False for not." From 818a010646f836b3c0d8a22409c0939c06e4053d Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 23:31:25 -0600 Subject: [PATCH 14/36] Update browser_utils.py Fixing the redundant import of selenium webdriver within `SeleniumBrowser` --- autogen/browser_utils.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index 0eecfb473df..7ea24608780 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -439,14 +439,6 @@ def SeleniumBrowser(**kwargs): # Function that loads the web driver browser = kwargs.get('browser', 'edge') download_dir = kwargs.get('download_dir', None) - try: - from selenium import webdriver - except ImportError as e: - import logging - logger = logging.getLogger(__name__) - logger.fatal("Failed to import selenium. Try running 'pip install selenium'. You may need to run 'sudo easy_install selenium' on Linux or MacOS") - raise e - def get_headless_options(download_dir): options = Options() options.headless = True From 643bad03c682d092ea1be46f965831307733b225 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Mon, 19 Feb 2024 23:48:09 -0600 Subject: [PATCH 15/36] Update content_agent.py Small corrections based on pre-commit errors, both resulting in removed code: content_agent.py:94:9: F821 Undefined name `f` content_agent.py:371:26: F811 Redefinition of unused `filename` from line 21 --- autogen/agentchat/contrib/content_agent.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/autogen/agentchat/contrib/content_agent.py b/autogen/agentchat/contrib/content_agent.py index 0ff7e7c8a33..0e4ffc649ac 100644 --- a/autogen/agentchat/contrib/content_agent.py +++ b/autogen/agentchat/contrib/content_agent.py @@ -12,7 +12,6 @@ import traceback import requests from collections import deque -from fileinput import filename from urllib.parse import urlparse, urlunparse from bs4 import BeautifulSoup @@ -78,7 +77,6 @@ def classifier_to_collector_reply(self, recipient, messages, sender, config): return True, rep def define_classifiers(self): - f # Define the system messages for the classifiers self.metadata_classifier_system_msg = "Help the user identify if the metadata contains potentially useful information such as: author, title, description, a date, etc. Respond True for useful, False for not." @@ -336,7 +334,7 @@ def identify_metadata(self, soup, verbose=False): return relevant_content def collect_images(self, soup, local_path, verbose=False): - import os + def get_basename(filename): return os.path.splitext(os.path.basename(filename))[0] From 20cd2a67da3bf7a39f680c804281a2a184148f1a Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Tue, 20 Feb 2024 00:44:36 -0600 Subject: [PATCH 16/36] Update browser_utils.py pre-commit fixes for: autogen/browser_utils.py:455: argumnets ==> arguments autogen/browser_utils.py:486: compatability ==> compatibility --- autogen/browser_utils.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index 7ea24608780..68266b9e089 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -33,6 +33,9 @@ from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys + from selenium.webdriver.edge.options import Options as EdgeOptions + from selenium.webdriver.firefox.options import Options as FirefoxOptions + from selenium.webdriver.chrome.options import Options as ChromeOptions IS_SELENIUM_CAPABLE = True except ImportError as e: @@ -439,8 +442,7 @@ def SeleniumBrowser(**kwargs): # Function that loads the web driver browser = kwargs.get('browser', 'edge') download_dir = kwargs.get('download_dir', None) - def get_headless_options(download_dir): - options = Options() + def get_headless_options(download_dir, options): options.headless = True options.add_argument('--headless') options.add_argument("--window-size=1920,5200") @@ -449,15 +451,13 @@ def get_headless_options(download_dir): options.set_preference("download.default_directory",download_dir) return options - if browser.lower()=='firefox': - from selenium.webdriver.firefox.options import Options - driver = webdriver.Firefox(options=get_headless_options(download_dir)) + if browser.lower()=='edge': + driver = webdriver.Edge(options=get_headless_options(download_dir, EdgeOptions())) + elif browser.lower()=='firefox': + driver = webdriver.Firefox(options=get_headless_options(download_dir, FirefoxOptions())) elif browser.lower()=='chrome': - from selenium.webdriver.chrome.options import Options - driver = webdriver.Chrome(options=get_headless_options(download_dir)) - elif browser.lower()=='edge': - from selenium.webdriver.edge.options import Options - driver = webdriver.Edge(options=get_headless_options(download_dir)) + driver = webdriver.Chrome(options=get_headless_options(download_dir, ChromeOptions())) + driver.capabilities['se:downloadsEnablead'] = True return driver From 0389387f527dfc7c7e074e3ac43908013c5fc56c Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Tue, 20 Feb 2024 02:02:55 -0600 Subject: [PATCH 17/36] Update test_web_surfer.py Still a bit new to the unit test framework and had to remove some conditional statements that are covered elsewhere --- test/agentchat/contrib/test_web_surfer.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/test/agentchat/contrib/test_web_surfer.py b/test/agentchat/contrib/test_web_surfer.py index 37983ffb980..69f9a3f0dd7 100644 --- a/test/agentchat/contrib/test_web_surfer.py +++ b/test/agentchat/contrib/test_web_surfer.py @@ -194,12 +194,8 @@ def test_web_surfer_bing(browser_type='text', web_driver=None) -> None: """Runs this file's tests from the command line.""" test_web_surfer() - - if not skip_oai: - test_web_surfer_oai() - - if not skip_bing: - test_web_surfer_bing() + test_web_surfer_oai() + test_web_surfer_bing() if IS_SELENIUM_CAPABLE: # Test the selenium browser if installed @@ -207,9 +203,5 @@ def test_web_surfer_bing(browser_type='text', web_driver=None) -> None: selected_driver = 'edge' # can be 'edge', 'firefox', or 'chrome' test_web_surfer(browser_type='selenium', web_driver=selected_driver) - - if not skip_oai: - test_web_surfer_oai(browser_type='selenium', web_driver=selected_driver) - - if not skip_bing: - test_web_surfer_bing(browser_type='selenium', web_driver=selected_driver) + test_web_surfer_oai(browser_type='selenium', web_driver=selected_driver) + test_web_surfer_bing(browser_type='selenium', web_driver=selected_driver) From be89b9bcb458962dc68f3759b01776005eabe944 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Tue, 20 Feb 2024 09:07:27 +0000 Subject: [PATCH 18/36] Updates to include selenium in websurfer extras, webdrivers in the python-package.yml workflow, and additional small fixes to bring the PR into compliance --- .github/workflows/python-package.yml | 12 + autogen/agentchat/contrib/content_agent.py | 325 ++++++++++--------- autogen/agentchat/contrib/web_surfer.py | 6 +- autogen/browser_utils.py | 157 +++++---- setup.py | 2 +- test/agentchat/contrib/test_content_agent.py | 100 +++--- test/agentchat/contrib/test_web_surfer.py | 40 ++- 7 files changed, 364 insertions(+), 278 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4f57c10ef70..0db7c66b7fe 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -54,3 +54,15 @@ jobs: TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} shell: pwsh run: twine upload dist/* + + - name: Setup edge + uses: browser-actions/setup-edge@latest + if: ${{ matrix.browsers == 'edge' }} + + - name: Setup firefox + uses: browser-actions/setup-firefox@latest + if: ${{ matrix.browsers == 'firefox' }} + + - name: Setup chrome + uses: browser-actions/setup-chrome@latest + if: ${{ matrix.browsers == 'chrome' }} diff --git a/autogen/agentchat/contrib/content_agent.py b/autogen/agentchat/contrib/content_agent.py index 0e4ffc649ac..c20cb1f0cb0 100644 --- a/autogen/agentchat/contrib/content_agent.py +++ b/autogen/agentchat/contrib/content_agent.py @@ -1,9 +1,15 @@ from ..conversable_agent import ConversableAgent from ..assistant_agent import AssistantAgent from ...browser_utils import ( - SeleniumBrowser, download_using_requests, - get_domain, get_scheme, get_path, get_last_path, get_file_path_from_url, fix_missing_protocol, - extract_pdf_text + SeleniumBrowser, + download_using_requests, + get_domain, + get_scheme, + get_path, + get_last_path, + get_file_path_from_url, + fix_missing_protocol, + extract_pdf_text, ) import os @@ -19,11 +25,11 @@ IS_ARXIV_CAPABLE = False try: import arxiv + IS_ARXIV_CAPABLE = True except ModuleNotFoundError: print("The 'arxiv' library was not found in this environment, but can be installed with 'pip install arxiv'.") pass - class ContentAgent(ConversableAgent): @@ -48,43 +54,51 @@ class ContentAgent(ConversableAgent): - arxiv - pillow - """ - def __init__(self, silent=True, storage_path='./content', max_depth=1, page_loading_time=5, *args, **kwargs): + """ - self.browser_kwargs = kwargs.pop('browser_kwargs', {"browser": "firefox"}) + def __init__(self, silent=True, storage_path="./content", max_depth=1, page_loading_time=5, *args, **kwargs): + self.browser_kwargs = kwargs.pop("browser_kwargs", {"browser": "firefox"}) super().__init__(*args, **kwargs) self.additional_links = deque() - self.link_depth = 0 - self.max_depth = max_depth - self.local_dir = storage_path - self.page_load_time = page_loading_time - self.silent = silent - self.request_kwargs = {'headers': { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15"} } - self.small_llm_config = kwargs['llm_config'] - self.process_history = {} + self.link_depth = 0 + self.max_depth = max_depth + self.local_dir = storage_path + self.page_load_time = page_loading_time + self.silent = silent + self.request_kwargs = { + "headers": { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15" + } + } + self.small_llm_config = kwargs["llm_config"] + self.process_history = {} # Define the classifiers self.define_classifiers() def classifier_to_collector_reply(self, recipient, messages, sender, config): # Inner dialogue reply for boolean classification results - last_message = messages[-1] if isinstance(messages, list) else messages + last_message = messages[-1] if isinstance(messages, list) else messages _, rep = recipient.generate_oai_reply([last_message], sender) - if 'false' in rep.lower(): rep = 'False' - elif 'true' in rep.lower(): rep = 'True' - else: rep = 'False' + if "false" in rep.lower(): + rep = "False" + elif "true" in rep.lower(): + rep = "True" + else: + rep = "False" return True, rep - - def define_classifiers(self): + def define_classifiers(self): # Define the system messages for the classifiers self.metadata_classifier_system_msg = "Help the user identify if the metadata contains potentially useful information such as: author, title, description, a date, etc. Respond True for useful, False for not." - self.content_classifier_system_msg = "You are to classify web data as content or other (such as an adversitement) based on the page title. Respond True if it is content, False if not." + self.content_classifier_system_msg = "You are to classify web data as content or other (such as an adversitement) based on the page title. Respond True if it is content, False if not." # Define the prompt templates for the classifiers - self.content_classifier_prompt = lambda title, content: f"Title: `{title}`, Data: ```{content}`" - self.metadata_classifier_prompt = lambda content: f"We are parsing html metadata to extract useful data. Should we hold onto this item? {content}." + self.content_classifier_prompt = lambda title, content: f"Title: `{title}`, Data: ```{content}`" + self.metadata_classifier_prompt = ( + lambda content: f"We are parsing html metadata to extract useful data. Should we hold onto this item? {content}." + ) # Define the metadata classifier self.metadata_classifier = AssistantAgent( @@ -104,172 +118,179 @@ def define_classifiers(self): ) self.content_classifier.register_reply(self, self.classifier_to_collector_reply, 1) - # Main entry point def collect_content(self, recipient, messages, sender, config): - content_type, content = '', '' - success = False + content_type, content = "", "" all_links = [] for message in messages: if message.get("role") == "user": - links = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', message.get("content")) + links = re.findall( + r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", + message.get("content"), + ) for link in links: all_links.append(link) # Process the links provided by the user for link in all_links: content_type, content = self.fetch_content(link) - + # Inform self that it has completed the root level of link(s) self.link_depth = 1 if self.link_depth <= self.max_depth: while len(self.additional_links) > 0: additional_link = self.additional_links.pop() - content_type, content = self.fetch_content( additional_link ) + content_type, content = self.fetch_content(additional_link) all_links.append(all_links) self.link_depth = 0 - return True, f"Success: archived the following links in your chosen location {self.local_dir}/ <-- {', '.join(all_links)}" - - def fetch_content(self, link): + return ( + True, + f"Success: archived the following links in your chosen location {self.local_dir}/ <-- {', '.join(all_links)}", + ) - # Parse the link + def fetch_content(self, link): + # Parse the link parsed_url = urlparse(link) # A special case for arxiv links - if 'arxiv' in link and IS_ARXIV_CAPABLE: - return 'pdf', self.fetch_arxiv_content(parsed_url) - - elif parsed_url.path.endswith('.pdf'): - return 'pdf', self.fetch_pdf_content(link) - + if "arxiv" in link and IS_ARXIV_CAPABLE: + return "pdf", self.fetch_arxiv_content(parsed_url) + + elif parsed_url.path.endswith(".pdf"): + return "pdf", self.fetch_pdf_content(link) + else: - return 'html', self.fetch_html_content(link) + return "html", self.fetch_html_content(link) def fetch_html_content(self, link): - # Handle web page content (html) - sd = {} # submission_data - sd['url'] = link + sd = {} # submission_data + sd["url"] = link # Establish the downloads folder - sd['local_path'] = os.path.join( self.local_dir, get_file_path_from_url(link) ) - os.makedirs(sd['local_path'], exist_ok=True) + sd["local_path"] = os.path.join(self.local_dir, get_file_path_from_url(link)) + os.makedirs(sd["local_path"], exist_ok=True) # We can instantiate the browser now that we know where the files and downloads will go - self.browser = SeleniumBrowser(browser=self.browser_kwargs['browser'], download_dir=sd['local_path']) + self.browser = SeleniumBrowser(browser=self.browser_kwargs["browser"], download_dir=sd["local_path"]) - if 'github.com' in link and 'README.md' not in link: + if "github.com" in link and "README.md" not in link: # Small patch to facilitate github repos - link = os.path.join(link, 'README.md') + link = os.path.join(link, "README.md") self.browser.get(link) self.browser.maximize_window() self.browser.implicitly_wait(self.page_load_time) - + # Define where the screeshot is stored - sd['browser_screenshot_path'] = os.path.join( sd['local_path'], "screenshot.png" ) + sd["browser_screenshot_path"] = os.path.join(sd["local_path"], "screenshot.png") # Save a screenshot of the browser window - self.browser.save_full_page_screenshot(sd['browser_screenshot_path']) - - sd['title'] = self.browser.title - sd['html'] = self.browser.page_source + self.browser.save_full_page_screenshot(sd["browser_screenshot_path"]) + + sd["title"] = self.browser.title + sd["html"] = self.browser.page_source # Write the HTML to disk for archival purposes - with open(os.path.join(sd['local_path'],'index.html'), 'w', encoding='utf-8') as f: + with open(os.path.join(sd["local_path"], "index.html"), "w", encoding="utf-8") as f: f.write(str(self.browser.page_source)) # Store the BS object - sd['soup'] = BeautifulSoup(sd['html'], 'html.parser') - - sd['content'] = self.identify_content( sd['soup'] ) - + sd["soup"] = BeautifulSoup(sd["html"], "html.parser") + + sd["content"] = self.identify_content(sd["soup"]) + # Save the content to a text file on disk - with open(os.path.join(sd['local_path'], "content.txt"), "w") as f: - for data in sd['content']: # Iterate over each record - f.write(data + "\n") # Write the content to the file + with open(os.path.join(sd["local_path"], "content.txt"), "w") as f: + for data in sd["content"]: # Iterate over each record + f.write(data + "\n") # Write the content to the file # Save the original URL for convenience elsewhere (when parsing images) - sd['soup'].url = link + sd["soup"].url = link # Parse and store the Metadata - sd['meta'] = self.identify_metadata(sd['soup']) # [ data.attrs for data in sd['soup'].find_all("meta") ] - + sd["meta"] = self.identify_metadata(sd["soup"]) # [ data.attrs for data in sd['soup'].find_all("meta") ] + # Open a file to write the metadata to - with open(os.path.join(sd['local_path'], "metadata.txt"), "w") as f: - for data in sd['meta']: # Iterate over each record - f.write(json.dumps(data) + "\n") # Write the link to the file + with open(os.path.join(sd["local_path"], "metadata.txt"), "w") as f: + for data in sd["meta"]: # Iterate over each record + f.write(json.dumps(data) + "\n") # Write the link to the file # Parse and store the links - sd['links'] = [{'text': link.get_text().strip(), 'href': link['href']} for link in sd['soup'].find_all('a') if link.has_attr('href') and '/' in link['href']] - + sd["links"] = [ + {"text": link.get_text().strip(), "href": link["href"]} + for link in sd["soup"].find_all("a") + if link.has_attr("href") and "/" in link["href"] + ] + # Open a file to write the link URLs to - with open(os.path.join(sd['local_path'], "links.txt"), "w") as f: - for link in sd['links']: # Iterate over each link - f.write(json.dumps(link) + "\n") # Write the link to the file + with open(os.path.join(sd["local_path"], "links.txt"), "w") as f: + for link in sd["links"]: # Iterate over each link + f.write(json.dumps(link) + "\n") # Write the link to the file # Recursive link checking, up to 1 level deep past the root if self.link_depth < 1: # Check if we find any useful relevant links that we should catalog - if ('project' in link['text'] or 'paper' in link['text'] or 'code' in link['text']) and 'marktekpost' in link['href'].lower(): - self.additional_links.append(link['href']) - elif 'arxiv' in link['href'] or ( 'github.com' in link['href'] and (link['href'][:-3] != ".md" or os.path.basename(link['href'])=='README.md') ): - self.additional_links.append(link['href']) - + if ( + "project" in link["text"] or "paper" in link["text"] or "code" in link["text"] + ) and "marktekpost" in link["href"].lower(): + self.additional_links.append(link["href"]) + elif "arxiv" in link["href"] or ( + "github.com" in link["href"] + and (link["href"][:-3] != ".md" or os.path.basename(link["href"]) == "README.md") + ): + self.additional_links.append(link["href"]) + # Parse and store the images - self.collect_images(sd['soup'], sd['local_path']) - + self.collect_images(sd["soup"], sd["local_path"]) + # Close down the browser self.browser.quit() - + # Log the processed link, motivated by the unit test - self.process_history[ sd['url'] ] = sd + self.process_history[sd["url"]] = sd - return 'success' - - def fetch_pdf_content(self, link): + return "success" - local_pdf_path = os.path.join( self.local_dir, - os.path.join( get_file_path_from_url(link), link.split('/')[-1] ) - ) + def fetch_pdf_content(self, link): + local_pdf_path = os.path.join(self.local_dir, os.path.join(get_file_path_from_url(link), link.split("/")[-1])) os.makedirs(local_pdf_path, exist_ok=True) - # This could be replaced with `download_using_requests` - response = requests.get(link, params={'headers': self.request_kwargs['headers']}) + # This could be replaced with `download_using_requests` + response = requests.get(link, params={"headers": self.request_kwargs["headers"]}) if response.status_code == 200: - with open(local_pdf_path, 'wb') as f: + with open(local_pdf_path, "wb") as f: f.write(response.content) - + # Extract text from the PDF file text = extract_pdf_text(local_pdf_path) # Let's store the content to disk for later access - with open(local_pdf_path.replace('pdf','txt'), 'w') as f: - f.write(text) + with open(local_pdf_path.replace("pdf", "txt"), "w") as f: + f.write(text) return text else: return None def fetch_arxiv_content(self, link): - # Identify the paper identification - arxiv_id = link.path.split('/')[-1] + arxiv_id = link.path.split("/")[-1] # Define the local directory - local_base_path = os.path.join( self.local_dir, get_file_path_from_url(link) ) + local_base_path = os.path.join(self.local_dir, get_file_path_from_url(link)) os.makedirs(local_base_path, exist_ok=True) - local_pdf_path = os.path.join( local_base_path, f"{arxiv_id}.pdf" ) + local_pdf_path = os.path.join(local_base_path, f"{arxiv_id}.pdf") # Download the paper if we don't already have it if not os.path.exists(local_pdf_path): - # Define the record belonging to the paper + # Define the record belonging to the paper paper = next(arxiv.Client().results(arxiv.Search(id_list=[arxiv_id]))) - + # Download the archive to the local downloads folder. paper.download_pdf(dirpath=local_base_path, filename=f"{arxiv_id}.pdf") @@ -279,78 +300,83 @@ def fetch_arxiv_content(self, link): text = extract_pdf_text(local_pdf_path) # Let's store the content to disk for later access - with open(local_pdf_path.replace('pdf','txt'), 'w') as f: - f.write(text) + with open(local_pdf_path.replace("pdf", "txt"), "w") as f: + f.write(text) return text - + def identify_content(self, soup): - # Get the page title for use with the queries - page_title = soup.find('head').find('title').string + page_title = soup.find("head").find("title").string # Find and extract relevant content from soup based on the title relevant_content = [] for element in soup.find_all(True): if element.name in ["h1", "h2", "h3", "p"]: - text = element.text.strip().replace("\t"," ").replace("\n"," ") + text = element.text.strip().replace("\t", " ").replace("\n", " ") if len(text) > 0: while text.find(" ") != -1: - text = text.replace(" "," ") + text = text.replace(" ", " ") prompt = self.content_classifier_prompt(page_title, text) - relevant = self.initiate_chat(self.content_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent).chat_history[-1]['content'] - if relevant == 'True': - relevant_content.append(text.strip()) - if not self.silent: + relevant = self.initiate_chat( + self.content_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent + ).chat_history[-1]["content"] + if relevant == "True": + relevant_content.append(text.strip()) + if not self.silent: print(element) return relevant_content def identify_metadata(self, soup, verbose=False): - page_title = soup.find('head').find('title').string + soup.find("head").find("title").string relevant_content = [] for data in soup.find_all("meta"): relevant = False prompt = self.metadata_classifier_prompt(data.attrs) - - if 'content' in data.attrs and 'http' in data.attrs['content']: + + if "content" in data.attrs and "http" in data.attrs["content"]: relevant = True - elif 'content' in data.attrs: - data.attrs['content'] = data.attrs['content'].strip() - relevant = self.initiate_chat(self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent).chat_history[-1]['content'] - elif 'property' in data.attrs: - data.attrs['property'] = data.attrs['property'].strip() - relevant = self.initiate_chat(self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent).chat_history[-1]['content'] - elif 'name' in data.attrs: - data.attrs['name'] = data.attrs['name'].strip() - relevant = self.initiate_chat(self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent).chat_history[-1]['content'] - - if relevant == 'True': - relevant_content.append(data.attrs) - if verbose: print(data.attrs) + elif "content" in data.attrs: + data.attrs["content"] = data.attrs["content"].strip() + relevant = self.initiate_chat( + self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent + ).chat_history[-1]["content"] + elif "property" in data.attrs: + data.attrs["property"] = data.attrs["property"].strip() + relevant = self.initiate_chat( + self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent + ).chat_history[-1]["content"] + elif "name" in data.attrs: + data.attrs["name"] = data.attrs["name"].strip() + relevant = self.initiate_chat( + self.metadata_classifier, message=prompt, max_turns=1, max_tokens=8, silent=self.silent + ).chat_history[-1]["content"] + + if relevant == "True": + relevant_content.append(data.attrs) + if verbose: + print(data.attrs) return relevant_content - - def collect_images(self, soup, local_path, verbose=False): + def collect_images(self, soup, local_path, verbose=False): def get_basename(filename): return os.path.splitext(os.path.basename(filename))[0] - for img in soup.find_all('img'): - - relevant = False - img_alt = img.attrs['alt'] if 'alt' in img.attrs else "" - img_src = img.attrs['src'].lower() + for img in soup.find_all("img"): + img_alt = img.attrs["alt"] if "alt" in img.attrs else "" + img_src = img.attrs["src"].lower() - if 'png;base64' in img_src: + if "png;base64" in img_src: from io import BytesIO from PIL import Image import base64 # Step 1: Strip the prefix to get the Base64 data - encoded_data = img.attrs['src'].split(",")[1] + encoded_data = img.attrs["src"].split(",")[1] # Step 2: Decode the Base64 string image_data = base64.b64decode(encoded_data) @@ -364,23 +390,30 @@ def get_basename(filename): # Save the image to a file image.save(f"{img_src.replace('data:image/png;base64','')[:28]}.png") - elif 'logo' in img_src: + elif "logo" in img_src: continue - elif 'png' in img_src or 'jpg' in img_src or 'jpeg' in img_src or 'webp' in img_src or 'avif' in img_src or 'heif' in img_src or 'heic' in img_src or 'svg' in img_src: - - file_name = img_src.split("/")[-1] # there are other ways to do this + elif ( + "png" in img_src + or "jpg" in img_src + or "jpeg" in img_src + or "webp" in img_src + or "avif" in img_src + or "heif" in img_src + or "heic" in img_src + or "svg" in img_src + ): + file_name = img_src.split("/")[-1] # there are other ways to do this local_image_description_path = os.path.join(local_path, get_basename(file_name) + ".txt") local_image_path = os.path.join(local_path, file_name) if len(img_alt) > 0 and not os.path.exists(local_image_description_path): - with open(local_image_description_path, 'w') as f: - f.write(img_alt) + with open(local_image_description_path, "w") as f: + f.write(img_alt) if not os.path.exists(local_image_path): - - image_url = fix_missing_protocol(img.attrs['src'], soup.url) + image_url = fix_missing_protocol(img.attrs["src"], soup.url) try: # response = requests.get(image_url, params={'headers': self.request_kwargs}) download_using_requests(self.browser, image_url, local_image_path) except Exception: - print(image_url, img.attrs['src']) + print(image_url, img.attrs["src"]) traceback.print_exc() diff --git a/autogen/agentchat/contrib/web_surfer.py b/autogen/agentchat/contrib/web_surfer.py index a4ee63bfece..6979ad51dd3 100644 --- a/autogen/agentchat/contrib/web_surfer.py +++ b/autogen/agentchat/contrib/web_surfer.py @@ -58,11 +58,11 @@ def __init__( self._create_summarizer_client(summarizer_llm_config, llm_config) # Determine if the user has requested the Selenium browser or not - browser_type = browser_config.pop('type', 'simple') - web_driver = browser_config.pop('web_driver', 'edge') + browser_type = browser_config.pop("type", "simple") + browser_config.pop("web_driver", "edge") # Create the browser - if browser_type != 'text' and IS_SELENIUM_CAPABLE: + if browser_type != "text" and IS_SELENIUM_CAPABLE: self.browser = SeleniumBrowserWrapper(**(browser_config if browser_config else {})) else: self.browser = SimpleTextBrowser(**(browser_config if browser_config else {})) diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index 68266b9e089..1bd7baf84b8 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -41,9 +41,12 @@ except ImportError as e: print(f"The module/package '{e.name}' is not available.") print("Try running 'pip install selenium'. You may need to run 'sudo easy_install selenium' on Linux or MacOS") - print("Official selenium installation documentation: https://www.selenium.dev/documentation/webdriver/getting_started/install_library/") + print( + "Official selenium installation documentation: https://www.selenium.dev/documentation/webdriver/getting_started/install_library/" + ) raise e + class SimpleTextBrowser: """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use.""" @@ -298,12 +301,24 @@ def _fetch_page(self, url: str) -> None: self.page_title = "Error" self._set_page_content(str(e)) -get_scheme = lambda url: urlparse(url).scheme if isinstance(url,str) else url.scheme -get_domain = lambda url: urlparse(url).netloc if isinstance(url,str) else url.netloc -get_path = lambda url: urlparse(url).path if isinstance(url, str) else url.path -get_last_path = lambda url: os.path.basename(urlparse(url).path) if isinstance(url, str) else os.path.basename(url.path) -def get_file_path_from_url(url): # URL to Directory function +def get_scheme(url): + return urlparse(url).scheme if isinstance(url, str) else url.scheme + + +def get_domain(url): + return urlparse(url).netloc if isinstance(url, str) else url.netloc + + +def get_path(url): + return urlparse(url).path if isinstance(url, str) else url.path + + +def get_last_path(url): + return os.path.basename(urlparse(url).path) if isinstance(url, str) else os.path.basename(url.path) + + +def get_file_path_from_url(url): # URL to Directory function """ get_file_path_from_url function: This function takes a URL as input and returns the corresponding local file path as a string. @@ -315,23 +330,24 @@ def get_file_path_from_url(url): # URL to Directory function """ # Remove any trailing forward slash - url = url[:-1] if url[-1] == '/' else url + url = url[:-1] if url[-1] == "/" else url # Parse the URL - parsed_url = urlparse(url) if isinstance(url, str) else url - canonical_url = parsed_url.netloc.replace("www.","") + parsed_url = urlparse(url) if isinstance(url, str) else url + canonical_url = parsed_url.netloc.replace("www.", "") - if 'github.com' in url and len(parsed_url.path.split('/')) >= 2: + if "github.com" in url and len(parsed_url.path.split("/")) >= 2: relative_path = os.path.join(canonical_url, parsed_url.path) - elif len(parsed_url.path.split('/')) >= 1: + elif len(parsed_url.path.split("/")) >= 1: relative_path = os.path.join(canonical_url, get_last_path(parsed_url)) # Remove any preceding forward slash - relative_path = relative_path[1:] if relative_path[0] == '/' else relative_path - + relative_path = relative_path[1:] if relative_path[0] == "/" else relative_path + return relative_path -def fix_missing_protocol(img_url, source_url): # Correct a url if it's missing the protocol + +def fix_missing_protocol(img_url, source_url): # Correct a url if it's missing the protocol """ Fixes a URL by adding the missing protocol (http or https) based on the provided domain. @@ -344,17 +360,18 @@ def fix_missing_protocol(img_url, source_url): # Correct a url if it's missing t """ protocol = get_scheme(source_url) - domain = get_domain(source_url) + domain = get_domain(source_url) + + if img_url.startswith("//"): # If the URL starts with "//" + img_url = f"{protocol}:{img_url}" # Add "https:" before it - if img_url.startswith('//'): # If the URL starts with "//" - img_url = f"{protocol}:{img_url}" # Add "https:" before it - - elif not bool(get_domain(img_url)): # domain not in img_url: + elif not bool(get_domain(img_url)): # domain not in img_url: img_url = f"{protocol}://{domain}/{img_url}" - + return img_url -def extract_pdf_text(local_pdf_path): # Returns the extracted text content from a local PDF file + +def extract_pdf_text(local_pdf_path): # Returns the extracted text content from a local PDF file """ Extracts the text content from a local PDF file and returns it as a string. @@ -369,11 +386,14 @@ def extract_pdf_text(local_pdf_path): # Returns the extracted text content from text = pdfminer.high_level.extract_text(local_pdf_path) except Exception: traceback.print_exc() - text = '' + text = "" return text -def download_using_requests(driver, download_url, save_path): # `requests` downloads assisted by selenium webdriver cookies + +def download_using_requests( + driver, download_url, save_path +): # `requests` downloads assisted by selenium webdriver cookies """ This function takes a Selenium WebDriver instance, a URL to download a file, and a path where you want to save the downloaded file. @@ -397,17 +417,19 @@ def get_cookies(driver): def convert_cookies_to_requests_format(cookies): cookie_dict = {} for cookie in cookies: - cookie_dict[cookie['name']] = cookie['value'] + cookie_dict[cookie["name"]] = cookie["value"] return cookie_dict def download_file_with_cookies(url, session_cookies, save_path, user_agent=None): headers = { - 'User-Agent': user_agent if user_agent else 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + "User-Agent": user_agent + if user_agent + else "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } - + response = requests.get(url, cookies=session_cookies, headers=headers, stream=True) if response.status_code == 200: - with open(save_path, 'wb') as file: + with open(save_path, "wb") as file: for chunk in response.iter_content(1024): file.write(chunk) @@ -423,48 +445,50 @@ def download_file_with_cookies(url, session_cookies, save_path, user_agent=None) # Download file using requests with the same session cookies and headers download_file_with_cookies(download_url, session_cookies, save_path, user_agent=user_agent) -def SeleniumBrowser(**kwargs): # Function that loads the web driver + +def SeleniumBrowser(**kwargs): # Function that loads the web driver """ This function launches a headless Selenium browser based on the specified 'browser'. The available options are 'edge', 'firefox', and 'chrome'. - + Parameters: - browser (str): A string specifying which browser to launch. Defaults to 'firefox'. + browser (str): A string specifying which browser to launch. Defaults to 'firefox'. download_dir (str): A path to where downloaded files are stored. Defaults to None Returns: webdriver: An instance of the Selenium WebDriver based on the specified browser. User can open a new page by `webdriver.get('https://www.microsoft.com')`. - + Raises: ImportError: If selenium package is not installed, it raises an ImportError with a message suggesting to install it using pip. - """ - - # Load the argumnets from kwargs - browser = kwargs.get('browser', 'edge') - download_dir = kwargs.get('download_dir', None) + """ + + # Load the arguments from kwargs + browser = kwargs.get("browser", "edge") + download_dir = kwargs.get("download_dir", None) def get_headless_options(download_dir, options): options.headless = True - options.add_argument('--headless') + options.add_argument("--headless") options.add_argument("--window-size=1920,5200") - options.add_argument('--downloadsEnabled') + options.add_argument("--downloadsEnabled") if download_dir: - options.set_preference("download.default_directory",download_dir) + options.set_preference("download.default_directory", download_dir) return options - if browser.lower()=='edge': + if browser.lower() == "edge": driver = webdriver.Edge(options=get_headless_options(download_dir, EdgeOptions())) - elif browser.lower()=='firefox': + elif browser.lower() == "firefox": driver = webdriver.Firefox(options=get_headless_options(download_dir, FirefoxOptions())) - elif browser.lower()=='chrome': + elif browser.lower() == "chrome": driver = webdriver.Chrome(options=get_headless_options(download_dir, ChromeOptions())) - - driver.capabilities['se:downloadsEnablead'] = True - - return driver -class SeleniumBrowserWrapper: # A wrapper to bridge compatability between SimpleTextBrowser and SeleniumBrowser + driver.capabilities["se:downloadsEnablead"] = True + + return driver + + +class SeleniumBrowserWrapper: # A wrapper to bridge compatibility between SimpleTextBrowser and SeleniumBrowser """ - SeleniumBrowserWrapper class is a wrapper that manages the interaction with a Selenium web driver. + SeleniumBrowserWrapper class is a wrapper that manages the interaction with a Selenium web driver. It provides methods to control the browser, set up the viewport size, and download files. Parameters: @@ -503,7 +527,7 @@ def __init__( downloads_folder: Optional[Union[str, None]] = None, bing_api_key: Optional[Union[str, None]] = None, request_kwargs: Optional[Union[Dict[str, Any], None]] = None, - web_driver: Optional[str] = 'edge', + web_driver: Optional[str] = "edge", ): self.start_page: str = start_page if start_page else "about:blank" self.viewport_size = viewport_size # Applies only to the standard uri types @@ -521,7 +545,7 @@ def __init__( self.driver = SeleniumBrowser(browser=web_driver, download_dir=downloads_folder) if start_page: self.set_address(self.start_page) - + @property def address(self) -> str: """Return the address of the current page.""" @@ -553,12 +577,12 @@ def visit_page(self, path_or_uri: str) -> str: def page_down(self) -> None: """Simulate page down action.""" # Simulate pressing Page Down key - self.driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN) + self.driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_DOWN) def page_up(self) -> None: """Simulate page up action.""" # Simulate pressing Page Up key - self.driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_UP) + self.driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_UP) def _update_page_content(self) -> None: """Update internal content state, including page title.""" @@ -630,35 +654,38 @@ def _bing_search(self, query: str) -> None: content += "\n\n## News Results:\n" + "\n\n".join(news_snippets) self._set_page_content(content) - def download(self, uri_or_path: str) -> None: # TODO: update this based on the new method + def download(self, uri_or_path: str) -> None: # TODO: update this based on the new method """Download from a given URI""" self.driver.get(uri_or_path) def _fetch_page(self, url: str) -> None: from selenium.common.exceptions import TimeoutException + try: self.driver.get(url) self.page_title = self.driver.title - - # Selenium WebDriver directly accesses the rendered page, + + # Selenium WebDriver directly accesses the rendered page, # so we don't need to manually fetch or process the HTML. # However, you can still manipulate or extract content from the page using Selenium methods. - content_type = "text/html" # Default to text/html since Selenium renders web pages - + # Example of extracting and cleaning the page content if "wikipedia.org" in url: - - body_elm = self.driver.find_element(By.cssSelector, 'div#mw-content-text') + body_elm = self.driver.find_element(By.cssSelector, "div#mw-content-text") main_title = self.driver.title - webpage_text = "# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm.get_attribute('innerHTML')) + webpage_text = ( + "# " + + main_title + + "\n\n" + + markdownify.MarkdownConverter().convert_soup(body_elm.get_attribute("innerHTML")) + ) else: - webpage_text = self.driver.find_element(By.TAG_NAME,'body').get_attribute('innerText') - + webpage_text = self.driver.find_element(By.TAG_NAME, "body").get_attribute("innerText") + # Convert newlines, remove excessive blank lines webpage_text = re.sub(r"\r\n", "\n", webpage_text) self._set_page_content(re.sub(r"\n{2,}", "\n\n", webpage_text).strip()) - - except TimeoutException as e: + + except TimeoutException: self.page_title = "Error" self._set_page_content("Timeout while retrieving " + url) - diff --git a/setup.py b/setup.py index a2577271cc1..2f8ecfdf5fb 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ "teachable": ["chromadb"], "lmm": ["replicate", "pillow"], "graph": ["networkx", "matplotlib"], - "websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate"], + "websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate", "selenium"], "redis": ["redis"], "ipython": ["jupyter-client>=8.6.0", "ipykernel>=6.29.0"], }, diff --git a/test/agentchat/contrib/test_content_agent.py b/test/agentchat/contrib/test_content_agent.py index b8754e4f9a5..6a5aeec3e5d 100644 --- a/test/agentchat/contrib/test_content_agent.py +++ b/test/agentchat/contrib/test_content_agent.py @@ -29,8 +29,7 @@ skip_oai, reason="do not run if oai is not installed", ) -def test_content_agent(browser:str) -> None: - +def test_content_agent(browser: str) -> None: llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": 42} model = ["gpt-3.5-turbo"] @@ -38,14 +37,14 @@ def test_content_agent(browser:str) -> None: # model = ['dolphin-mistral:7b-v2.6-q8_0'] assert len(llm_config["config_list"]) > 0 # type: ignore[arg-type] - + # Define the temporary storage location - temporary_content_storage = os.path.join( tempfile.gettempdir(), "test_content_agent_storage") - print( f"Storing temporary test files in {temporary_content_storage}" ) + temporary_content_storage = os.path.join(tempfile.gettempdir(), "test_content_agent_storage") + print(f"Storing temporary test files in {temporary_content_storage}") # Define the system message for the ContentAgent content_agent_system_msg = "You are data collection agent specializing in content on the web." - + # Instantiate the ContentAgent content_agent = ContentAgent( name="ContentAgent", @@ -53,7 +52,6 @@ def test_content_agent(browser:str) -> None: llm_config=llm_config, max_consecutive_auto_reply=0, silent=False, - # Below are the arguments specific to the ContentAgent storage_path=temporary_content_storage, browser_kwargs={"browser": browser}, @@ -74,56 +72,68 @@ def test_content_agent(browser:str) -> None: # Define the links used during the testing process links = [ - "https://microsoft.github.io/autogen/docs/Examples", + "https://microsoft.github.io/autogen/docs/Examples", "https://microsoft.github.io/autogen/docs/Getting-Started", "https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/", ] - with Cache.disk(): - for link in links: - - # Collect the content from the requested link + # Collect the content from the requested link user_proxy.initiate_chat(content_agent, message=link) - assert content_agent.process_history[link]['url'] == link, "Investigate why the correct not link was reported" - - assert os.path.exists( content_agent.process_history[link]['local_path'] ), "The content storage path was not found" - - assert len(content_agent.process_history[link]['content']) > 0, "No content was identified or stored" - - assert os.path.exists( - os.path.join( content_agent.process_history[link]['local_path'], 'content.txt') - ), "The file path for content.txt was not found" - - assert os.path.exists( - os.path.join( content_agent.process_history[link]['local_path'], 'metadata.txt') - ), "The file path for metadata.txt was not found" - - assert os.path.exists( - os.path.join( content_agent.process_history[link]['local_path'], 'index.html') - ), "The file path for index.html was not found" - - assert os.path.exists( - os.path.join( content_agent.process_history[link]['local_path'], 'screenshot.png') - ), "The file path for screenshot.png was not found" - - assert os.path.exists( - os.path.join( content_agent.process_history[link]['local_path'], 'links.txt') - ), "The file path for links.txt was not found" - - assert os.path.getsize( os.path.join( content_agent.process_history[link]['local_path'], 'links.txt') ) > 0, "The file size of links.txt was zero" - assert os.path.getsize( os.path.join( content_agent.process_history[link]['local_path'], 'content.txt') ) > 0, "The file size of content.txt was zero" - assert os.path.getsize( os.path.join( content_agent.process_history[link]['local_path'], 'metadata.txt') ) > 0, "The file size of metadata.txt was zero" - assert os.path.getsize( os.path.join( content_agent.process_history[link]['local_path'], 'index.html') ) > 0, "The file size of index.html was zero" - assert os.path.getsize( os.path.join( content_agent.process_history[link]['local_path'], 'screenshot.png') ) > 0, "The file size of screenshot.png was zero" + assert ( + content_agent.process_history[link]["url"] == link + ), "Investigate why the correct not link was reported" + + assert os.path.exists( + content_agent.process_history[link]["local_path"] + ), "The content storage path was not found" + + assert len(content_agent.process_history[link]["content"]) > 0, "No content was identified or stored" + + assert os.path.exists( + os.path.join(content_agent.process_history[link]["local_path"], "content.txt") + ), "The file path for content.txt was not found" + + assert os.path.exists( + os.path.join(content_agent.process_history[link]["local_path"], "metadata.txt") + ), "The file path for metadata.txt was not found" + + assert os.path.exists( + os.path.join(content_agent.process_history[link]["local_path"], "index.html") + ), "The file path for index.html was not found" + + assert os.path.exists( + os.path.join(content_agent.process_history[link]["local_path"], "screenshot.png") + ), "The file path for screenshot.png was not found" + + assert os.path.exists( + os.path.join(content_agent.process_history[link]["local_path"], "links.txt") + ), "The file path for links.txt was not found" + + assert ( + os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "links.txt")) > 0 + ), "The file size of links.txt was zero" + assert ( + os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "content.txt")) > 0 + ), "The file size of content.txt was zero" + assert ( + os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "metadata.txt")) > 0 + ), "The file size of metadata.txt was zero" + assert ( + os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "index.html")) > 0 + ), "The file size of index.html was zero" + assert ( + os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "screenshot.png")) > 0 + ), "The file size of screenshot.png was zero" print() - print( f"All done, feel free to browse the collected content at: {temporary_content_storage}" ) + print(f"All done, feel free to browse the collected content at: {temporary_content_storage}") + if __name__ == "__main__": """Runs this file's tests from the command line.""" - + if not skip_oai: test_content_agent(browser="firefox") diff --git a/test/agentchat/contrib/test_web_surfer.py b/test/agentchat/contrib/test_web_surfer.py index 69f9a3f0dd7..7cfd30669ee 100644 --- a/test/agentchat/contrib/test_web_surfer.py +++ b/test/agentchat/contrib/test_web_surfer.py @@ -46,7 +46,7 @@ skip_all, reason="do not run if dependency is not installed", ) -def test_web_surfer(browser_type='text', web_driver=None) -> None: +def test_web_surfer(browser_type="text", web_driver=None) -> None: with pytest.MonkeyPatch.context() as mp: # we mock the API key so we can register functions (llm_config must be present for this to work) mp.setenv("OPENAI_API_KEY", MOCK_OPEN_AI_API_KEY) @@ -54,7 +54,7 @@ def test_web_surfer(browser_type='text', web_driver=None) -> None: web_surfer = WebSurferAgent( "web_surfer", llm_config={"model": "gpt-4", "config_list": []}, - browser_config={"viewport_size": page_size, 'type': browser_type, 'web_driver': web_driver}, + browser_config={"viewport_size": page_size, "type": browser_type, "web_driver": web_driver}, ) # Sneak a peak at the function map, allowing us to call the functions for testing here @@ -70,24 +70,24 @@ def test_web_surfer(browser_type='text', web_driver=None) -> None: total_pages = int(m.group(1)) # type: ignore[union-attr] response = function_map["page_down"]() - if browser_type=='text': + if browser_type == "text": assert ( f"Viewport position: Showing page 2 of {total_pages}." in response ) # Assumes the content is longer than one screen response = function_map["page_up"]() - if browser_type=='text': + if browser_type == "text": assert f"Viewport position: Showing page 1 of {total_pages}." in response # Try to scroll too far back up response = function_map["page_up"]() - if browser_type=='text': + if browser_type == "text": assert f"Viewport position: Showing page 1 of {total_pages}." in response # Try to scroll too far down for i in range(0, total_pages + 1): response = function_map["page_down"]() - if browser_type=='text': + if browser_type == "text": assert f"Viewport position: Showing page {total_pages} of {total_pages}." in response if not skip_bing: @@ -110,7 +110,7 @@ def test_web_surfer(browser_type='text', web_driver=None) -> None: skip_oai, reason="do not run if oai is not installed", ) -def test_web_surfer_oai(browser_type='text', web_driver=None) -> None: +def test_web_surfer_oai(browser_type="text", web_driver=None) -> None: llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": 42} # adding Azure name variations to the model list @@ -132,7 +132,7 @@ def test_web_surfer_oai(browser_type='text', web_driver=None) -> None: "web_surfer", llm_config=llm_config, summarizer_llm_config=summarizer_llm_config, - browser_config={"viewport_size": page_size, 'type': browser_type, 'web_driver': web_driver}, + browser_config={"viewport_size": page_size, "type": browser_type, "web_driver": web_driver}, ) user_proxy = UserProxyAgent( @@ -160,7 +160,7 @@ def test_web_surfer_oai(browser_type='text', web_driver=None) -> None: skip_bing, reason="do not run if bing api key is not available", ) -def test_web_surfer_bing(browser_type='text', web_driver=None) -> None: +def test_web_surfer_bing(browser_type="text", web_driver=None) -> None: page_size = 4096 web_surfer = WebSurferAgent( "web_surfer", @@ -172,7 +172,12 @@ def test_web_surfer_bing(browser_type='text', web_driver=None) -> None: } ] }, - browser_config={"viewport_size": page_size, "bing_api_key": BING_API_KEY, 'type': browser_type, 'web_driver': web_driver}, + browser_config={ + "viewport_size": page_size, + "bing_api_key": BING_API_KEY, + "type": browser_type, + "web_driver": web_driver, + }, ) # Sneak a peak at the function map, allowing us to call the functions for testing here @@ -192,16 +197,15 @@ def test_web_surfer_bing(browser_type='text', web_driver=None) -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" - + test_web_surfer() test_web_surfer_oai() test_web_surfer_bing() - - if IS_SELENIUM_CAPABLE: # Test the selenium browser if installed - + + if IS_SELENIUM_CAPABLE: # Test the selenium browser if installed # Todo: automatically determine which is available in order to avoid unnecessary errors - selected_driver = 'edge' # can be 'edge', 'firefox', or 'chrome' + selected_driver = "edge" # can be 'edge', 'firefox', or 'chrome' - test_web_surfer(browser_type='selenium', web_driver=selected_driver) - test_web_surfer_oai(browser_type='selenium', web_driver=selected_driver) - test_web_surfer_bing(browser_type='selenium', web_driver=selected_driver) + test_web_surfer(browser_type="selenium", web_driver=selected_driver) + test_web_surfer_oai(browser_type="selenium", web_driver=selected_driver) + test_web_surfer_bing(browser_type="selenium", web_driver=selected_driver) From 0a4076379d7df59c67f432df27c564430e3b290d Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 06:10:31 +0000 Subject: [PATCH 19/36] Added the websurfer with desktop browser demo notebook --- .github/workflows/contrib-openai.yml | 9 + .github/workflows/contrib-tests.yml | 9 + .github/workflows/python-package.yml | 12 - autogen/agentchat/contrib/content_agent.py | 163 +- autogen/agentchat/contrib/web_surfer.py | 67 +- autogen/browser_utils.py | 502 +++++- notebook/agentchat_content_agent.ipynb | 1762 ++++++++++++++++++++ notebook/agentchat_custom_model.ipynb | 1 + notebook/agentchat_lmm_gpt-4v.ipynb | 2 - notebook/agentchat_surfer_edge.ipynb | 796 +++++++++ setup.py | 2 +- 11 files changed, 3192 insertions(+), 133 deletions(-) create mode 100644 notebook/agentchat_content_agent.ipynb create mode 100644 notebook/agentchat_surfer_edge.ipynb diff --git a/.github/workflows/contrib-openai.yml b/.github/workflows/contrib-openai.yml index a8cedb29a3c..04a22be58ff 100644 --- a/.github/workflows/contrib-openai.yml +++ b/.github/workflows/contrib-openai.yml @@ -245,6 +245,15 @@ jobs: pip install -e .[websurfer] python -c "import autogen" pip install coverage pytest + - name: Setup edge + uses: browser-actions/setup-edge@latest + if: ${{ matrix.browsers == 'edge' }} + - name: Setup firefox + uses: browser-actions/setup-firefox@latest + if: ${{ matrix.browsers == 'firefox' }} + - name: Setup chrome + uses: browser-actions/setup-chrome@latest + if: ${{ matrix.browsers == 'chrome' }} - name: Coverage env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index b82fe1baf03..81eaad453d3 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -200,6 +200,15 @@ jobs: - name: Install packages and dependencies for WebSurfer run: | pip install -e .[websurfer] + - name: Setup edge + uses: browser-actions/setup-edge@latest + if: ${{ matrix.browsers == 'edge' }} + - name: Setup firefox + uses: browser-actions/setup-firefox@latest + if: ${{ matrix.browsers == 'firefox' }} + - name: Setup chrome + uses: browser-actions/setup-chrome@latest + if: ${{ matrix.browsers == 'chrome' }} - name: Set AUTOGEN_USE_DOCKER based on OS shell: bash run: | diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 0db7c66b7fe..4f57c10ef70 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -54,15 +54,3 @@ jobs: TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} shell: pwsh run: twine upload dist/* - - - name: Setup edge - uses: browser-actions/setup-edge@latest - if: ${{ matrix.browsers == 'edge' }} - - - name: Setup firefox - uses: browser-actions/setup-firefox@latest - if: ${{ matrix.browsers == 'firefox' }} - - - name: Setup chrome - uses: browser-actions/setup-chrome@latest - if: ${{ matrix.browsers == 'chrome' }} diff --git a/autogen/agentchat/contrib/content_agent.py b/autogen/agentchat/contrib/content_agent.py index c20cb1f0cb0..49b6fee7211 100644 --- a/autogen/agentchat/contrib/content_agent.py +++ b/autogen/agentchat/contrib/content_agent.py @@ -1,3 +1,4 @@ +from ..agent import Agent from ..conversable_agent import ConversableAgent from ..assistant_agent import AssistantAgent from ...browser_utils import ( @@ -7,11 +8,12 @@ get_scheme, get_path, get_last_path, + github_path_rule, get_file_path_from_url, fix_missing_protocol, extract_pdf_text, ) - +from typing import List, Union, Any, Tuple import os import re import json @@ -20,6 +22,9 @@ from collections import deque from urllib.parse import urlparse, urlunparse from bs4 import BeautifulSoup +from io import BytesIO +from PIL import Image +import base64 # Import the arxiv library if it is available IS_ARXIV_CAPABLE = False @@ -33,38 +38,59 @@ class ContentAgent(ConversableAgent): - """ - ContentAgent: Custom LLM agent for collecting online content. - - The ContentAgent class is a custom Autogen agent that can be used to collect and store online content from different web pages. It extends the ConversableAgent class and provides additional functionality for managing a list of additional links, storing collected content in local directories, and customizing request headers. - ContentAgent uses deque to manage a list of additional links for further exploration, with a maximum depth limit set by max_depth parameter. The collected content is stored in the specified storage path (storage_path) using local directories. - ContentAgent can be customized with request_kwargs and llm_config parameters during instantiation. The default User-Agent header is used for requests, but it can be overridden by providing a new dictionary of headers under request_kwargs. - - Parameters: - request_kwargs (dict): A dictionary containing key-value pairs used to configure request parameters such as headers and other options. - storage_path (str): The path where the collected content will be stored. Defaults to './content'. - max_depth (int): Maximum depth limit for exploring additional links from a web page. Defaults to 1. - page_loading_time (float): Time in seconds to wait before loading each web page. Defaults to 5. - *args, **kwargs: Additional arguments and keyword arguments to be passed to the parent class ConversableAgent. - - Software Dependencies: - - beautifulsoup4 - - pdfminer - - selenium - - arxiv - - pillow - - """ - - def __init__(self, silent=True, storage_path="./content", max_depth=1, page_loading_time=5, *args, **kwargs): - self.browser_kwargs = kwargs.pop("browser_kwargs", {"browser": "firefox"}) + def __init__( + self, + silent: bool = True, + storage_path: str = "./content", + max_depth: int = 1, + page_load_time: float = 6, + *args, + **kwargs, + ): + """ + ContentAgent: Custom LLM agent for collecting online content. + + The ContentAgent class is a custom Autogen agent that can be used to collect and store online content from different + web pages. It extends the ConversableAgent class and provides additional functionality for managing a list of + additional links, storing collected content in local directories, and customizing request headers. ContentAgent + uses deque to manage a list of additional links for further exploration, with a maximum depth limit set by max_depth + parameter. The collected content is stored in the specified storage path (storage_path) using local directories. + ContentAgent can be customized with request_kwargs and llm_config parameters during instantiation. The default + User-Agent header is used for requests, but it can be overridden by providing a new dictionary of headers under + request_kwargs. + + Parameters: + silent (bool): If True, the agent operates in silent mode with minimal output. Defaults to True. + storage_path (str): The path where the collected content will be stored. Defaults to './content'. + max_depth (int): Maximum depth limit for exploring additional links from a web page. This defines how deep + the agent will go into linked pages from the starting point. Defaults to 1. + page_load_time (float): Time in seconds to wait for loading each web page. This ensures that dynamic content + has time to load before the page is processed. Defaults to 6 seconds. + *args, **kwargs: Additional arguments and keyword arguments to be passed to the parent class `ConversableAgent`. + These can be used to configure underlying behaviors of the agent that are not explicitly + covered by the constructor's parameters. + + Note: + The `silent` parameter can be useful for controlling the verbosity of the agent's operations, particularly + in environments where logging or output needs to be minimized for performance or clarity. + + Software Dependencies: + - requests + - beautifulsoup4 + - pdfminer + - selenium + - arxiv + - pillow + """ + + self.browser_kwargs = kwargs.pop("browser_config", {"browser": "firefox"}) super().__init__(*args, **kwargs) self.additional_links = deque() self.link_depth = 0 self.max_depth = max_depth self.local_dir = storage_path - self.page_load_time = page_loading_time + self.page_load_time = page_load_time self.silent = silent self.request_kwargs = { "headers": { @@ -73,21 +99,66 @@ def __init__(self, silent=True, storage_path="./content", max_depth=1, page_load } self.small_llm_config = kwargs["llm_config"] self.process_history = {} + self.browser = None + self.domain_path_rules = { + "github.com": github_path_rule, + # Add more domain rules as needed + } # Define the classifiers self.define_classifiers() - def classifier_to_collector_reply(self, recipient, messages, sender, config): - # Inner dialogue reply for boolean classification results + # def classifier_to_collector_reply(self, recipient, messages, sender, config): + # # Inner dialogue reply for boolean classification results + # last_message = messages[-1] if isinstance(messages, list) else messages + # _, rep = recipient.generate_oai_reply([last_message], sender) + # if "false" in rep.lower(): + # rep = "False" + # elif "true" in rep.lower(): + # rep = "True" + # else: + # rep = "False" + # return True, rep + def classifier_to_collector_reply( + self, + recipient: Agent, # Assuming no specific type is enforced; otherwise, replace Any with the specific class type + messages: Union[List[str], str], + sender: Agent, # Replace Any if the sender has a specific type + config: dict, + ) -> Tuple[bool, str]: + """ + Processes the last message in a conversation to generate a boolean classification response. + + This method takes the most recent message from a conversation, uses the recipient's method to generate a reply, + and classifies the reply as either "True" or "False" based on its content. It is designed for scenarios where + the reply is expected to represent a boolean value, simplifying downstream processing. + + Parameters: + recipient (Agent): The agent or object responsible for generating replies. Must have a method `generate_oai_reply` + that accepts a list of messages, a sender, and optionally a configuration, and returns a tuple + where the second element is the reply string. + messages (Union[List[str], str]): A list of messages or a single message string from the conversation. The last message + in this list is used to generate the reply. + sender (Agent): The entity that sent the message. This could be an identifier, an object, or any representation + that the recipient's reply generation method expects. + config (dict): Configuration parameters for the reply generation process, if required by the recipient's method. + + Returns: + Tuple[bool, str]: A tuple containing a boolean status (always True in this implementation) and the classification result + as "True" or "False" based on the content of the generated reply. + + Note: + The classification is case-insensitive and defaults to "False" if the reply does not explicitly contain + "true" or "false". This behavior ensures a conservative approach to classification. + """ last_message = messages[-1] if isinstance(messages, list) else messages _, rep = recipient.generate_oai_reply([last_message], sender) - if "false" in rep.lower(): - rep = "False" - elif "true" in rep.lower(): - rep = "True" - else: - rep = "False" - return True, rep + + # Streamlined classification logic + rep_lower = rep.lower() + classified_reply = "True" if "true" in rep_lower else "False" + + return True, classified_reply def define_classifiers(self): # Define the system messages for the classifiers @@ -170,7 +241,7 @@ def fetch_html_content(self, link): sd["url"] = link # Establish the downloads folder - sd["local_path"] = os.path.join(self.local_dir, get_file_path_from_url(link)) + sd["local_path"] = os.path.join(self.local_dir, get_file_path_from_url(link, self.domain_path_rules)) os.makedirs(sd["local_path"], exist_ok=True) # We can instantiate the browser now that we know where the files and downloads will go @@ -188,7 +259,13 @@ def fetch_html_content(self, link): sd["browser_screenshot_path"] = os.path.join(sd["local_path"], "screenshot.png") # Save a screenshot of the browser window - self.browser.save_full_page_screenshot(sd["browser_screenshot_path"]) + if self.browser_kwargs["browser"] == "firefox": + # save_full_page_screenshot + self.browser.save_full_page_screenshot(sd["browser_screenshot_path"]) + else: + page_height = self.browser.execute_script("return window.pageYOffset + window.innerHeight") + self.browser.set_window_size(1920, page_height) + self.browser.save_screenshot(sd["browser_screenshot_path"]) sd["title"] = self.browser.title sd["html"] = self.browser.page_source @@ -255,7 +332,9 @@ def fetch_html_content(self, link): return "success" def fetch_pdf_content(self, link): - local_pdf_path = os.path.join(self.local_dir, os.path.join(get_file_path_from_url(link), link.split("/")[-1])) + local_pdf_path = os.path.join( + self.local_dir, os.path.join(get_file_path_from_url(link, self.domain_path_rules), link.split("/")[-1]) + ) os.makedirs(local_pdf_path, exist_ok=True) # This could be replaced with `download_using_requests` @@ -281,7 +360,7 @@ def fetch_arxiv_content(self, link): arxiv_id = link.path.split("/")[-1] # Define the local directory - local_base_path = os.path.join(self.local_dir, get_file_path_from_url(link)) + local_base_path = os.path.join(self.local_dir, get_file_path_from_url(link, self.domain_path_rules)) os.makedirs(local_base_path, exist_ok=True) local_pdf_path = os.path.join(local_base_path, f"{arxiv_id}.pdf") @@ -371,10 +450,6 @@ def get_basename(filename): img_src = img.attrs["src"].lower() if "png;base64" in img_src: - from io import BytesIO - from PIL import Image - import base64 - # Step 1: Strip the prefix to get the Base64 data encoded_data = img.attrs["src"].split(",")[1] diff --git a/autogen/agentchat/contrib/web_surfer.py b/autogen/agentchat/contrib/web_surfer.py index 6979ad51dd3..024149d7b44 100644 --- a/autogen/agentchat/contrib/web_surfer.py +++ b/autogen/agentchat/contrib/web_surfer.py @@ -5,12 +5,18 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, Union, Callable, Literal, Tuple from typing_extensions import Annotated +from datetime import datetime from ..agent import Agent from .. import ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat from ...oai.client import OpenAIWrapper -from ...browser_utils import SimpleTextBrowser, SeleniumBrowserWrapper, IS_SELENIUM_CAPABLE +from ...browser_utils import ( + SimpleTextBrowser, + SeleniumBrowserWrapper, + IS_SELENIUM_CAPABLE, + display_binary_image, + generate_png_filename, +) from ...code_utils import content_str -from datetime import datetime from ...token_count_utils import count_token, get_max_token_limit from ...oai.openai_utils import filter_config @@ -58,14 +64,20 @@ def __init__( self._create_summarizer_client(summarizer_llm_config, llm_config) # Determine if the user has requested the Selenium browser or not - browser_type = browser_config.pop("type", "simple") - browser_config.pop("web_driver", "edge") + browser_type = browser_config.pop("type", "text") # Create the browser if browser_type != "text" and IS_SELENIUM_CAPABLE: self.browser = SeleniumBrowserWrapper(**(browser_config if browser_config else {})) + self.is_graphical_browser = True else: + # Cleanup any arguments specific to the desktop browser + if "web_driver" in browser_config: + browser_config.pop("web_driver") + if "render_text" in browser_config: + browser_config.pop("render_text") self.browser = SimpleTextBrowser(**(browser_config if browser_config else {})) + self.is_graphical_browser = False inner_llm_config = copy.deepcopy(llm_config) @@ -94,6 +106,18 @@ def __init__( self.register_reply([Agent, None], ConversableAgent.generate_function_call_reply) self.register_reply([Agent, None], ConversableAgent.check_termination_and_human_reply) + @property + def text_content(self): + return self.browser.page_content + + @property + def render_text(self): + self.browser._set_page_content(self.browser.page_content) + return self.browser.page_content + + def close_the_browser(self): + self.browser.driver.quit() + def _create_summarizer_client(self, summarizer_llm_config: Dict[str, Any], llm_config: Dict[str, Any]) -> None: # If the summarizer_llm_config is None, we copy it from the llm_config if summarizer_llm_config is None: @@ -192,6 +216,41 @@ def _page_down() -> str: header, content = _browser_state() return header.strip() + "\n=======================\n" + content + if self.is_graphical_browser: + + @self._user_proxy.register_for_execution() + @self._assistant.register_for_llm( + name="get_screenshot", + description="Captures and displays a screenshot of the current web page as seen by the browser.", + ) + def _get_screenshot( + url: Annotated[Optional[str], "[Optional] The url of the page. (Defaults to the current page)"] = None, + ) -> str: + if url is not None and url != self.browser.address: + self.browser.visit_page(url) + else: + url = self.browser.address + + self.screenshot = self.browser.driver.get_screenshot_as_png() + display_binary_image(self.screenshot) + + @self._user_proxy.register_for_execution() + @self._assistant.register_for_llm( + name="save_screenshot", + description="Saves a screenshot of the current web page as seen by the browser.", + ) + def _save_screenshot( + url: Annotated[Optional[str], "[Optional] The url of the page. (Defaults to the current page)"] = None, + ) -> str: + if url is not None and url != self.browser.address: + self.browser.visit_page(url) + else: + url = self.browser.address + + png_filename = generate_png_filename(url) + self.screenshot = self.browser.driver.save_screenshot(png_filename) + # display_binary_image(self.screenshot) + if self.summarization_client is not None: @self._user_proxy.register_for_execution() diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index 1bd7baf84b8..f06e09564d1 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -7,9 +7,16 @@ import io import uuid import mimetypes -from urllib.parse import urljoin, urlparse +import hashlib # Used for generating a content ID from the URL (currently unused) +import random +import string +import tempfile +from math import ceil # to determine the total number of pages +from typing import Any, Dict, List, Optional, Union, Tuple, Callable +from urllib.parse import ParseResult, urljoin, urlparse from bs4 import BeautifulSoup -from typing import Any, Dict, List, Optional, Union, Tuple +from PIL import Image +from IPython.core.display_functions import display # Optional PDF support IS_PDF_CAPABLE = False @@ -31,10 +38,16 @@ IS_SELENIUM_CAPABLE = False try: from selenium import webdriver + from selenium.common.exceptions import TimeoutException + + # from selenium.webdriver.support.ui import WebDriverWait # We might implement this next from selenium.webdriver.common.by import By + from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.keys import Keys + from selenium.webdriver.edge.service import Service as EdgeService from selenium.webdriver.edge.options import Options as EdgeOptions from selenium.webdriver.firefox.options import Options as FirefoxOptions + from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.chrome.options import Options as ChromeOptions IS_SELENIUM_CAPABLE = True @@ -302,63 +315,149 @@ def _fetch_page(self, url: str) -> None: self._set_page_content(str(e)) -def get_scheme(url): +def get_scheme(url: Union[str, ParseResult]) -> str: + """ + Extracts the scheme component from a given URL. + + This function supports both string URLs and ParseResult objects. For string URLs, it parses + the URL and extracts the scheme part. For ParseResult objects, it directly accesses the scheme attribute. + + Args: + url (Union[str, ParseResult]): The URL from which to extract the scheme. Can be a string or a ParseResult object. + + Returns: + str: The scheme of the URL (e.g., 'http', 'https'). + """ return urlparse(url).scheme if isinstance(url, str) else url.scheme -def get_domain(url): +def get_domain(url: Union[str, ParseResult]) -> str: + """ + Retrieves the domain (network location) component from a URL. + + Similar to `get_scheme`, this function can handle both string representations of URLs and + ParseResult objects. It extracts the network location part from the URL. + + Args: + url (Union[str, ParseResult]): The URL from which to extract the domain. Can be a string or a ParseResult object. + + Returns: + str: The domain of the URL (e.g., 'www.example.com'). + """ return urlparse(url).netloc if isinstance(url, str) else url.netloc -def get_path(url): - return urlparse(url).path if isinstance(url, str) else url.path +def get_path(url: Union[str, ParseResult]) -> str: + """ + Extracts the path component from a URL. + + This function processes both strings and ParseResult objects to return the path segment of the URL. + The path is the part of the URL that follows the domain but precedes any query parameters or fragment identifiers. + Args: + url (Union[str, ParseResult]): The URL from which to extract the path. Can be a string or a ParseResult object. -def get_last_path(url): - return os.path.basename(urlparse(url).path) if isinstance(url, str) else os.path.basename(url.path) + Returns: + str: The path of the URL (e.g., '/path/to/resource'). + """ + return urlparse(url).path if isinstance(url, str) else url.path -def get_file_path_from_url(url): # URL to Directory function +def get_last_path(url: Union[str, ParseResult]) -> str: """ - get_file_path_from_url function: This function takes a URL as input and returns the corresponding local file path as a string. + Retrieves the last component of the path from a URL. - Parameters: - url (str | ParseResult): The URL of the file for which the local path is to be obtained. + This function is useful for extracting the final part of the path, often representing a specific resource or page. + It handles both string URLs and ParseResult objects. For string URLs, it parses the URL to extract the path and then + retrieves the last component. + + Args: + url (Union[str, ParseResult]): The URL from which to extract the last path component. Can be a string or a ParseResult object. Returns: - str: The local file path on the system as a string. + str: The last component of the path (e.g., 'resource.html'). + """ + return ( + os.path.basename(urlparse(url).path.rstrip("/")) + if isinstance(url, str) + else os.path.basename(url.path.rstrip("/")) + ) + + +def github_path_rule(parsed_url: ParseResult) -> str: + """Specific rule for GitHub URLs.""" + return os.path.join(parsed_url.netloc.replace("www.", ""), parsed_url.path.lstrip("/")) + + +def default_path_rule(parsed_url: ParseResult) -> str: + """Fallback rule for general URLs.""" + return os.path.join(parsed_url.netloc.replace("www.", ""), get_last_path(parsed_url.path)) + + +def get_file_path_from_url( + url: Union[str, ParseResult], + domain_rules: Optional[Dict[str, Callable[[ParseResult], str]]] = None, + default_path_rule: Optional[Callable[[ParseResult], str]] = None, +) -> str: """ + Converts a URL into a corresponding local file path, allowing for domain-specific customization. + + This function takes a URL, either as a string or a ParseResult object, and generates a path that represents + the URL's location in a hypothetical local file system structure. It supports domain-specific rules for + customizable path generation, with a default rule applied to URLs from domains not explicitly configured. - # Remove any trailing forward slash - url = url[:-1] if url[-1] == "/" else url + Parameters: + url (Union[str, ParseResult]): The URL to be converted into a local file path. + domain_rules (Optional[Dict[str, Callable[[ParseResult], str]]]): A dictionary mapping domains to functions + that define how to construct file paths for URLs from those domains. + default_path_rule (Optional[Callable[[ParseResult], str]]): A function to construct file paths for URLs + from domains not covered by `domain_rules`. - # Parse the URL + Returns: + str: The generated local file path, which omits the protocol and optionally adjusts for specific domain structures. + """ + # Parse the URL if not already parsed_url = urlparse(url) if isinstance(url, str) else url canonical_url = parsed_url.netloc.replace("www.", "") - if "github.com" in url and len(parsed_url.path.split("/")) >= 2: - relative_path = os.path.join(canonical_url, parsed_url.path) - elif len(parsed_url.path.split("/")) >= 1: - relative_path = os.path.join(canonical_url, get_last_path(parsed_url)) + # Determine the appropriate path rule to use + if domain_rules and canonical_url in domain_rules: + path_rule = domain_rules[canonical_url] + else: + path_rule = ( + default_path_rule + if default_path_rule + else lambda u: os.path.join(u.netloc.replace("www.", ""), get_last_path(u.path.rstrip("/"))) + ) + + # Generate the relative path using the selected rule + relative_path = path_rule(parsed_url) - # Remove any preceding forward slash - relative_path = relative_path[1:] if relative_path[0] == "/" else relative_path + # Remove any preceding forward slash for consistency + relative_path = relative_path.lstrip("/") return relative_path -def fix_missing_protocol(img_url, source_url): # Correct a url if it's missing the protocol +def fix_missing_protocol(img_url: str, source_url: str) -> str: """ - Fixes a URL by adding the missing protocol (http or https) based on the provided domain. + Ensures that an image URL has a proper protocol specified, using the protocol of a source URL as a reference. + + This function checks if the given image URL lacks a protocol (http or https) and, if so, fixes the URL by + prepending it with the protocol from the source URL. This is useful for fixing relative URLs or those missing + a scheme. Parameters: - - img_url (str): The input image URL to be fixed. - - domain (str): The domain of the image URL which is used to determine the protocol. + img_url (str): The image URL to be corrected. It can be a relative URL or one missing a protocol. + source_url (str): The source URL from which to extract the protocol and, if necessary, the domain. Returns: - - str: A corrected URL string with the missing protocol added. - """ + str: The corrected image URL with a protocol. + Note: + The function handles URLs starting with "//" by directly adding the protocol. If the domain is missing + from `img_url`, the function constructs the full URL using the protocol and domain from `source_url`. + """ protocol = get_scheme(source_url) domain = get_domain(source_url) @@ -371,7 +470,7 @@ def fix_missing_protocol(img_url, source_url): # Correct a url if it's missing return img_url -def extract_pdf_text(local_pdf_path): # Returns the extracted text content from a local PDF file +def extract_pdf_text(local_pdf_path: str): # Returns the extracted text content from a local PDF file """ Extracts the text content from a local PDF file and returns it as a string. @@ -392,15 +491,19 @@ def extract_pdf_text(local_pdf_path): # Returns the extracted text content from def download_using_requests( - driver, download_url, save_path -): # `requests` downloads assisted by selenium webdriver cookies + driver: Union[ + webdriver.edge.webdriver.WebDriver, webdriver.firefox.webdriver.WebDriver, webdriver.chrome.webdriver.WebDriver + ], + download_url: str, + save_path: str, +) -> None: """ This function takes a Selenium WebDriver instance, a URL to download a file, and a path where you want to save the downloaded file. It first retrieves cookies from the given driver, converts them into a format suitable for use with the `requests` library, and then uses these cookies to successfully download the specified file using the `requests.get()` function. The `User-Agent` header is also set to match that used by the WebDriver instance. Args: - driver (webdriver.chrome.webdriver.WebDriver): A Selenium WebDriver instance, typically obtained from selenium.webdriver.Chrome() or another appropriate method for your browser of choice. + driver (webdriver.edge.webdriver.WebDriver): A Selenium WebDriver instance, typically obtained from selenium.webdriver.Edge() or another appropriate method for your browser of choice. download_url (str): The URL to the file you want to download. save_path (str): The path where you would like the downloaded file to be saved. @@ -424,7 +527,7 @@ def download_file_with_cookies(url, session_cookies, save_path, user_agent=None) headers = { "User-Agent": user_agent if user_agent - else "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15" } response = requests.get(url, cookies=session_cookies, headers=headers, stream=True) @@ -446,6 +549,45 @@ def download_file_with_cookies(url, session_cookies, save_path, user_agent=None) download_file_with_cookies(download_url, session_cookies, save_path, user_agent=user_agent) +def display_binary_image(binary_data): + """ + display_binary_image(binary_data): + This function displays the binary image data in Jupyter notebook cells or shows it in non-notebook environments. + + Args: + - binary_data (bytes): A bytes object containing the PNG image data. + + Returns: + - Nothing, but in non-notebook environment, it displays the image. + """ + img = Image.open(io.BytesIO(binary_data)) + try: + __IPYTHON__ + display(img) + except NameError: + img.show() + + +def generate_png_filename(url: str): # Function to help provide a PNG filename (with relative path) + """ + Generates a PNG filename based on the provided URL, along with a small random hash. + + Args: + url (str): The URL from which to create a filename. + + Returns: + str: A unique PNG filename based on the URL and a random hash. + """ + + # Split the URL into its components + parsed_url = urlparse(url) + + # Generate a 4-character random hash from lowercase letters and digits + random_hash = "".join(random.choices(string.ascii_lowercase + string.digits, k=6)) + + return f"{'.'.join(parsed_url.netloc.split('.')[-2:])}-{random_hash}.png" + + def SeleniumBrowser(**kwargs): # Function that loads the web driver """ This function launches a headless Selenium browser based on the specified 'browser'. The available options are 'edge', 'firefox', and 'chrome'. @@ -463,25 +605,82 @@ def SeleniumBrowser(**kwargs): # Function that loads the web driver # Load the arguments from kwargs browser = kwargs.get("browser", "edge") - download_dir = kwargs.get("download_dir", None) + download_dir = kwargs.get("download_dir", tempfile.gettempdir()) + if not download_dir: + download_dir = tempfile.gettempdir() + + browser_res = kwargs.get("resolution", (1920, 5200)) def get_headless_options(download_dir, options): options.headless = True options.add_argument("--headless") - options.add_argument("--window-size=1920,5200") + options.add_argument(f"--window-size={browser_res[0]},{browser_res[1]}") options.add_argument("--downloadsEnabled") if download_dir: options.set_preference("download.default_directory", download_dir) return options if browser.lower() == "edge": - driver = webdriver.Edge(options=get_headless_options(download_dir, EdgeOptions())) + options = EdgeOptions() + options.use_chromium = True # Ensure we're using the Chromium-based version of Edge + options.headless = True + options.add_argument("--headless") + options.add_argument(f"--window-size={browser_res[0]},{browser_res[1]}") + options.add_argument("--downloadsEnabled") + + prefs = { + "download.default_directory": download_dir, + "download.prompt_for_download": False, # Disable download prompt + "download.directory_upgrade": True, # Enable directory upgrade + "safebrowsing.enabled": True, # Enable safe browsing + } + options.add_experimental_option("prefs", prefs) + # Instantiate the EdgeService object + edge_service = EdgeService() + # Instantiate the Edge WebDriver with the configured options + driver = webdriver.Edge(options=options, service=edge_service) + elif browser.lower() == "firefox": - driver = webdriver.Firefox(options=get_headless_options(download_dir, FirefoxOptions())) + # Instantiate the Firefox Profile to specify options + profile = FirefoxProfile() + profile.set_preference("browser.download.folderList", 2) # Custom location + profile.set_preference("browser.download.dir", download_dir) + profile.set_preference("browser.download.useDownloadDir", True) + profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") # MIME type + # profile.set_preference("pdfjs.disabled", True) # Disable PDF viewer + profile.set_preference("javascript.enabled", False) + # profile.set_preference("browser.startup.homepage", "https://microsoft.com") + profile.update_preferences() + options = FirefoxOptions() + options.profile = profile + options.set_capability("se:downloadsEnabled", True) + + # Instantiate the Firefox WebDriver with the configured options + driver = webdriver.Firefox( + options=get_headless_options(download_dir, options) + ) # , service_log_path=f'{tempfile.tempdir}/geckodriver.log') + driver.capabilities["moz:processID"] + elif browser.lower() == "chrome": - driver = webdriver.Chrome(options=get_headless_options(download_dir, ChromeOptions())) + # Instantiate the Chrome Options + options = ChromeOptions() + prefs = { + "download.default_directory": download_dir, + "download.prompt_for_download": False, # Disable download prompt + "download.directory_upgrade": True, # Enable directory upgrade + "safebrowsing.enabled": True, # Enable safe browsing + } + options.add_experimental_option("prefs", prefs) + # Instantiate the Chrome WebDriver with the configured options + driver = webdriver.Chrome(options=get_headless_options(download_dir, options)) + else: + raise (f"Unknown browser type {browser}") + + # Ensure that downloads are permitted driver.capabilities["se:downloadsEnablead"] = True + # Ensure that the window is at the expected size + driver.set_window_size(browser_res[0], browser_res[1]) return driver @@ -527,10 +726,12 @@ def __init__( downloads_folder: Optional[Union[str, None]] = None, bing_api_key: Optional[Union[str, None]] = None, request_kwargs: Optional[Union[Dict[str, Any], None]] = None, - web_driver: Optional[str] = "edge", + browser: Optional[str] = "edge", + page_load_time: Optional[int] = 6, + resolution: Optional[Tuple] = (1920, 1080), + render_text: Optional[bool] = False, ): self.start_page: str = start_page if start_page else "about:blank" - self.viewport_size = viewport_size # Applies only to the standard uri types self.downloads_folder = downloads_folder self.history: List[str] = list() self.page_title: Optional[str] = None @@ -538,11 +739,15 @@ def __init__( self.viewport_pages: List[Tuple[int, int]] = list() self.bing_api_key = bing_api_key self.request_kwargs = request_kwargs - + self.page_load_time = page_load_time self._page_content = "" + self.window_width = resolution[0] + self.window_height = resolution[1] + self.viewport_size = resolution[1] # We override this from SimpleTextBrowser to match the browser window height + self.render_text = render_text # Just in case for functionality purposes # Initialize the WebDriver - self.driver = SeleniumBrowser(browser=web_driver, download_dir=downloads_folder) + self.driver = SeleniumBrowser(browser=browser, download_dir=downloads_folder, resolution=resolution) if start_page: self.set_address(self.start_page) @@ -554,7 +759,11 @@ def address(self) -> str: @property def viewport(self) -> str: """Return the content of the current viewport.""" - return self.driver.page_source # Selenium directly interacts with the page, no viewport concept + # display_binary_image(self.driver.get_screenshot_as_png()) + # self._page_content # or self.driver.page_source + # Image.open(io.BytesIO(self.driver.get_screenshot_as_png())) + # if self._page_content and len(self._page_content) > 0 + return self._page_content @property def page_content(self) -> str: @@ -565,9 +774,24 @@ def set_address(self, uri_or_path: str) -> None: """Navigate to a given URI and update history.""" if not uri_or_path.startswith("http:") and not uri_or_path.startswith("https:"): uri_or_path = urljoin(self.address, uri_or_path) - self.driver.get(uri_or_path) + self.history.append(uri_or_path) - self._update_page_content() + + # Handle special URIs + if uri_or_path == "about:blank": + self._set_page_content("") + elif uri_or_path.startswith("bing:"): + self._bing_search(uri_or_path[len("bing:") :].strip()) + else: + if not uri_or_path.startswith("http:") and not uri_or_path.startswith("https:"): + uri_or_path = urljoin(self.address, uri_or_path) + self.history[-1] = uri_or_path # Update the address with the fully-qualified path + # Navigate to the specified URI or path + self._fetch_page(uri_or_path) # Implemented, but not needed + # self.driver.get(uri_or_path) + # self.driver.implicitly_wait(self.page_load_time) + self.viewport_current_page = 0 + self._split_pages() def visit_page(self, path_or_uri: str) -> str: """Navigate to a page and return its content.""" @@ -593,7 +817,33 @@ def close(self): self.driver.quit() def _split_pages(self) -> None: - # This is not implemented with the selenium.webdirver wrapper + # Page scroll position + int(self.driver.execute_script("return document.documentElement.scrollHeight")) + + # Grab the current page height based on the scrollbar + self.page_height = self.driver.execute_script("return window.pageYOffset + window.innerHeight") + + # Calculate the total number of pages currently rendered + self.page_count = ceil(self.window_height / self.page_height) + + # Split only regular pages + if not self.address.startswith("http:") and not self.address.startswith("https:"): + self.viewport_pages = [(0, len(self._page_content))] + return + + # Handle empty pages + if len(self._page_content) == 0: + self.viewport_pages = [(0, 0)] + return + + # Break the viewport into pages + self.viewport_pages = [] + start_idx = 0 + while start_idx < self.page_height: + end_idx = min(start_idx + self.viewport_size, self.page_height) # type: ignore[operator] + self.viewport_pages.append((start_idx, end_idx)) + start_idx = end_idx + return def _bing_api_call(self, query: str) -> Dict[str, Dict[str, List[Dict[str, Union[str, Dict[str, str]]]]]]: @@ -625,7 +875,7 @@ def _bing_api_call(self, query: str) -> Dict[str, Dict[str, List[Dict[str, Union def _bing_search(self, query: str) -> None: results = self._bing_api_call(query) - + self.bing_results = results web_snippets: List[str] = list() idx = 0 for page in results["webPages"]["value"]: @@ -652,40 +902,152 @@ def _bing_search(self, query: str) -> None: ) if len(news_snippets) > 0: content += "\n\n## News Results:\n" + "\n\n".join(news_snippets) + self._set_page_content(content) + def _set_page_content(self, content): + """Sets the text content of the current page.""" + self._page_content = content + + # Your custom HTML content + custom_html_content = "" + content.replace("\n", "
") + "" + + # Create a temporary HTML file + with tempfile.NamedTemporaryFile("w", delete=False, suffix=".html") as tmp_file: + tmp_file.write(custom_html_content) + html_file_path = tmp_file.name + + # Navigate to the file + self.driver.get(f"file://{html_file_path}") + def download(self, uri_or_path: str) -> None: # TODO: update this based on the new method """Download from a given URI""" self.driver.get(uri_or_path) - def _fetch_page(self, url: str) -> None: - from selenium.common.exceptions import TimeoutException + def _get_headers(self): + def parse_list_to_dict(lst): + result_dict = {} + for item in lst: + key, value = item.split(": ", 1) + # Attempt to load JSON content if present + try: + value_json = json.loads(value) + result_dict[key] = value_json + except json.JSONDecodeError: + # Handle non-JSON value + result_dict[key] = value + return result_dict + + headers = self.driver.execute_script( + "var req = new XMLHttpRequest();req.open('GET', document.location, false);req.send(null);return req.getAllResponseHeaders()" + ) + headers = headers.splitlines() + headers = parse_list_to_dict(headers) + return headers + def _fetch_page(self, url: str) -> None: try: self.driver.get(url) + self.driver.implicitly_wait(self.page_load_time) + self.history.append(url) + headers = self._get_headers() + self.page_title = self.driver.title - # Selenium WebDriver directly accesses the rendered page, - # so we don't need to manually fetch or process the HTML. - # However, you can still manipulate or extract content from the page using Selenium methods. - - # Example of extracting and cleaning the page content - if "wikipedia.org" in url: - body_elm = self.driver.find_element(By.cssSelector, "div#mw-content-text") - main_title = self.driver.title - webpage_text = ( - "# " - + main_title - + "\n\n" - + markdownify.MarkdownConverter().convert_soup(body_elm.get_attribute("innerHTML")) - ) - else: - webpage_text = self.driver.find_element(By.TAG_NAME, "body").get_attribute("innerText") + # We can't get response codes without using a proxy or using requests in a double call + content_type = headers.get("content-type", "") + for ct in ["text/html", "text/plain", "application/pdf"]: + if ct in content_type.lower(): + content_type = ct + break + + if content_type == "text/html": + html = self.driver.page_source + soup = BeautifulSoup(html, "html.parser") + + # Remove javascript and style blocks + for script in soup(["script", "style"]): + script.extract() + + # Convert to markdown -- Wikipedia gets special attention to get a clean version of the page + if url.startswith("https://en.wikipedia.org/"): + body_elm = soup.find("div", {"id": "mw-content-text"}) + title_elm = soup.find("span", {"class": "mw-page-title-main"}) + + if body_elm: + # What's the title + main_title = soup.title.string + if title_elm and len(title_elm) > 0: + main_title = title_elm.string + webpage_text = ( + "# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm) + ) + else: + webpage_text = markdownify.MarkdownConverter().convert_soup(soup) + else: + webpage_text = markdownify.MarkdownConverter().convert_soup(soup) - # Convert newlines, remove excessive blank lines - webpage_text = re.sub(r"\r\n", "\n", webpage_text) - self._set_page_content(re.sub(r"\n{2,}", "\n\n", webpage_text).strip()) + # Convert newlines + webpage_text = re.sub(r"\r\n", "\n", webpage_text) - except TimeoutException: + # Remove excessive blank lines + if self.render_text: + self.page_title = soup.title.string + self._set_page_content(webpage_text.strip()) + else: + self._page_content = webpage_text + + elif content_type == "text/plain": + html = self.driver.page_source + soup = BeautifulSoup(html, "html.parser") + plain_text = soup.prettify() + if self.render_text: + self.page_title = None + self._set_page_content(plain_text) + else: + self._page_content = plain_text + + elif IS_PDF_CAPABLE and content_type == "application/pdf": + download_using_requests(self.driver, self.downloads_folder, os.path.basename(url)) + plain_text = extract_pdf_text(os.path.join(self.downloads_folder, os.path.basename(url))) + if self.render_text: + self.page_title = None + self._set_page_content(plain_text) + else: + self._page_content = plain_text + + elif self.downloads_folder is not None: + # Try producing a safe filename + fname = None + try: + fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip() + except NameError: + pass + + # No suitable name, so make one + if fname is None: + extension = mimetypes.guess_extension(content_type) + if extension is None: + extension = ".download" + fname = str(uuid.uuid4()) + extension + + # Open a file for writing + download_path = os.path.abspath(os.path.join(self.downloads_folder, fname)) + download_using_requests(self.driver, self.downloads_folder, fname) + + # Return a page describing what just happened + if self.render_text: + self.page_title = "Download complete." + self._set_page_content(f"Downloaded '{url}' to '{download_path}'.") + else: + self._page_content = f"Downloaded '{url}' to '{download_path}'." + + elif self.render_text: + self.page_title = f"Error - Unsupported Content-Type '{content_type}'" + self._set_page_content(self.page_title) + else: + self._page_content = None + + except requests.exceptions.RequestException as e: self.page_title = "Error" - self._set_page_content("Timeout while retrieving " + url) + self._set_page_content(str(e)) diff --git a/notebook/agentchat_content_agent.ipynb b/notebook/agentchat_content_agent.ipynb new file mode 100644 index 00000000000..94383714bcc --- /dev/null +++ b/notebook/agentchat_content_agent.ipynb @@ -0,0 +1,1762 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Content Collection Tasks with ContentAgent\n", + "\n", + "### Why would we want this?\n", + "As part of a larger pipeline, `ContentAgent` accomplishes the task of automatic retrieval and storage of online content for numerous downstream tasks. \n", + "This task is facilitated by a headless Selenium webdriver. \n", + "\n", + "\n", + "## Requirements\n", + "\n", + "AutoGen requires `Python>=3.8`. To run this notebook example, please install:\n", + "```bash\n", + "pip install pyautogen, selenium, markdownify, pillow, pdfminer.six, beautifulsoup4\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# %pip install pyautogen selenium markdownify pillow pdfminer.six beautifulsoup4 arxiv\n", + "## or\n", + "# %pip install \"pyautogen[websurfer]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ensure that we have the WebDrivers present for Selenium\n", + "Following the instructions in [Selenium Documentation](https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location/#download-the-driver), \n", + "we first download the web driver for our browser of choice, or all 3: [Edge](https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads), [Firefox](https://github.com/mozilla/geckodriver/releases), [Chrome](https://chromedriver.chromium.org/downloads)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Neither powershell nor pwsh is installed.\n" + ] + } + ], + "source": [ + "# %%capture --no-stderr\n", + "import os\n", + "import logging\n", + "import autogen\n", + "from PIL import Image\n", + "from IPython.core.display_functions import display\n", + "from autogen.agentchat.contrib.content_agent import ContentAgent\n", + "from autogen.agentchat.user_proxy_agent import UserProxyAgent\n", + "from autogen.oai import config_list_from_json\n", + "from autogen.browser_utils import display_binary_image\n", + "from autogen.browser_utils import get_file_path_from_url\n", + "\n", + "# Get the logger instance for the current module (__name__).\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set your API Endpoint\n", + "\n", + "The [`config_list_from_json`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_json) function loads a list of configurations from an environment variable or a json file.\n", + "\n", + "It first looks for environment variable \"OAI_CONFIG_LIST\" which needs to be a valid json string. If that variable is not found, it then looks for a json file named \"OAI_CONFIG_LIST\". It filters the configs by models (you can filter by other keys as well).\n", + "\n", + "The WebSurferAgent uses a combination of models. GPT-4 and GPT-3.5-turbo-16 are recommended.\n", + "\n", + "Your json config should look something like the following:\n", + "```json\n", + "[\n", + " {\n", + " \"model\": \"gpt-4\",\n", + " \"api_key\": \"\"\n", + " },\n", + " {\n", + " \"model\": \"gpt-3.5-turbo-16k\",\n", + " \"api_key\": \"\"\n", + " }\n", + "]\n", + "```\n", + "\n", + "If you open this notebook in colab, you can upload your files by clicking the file icon on the left panel and then choose \"upload file\" icon.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "llm_config = {\n", + " \"timeout\": 600,\n", + " \"cache_seed\": 44, # change the seed for different trials\n", + " \"config_list\": config_list_from_json(\n", + " \"OAI_CONFIG_LIST\",\n", + " # filter_dict={\"model\": [\"Sakura-SOLAR-Instruct-f16\"]},\n", + " filter_dict={\n", + " \"model\": [\"gpt-3.5-turbo\"]\n", + " }, # , \"gpt-4\", \"gpt-4-0613\", \"gpt-4-32k\", \"gpt-4-32k-0613\", \"gpt-4-1106-preview\"]},\n", + " ),\n", + " \"temperature\": 0,\n", + "}\n", + "\n", + "summarizer_llm_config = {\n", + " \"timeout\": 600,\n", + " \"cache_seed\": 44, # change the seed for different trials\n", + " \"config_list\": config_list_from_json(\n", + " \"OAI_CONFIG_LIST\",\n", + " # filter_dict={\"model\": [\"Sakura-SOLAR-Instruct-f16\"]},\n", + " filter_dict={\"model\": [\"gpt-3.5-turbo\"]},\n", + " ),\n", + " \"temperature\": 0,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure Bing\n", + "\n", + "For WebSurferAgent to be reasonably useful, it needs to be able to search the web -- and that means it needs a Bing API key. \n", + "You can read more about how to get an API on the [Bing Web Search API](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) page.\n", + "\n", + "Once you have your key, either set it as the `BING_API_KEY` system environment variable, or simply input your key below." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "bing_api_key = os.environ[\"BING_API_KEY\"] if \"BING_API_KEY\" in os.environ else \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define our agents" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Specify where our web content will be stored, we'll use this at the end of the notebook\n", + "storage_path = \"./content\"\n", + "\n", + "content_agent = ContentAgent(\n", + " name=\"ContentAgent\",\n", + " system_message=\"You are data collection agent specializing in content on the web.\",\n", + " max_depth=0,\n", + " llm_config=llm_config,\n", + " max_consecutive_auto_reply=0,\n", + " silent=False, # *NEW* In case we want to hear the inner-conversation,\n", + " storage_path=storage_path, # *NEW* This is where our archived content is stored, defaulting to `./content`\n", + " browser_config={\n", + " \"bing_api_key\": bing_api_key,\n", + " \"type\": \"selenium\", # *NEW* Here we specify that we intend to use our headless GUI browser. The default setting is \"text\".\n", + " \"browser\": \"edge\", # *NEW* We'll use the edge browser for these tests. Choices include 'edge', 'firefox', and 'chrome'\n", + " # \"resolution\": (1400,900), # *NEW* we specify the browser window size. The default is (1920,5200)\n", + " \"render_text\": False, # *NEW* We still have the option to convert the output to text and render it on the screen\n", + " },\n", + ")\n", + "\n", + "# Define the user agent\n", + "user_proxy = autogen.agentchat.UserProxyAgent(\n", + " \"user_proxy\",\n", + " human_input_mode=\"NEVER\",\n", + " code_execution_config=False,\n", + " default_auto_reply=\"\",\n", + " is_termination_msg=lambda x: True,\n", + " max_consecutive_auto_reply=0,\n", + ")\n", + "\n", + "# We register our collection function as the default response\n", + "content_agent.register_reply(user_proxy, content_agent.collect_content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's take it for a spin! \n", + "The Autogen open-source framework has an academic paper on arxiv.org! We'd certainly be interested to have that in our archives for later retrieval" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to ContentAgent):\n", + "\n", + "https://arxiv.org/abs/2308.08155\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to user_proxy):\n", + "\n", + "Success: archived the following links in your chosen location ./content/ <-- https://arxiv.org/abs/2308.08155\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': 'https://arxiv.org/abs/2308.08155', 'role': 'assistant'}, {'content': 'Success: archived the following links in your chosen location ./content/ <-- https://arxiv.org/abs/2308.08155', 'role': 'user'}], summary='Success: archived the following links in your chosen location ./content/ <-- https://arxiv.org/abs/2308.08155', cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "link = \"https://arxiv.org/abs/2308.08155\"\n", + "\n", + "user_proxy.initiate_chat(content_agent, message=link)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### We'll try another, this time the examples page from the Autogen official website" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to ContentAgent):\n", + "\n", + "https://microsoft.github.io/autogen/docs/Examples\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Examples`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Automated Multi Agent Chat​`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```AutoGen offers conversable agents powered by LLM, tool or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation via multi-agent conversation. Please find documentation about this feature here.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

AutoGen offers conversable agents powered by LLM, tool or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation via multi-agent conversation.\n", + "Please find documentation about this feature here.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Links to notebook examples:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Code Generation, Execution, and Debugging`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Code Generation, Execution, and Debugging

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Multi-Agent Collaboration (>3 Agents)`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Sequential Multi-Agent Chats`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Applications`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Tool Use`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Human Involvement`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Agent Teaching and Learning`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Multi-Agent Chat with OpenAI Assistants in the loop`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Multimodal Agent`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Long Context Handling`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Evaluation and Assessment`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Automatic Agent Building`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Enhanced Inferences​`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Utilities​`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Inference Hyperparameters Tuning​`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```AutoGen offers a cost-effective hyperparameter optimization technique EcoOptiGen for tuning Large Language Models. The research study finds that tuning hyperparameters can significantly improve the utility of them. Please find documentation about this feature here.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

AutoGen offers a cost-effective hyperparameter optimization technique EcoOptiGen for tuning Large Language Models. The research study finds that tuning hyperparameters can significantly improve the utility of them.\n", + "Please find documentation about this feature here.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `Examples | AutoGen`, Data: ```Links to notebook examples:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'generator', 'content': 'Docusaurus v3.1.1'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'data-rh': 'true', 'name': 'twitter:card', 'content': 'summary_large_image'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'data-rh': 'true', 'property': 'og:locale', 'content': 'en'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'data-rh': 'true', 'name': 'docusaurus_locale', 'content': 'en'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'data-rh': 'true', 'name': 'docsearch:language', 'content': 'en'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'viewport', 'content': 'width=device-width, initial-scale=1.0', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'docusaurus_version', 'content': 'current', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'docusaurus_tag', 'content': 'docs-default-current', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'docsearch:version', 'content': 'current', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'docsearch:docusaurus_tag', 'content': 'docs-default-current', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:title', 'content': 'Examples | AutoGen', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'description', 'content': 'Automated Multi Agent Chat', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:description', 'content': 'Automated Multi Agent Chat', 'data-rh': 'true'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to user_proxy):\n", + "\n", + "Success: archived the following links in your chosen location ./content/ <-- https://microsoft.github.io/autogen/docs/Examples\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': 'https://microsoft.github.io/autogen/docs/Examples', 'role': 'assistant'}, {'content': 'Success: archived the following links in your chosen location ./content/ <-- https://microsoft.github.io/autogen/docs/Examples', 'role': 'user'}], summary='Success: archived the following links in your chosen location ./content/ <-- https://microsoft.github.io/autogen/docs/Examples', cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "link = \"https://microsoft.github.io/autogen/docs/Examples\"\n", + "user_proxy.initiate_chat(content_agent, message=link)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see a lot of communication taking place when listening to the inner-dialog. The agent needs to confirm relevance of various pieces of content so its not storing advertisements or content not associated with the page topic.\n", + "\n", + "### We'll collect one more recent and very interesting publication by the good scientists at Microsoft" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to ContentAgent):\n", + "\n", + "https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Global`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Microsoft Research Blog`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```GraphRAG: Unlocking LLM discovery on narrative private data`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Published February 13, 2024`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

\n", + "\t\t\t\tPublished\t\t\t\t\n", + "

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```By Jonathan Larson , Senior Principal Data Architect Steven Truitt , Principal Program Manager`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Share this page`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Perhaps the greatest challenge – and opportunity – of LLMs is extending their powerful capabilities to solve problems beyond the data on which they have been trained, and to achieve comparable results with data the LLM has never seen.  This opens new possibilities in data investigation, such as identifying themes and semantic concepts with context and grounding on datasets.  In this post, we introduce GraphRAG, created by Microsoft Research, as a significant advance in enhancing the capability of LLMs.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Perhaps the greatest challenge – and opportunity – of LLMs is extending their powerful capabilities to solve problems beyond the data on which they have been trained, and to achieve comparable results with data the LLM has never seen.  This opens new possibilities in data investigation, such as identifying themes and semantic concepts with context and grounding on datasets.  In this post, we introduce GraphRAG, created by Microsoft Research, as a significant advance in enhancing the capability of LLMs.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Retrieval-Augmented Generation (RAG) is a technique to search for information based on a user query and provide the results as reference for an AI answer to be generated. This technique is an important part of most LLM-based tools and the majority of RAG approaches use vector similarity as the search technique. GraphRAG uses LLM-generated knowledge graphs to provide substantial improvements in question-and-answer performance when conducting document analysis of complex information.  This builds upon our recent research, which points to the power of prompt augmentation when performing discovery on private datasets. Here, we define private dataset as data that the LLM is not trained on and has never seen before, such as an enterprise’s proprietary research, business documents, or communications. Baseline RAG1 was created to help solve this problem, but we observe situations where baseline RAG performs very poorly. For example:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Retrieval-Augmented Generation (RAG) is a technique to search for information based on a user query and provide the results as reference for an AI answer to be generated. This technique is an important part of most LLM-based tools and the majority of RAG approaches use vector similarity as the search technique. GraphRAG uses LLM-generated knowledge graphs to provide substantial improvements in question-and-answer performance when conducting document analysis of complex information.  This builds upon our recent research, which points to the power of prompt augmentation when performing discovery on private datasets. Here, we define private dataset as data that the LLM is not trained on and has never seen before, such as an enterprise’s proprietary research, business documents, or communications. Baseline RAG1 was created to help solve this problem, but we observe situations where baseline RAG performs very poorly. For example:

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```To address this, the tech community is working to develop methods that extend and enhance RAG (e.g., LlamaIndex (opens in new tab)).  Microsoft Research’s new approach, GraphRAG, uses the LLM to create a knowledge graph based on the private dataset.  This graph is then used alongside graph machine learning to perform prompt augmentation at query time.  GraphRAG shows substantial improvement in answering the two classes of questions described above, demonstrating intelligence or mastery that outperforms other approaches previously applied to private datasets.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

To address this, the tech community is working to develop methods that extend and enhance RAG (e.g., LlamaIndex (opens in new tab)).  Microsoft Research’s new approach, GraphRAG, uses the LLM to create a knowledge graph based on the private dataset.  This graph is then used alongside graph machine learning to perform prompt augmentation at query time.  GraphRAG shows substantial improvement in answering the two classes of questions described above, demonstrating intelligence or mastery that outperforms other approaches previously applied to private datasets.   

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Applying RAG to private datasets`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```To demonstrate the effectiveness of GraphRAG, let’s start with an investigation using the Violent Incident Information from News Articles (VIINA) dataset (opens in new tab).  This dataset was chosen due to its complexity and the presence of differing opinions and partial information.  It is a messy real-world test case that was recent enough not to be included in the LLM base model’s training.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

To demonstrate the effectiveness of GraphRAG, let’s start with an investigation using the Violent Incident Information from News Articles (VIINA) dataset (opens in new tab).  This dataset was chosen due to its complexity and the presence of differing opinions and partial information.  It is a messy real-world test case that was recent enough not to be included in the LLM base model’s training.  

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```For this research, we use thousands of news articles from both Russian and Ukrainian news sources for the month of June 2023, translated into English, to create a private dataset on which we will perform our LLM-based retrieval.  The dataset is far too large to fit into an LLM context window, thus demanding a RAG approach.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

For this research, we use thousands of news articles from both Russian and Ukrainian news sources for the month of June 2023, translated into English, to create a private dataset on which we will perform our LLM-based retrieval.  The dataset is far too large to fit into an LLM context window, thus demanding a RAG approach.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```We start with an exploratory query, which we pose to both a baseline RAG system and to our new approach, GraphRAG:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

We start with an exploratory query, which we pose to both a baseline RAG system and to our new approach, GraphRAG:

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Query: “What is Novorossiya?”`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```In these results, we can see both systems perform well – highlighting a class of query on which baseline RAG performs well.  Let’s try a query that requires connecting the dots:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

In these results, we can see both systems perform well – highlighting a class of query on which baseline RAG performs well.  Let’s try a query that requires connecting the dots:

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Query: “What has Novorossiya done?”`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Baseline RAG fails to answer this question.  Looking at the source documents inserted into the context window (Figure 1), none of the text segments discuss Novorossiya, resulting in this failure.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Baseline RAG fails to answer this question.  Looking at the source documents inserted into the context window (Figure 1), none of the text segments discuss Novorossiya, resulting in this failure.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```In comparison, the GraphRAG approach discovered an entity in the query, Novorossiya.  This allows the LLM to ground itself in the graph and results in a superior answer that contains provenance through links to the original supporting text.  For example, Figure 2 below shows the exact content the LLM used for the LLM-generated statement, “Novorossiya has been implicated in plans to blow up ATMs.” We see the snippet from the raw source documents (after English translation) that the LLM used to support the assertion that a specific bank was a target for Novorossiya via the relationship that exists between the two entities in the graph.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

In comparison, the GraphRAG approach discovered an entity in the query, Novorossiya.  This allows the LLM to ground itself in the graph and results in a superior answer that contains provenance through links to the original supporting text.  For example, Figure 2 below shows the exact content the LLM used for the LLM-generated statement, “Novorossiya has been implicated in plans to blow up ATMs.” We see the snippet from the raw source documents (after English translation) that the LLM used to support the assertion that a specific bank was a target for Novorossiya via the relationship that exists between the two entities in the graph. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```By using the LLM-generated knowledge graph, GraphRAG vastly improves the “retrieval” portion of RAG, populating the context window with higher relevance content, resulting in better answers and capturing evidence provenance.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

By using the LLM-generated knowledge graph, GraphRAG vastly improves the “retrieval” portion of RAG, populating the context window with higher relevance content, resulting in better answers and capturing evidence provenance. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Being able to trust and verify LLM-generated results is always important.  We care that the results are factually correct, coherent, and accurately represent content found in the source material. GraphRAG provides the provenance, or source grounding information, as it generates each response.  It demonstrates that an answer is grounded in the dataset.  Having the cited source for each assertion readily available also enables a human user to quickly and accurately audit the LLM’s output directly against the original source material.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Being able to trust and verify LLM-generated results is always important.  We care that the results are factually correct, coherent, and accurately represent content found in the source material. GraphRAG provides the provenance, or source grounding information, as it generates each response.  It demonstrates that an answer is grounded in the dataset.  Having the cited source for each assertion readily available also enables a human user to quickly and accurately audit the LLM’s output directly against the original source material.   

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```However, this isn’t all that’s possible using GraphRAG.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

However, this isn’t all that’s possible using GraphRAG. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Whole dataset reasoning`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Whole dataset reasoning 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Baseline RAG struggles with queries that require aggregation of information across the dataset to compose an answer. Queries such as “What are the top 5 themes in the data?” perform terribly because baseline RAG relies on a vector search of semantically similar text content within the dataset. There is nothing in the query to direct it to the correct information.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Baseline RAG struggles with queries that require aggregation of information across the dataset to compose an answer. Queries such as “What are the top 5 themes in the data?” perform terribly because baseline RAG relies on a vector search of semantically similar text content within the dataset. There is nothing in the query to direct it to the correct information. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```However, with GraphRAG we can answer such questions, because the structure of the LLM-generated knowledge graph tells us about the structure (and thus themes) of the dataset as a whole.  This allows the private dataset to be organized into meaningful semantic clusters that are pre-summarized.  The LLM uses these clusters to summarize these themes when responding to a user query.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

However, with GraphRAG we can answer such questions, because the structure of the LLM-generated knowledge graph tells us about the structure (and thus themes) of the dataset as a whole.  This allows the private dataset to be organized into meaningful semantic clusters that are pre-summarized.  The LLM uses these clusters to summarize these themes when responding to a user query. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```We illustrate whole-dataset reasoning abilities by posing the following question to the two systems:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

We illustrate whole-dataset reasoning abilities by posing the following question to the two systems: 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Query: “What are the top 5 themes in the data?“`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Looking at the results from baseline RAG, we see that none of the listed themes has much to do with the war between the two countries.  As anticipated, the vector search retrieved irrelevant text, which was inserted into the LLM’s context window.  Results that were included were likely keying on the word “theme,” resulting in a less than useful assessment of what is going on in the dataset.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Looking at the results from baseline RAG, we see that none of the listed themes has much to do with the war between the two countries.  As anticipated, the vector search retrieved irrelevant text, which was inserted into the LLM’s context window.  Results that were included were likely keying on the word “theme,” resulting in a less than useful assessment of what is going on in the dataset. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Observing the results from GraphRAG, we can clearly see that the results are far more aligned with what is going on in the dataset as a whole.  The answer provides the five main themes as well as supporting details that are observed in the dataset.  The referenced reports are pre-generated by the LLM for each semantic cluster in GraphRAG and, in turn, provide provenance back to original source material.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Observing the results from GraphRAG, we can clearly see that the results are far more aligned with what is going on in the dataset as a whole.  The answer provides the five main themes as well as supporting details that are observed in the dataset.  The referenced reports are pre-generated by the LLM for each semantic cluster in GraphRAG and, in turn, provide provenance back to original source material.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Spotlight: On-demand video`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```AI Explainer: Foundation models ​and the next era of AI`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Explore how the transformer architecture, larger models and more data, and in-context learning have helped advance AI from perception to creation.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Explore how the transformer architecture, larger models and more data, and in-context learning have helped advance AI from perception to creation.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Creating LLM-generated knowledge graphs`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

Creating LLM-generated knowledge graphs

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```We note the basic flow that underpins GraphRAG, which builds upon our prior research (opens in new tab) and repositories (opens in new tab) using graph machine learning:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

We note the basic flow that underpins GraphRAG, which builds upon our prior research (opens in new tab) and repositories (opens in new tab) using graph machine learning: 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```An example visualization of the graph is shown in Figure 3.  Each circle is an entity (e.g., a person, place, or organization), with the entity size representing the number of relationships that entity has, and the color representing groupings of similar entities.  The color partitioning is a bottom-up clustering method built on top of the graph structure, which enables us to answer questions at varying levels of abstraction.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

An example visualization of the graph is shown in Figure 3.  Each circle is an entity (e.g., a person, place, or organization), with the entity size representing the number of relationships that entity has, and the color representing groupings of similar entities.  The color partitioning is a bottom-up clustering method built on top of the graph structure, which enables us to answer questions at varying levels of abstraction.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Result metrics`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```The illustrative examples above are representative of GraphRAG’s consistent improvement across multiple datasets in different subject domains.  We assess this improvement by performing an evaluation using an LLM grader to determine a pairwise winner between GraphRAG and baseline RAG.  We use a set of qualitative metrics, including comprehensiveness (completeness within the framing of the implied context of the question), human enfranchisement (provision of supporting source material or other contextual information), and diversity (provision of differing viewpoints or angles on the question posed). Initial results show that GraphRAG consistently outperforms baseline RAG on these metrics.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

The illustrative examples above are representative of GraphRAG’s consistent improvement across multiple datasets in different subject domains.  We assess this improvement by performing an evaluation using an LLM grader to determine a pairwise winner between GraphRAG and baseline RAG.  We use a set of qualitative metrics, including comprehensiveness (completeness within the framing of the implied context of the question), human enfranchisement (provision of supporting source material or other contextual information), and diversity (provision of differing viewpoints or angles on the question posed). Initial results show that GraphRAG consistently outperforms baseline RAG on these metrics.  

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```In addition to relative comparisons, we also use SelfCheckGPT (opens in new tab) to perform an absolute measurement of faithfulness to help ensure factual, coherent results grounded in the source material. Results show that GraphRAG achieves a similar level of faithfulness to baseline RAG. We are currently developing an evaluation framework to measure performance on the class of problems above.  This will include more robust mechanisms for generating question-answer test sets as well as additional metrics, such as accuracy and context relevance.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

In addition to relative comparisons, we also use SelfCheckGPT (opens in new tab) to perform an absolute measurement of faithfulness to help ensure factual, coherent results grounded in the source material. Results show that GraphRAG achieves a similar level of faithfulness to baseline RAG. We are currently developing an evaluation framework to measure performance on the class of problems above.  This will include more robust mechanisms for generating question-answer test sets as well as additional metrics, such as accuracy and context relevance. 

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Next steps`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```By combining LLM-generated knowledge graphs and graph machine learning, GraphRAG enables us to answer important classes of questions that we cannot attempt with baseline RAG alone.  We have seen promising results after applying this technology to a variety of scenarios, including social media, news articles, workplace productivity, and chemistry.  Looking forward, we plan to work closely with customers on a variety of new domains as we continue to apply this technology while working on metrics and robust evaluation. We look forward to sharing more as our research continues.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

By combining LLM-generated knowledge graphs and graph machine learning, GraphRAG enables us to answer important classes of questions that we cannot attempt with baseline RAG alone.  We have seen promising results after applying this technology to a variety of scenarios, including social media, news articles, workplace productivity, and chemistry.  Looking forward, we plan to work closely with customers on a variety of new domains as we continue to apply this technology while working on metrics and robust evaluation. We look forward to sharing more as our research continues.

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```1As baseline RAG in this comparison we use LangChain’s Q&A (opens in new tab), a well-known representative example of this class of RAG tools in widespread use today.`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Related publications`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "

\n", + "\n", + "Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine\n", + "\n", + "\n", + "

\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Meet the authors`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Jonathan Larson`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Senior Principal Data Architect`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Steven Truitt`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Principal Program Manager`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Continue reading`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Research Areas`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Related tools`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Follow us:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Share this page:`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Content Classifier):\n", + "\n", + "Title: `GraphRAG: Unlocking LLM discovery on narrative private data - Microsoft Research`, Data: ```Notifications`\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContent Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'viewport', 'content': 'width=device-width, initial-scale=1'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:dnt', 'content': 'on'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-product', 'content': 'MSR'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-stv', 'content': '8.5.0'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-sitesection', 'content': ''}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-pageType', 'content': 'Post'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-market', 'content': 'en-us'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-env', 'content': 'Production'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa‐asst', 'content': '1005408'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-pgidx', 'content': '1'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-pgtot', 'content': '-1'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'awa-pgtop', 'content': 'Artificial intelligence'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'robots', 'content': 'index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:locale', 'content': 'en_US'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:type', 'content': 'article'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:title', 'content': 'GraphRAG: A new approach for discovery using complex information'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:description', 'content': 'Microsoft is transforming retrieval-augmented generation with GraphRAG, using LLM-generated knowledge graphs to significantly improve Q&A when analyzing complex information and consistently outperforming baseline RAG. Get the details.'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:site_name', 'content': 'Microsoft Research'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'article:published_time', 'content': '2024-02-13T20:00:00+00:00'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'article:modified_time', 'content': '2024-02-13T16:50:07+00:00'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:image:width', 'content': '1200'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:image:height', 'content': '627'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'property': 'og:image:type', 'content': 'image/jpeg'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'author', 'content': 'Brenda Potts'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:card', 'content': 'summary_large_image'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:title', 'content': 'GraphRAG: A new approach for discovery using complex information'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:description', 'content': 'Microsoft is transforming retrieval-augmented generation with GraphRAG, using LLM-generated knowledge graphs to significantly improve Q&A when analyzing complex information and consistently outperforming baseline RAG. Get the details.'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:creator', 'content': '@MSFTResearch'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'twitter:site', 'content': '@MSFTResearch'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'generator', 'content': 'WordPress 6.4.3'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'name': 'research-area', 'content': 'Artificial intelligence'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'itemprop': 'width', 'content': '216'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'itemprop': 'height', 'content': '46'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'itemprop': 'name', 'content': 'Microsoft'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "True\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'itemprop': 'width', 'content': '1024'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to Metadata Classifier):\n", + "\n", + "We are parsing html metadata to extract useful data. Should we hold onto this item? {'itemprop': 'height', 'content': '576'}.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mMetadata Classifier\u001b[0m (to ContentAgent):\n", + "\n", + "False\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mContentAgent\u001b[0m (to user_proxy):\n", + "\n", + "Success: archived the following links in your chosen location ./content/ <-- https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': 'https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/', 'role': 'assistant'}, {'content': 'Success: archived the following links in your chosen location ./content/ <-- https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/', 'role': 'user'}], summary='Success: archived the following links in your chosen location ./content/ <-- https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/', cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "link = \"https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/\"\n", + "user_proxy.initiate_chat(content_agent, message=link)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aiex01_blog_hero_1400x788.png\n", + "aiex01_blog_hero_1400x788.txt\n", + "amit_emre_podcast_hero_feature_1400x788.jpg\n", + "amit_emre_podcast_hero_feature_1400x788.txt\n", + "content.txt\n", + "emnlp-2023-blogherofeature-1400x788-1.png\n", + "emnlp-2023-blogherofeature-1400x788-1.txt\n", + "graphrag-blogherofeature-1400x788-1.png\n", + "graphrag-blogherofeature-1400x788-1.txt\n", + "graphrag-figure3.jpg\n", + "graphrag-figure3.txt\n", + "graphrag_figure1.png\n", + "graphrag_figure1.txt\n", + "graphrag_figure2.png\n", + "graphrag_figure2.txt\n", + "headshot150px.png\n", + "headshot150px.txt\n", + "index.html\n", + "links.txt\n", + "metadata.txt\n", + "msr-ai-2x.png\n", + "newsplitwise-jan-24-blogherofeature-1400x788-1.jpg\n", + "newsplitwise-jan-24-blogherofeature-1400x788-1.txt\n", + "screenshot.png\n", + "sot-blogherofeature-1400x788-1.jpg\n", + "sot-blogherofeature-1400x788-1.txt\n", + "steven-truitt_360x360.jpg\n", + "steven-truitt_360x360.txt\n" + ] + } + ], + "source": [ + "!ls {storage_path}/microsoft.com/graphrag-unlocking-llm-discovery-on-narrative-private-data/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Just for reference, what did the page look like?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "last_page = list(content_agent.process_history.keys())[-1]\n", + "\n", + "local_path = f\"{storage_path}/{get_file_path_from_url(last_page)}\"\n", + "screenshot_path = os.path.join(local_path, \"screenshot.png\")\n", + "assert os.path.exists(screenshot_path)\n", + "\n", + "# Load the image\n", + "image = Image.open(screenshot_path)\n", + "\n", + "# Display the image\n", + "display(image)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### It seems the bottom was cropped, but using the 'firefox' browser for our agent will trigger the \"full page screenshot\" function, \n", + "And not to worry, everything is also stored to disk in its original form, including the source HTML as it was loaded in the desktop browser.\n", + "\n", + "#### Below we confirm that our Autogen Agent successfully cataloged all of the content into the file." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We located our search term on line 14 out of a total 27 lines\n", + "\n", + "The last 3 lines stored in content were:\n", + "\n", + "In addition to relative comparisons, we also use SelfCheckGPT (opens in new tab) to perform an absolute measurement of faithfulness to help ensure factual, coherent results grounded in the source material. Results show that GraphRAG achieves a similar level of faithfulness to baseline RAG. We are currently developing an evaluation framework to measure performance on the class of problems above.  This will include more robust mechanisms for generating question-answer test sets as well as additional metrics, such as accuracy and context relevance.\n", + "\n", + "By combining LLM-generated knowledge graphs and graph machine learning, GraphRAG enables us to answer important classes of questions that we cannot attempt with baseline RAG alone.  We have seen promising results after applying this technology to a variety of scenarios, including social media, news articles, workplace productivity, and chemistry.  Looking forward, we plan to work closely with customers on a variety of new domains as we continue to apply this technology while working on metrics and robust evaluation. We look forward to sharing more as our research continues.\n", + "\n", + "Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine\n", + "\n" + ] + } + ], + "source": [ + "with open(f\"{local_path}/content.txt\") as f:\n", + " content = f.readlines()\n", + "for idx, line in enumerate(content):\n", + " if \"What are the top 5\" in line:\n", + " break\n", + "print(f\"We located our search term on line {idx} out of a total {len(content)} lines\\n\")\n", + "print(\"The last 3 lines stored in content were:\\n\")\n", + "for i in reversed(range(1, 4)):\n", + " print(content[-i])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Thanks for looking at our new ContentAgent:\n", + "### Stay tuned for the larger pipeline known as the Athena Agent!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebook/agentchat_custom_model.ipynb b/notebook/agentchat_custom_model.ipynb index c29d3808926..365ff22c038 100644 --- a/notebook/agentchat_custom_model.ipynb +++ b/notebook/agentchat_custom_model.ipynb @@ -383,6 +383,7 @@ "source": [ "# load model here\n", "\n", + "\n", "config = config_list_custom[0]\n", "device = config.get(\"device\", \"cpu\")\n", "loaded_model = AutoModelForCausalLM.from_pretrained(config[\"model\"]).to(device)\n", diff --git a/notebook/agentchat_lmm_gpt-4v.ipynb b/notebook/agentchat_lmm_gpt-4v.ipynb index c56c6e6a1db..b49f4472a50 100644 --- a/notebook/agentchat_lmm_gpt-4v.ipynb +++ b/notebook/agentchat_lmm_gpt-4v.ipynb @@ -637,8 +637,6 @@ } ], "source": [ - "\n", - "\n", "creator = FigureCreator(name=\"Figure Creator~\", llm_config=gpt4_llm_config)\n", "\n", "user_proxy = autogen.UserProxyAgent(\n", diff --git a/notebook/agentchat_surfer_edge.ipynb b/notebook/agentchat_surfer_edge.ipynb new file mode 100644 index 00000000000..ce4015f7f60 --- /dev/null +++ b/notebook/agentchat_surfer_edge.ipynb @@ -0,0 +1,796 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# WebSurfer Agent with Headless GUI-based Browsing\n", + "\n", + "This notebook is derived from the standard [WebSurferAgent Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_surfer.ipynb) for the purposes of demonstrating coverage." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Requirements\n", + "\n", + "AutoGen requires `Python>=3.8`. To run this notebook example, please install:\n", + "```bash\n", + "pip install pyautogen selenium markdownify pillow pdfminer.six beautifulsoup4 arxiv\n", + "```\n", + "or\n", + "```bash\n", + "pip install \"pyautogen[websurfer]\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ensure that we have the WebDrivers present for Selenium\n", + "\n", + "*EDIT*:\n", + "[Selenium Manager](https://www.selenium.dev/documentation/selenium_manager/) states:\n", + "\"Selenium Manager is a command-line tool implemented in Rust that provides automated driver and browser management for Selenium. Selenium bindings use this tool by default, so you do not need to download it or add anything to your code or do anything else to use it.\"\n", + "\n", + "Therefore the folling instructions should not be needed:\n", + "Following the instructions in [Selenium Documentation](https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location/#download-the-driver), \n", + "we first download the web driver for our browser of choice, or all 3: [Edge](https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads), [Firefox](https://github.com/mozilla/geckodriver/releases), [Chrome](https://chromedriver.chromium.org/downloads).~~" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Neither powershell nor pwsh is installed.\n" + ] + } + ], + "source": [ + "# %%capture --no-stderr\n", + "import os\n", + "import logging\n", + "import autogen\n", + "from time import sleep\n", + "\n", + "from autogen.agentchat.contrib.web_surfer import WebSurferAgent\n", + "from autogen.agentchat.conversable_agent import ConversableAgent\n", + "from autogen.agentchat.user_proxy_agent import UserProxyAgent\n", + "from autogen.oai import config_list_from_json\n", + "from autogen.browser_utils import display_binary_image\n", + "\n", + "# Get the logger instance for the current module (__name__).\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set your API Endpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The [`config_list_from_json`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_json) function loads a list of configurations from an environment variable or a json file.\n", + "\n", + "It first looks for environment variable \"OAI_CONFIG_LIST\" which needs to be a valid json string. If that variable is not found, it then looks for a json file named \"OAI_CONFIG_LIST\". It filters the configs by models (you can filter by other keys as well).\n", + "\n", + "The WebSurferAgent uses a combination of models. GPT-4 and GPT-3.5-turbo-16 are recommended.\n", + "\n", + "Your json config should look something like the following:\n", + "```json\n", + "[\n", + " {\n", + " \"model\": \"gpt-4\",\n", + " \"api_key\": \"\"\n", + " },\n", + " {\n", + " \"model\": \"gpt-3.5-turbo-16k\",\n", + " \"api_key\": \"\"\n", + " }\n", + "]\n", + "```\n", + "\n", + "If you open this notebook in colab, you can upload your files by clicking the file icon on the left panel and then choose \"upload file\" icon.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "llm_config = {\n", + " \"timeout\": 600,\n", + " \"cache_seed\": 44, # change the seed for different trials\n", + " \"config_list\": config_list_from_json(\n", + " \"OAI_CONFIG_LIST\",\n", + " filter_dict={\"model\": [\"gpt-3.5-turbo\"]},\n", + " ),\n", + " \"temperature\": 0,\n", + "}\n", + "\n", + "summarizer_llm_config = {\n", + " \"timeout\": 600,\n", + " \"cache_seed\": 44, # change the seed for different trials\n", + " \"config_list\": config_list_from_json(\n", + " \"OAI_CONFIG_LIST\",\n", + " filter_dict={\"model\": [\"gpt-3.5-turbo\"]},\n", + " ),\n", + " \"temperature\": 0,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure Bing\n", + "\n", + "For WebSurferAgent to be reasonably useful, it needs to be able to search the web -- and that means it needs a Bing API key. \n", + "You can read more about how to get an API on the [Bing Web Search API](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) page.\n", + "\n", + "Once you have your key, either set it as the `BING_API_KEY` system environment variable, or simply input your key below." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "bing_api_key = os.environ[\"BING_API_KEY\"] if \"BING_API_KEY\" in os.environ else \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Construct Agents\n", + "\n", + "We now create out WebSurferAgent, and a UserProxyAgent to surf the web, but using a graphical based browser required for many use-cases. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "web_surfer = WebSurferAgent(\n", + " \"web_surfer\",\n", + " llm_config=llm_config,\n", + " summarizer_llm_config=summarizer_llm_config,\n", + " browser_config={\n", + " \"type\": \"selenium\", # *NEW* Here we specify that we intend to use our headless GUI browser. The default setting is \"text\".\n", + " \"browser\": \"edge\", # *NEW* We'll use the edge browser for these tests. Choices include 'edge', 'firefox', and 'chrome'\n", + " \"resolution\": (1400, 900), # *NEW* we specify the browser window size. The default is (1920,5200)\n", + " \"render_text\": False, # *NEW* We still have the option to convert the output to text and render it in the browser\n", + " \"bing_api_key\": bing_api_key,\n", + " },\n", + ")\n", + "\n", + "user_proxy = UserProxyAgent(\n", + " \"user_proxy\",\n", + " human_input_mode=\"NEVER\",\n", + " code_execution_config=False,\n", + " default_auto_reply=\"\",\n", + " is_termination_msg=lambda x: True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook Content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Part 1: Search, summarize\n", + "- Search for information aobut Microsoft AutoGen\n", + "- Summarize the results\n", + "- Visit the Getting Started Docs page" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to web_surfer):\n", + "\n", + "\n", + "Search the web for information about Microsoft AutoGen\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION informational_web_search...\u001b[0m\n", + "\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n", + "\n", + "Address: bing: Microsoft AutoGen\n", + "Title: Microsoft AutoGen - Search\n", + "Viewport position: Showing page 1 of 1.\n", + "=======================\n", + "A Bing search for 'Microsoft AutoGen' found 8 results:\n", + "\n", + "## Web Results\n", + "1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\n", + "AutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\n", + "\n", + "2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\n", + "AutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\n", + "\n", + "3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\n", + "AutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\n", + "\n", + "4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\n", + "AutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\n", + "\n", + "5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\n", + "AutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\n", + "\n", + "6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\n", + "AutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\n", + "\n", + "7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\n", + "AutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\n", + "\n", + "8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\n", + "AutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': '\\nSearch the web for information about Microsoft AutoGen\\n', 'role': 'assistant'}, {'content': \"Address: bing: Microsoft AutoGen\\nTitle: Microsoft AutoGen - Search\\nViewport position: Showing page 1 of 1.\\n=======================\\nA Bing search for 'Microsoft AutoGen' found 8 results:\\n\\n## Web Results\\n1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\\nAutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\\n\\n2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\\nAutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\\n\\n3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\\n\\n4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\\nAutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\\n\\n5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\\nAutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\\n\\n6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\\n\\n7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\\nAutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\\n\\n8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\", 'role': 'user'}], summary=\"Address: bing: Microsoft AutoGen\\nTitle: Microsoft AutoGen - Search\\nViewport position: Showing page 1 of 1.\\n=======================\\nA Bing search for 'Microsoft AutoGen' found 8 results:\\n\\n## Web Results\\n1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\\nAutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\\n\\n2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\\nAutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\\n\\n3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\\n\\n4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\\nAutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\\n\\n5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\\nAutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\\n\\n6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\\n\\n7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\\nAutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\\n\\n8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\", cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Bing search is a special case and we return the text in the same way as the SimpleTextBrowser\n", + "\n", + "task1 = \"\"\"\n", + "Search the web for information about Microsoft AutoGen\n", + "\"\"\"\n", + "\n", + "user_proxy.initiate_chat(web_surfer, message=task1)\n", + "\n", + "# Note that these results are also accessable in JSON format with `web_surfer.browser.bing_results`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to web_surfer):\n", + "\n", + "Summarize these results\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION summarize_page...\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The token limit (4096) of the WebSurferAgent.summarizer_llm_config, is below the recommended 16k.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n", + "\n", + "AutoGen is a framework developed by Microsoft Research to simplify the orchestration, optimization, and automation of large language model (LLM) workflows. The framework offers customizable and conversable agents that utilize advanced LLM capabilities, such as GPT-4, while also integrating with humans and tools to address limitations and enhance performance. As developers create more complex LLM-based applications, the workflows become intricate, requiring significant effort and expertise to design and implement. Automating these workflows using AutoGen can streamline the process and improve efficiency, enabling the creation of next-generation applications that leverage the full potential of LLMs. The framework supports conversations between multiple agents through automated chat, providing a solution to the challenge of orchestrating optimal workflows in a vast and complex design space.\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': '\\nSearch the web for information about Microsoft AutoGen\\n', 'role': 'assistant'}, {'content': \"Address: bing: Microsoft AutoGen\\nTitle: Microsoft AutoGen - Search\\nViewport position: Showing page 1 of 1.\\n=======================\\nA Bing search for 'Microsoft AutoGen' found 8 results:\\n\\n## Web Results\\n1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\\nAutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\\n\\n2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\\nAutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\\n\\n3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\\n\\n4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\\nAutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\\n\\n5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\\nAutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\\n\\n6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\\n\\n7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\\nAutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\\n\\n8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\", 'role': 'user'}, {'content': 'Summarize these results', 'role': 'assistant'}, {'content': 'AutoGen is a framework developed by Microsoft Research to simplify the orchestration, optimization, and automation of large language model (LLM) workflows. The framework offers customizable and conversable agents that utilize advanced LLM capabilities, such as GPT-4, while also integrating with humans and tools to address limitations and enhance performance. As developers create more complex LLM-based applications, the workflows become intricate, requiring significant effort and expertise to design and implement. Automating these workflows using AutoGen can streamline the process and improve efficiency, enabling the creation of next-generation applications that leverage the full potential of LLMs. The framework supports conversations between multiple agents through automated chat, providing a solution to the challenge of orchestrating optimal workflows in a vast and complex design space.', 'role': 'user'}], summary='AutoGen is a framework developed by Microsoft Research to simplify the orchestration, optimization, and automation of large language model (LLM) workflows. The framework offers customizable and conversable agents that utilize advanced LLM capabilities, such as GPT-4, while also integrating with humans and tools to address limitations and enhance performance. As developers create more complex LLM-based applications, the workflows become intricate, requiring significant effort and expertise to design and implement. Automating these workflows using AutoGen can streamline the process and improve efficiency, enabling the creation of next-generation applications that leverage the full potential of LLMs. The framework supports conversations between multiple agents through automated chat, providing a solution to the challenge of orchestrating optimal workflows in a vast and complex design space.', cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task2 = \"Summarize these results\"\n", + "user_proxy.initiate_chat(web_surfer, message=task2, clear_history=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to web_surfer):\n", + "\n", + "Click the 'Getting Started' result\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION visit_page...\u001b[0m\n", + "\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n", + "\n", + "Address: https://microsoft.github.io/autogen/docs/Getting-Started/\n", + "Title: Getting Started | AutoGen\n", + "Viewport position: Showing page 1 of 1.\n", + "=======================\n", + "\n", + "\n", + "\n", + "Getting Started | AutoGen\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\n", + "* [Gallery](/autogen/docs/Gallery)\n", + "[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\n", + "[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\n", + "* [Installation](/autogen/docs/installation/)\n", + "* [LLM Configuration](/autogen/docs/llm_configuration)\n", + "* [Use Cases](#)\n", + "* [Contributing](/autogen/docs/Contribute)\n", + "* [Research](/autogen/docs/Research)\n", + "* [Migration Guide](/autogen/docs/Migration-Guide)\n", + "* \n", + "* Getting Started\n", + "On this pageGetting Started\n", + "===============\n", + "\n", + "\n", + "AutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools.\n", + "\n", + "\n", + "![AutoGen Overview](/autogen/assets/images/autogen_agentchat-250ca64b77b87e70d34766a080bf6ba8.png)\n", + "\n", + "\n", + "### Main Features[​](#main-features \"Direct link to Main Features\")\n", + "\n", + "\n", + "* AutoGen enables building next-gen LLM applications based on [multi-agent conversations](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) with minimal effort. It simplifies the orchestration, automation, and optimization of a complex LLM workflow. It maximizes the performance of LLM models and overcomes their weaknesses.\n", + "* It supports [diverse conversation patterns](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#supporting-diverse-conversation-patterns) for complex workflows. With customizable and conversable agents, developers can use AutoGen to build a wide range of conversation patterns concerning conversation autonomy,\n", + "the number of agents, and agent conversation topology.\n", + "* It provides a collection of working systems with different complexities. These systems span a [wide range of applications](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#diverse-applications-implemented-with-autogen) from various domains and complexities. This demonstrates how AutoGen can easily support diverse conversation patterns.\n", + "* AutoGen provides [enhanced LLM inference](https://microsoft.github.io/autogen/docs/Use-Cases/enhanced_inference#api-unification). It offers utilities like API unification and caching, and advanced usage patterns, such as error handling, multi-config inference, context programming, etc.\n", + "\n", + "\n", + "AutoGen is powered by collaborative [research studies](/autogen/docs/Research) from Microsoft, Penn State University, and University of Washington.\n", + "\n", + "\n", + "### Quickstart[​](#quickstart \"Direct link to Quickstart\")\n", + "\n", + "\n", + "Install from pip: `pip install pyautogen`. Find more options in [Installation](/autogen/docs/installation/).\n", + "For [code execution](/autogen/docs/FAQ#code-execution), we strongly recommend installing the python docker package, and using docker.\n", + "\n", + "\n", + "#### Multi-Agent Conversation Framework[​](#multi-agent-conversation-framework \"Direct link to Multi-Agent Conversation Framework\")\n", + "\n", + "\n", + "Autogen enables the next-gen LLM applications with a generic multi-agent conversation framework. It offers customizable and conversable agents which integrate LLMs, tools, and humans.\n", + "By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\n", + "\n", + "\n", + "\n", + "```\n", + "from autogen import AssistantAgent, UserProxyAgent, config\\_list\\_from\\_json \n", + " \n", + "# Load LLM inference endpoints from an env variable or a file \n", + "# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints \n", + "# and OAI\\_CONFIG\\_LIST\\_sample.json \n", + "config\\_list = config\\_list\\_from\\_json(env\\_or\\_file=\"OAI\\_CONFIG\\_LIST\") \n", + "assistant = AssistantAgent(\"assistant\", llm\\_config={\"config\\_list\": config\\_list}) \n", + "user\\_proxy = UserProxyAgent(\"user\\_proxy\", code\\_execution\\_config={\"work\\_dir\": \"coding\", \"use\\_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended \n", + "user\\_proxy.initiate\\_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\") \n", + "# This initiates an automated chat between the two agents to solve the task \n", + "\n", + "```\n", + "\n", + "The figure below shows an example conversation flow with AutoGen.\n", + "![Agent Chat Example](/autogen/assets/images/chat_example-da70a7420ebc817ef9826fa4b1e80951.png)\n", + "\n", + "\n", + "* [Code examples](/autogen/docs/Examples).\n", + "* [Documentation](/autogen/docs/Use-Cases/agent_chat).\n", + "\n", + "\n", + "#### Enhanced LLM Inferences[​](#enhanced-llm-inferences \"Direct link to Enhanced LLM Inferences\")\n", + "\n", + "\n", + "Autogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers enhanced LLM inference with powerful functionalities like tuning, caching, error handling, templating. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.\n", + "\n", + "\n", + "\n", + "```\n", + "# perform tuning for openai<1 \n", + "config, analysis = autogen.Completion.tune( \n", + " data=tune\\_data, \n", + " metric=\"success\", \n", + " mode=\"max\", \n", + " eval\\_func=eval\\_func, \n", + " inference\\_budget=0.05, \n", + " optimization\\_budget=3, \n", + " num\\_samples=-1, \n", + ") \n", + "# perform inference for a test instance \n", + "response = autogen.Completion.create(context=test\\_instance, \\*\\*config) \n", + "\n", + "```\n", + "\n", + "* [Code examples](/autogen/docs/Examples).\n", + "* [Documentation](/autogen/docs/Use-Cases/enhanced_inference).\n", + "\n", + "\n", + "### Where to Go Next ?[​](#where-to-go-next- \"Direct link to Where to Go Next ?\")\n", + "\n", + "\n", + "* Understand the use cases for [multi-agent conversation](/autogen/docs/Use-Cases/agent_chat) and [enhanced LLM inference](/autogen/docs/Use-Cases/enhanced_inference).\n", + "* Find [code examples](/autogen/docs/Examples).\n", + "* Read [SDK](/autogen/docs/reference/agentchat/conversable_agent/).\n", + "* Learn about [research](/autogen/docs/Research) around AutoGen.\n", + "* [Roadmap](https://github.com/orgs/microsoft/projects/989/views/3)\n", + "* Chat on [Discord](https://discord.gg/pAbnFJrkgZ).\n", + "* Follow on [Twitter](https://twitter.com/pyautogen).\n", + "\n", + "\n", + "If you like our project, please give it a [star](https://github.com/microsoft/autogen/stargazers) on GitHub. If you are interested in contributing, please read [Contributor's Guide](/autogen/docs/Contribute).\n", + "\n", + "\n", + "[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Getting-Started.md)[NextInstallation](/autogen/docs/installation/)* [Main Features](#main-features)\n", + "* [Quickstart](#quickstart)\n", + "* [Where to Go Next ?](#where-to-go-next-)\n", + "Community* [Discord](https://discord.gg/pAbnFJrkgZ)\n", + "* [Twitter](https://twitter.com/pyautogen)\n", + "Copyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': '\\nSearch the web for information about Microsoft AutoGen\\n', 'role': 'assistant'}, {'content': \"Address: bing: Microsoft AutoGen\\nTitle: Microsoft AutoGen - Search\\nViewport position: Showing page 1 of 1.\\n=======================\\nA Bing search for 'Microsoft AutoGen' found 8 results:\\n\\n## Web Results\\n1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\\nAutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\\n\\n2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\\nAutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\\n\\n3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\\n\\n4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\\nAutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\\n\\n5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\\nAutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\\n\\n6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\\n\\n7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\\nAutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\\n\\n8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\", 'role': 'user'}, {'content': 'Summarize these results', 'role': 'assistant'}, {'content': 'AutoGen is a framework developed by Microsoft Research to simplify the orchestration, optimization, and automation of large language model (LLM) workflows. The framework offers customizable and conversable agents that utilize advanced LLM capabilities, such as GPT-4, while also integrating with humans and tools to address limitations and enhance performance. As developers create more complex LLM-based applications, the workflows become intricate, requiring significant effort and expertise to design and implement. Automating these workflows using AutoGen can streamline the process and improve efficiency, enabling the creation of next-generation applications that leverage the full potential of LLMs. The framework supports conversations between multiple agents through automated chat, providing a solution to the challenge of orchestrating optimal workflows in a vast and complex design space.', 'role': 'user'}, {'content': \"Click the 'Getting Started' result\", 'role': 'assistant'}, {'content': 'Address: https://microsoft.github.io/autogen/docs/Getting-Started/\\nTitle: Getting Started | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nGetting Started | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Getting Started\\nOn this pageGetting Started\\n===============\\n\\n\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools.\\n\\n\\n![AutoGen Overview](/autogen/assets/images/autogen_agentchat-250ca64b77b87e70d34766a080bf6ba8.png)\\n\\n\\n### Main Features[\\u200b](#main-features \"Direct link to Main Features\")\\n\\n\\n* AutoGen enables building next-gen LLM applications based on [multi-agent conversations](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) with minimal effort. It simplifies the orchestration, automation, and optimization of a complex LLM workflow. It maximizes the performance of LLM models and overcomes their weaknesses.\\n* It supports [diverse conversation patterns](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#supporting-diverse-conversation-patterns) for complex workflows. With customizable and conversable agents, developers can use AutoGen to build a wide range of conversation patterns concerning conversation autonomy,\\nthe number of agents, and agent conversation topology.\\n* It provides a collection of working systems with different complexities. These systems span a [wide range of applications](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#diverse-applications-implemented-with-autogen) from various domains and complexities. This demonstrates how AutoGen can easily support diverse conversation patterns.\\n* AutoGen provides [enhanced LLM inference](https://microsoft.github.io/autogen/docs/Use-Cases/enhanced_inference#api-unification). It offers utilities like API unification and caching, and advanced usage patterns, such as error handling, multi-config inference, context programming, etc.\\n\\n\\nAutoGen is powered by collaborative [research studies](/autogen/docs/Research) from Microsoft, Penn State University, and University of Washington.\\n\\n\\n### Quickstart[\\u200b](#quickstart \"Direct link to Quickstart\")\\n\\n\\nInstall from pip: `pip install pyautogen`. Find more options in [Installation](/autogen/docs/installation/).\\nFor [code execution](/autogen/docs/FAQ#code-execution), we strongly recommend installing the python docker package, and using docker.\\n\\n\\n#### Multi-Agent Conversation Framework[\\u200b](#multi-agent-conversation-framework \"Direct link to Multi-Agent Conversation Framework\")\\n\\n\\nAutogen enables the next-gen LLM applications with a generic multi-agent conversation framework. It offers customizable and conversable agents which integrate LLMs, tools, and humans.\\nBy automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\\n\\n\\n\\n```\\nfrom autogen import AssistantAgent, UserProxyAgent, config\\\\_list\\\\_from\\\\_json \\n \\n# Load LLM inference endpoints from an env variable or a file \\n# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints \\n# and OAI\\\\_CONFIG\\\\_LIST\\\\_sample.json \\nconfig\\\\_list = config\\\\_list\\\\_from\\\\_json(env\\\\_or\\\\_file=\"OAI\\\\_CONFIG\\\\_LIST\") \\nassistant = AssistantAgent(\"assistant\", llm\\\\_config={\"config\\\\_list\": config\\\\_list}) \\nuser\\\\_proxy = UserProxyAgent(\"user\\\\_proxy\", code\\\\_execution\\\\_config={\"work\\\\_dir\": \"coding\", \"use\\\\_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended \\nuser\\\\_proxy.initiate\\\\_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\") \\n# This initiates an automated chat between the two agents to solve the task \\n\\n```\\n\\nThe figure below shows an example conversation flow with AutoGen.\\n![Agent Chat Example](/autogen/assets/images/chat_example-da70a7420ebc817ef9826fa4b1e80951.png)\\n\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/agent_chat).\\n\\n\\n#### Enhanced LLM Inferences[\\u200b](#enhanced-llm-inferences \"Direct link to Enhanced LLM Inferences\")\\n\\n\\nAutogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers enhanced LLM inference with powerful functionalities like tuning, caching, error handling, templating. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.\\n\\n\\n\\n```\\n# perform tuning for openai<1 \\nconfig, analysis = autogen.Completion.tune( \\n data=tune\\\\_data, \\n metric=\"success\", \\n mode=\"max\", \\n eval\\\\_func=eval\\\\_func, \\n inference\\\\_budget=0.05, \\n optimization\\\\_budget=3, \\n num\\\\_samples=-1, \\n) \\n# perform inference for a test instance \\nresponse = autogen.Completion.create(context=test\\\\_instance, \\\\*\\\\*config) \\n\\n```\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/enhanced_inference).\\n\\n\\n### Where to Go Next ?[\\u200b](#where-to-go-next- \"Direct link to Where to Go Next ?\")\\n\\n\\n* Understand the use cases for [multi-agent conversation](/autogen/docs/Use-Cases/agent_chat) and [enhanced LLM inference](/autogen/docs/Use-Cases/enhanced_inference).\\n* Find [code examples](/autogen/docs/Examples).\\n* Read [SDK](/autogen/docs/reference/agentchat/conversable_agent/).\\n* Learn about [research](/autogen/docs/Research) around AutoGen.\\n* [Roadmap](https://github.com/orgs/microsoft/projects/989/views/3)\\n* Chat on [Discord](https://discord.gg/pAbnFJrkgZ).\\n* Follow on [Twitter](https://twitter.com/pyautogen).\\n\\n\\nIf you like our project, please give it a [star](https://github.com/microsoft/autogen/stargazers) on GitHub. If you are interested in contributing, please read [Contributor\\'s Guide](/autogen/docs/Contribute).\\n\\n\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Getting-Started.md)[NextInstallation](/autogen/docs/installation/)* [Main Features](#main-features)\\n* [Quickstart](#quickstart)\\n* [Where to Go Next ?](#where-to-go-next-)\\nCommunity* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n', 'role': 'user'}], summary='Address: https://microsoft.github.io/autogen/docs/Getting-Started/\\nTitle: Getting Started | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nGetting Started | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Getting Started\\nOn this pageGetting Started\\n===============\\n\\n\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools.\\n\\n\\n![AutoGen Overview](/autogen/assets/images/autogen_agentchat-250ca64b77b87e70d34766a080bf6ba8.png)\\n\\n\\n### Main Features[\\u200b](#main-features \"Direct link to Main Features\")\\n\\n\\n* AutoGen enables building next-gen LLM applications based on [multi-agent conversations](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) with minimal effort. It simplifies the orchestration, automation, and optimization of a complex LLM workflow. It maximizes the performance of LLM models and overcomes their weaknesses.\\n* It supports [diverse conversation patterns](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#supporting-diverse-conversation-patterns) for complex workflows. With customizable and conversable agents, developers can use AutoGen to build a wide range of conversation patterns concerning conversation autonomy,\\nthe number of agents, and agent conversation topology.\\n* It provides a collection of working systems with different complexities. These systems span a [wide range of applications](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#diverse-applications-implemented-with-autogen) from various domains and complexities. This demonstrates how AutoGen can easily support diverse conversation patterns.\\n* AutoGen provides [enhanced LLM inference](https://microsoft.github.io/autogen/docs/Use-Cases/enhanced_inference#api-unification). It offers utilities like API unification and caching, and advanced usage patterns, such as error handling, multi-config inference, context programming, etc.\\n\\n\\nAutoGen is powered by collaborative [research studies](/autogen/docs/Research) from Microsoft, Penn State University, and University of Washington.\\n\\n\\n### Quickstart[\\u200b](#quickstart \"Direct link to Quickstart\")\\n\\n\\nInstall from pip: `pip install pyautogen`. Find more options in [Installation](/autogen/docs/installation/).\\nFor [code execution](/autogen/docs/FAQ#code-execution), we strongly recommend installing the python docker package, and using docker.\\n\\n\\n#### Multi-Agent Conversation Framework[\\u200b](#multi-agent-conversation-framework \"Direct link to Multi-Agent Conversation Framework\")\\n\\n\\nAutogen enables the next-gen LLM applications with a generic multi-agent conversation framework. It offers customizable and conversable agents which integrate LLMs, tools, and humans.\\nBy automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\\n\\n\\n\\n```\\nfrom autogen import AssistantAgent, UserProxyAgent, config\\\\_list\\\\_from\\\\_json \\n \\n# Load LLM inference endpoints from an env variable or a file \\n# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints \\n# and OAI\\\\_CONFIG\\\\_LIST\\\\_sample.json \\nconfig\\\\_list = config\\\\_list\\\\_from\\\\_json(env\\\\_or\\\\_file=\"OAI\\\\_CONFIG\\\\_LIST\") \\nassistant = AssistantAgent(\"assistant\", llm\\\\_config={\"config\\\\_list\": config\\\\_list}) \\nuser\\\\_proxy = UserProxyAgent(\"user\\\\_proxy\", code\\\\_execution\\\\_config={\"work\\\\_dir\": \"coding\", \"use\\\\_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended \\nuser\\\\_proxy.initiate\\\\_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\") \\n# This initiates an automated chat between the two agents to solve the task \\n\\n```\\n\\nThe figure below shows an example conversation flow with AutoGen.\\n![Agent Chat Example](/autogen/assets/images/chat_example-da70a7420ebc817ef9826fa4b1e80951.png)\\n\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/agent_chat).\\n\\n\\n#### Enhanced LLM Inferences[\\u200b](#enhanced-llm-inferences \"Direct link to Enhanced LLM Inferences\")\\n\\n\\nAutogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers enhanced LLM inference with powerful functionalities like tuning, caching, error handling, templating. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.\\n\\n\\n\\n```\\n# perform tuning for openai<1 \\nconfig, analysis = autogen.Completion.tune( \\n data=tune\\\\_data, \\n metric=\"success\", \\n mode=\"max\", \\n eval\\\\_func=eval\\\\_func, \\n inference\\\\_budget=0.05, \\n optimization\\\\_budget=3, \\n num\\\\_samples=-1, \\n) \\n# perform inference for a test instance \\nresponse = autogen.Completion.create(context=test\\\\_instance, \\\\*\\\\*config) \\n\\n```\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/enhanced_inference).\\n\\n\\n### Where to Go Next ?[\\u200b](#where-to-go-next- \"Direct link to Where to Go Next ?\")\\n\\n\\n* Understand the use cases for [multi-agent conversation](/autogen/docs/Use-Cases/agent_chat) and [enhanced LLM inference](/autogen/docs/Use-Cases/enhanced_inference).\\n* Find [code examples](/autogen/docs/Examples).\\n* Read [SDK](/autogen/docs/reference/agentchat/conversable_agent/).\\n* Learn about [research](/autogen/docs/Research) around AutoGen.\\n* [Roadmap](https://github.com/orgs/microsoft/projects/989/views/3)\\n* Chat on [Discord](https://discord.gg/pAbnFJrkgZ).\\n* Follow on [Twitter](https://twitter.com/pyautogen).\\n\\n\\nIf you like our project, please give it a [star](https://github.com/microsoft/autogen/stargazers) on GitHub. If you are interested in contributing, please read [Contributor\\'s Guide](/autogen/docs/Contribute).\\n\\n\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Getting-Started.md)[NextInstallation](/autogen/docs/installation/)* [Main Features](#main-features)\\n* [Quickstart](#quickstart)\\n* [Where to Go Next ?](#where-to-go-next-)\\nCommunity* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n', cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task3 = \"Click the 'Getting Started' result\"\n", + "user_proxy.initiate_chat(web_surfer, message=task3, clear_history=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Part 2: Let's look at the actual page rendered" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_binary_image(web_surfer.browser.driver.get_screenshot_as_png())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's scroll down and look again" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "task5 = \"\"\"Scroll down.\"\"\"\n", + "user_proxy.initiate_chat(web_surfer, message=task5, clear_history=False)\n", + "\n", + "# We give it few seconds before viewing the browser\n", + "sleep(3)\n", + "display_binary_image(web_surfer.browser.driver.get_screenshot_as_png())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's test our navigation using the rendered page\n", + "Note: this does require vision capabilities" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser_proxy\u001b[0m (to web_surfer):\n", + "\n", + "Click the 'research studies' link\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION visit_page...\u001b[0m\n", + "\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n", + "\n", + "Address: https://microsoft.github.io/autogen/docs/Research\n", + "Title: Research | AutoGen\n", + "Viewport position: Showing page 1 of 1.\n", + "=======================\n", + "\n", + "\n", + "\n", + "Research | AutoGen\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\n", + "* [Gallery](/autogen/docs/Gallery)\n", + "[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\n", + "[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\n", + "* [Installation](/autogen/docs/installation/)\n", + "* [LLM Configuration](/autogen/docs/llm_configuration)\n", + "* [Use Cases](#)\n", + "* [Contributing](/autogen/docs/Contribute)\n", + "* [Research](/autogen/docs/Research)\n", + "* [Migration Guide](/autogen/docs/Migration-Guide)\n", + "* \n", + "* Research\n", + "Research\n", + "========\n", + "\n", + "\n", + "For technical details, please check our technical report and research publications.\n", + "\n", + "\n", + "* [AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework](https://arxiv.org/abs/2308.08155). Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Shaokun Zhang, Erkang Zhu, Beibin Li, Li Jiang, Xiaoyun Zhang and Chi Wang. ArXiv 2023.\n", + "\n", + "\n", + "\n", + "```\n", + "@inproceedings{wu2023autogen, \n", + " title={AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework}, \n", + " author={Qingyun Wu and Gagan Bansal and Jieyu Zhang and Yiran Wu and Shaokun Zhang and Erkang Zhu and Beibin Li and Li Jiang and Xiaoyun Zhang and Chi Wang}, \n", + " year={2023}, \n", + " eprint={2308.08155}, \n", + " archivePrefix={arXiv}, \n", + " primaryClass={cs.AI} \n", + "} \n", + "\n", + "```\n", + "\n", + "* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. AutoML'23.\n", + "\n", + "\n", + "\n", + "```\n", + "@inproceedings{wang2023EcoOptiGen, \n", + " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference}, \n", + " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah}, \n", + " year={2023}, \n", + " booktitle={AutoML'23}, \n", + "} \n", + "\n", + "```\n", + "\n", + "* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", + "\n", + "\n", + "\n", + "```\n", + "@inproceedings{wu2023empirical, \n", + " title={An Empirical Study on Challenging Math Problem Solving with GPT-4}, \n", + " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang}, \n", + " year={2023}, \n", + " booktitle={ArXiv preprint arXiv:2306.01337}, \n", + "} \n", + "\n", + "```\n", + "\n", + "* [EcoAssistant: Using LLM Assistant More Affordably and Accurately](https://arxiv.org/abs/2310.03046). Jieyu Zhang, Ranjay Krishna, Ahmed H. Awadallah, Chi Wang. ArXiv preprint arXiv:2310.03046 (2023).\n", + "\n", + "\n", + "\n", + "```\n", + "@inproceedings{zhang2023ecoassistant, \n", + " title={EcoAssistant: Using LLM Assistant More Affordably and Accurately}, \n", + " author={Zhang, Jieyu and Krishna, Ranjay and Awadallah, Ahmed H and Wang, Chi}, \n", + " year={2023}, \n", + " booktitle={ArXiv preprint arXiv:2310.03046}, \n", + "} \n", + "\n", + "```\n", + "[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Research.md)[PreviousContributing](/autogen/docs/Contribute)[NextMigration Guide](/autogen/docs/Migration-Guide)Community* [Discord](https://discord.gg/pAbnFJrkgZ)\n", + "* [Twitter](https://twitter.com/pyautogen)\n", + "Copyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "data": { + "text/plain": [ + "ChatResult(chat_history=[{'content': '\\nSearch the web for information about Microsoft AutoGen\\n', 'role': 'assistant'}, {'content': \"Address: bing: Microsoft AutoGen\\nTitle: Microsoft AutoGen - Search\\nViewport position: Showing page 1 of 1.\\n=======================\\nA Bing search for 'Microsoft AutoGen' found 8 results:\\n\\n## Web Results\\n1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\\nAutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\\n\\n2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\\nAutoGen is a framework that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports diverse conversation patterns, enhanced LLM inference, and customizable and conversable agents.\\n\\n3. [Getting Started | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Getting-Started/)\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\\n\\n4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\\nAutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\\n\\n5. [AutoGen Studio: Interactively Explore Multi-Agent Workflows](https://microsoft.github.io/autogen/blog/2023/12/01/AutoGenStudio/)\\nAutoGen has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: AutoGen Studio.\\n\\n6. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\\n\\n7. [Mastering AutoGen: A Comprehensive Guide to Next-Generation ... - Medium](https://medium.com/@krtarunsingh/mastering-autogen-a-comprehensive-guide-to-next-generation-language-model-applications-b375d9b4dc6d)\\nAutoGen is a framework by Microsoft that allows you to create applications that leverage large language models (LLMs) with multi-agent conversations, diverse patterns, and enhanced inference. Learn how to set up AutoGen, use its architecture, and apply its features in this comprehensive guide by Tarun Singh.\\n\\n8. [arXiv:2308.08155v2 [cs.AI] 3 Oct 2023](https://arxiv.org/pdf/2308.08155.pdf)\\nAutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to solve tasks using multiple languages, tools, and human inputs. The framework supports flexible conversation patterns and natural or code-based programming for diverse applications of complexities and LLM capacities.\", 'role': 'user'}, {'content': 'Summarize these results', 'role': 'assistant'}, {'content': 'AutoGen is a framework developed by Microsoft Research to simplify the orchestration, optimization, and automation of large language model (LLM) workflows. The framework offers customizable and conversable agents that utilize advanced LLM capabilities, such as GPT-4, while also integrating with humans and tools to address limitations and enhance performance. As developers create more complex LLM-based applications, the workflows become intricate, requiring significant effort and expertise to design and implement. Automating these workflows using AutoGen can streamline the process and improve efficiency, enabling the creation of next-generation applications that leverage the full potential of LLMs. The framework supports conversations between multiple agents through automated chat, providing a solution to the challenge of orchestrating optimal workflows in a vast and complex design space.', 'role': 'user'}, {'content': \"Click the 'Getting Started' result\", 'role': 'assistant'}, {'content': 'Address: https://microsoft.github.io/autogen/docs/Getting-Started/\\nTitle: Getting Started | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nGetting Started | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Getting Started\\nOn this pageGetting Started\\n===============\\n\\n\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools.\\n\\n\\n![AutoGen Overview](/autogen/assets/images/autogen_agentchat-250ca64b77b87e70d34766a080bf6ba8.png)\\n\\n\\n### Main Features[\\u200b](#main-features \"Direct link to Main Features\")\\n\\n\\n* AutoGen enables building next-gen LLM applications based on [multi-agent conversations](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) with minimal effort. It simplifies the orchestration, automation, and optimization of a complex LLM workflow. It maximizes the performance of LLM models and overcomes their weaknesses.\\n* It supports [diverse conversation patterns](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#supporting-diverse-conversation-patterns) for complex workflows. With customizable and conversable agents, developers can use AutoGen to build a wide range of conversation patterns concerning conversation autonomy,\\nthe number of agents, and agent conversation topology.\\n* It provides a collection of working systems with different complexities. These systems span a [wide range of applications](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#diverse-applications-implemented-with-autogen) from various domains and complexities. This demonstrates how AutoGen can easily support diverse conversation patterns.\\n* AutoGen provides [enhanced LLM inference](https://microsoft.github.io/autogen/docs/Use-Cases/enhanced_inference#api-unification). It offers utilities like API unification and caching, and advanced usage patterns, such as error handling, multi-config inference, context programming, etc.\\n\\n\\nAutoGen is powered by collaborative [research studies](/autogen/docs/Research) from Microsoft, Penn State University, and University of Washington.\\n\\n\\n### Quickstart[\\u200b](#quickstart \"Direct link to Quickstart\")\\n\\n\\nInstall from pip: `pip install pyautogen`. Find more options in [Installation](/autogen/docs/installation/).\\nFor [code execution](/autogen/docs/FAQ#code-execution), we strongly recommend installing the python docker package, and using docker.\\n\\n\\n#### Multi-Agent Conversation Framework[\\u200b](#multi-agent-conversation-framework \"Direct link to Multi-Agent Conversation Framework\")\\n\\n\\nAutogen enables the next-gen LLM applications with a generic multi-agent conversation framework. It offers customizable and conversable agents which integrate LLMs, tools, and humans.\\nBy automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\\n\\n\\n\\n```\\nfrom autogen import AssistantAgent, UserProxyAgent, config\\\\_list\\\\_from\\\\_json \\n \\n# Load LLM inference endpoints from an env variable or a file \\n# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints \\n# and OAI\\\\_CONFIG\\\\_LIST\\\\_sample.json \\nconfig\\\\_list = config\\\\_list\\\\_from\\\\_json(env\\\\_or\\\\_file=\"OAI\\\\_CONFIG\\\\_LIST\") \\nassistant = AssistantAgent(\"assistant\", llm\\\\_config={\"config\\\\_list\": config\\\\_list}) \\nuser\\\\_proxy = UserProxyAgent(\"user\\\\_proxy\", code\\\\_execution\\\\_config={\"work\\\\_dir\": \"coding\", \"use\\\\_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended \\nuser\\\\_proxy.initiate\\\\_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\") \\n# This initiates an automated chat between the two agents to solve the task \\n\\n```\\n\\nThe figure below shows an example conversation flow with AutoGen.\\n![Agent Chat Example](/autogen/assets/images/chat_example-da70a7420ebc817ef9826fa4b1e80951.png)\\n\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/agent_chat).\\n\\n\\n#### Enhanced LLM Inferences[\\u200b](#enhanced-llm-inferences \"Direct link to Enhanced LLM Inferences\")\\n\\n\\nAutogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers enhanced LLM inference with powerful functionalities like tuning, caching, error handling, templating. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.\\n\\n\\n\\n```\\n# perform tuning for openai<1 \\nconfig, analysis = autogen.Completion.tune( \\n data=tune\\\\_data, \\n metric=\"success\", \\n mode=\"max\", \\n eval\\\\_func=eval\\\\_func, \\n inference\\\\_budget=0.05, \\n optimization\\\\_budget=3, \\n num\\\\_samples=-1, \\n) \\n# perform inference for a test instance \\nresponse = autogen.Completion.create(context=test\\\\_instance, \\\\*\\\\*config) \\n\\n```\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/enhanced_inference).\\n\\n\\n### Where to Go Next ?[\\u200b](#where-to-go-next- \"Direct link to Where to Go Next ?\")\\n\\n\\n* Understand the use cases for [multi-agent conversation](/autogen/docs/Use-Cases/agent_chat) and [enhanced LLM inference](/autogen/docs/Use-Cases/enhanced_inference).\\n* Find [code examples](/autogen/docs/Examples).\\n* Read [SDK](/autogen/docs/reference/agentchat/conversable_agent/).\\n* Learn about [research](/autogen/docs/Research) around AutoGen.\\n* [Roadmap](https://github.com/orgs/microsoft/projects/989/views/3)\\n* Chat on [Discord](https://discord.gg/pAbnFJrkgZ).\\n* Follow on [Twitter](https://twitter.com/pyautogen).\\n\\n\\nIf you like our project, please give it a [star](https://github.com/microsoft/autogen/stargazers) on GitHub. If you are interested in contributing, please read [Contributor\\'s Guide](/autogen/docs/Contribute).\\n\\n\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Getting-Started.md)[NextInstallation](/autogen/docs/installation/)* [Main Features](#main-features)\\n* [Quickstart](#quickstart)\\n* [Where to Go Next ?](#where-to-go-next-)\\nCommunity* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n', 'role': 'user'}, {'content': 'Scroll down.', 'role': 'assistant'}, {'content': 'Address: https://microsoft.github.io/autogen/docs/Getting-Started/\\nTitle: Getting Started | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nGetting Started | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Getting Started\\nOn this pageGetting Started\\n===============\\n\\n\\nAutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools.\\n\\n\\n![AutoGen Overview](/autogen/assets/images/autogen_agentchat-250ca64b77b87e70d34766a080bf6ba8.png)\\n\\n\\n### Main Features[\\u200b](#main-features \"Direct link to Main Features\")\\n\\n\\n* AutoGen enables building next-gen LLM applications based on [multi-agent conversations](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) with minimal effort. It simplifies the orchestration, automation, and optimization of a complex LLM workflow. It maximizes the performance of LLM models and overcomes their weaknesses.\\n* It supports [diverse conversation patterns](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#supporting-diverse-conversation-patterns) for complex workflows. With customizable and conversable agents, developers can use AutoGen to build a wide range of conversation patterns concerning conversation autonomy,\\nthe number of agents, and agent conversation topology.\\n* It provides a collection of working systems with different complexities. These systems span a [wide range of applications](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#diverse-applications-implemented-with-autogen) from various domains and complexities. This demonstrates how AutoGen can easily support diverse conversation patterns.\\n* AutoGen provides [enhanced LLM inference](https://microsoft.github.io/autogen/docs/Use-Cases/enhanced_inference#api-unification). It offers utilities like API unification and caching, and advanced usage patterns, such as error handling, multi-config inference, context programming, etc.\\n\\n\\nAutoGen is powered by collaborative [research studies](/autogen/docs/Research) from Microsoft, Penn State University, and University of Washington.\\n\\n\\n### Quickstart[\\u200b](#quickstart \"Direct link to Quickstart\")\\n\\n\\nInstall from pip: `pip install pyautogen`. Find more options in [Installation](/autogen/docs/installation/).\\nFor [code execution](/autogen/docs/FAQ#code-execution), we strongly recommend installing the python docker package, and using docker.\\n\\n\\n#### Multi-Agent Conversation Framework[\\u200b](#multi-agent-conversation-framework \"Direct link to Multi-Agent Conversation Framework\")\\n\\n\\nAutogen enables the next-gen LLM applications with a generic multi-agent conversation framework. It offers customizable and conversable agents which integrate LLMs, tools, and humans.\\nBy automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\\n\\n\\n\\n```\\nfrom autogen import AssistantAgent, UserProxyAgent, config\\\\_list\\\\_from\\\\_json \\n \\n# Load LLM inference endpoints from an env variable or a file \\n# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints \\n# and OAI\\\\_CONFIG\\\\_LIST\\\\_sample.json \\nconfig\\\\_list = config\\\\_list\\\\_from\\\\_json(env\\\\_or\\\\_file=\"OAI\\\\_CONFIG\\\\_LIST\") \\nassistant = AssistantAgent(\"assistant\", llm\\\\_config={\"config\\\\_list\": config\\\\_list}) \\nuser\\\\_proxy = UserProxyAgent(\"user\\\\_proxy\", code\\\\_execution\\\\_config={\"work\\\\_dir\": \"coding\", \"use\\\\_docker\": False}) # IMPORTANT: set to True to run code in docker, recommended \\nuser\\\\_proxy.initiate\\\\_chat(assistant, message=\"Plot a chart of NVDA and TESLA stock price change YTD.\") \\n# This initiates an automated chat between the two agents to solve the task \\n\\n```\\n\\nThe figure below shows an example conversation flow with AutoGen.\\n![Agent Chat Example](/autogen/assets/images/chat_example-da70a7420ebc817ef9826fa4b1e80951.png)\\n\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/agent_chat).\\n\\n\\n#### Enhanced LLM Inferences[\\u200b](#enhanced-llm-inferences \"Direct link to Enhanced LLM Inferences\")\\n\\n\\nAutogen also helps maximize the utility out of the expensive LLMs such as ChatGPT and GPT-4. It offers enhanced LLM inference with powerful functionalities like tuning, caching, error handling, templating. For example, you can optimize generations by LLM with your own tuning data, success metrics and budgets.\\n\\n\\n\\n```\\n# perform tuning for openai<1 \\nconfig, analysis = autogen.Completion.tune( \\n data=tune\\\\_data, \\n metric=\"success\", \\n mode=\"max\", \\n eval\\\\_func=eval\\\\_func, \\n inference\\\\_budget=0.05, \\n optimization\\\\_budget=3, \\n num\\\\_samples=-1, \\n) \\n# perform inference for a test instance \\nresponse = autogen.Completion.create(context=test\\\\_instance, \\\\*\\\\*config) \\n\\n```\\n\\n* [Code examples](/autogen/docs/Examples).\\n* [Documentation](/autogen/docs/Use-Cases/enhanced_inference).\\n\\n\\n### Where to Go Next ?[\\u200b](#where-to-go-next- \"Direct link to Where to Go Next ?\")\\n\\n\\n* Understand the use cases for [multi-agent conversation](/autogen/docs/Use-Cases/agent_chat) and [enhanced LLM inference](/autogen/docs/Use-Cases/enhanced_inference).\\n* Find [code examples](/autogen/docs/Examples).\\n* Read [SDK](/autogen/docs/reference/agentchat/conversable_agent/).\\n* Learn about [research](/autogen/docs/Research) around AutoGen.\\n* [Roadmap](https://github.com/orgs/microsoft/projects/989/views/3)\\n* Chat on [Discord](https://discord.gg/pAbnFJrkgZ).\\n* Follow on [Twitter](https://twitter.com/pyautogen).\\n\\n\\nIf you like our project, please give it a [star](https://github.com/microsoft/autogen/stargazers) on GitHub. If you are interested in contributing, please read [Contributor\\'s Guide](/autogen/docs/Contribute).\\n\\n\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Getting-Started.md)[NextInstallation](/autogen/docs/installation/)* [Main Features](#main-features)\\n* [Quickstart](#quickstart)\\n* [Where to Go Next ?](#where-to-go-next-)\\nCommunity* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n', 'role': 'user'}, {'content': \"Click the 'research studies' link\", 'role': 'assistant'}, {'content': \"Address: https://microsoft.github.io/autogen/docs/Research\\nTitle: Research | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nResearch | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Research\\nResearch\\n========\\n\\n\\nFor technical details, please check our technical report and research publications.\\n\\n\\n* [AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework](https://arxiv.org/abs/2308.08155). Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Shaokun Zhang, Erkang Zhu, Beibin Li, Li Jiang, Xiaoyun Zhang and Chi Wang. ArXiv 2023.\\n\\n\\n\\n```\\n@inproceedings{wu2023autogen, \\n title={AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework}, \\n author={Qingyun Wu and Gagan Bansal and Jieyu Zhang and Yiran Wu and Shaokun Zhang and Erkang Zhu and Beibin Li and Li Jiang and Xiaoyun Zhang and Chi Wang}, \\n year={2023}, \\n eprint={2308.08155}, \\n archivePrefix={arXiv}, \\n primaryClass={cs.AI} \\n} \\n\\n```\\n\\n* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. AutoML'23.\\n\\n\\n\\n```\\n@inproceedings{wang2023EcoOptiGen, \\n title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference}, \\n author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah}, \\n year={2023}, \\n booktitle={AutoML'23}, \\n} \\n\\n```\\n\\n* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\\n\\n\\n\\n```\\n@inproceedings{wu2023empirical, \\n title={An Empirical Study on Challenging Math Problem Solving with GPT-4}, \\n author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang}, \\n year={2023}, \\n booktitle={ArXiv preprint arXiv:2306.01337}, \\n} \\n\\n```\\n\\n* [EcoAssistant: Using LLM Assistant More Affordably and Accurately](https://arxiv.org/abs/2310.03046). Jieyu Zhang, Ranjay Krishna, Ahmed H. Awadallah, Chi Wang. ArXiv preprint arXiv:2310.03046 (2023).\\n\\n\\n\\n```\\n@inproceedings{zhang2023ecoassistant, \\n title={EcoAssistant: Using LLM Assistant More Affordably and Accurately}, \\n author={Zhang, Jieyu and Krishna, Ranjay and Awadallah, Ahmed H and Wang, Chi}, \\n year={2023}, \\n booktitle={ArXiv preprint arXiv:2310.03046}, \\n} \\n\\n```\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Research.md)[PreviousContributing](/autogen/docs/Contribute)[NextMigration Guide](/autogen/docs/Migration-Guide)Community* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n\", 'role': 'user'}], summary=\"Address: https://microsoft.github.io/autogen/docs/Research\\nTitle: Research | AutoGen\\nViewport position: Showing page 1 of 1.\\n=======================\\n\\n\\n\\nResearch | AutoGen\\n\\n\\n\\n\\n\\n\\n\\n[Skip to main content](#__docusaurus_skipToContent_fallback)[![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)[Resources](#)* [Ecosystem](/autogen/docs/Ecosystem)\\n* [Gallery](/autogen/docs/Gallery)\\n[Other Languages](#)* [Dotnet](https://microsoft.github.io/autogen-for-net/)\\n[GitHub](https://github.com/microsoft/autogen)`⌘``K`* [Getting Started](/autogen/docs/Getting-Started)\\n* [Installation](/autogen/docs/installation/)\\n* [LLM Configuration](/autogen/docs/llm_configuration)\\n* [Use Cases](#)\\n* [Contributing](/autogen/docs/Contribute)\\n* [Research](/autogen/docs/Research)\\n* [Migration Guide](/autogen/docs/Migration-Guide)\\n* \\n* Research\\nResearch\\n========\\n\\n\\nFor technical details, please check our technical report and research publications.\\n\\n\\n* [AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework](https://arxiv.org/abs/2308.08155). Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Shaokun Zhang, Erkang Zhu, Beibin Li, Li Jiang, Xiaoyun Zhang and Chi Wang. ArXiv 2023.\\n\\n\\n\\n```\\n@inproceedings{wu2023autogen, \\n title={AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework}, \\n author={Qingyun Wu and Gagan Bansal and Jieyu Zhang and Yiran Wu and Shaokun Zhang and Erkang Zhu and Beibin Li and Li Jiang and Xiaoyun Zhang and Chi Wang}, \\n year={2023}, \\n eprint={2308.08155}, \\n archivePrefix={arXiv}, \\n primaryClass={cs.AI} \\n} \\n\\n```\\n\\n* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. AutoML'23.\\n\\n\\n\\n```\\n@inproceedings{wang2023EcoOptiGen, \\n title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference}, \\n author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah}, \\n year={2023}, \\n booktitle={AutoML'23}, \\n} \\n\\n```\\n\\n* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\\n\\n\\n\\n```\\n@inproceedings{wu2023empirical, \\n title={An Empirical Study on Challenging Math Problem Solving with GPT-4}, \\n author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang}, \\n year={2023}, \\n booktitle={ArXiv preprint arXiv:2306.01337}, \\n} \\n\\n```\\n\\n* [EcoAssistant: Using LLM Assistant More Affordably and Accurately](https://arxiv.org/abs/2310.03046). Jieyu Zhang, Ranjay Krishna, Ahmed H. Awadallah, Chi Wang. ArXiv preprint arXiv:2310.03046 (2023).\\n\\n\\n\\n```\\n@inproceedings{zhang2023ecoassistant, \\n title={EcoAssistant: Using LLM Assistant More Affordably and Accurately}, \\n author={Zhang, Jieyu and Krishna, Ranjay and Awadallah, Ahmed H and Wang, Chi}, \\n year={2023}, \\n booktitle={ArXiv preprint arXiv:2310.03046}, \\n} \\n\\n```\\n[Edit this page](https://github.com/microsoft/autogen/edit/main/website/docs/Research.md)[PreviousContributing](/autogen/docs/Contribute)[NextMigration Guide](/autogen/docs/Migration-Guide)Community* [Discord](https://discord.gg/pAbnFJrkgZ)\\n* [Twitter](https://twitter.com/pyautogen)\\nCopyright © 2024 AutoGen Authors | [Privacy and Cookies](https://go.microsoft.com/fwlink/?LinkId=521839)\\n\", cost=({'total_cost': 0}, {'total_cost': 0}), human_input=[])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task6 = \"Click the 'research studies' link\"\n", + "user_proxy.initiate_chat(web_surfer, message=task6, clear_history=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Show us the results of that action" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_binary_image(web_surfer.browser.driver.get_screenshot_as_png())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Amazing! Agent navigation on the web still works with the full desktop browser which is great news!\n", + "### And we can always still display the text on screen if our use-case benefited from that" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_binary_image(web_surfer.browser.driver.get_screenshot_as_png())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup process\n", + "To ensure that we have no lingering processes in the background, we can shutdown the browser" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Gracefully shut down our headless desktop browser\n", + "web_surfer.close_the_browser()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/setup.py b/setup.py index 2f8ecfdf5fb..3d153af4393 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ "teachable": ["chromadb"], "lmm": ["replicate", "pillow"], "graph": ["networkx", "matplotlib"], - "websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate", "selenium"], + "websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate", "selenium", "arxiv", "requests"], "redis": ["redis"], "ipython": ["jupyter-client>=8.6.0", "ipykernel>=6.29.0"], }, From 5602958e5af136845642082ba75918ec7e1de8c4 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 07:33:44 +0000 Subject: [PATCH 20/36] Restored to original form in official main branch. Added for clarity. Updated to account for refactoring. All updates now stable and done. Inside Dev Docker, all test files Passed, all pre-commit checks Passed. --- test/agentchat/contrib/test_content_agent.py | 19 +- test/agentchat/contrib/test_web_surfer.py | 77 +++---- .../contrib/test_web_surfer_selenium.py | 208 ++++++++++++++++++ 3 files changed, 241 insertions(+), 63 deletions(-) create mode 100644 test/agentchat/contrib/test_web_surfer_selenium.py diff --git a/test/agentchat/contrib/test_content_agent.py b/test/agentchat/contrib/test_content_agent.py index 6a5aeec3e5d..2d84e3adac6 100644 --- a/test/agentchat/contrib/test_content_agent.py +++ b/test/agentchat/contrib/test_content_agent.py @@ -35,7 +35,6 @@ def test_content_agent(browser: str) -> None: model = ["gpt-3.5-turbo"] model += [m.replace(".", "") for m in model] - # model = ['dolphin-mistral:7b-v2.6-q8_0'] assert len(llm_config["config_list"]) > 0 # type: ignore[arg-type] # Define the temporary storage location @@ -51,10 +50,10 @@ def test_content_agent(browser: str) -> None: system_message=content_agent_system_msg, llm_config=llm_config, max_consecutive_auto_reply=0, - silent=False, # Below are the arguments specific to the ContentAgent + silent=True, storage_path=temporary_content_storage, - browser_kwargs={"browser": browser}, + browser_config={"browser": browser}, max_depth=0, ) @@ -71,11 +70,7 @@ def test_content_agent(browser: str) -> None: content_agent.register_reply(user_proxy, content_agent.collect_content) # Define the links used during the testing process - links = [ - "https://microsoft.github.io/autogen/docs/Examples", - "https://microsoft.github.io/autogen/docs/Getting-Started", - "https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/", - ] + links = ["https://microsoft.github.io/autogen/docs/Examples"] with Cache.disk(): for link in links: @@ -128,12 +123,10 @@ def test_content_agent(browser: str) -> None: os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "screenshot.png")) > 0 ), "The file size of screenshot.png was zero" - print() - print(f"All done, feel free to browse the collected content at: {temporary_content_storage}") + # print() + # print(f"All done, feel free to browse the collected content at: {temporary_content_storage}") if __name__ == "__main__": """Runs this file's tests from the command line.""" - - if not skip_oai: - test_content_agent(browser="firefox") + test_content_agent(browser="firefox") diff --git a/test/agentchat/contrib/test_web_surfer.py b/test/agentchat/contrib/test_web_surfer.py index 7cfd30669ee..d5dae0beb1c 100644 --- a/test/agentchat/contrib/test_web_surfer.py +++ b/test/agentchat/contrib/test_web_surfer.py @@ -2,8 +2,8 @@ import sys import re import pytest -from autogen.agentchat import UserProxyAgent -from autogen.oai.openai_utils import filter_config, config_list_from_json +from autogen import UserProxyAgent, config_list_from_json +from autogen.oai.openai_utils import filter_config from autogen.cache import Cache sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) @@ -17,10 +17,9 @@ BING_QUERY = "Microsoft" try: - from autogen.agentchat.contrib.web_surfer import WebSurferAgent, IS_SELENIUM_CAPABLE + from autogen.agentchat.contrib.web_surfer import WebSurferAgent except ImportError: skip_all = True - print("THERE WAS AN ERROR") else: skip_all = False @@ -46,7 +45,7 @@ skip_all, reason="do not run if dependency is not installed", ) -def test_web_surfer(browser_type="text", web_driver=None) -> None: +def test_web_surfer() -> None: with pytest.MonkeyPatch.context() as mp: # we mock the API key so we can register functions (llm_config must be present for this to work) mp.setenv("OPENAI_API_KEY", MOCK_OPEN_AI_API_KEY) @@ -54,7 +53,7 @@ def test_web_surfer(browser_type="text", web_driver=None) -> None: web_surfer = WebSurferAgent( "web_surfer", llm_config={"model": "gpt-4", "config_list": []}, - browser_config={"viewport_size": page_size, "type": browser_type, "web_driver": web_driver}, + browser_config={"viewport_size": page_size}, ) # Sneak a peak at the function map, allowing us to call the functions for testing here @@ -70,33 +69,28 @@ def test_web_surfer(browser_type="text", web_driver=None) -> None: total_pages = int(m.group(1)) # type: ignore[union-attr] response = function_map["page_down"]() - if browser_type == "text": - assert ( - f"Viewport position: Showing page 2 of {total_pages}." in response - ) # Assumes the content is longer than one screen + assert ( + f"Viewport position: Showing page 2 of {total_pages}." in response + ) # Assumes the content is longer than one screen response = function_map["page_up"]() - if browser_type == "text": - assert f"Viewport position: Showing page 1 of {total_pages}." in response + assert f"Viewport position: Showing page 1 of {total_pages}." in response # Try to scroll too far back up response = function_map["page_up"]() - if browser_type == "text": - assert f"Viewport position: Showing page 1 of {total_pages}." in response + assert f"Viewport position: Showing page 1 of {total_pages}." in response # Try to scroll too far down for i in range(0, total_pages + 1): response = function_map["page_down"]() - if browser_type == "text": - assert f"Viewport position: Showing page {total_pages} of {total_pages}." in response + assert f"Viewport position: Showing page {total_pages} of {total_pages}." in response - if not skip_bing: - # Test web search -- we don't have a key in this case, so we expect it to raise an error (but it means the code path is correct) - with pytest.raises(ValueError, match="Missing Bing API key."): - response = function_map["informational_web_search"](BING_QUERY) + # Test web search -- we don't have a key in this case, so we expect it to raise an error (but it means the code path is correct) + with pytest.raises(ValueError, match="Missing Bing API key."): + response = function_map["informational_web_search"](BING_QUERY) - with pytest.raises(ValueError, match="Missing Bing API key."): - response = function_map["navigational_web_search"](BING_QUERY) + with pytest.raises(ValueError, match="Missing Bing API key."): + response = function_map["navigational_web_search"](BING_QUERY) # Test Q&A and summarization -- we don't have a key so we expect it to fail (but it means the code path is correct) with pytest.raises(IndexError): @@ -110,19 +104,17 @@ def test_web_surfer(browser_type="text", web_driver=None) -> None: skip_oai, reason="do not run if oai is not installed", ) -def test_web_surfer_oai(browser_type="text", web_driver=None) -> None: +def test_web_surfer_oai() -> None: llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": 42} # adding Azure name variations to the model list - model = ["gpt-3.5-turbo", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-16k"] + model = ["gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-16k"] model += [m.replace(".", "") for m in model] summarizer_llm_config = { "config_list": filter_config(config_list, dict(model=model)), # type: ignore[no-untyped-call] "timeout": 180, } - # import ipdb - # ipdb.set_trace() assert len(llm_config["config_list"]) > 0 # type: ignore[arg-type] assert len(summarizer_llm_config["config_list"]) > 0 @@ -132,7 +124,7 @@ def test_web_surfer_oai(browser_type="text", web_driver=None) -> None: "web_surfer", llm_config=llm_config, summarizer_llm_config=summarizer_llm_config, - browser_config={"viewport_size": page_size, "type": browser_type, "web_driver": web_driver}, + browser_config={"viewport_size": page_size}, ) user_proxy = UserProxyAgent( @@ -143,24 +135,23 @@ def test_web_surfer_oai(browser_type="text", web_driver=None) -> None: is_termination_msg=lambda x: True, ) - with Cache.disk(): - # Make some requests that should test function calling - user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'") + # Make some requests that should test function calling + user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'") - user_proxy.initiate_chat(web_surfer, message="Please scroll down.") + user_proxy.initiate_chat(web_surfer, message="Please scroll down.") - user_proxy.initiate_chat(web_surfer, message="Please scroll up.") + user_proxy.initiate_chat(web_surfer, message="Please scroll up.") - user_proxy.initiate_chat(web_surfer, message="When was it founded?") + user_proxy.initiate_chat(web_surfer, message="When was it founded?") - user_proxy.initiate_chat(web_surfer, message="What's this page about?") + user_proxy.initiate_chat(web_surfer, message="What's this page about?") @pytest.mark.skipif( skip_bing, reason="do not run if bing api key is not available", ) -def test_web_surfer_bing(browser_type="text", web_driver=None) -> None: +def test_web_surfer_bing() -> None: page_size = 4096 web_surfer = WebSurferAgent( "web_surfer", @@ -172,12 +163,7 @@ def test_web_surfer_bing(browser_type="text", web_driver=None) -> None: } ] }, - browser_config={ - "viewport_size": page_size, - "bing_api_key": BING_API_KEY, - "type": browser_type, - "web_driver": web_driver, - }, + browser_config={"viewport_size": page_size, "bing_api_key": BING_API_KEY}, ) # Sneak a peak at the function map, allowing us to call the functions for testing here @@ -197,15 +183,6 @@ def test_web_surfer_bing(browser_type="text", web_driver=None) -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" - test_web_surfer() test_web_surfer_oai() test_web_surfer_bing() - - if IS_SELENIUM_CAPABLE: # Test the selenium browser if installed - # Todo: automatically determine which is available in order to avoid unnecessary errors - selected_driver = "edge" # can be 'edge', 'firefox', or 'chrome' - - test_web_surfer(browser_type="selenium", web_driver=selected_driver) - test_web_surfer_oai(browser_type="selenium", web_driver=selected_driver) - test_web_surfer_bing(browser_type="selenium", web_driver=selected_driver) diff --git a/test/agentchat/contrib/test_web_surfer_selenium.py b/test/agentchat/contrib/test_web_surfer_selenium.py new file mode 100644 index 00000000000..6e031e497ce --- /dev/null +++ b/test/agentchat/contrib/test_web_surfer_selenium.py @@ -0,0 +1,208 @@ +import os +import sys +import re +import pytest +from autogen.agentchat import UserProxyAgent +from autogen.oai.openai_utils import filter_config, config_list_from_json +from autogen.cache import Cache + +sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +from conftest import MOCK_OPEN_AI_API_KEY, skip_openai # noqa: E402 + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402 + +BLOG_POST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" +BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen" +BING_QUERY = "Microsoft" + +try: + from autogen.agentchat.contrib.web_surfer import WebSurferAgent, IS_SELENIUM_CAPABLE +except ImportError: + skip_all = True + print("THERE WAS AN ERROR") +else: + skip_all = False + +if not IS_SELENIUM_CAPABLE: + skip_selenium = True +else: + skip_selenium = False + +try: + from openai import OpenAI +except ImportError: + skip_oai = True +else: + skip_oai = False or skip_openai + +try: + BING_API_KEY = os.environ["BING_API_KEY"] +except KeyError: + skip_bing = True +else: + skip_bing = False + +if not skip_oai: + config_list = config_list_from_json(env_or_file=OAI_CONFIG_LIST, file_location=KEY_LOC) + + +@pytest.mark.skipif( + skip_selenium, + reason="do not run if dependency is not installed", +) +def test_web_surfer(browser_type="text", browser=None) -> None: + with pytest.MonkeyPatch.context() as mp: + # we mock the API key so we can register functions (llm_config must be present for this to work) + mp.setenv("OPENAI_API_KEY", MOCK_OPEN_AI_API_KEY) + page_size = 4096 + web_surfer = WebSurferAgent( + "web_surfer", + llm_config={"model": "gpt-4", "config_list": []}, + browser_config={"viewport_size": page_size, "type": browser_type, "browser": browser}, + ) + + # Sneak a peak at the function map, allowing us to call the functions for testing here + function_map = web_surfer._user_proxy._function_map + + # Test some basic navigations + response = function_map["visit_page"](BLOG_POST_URL) + assert f"Address: {BLOG_POST_URL}".strip() in response + assert f"Title: {BLOG_POST_TITLE}".strip() in response + + # Test scrolling + m = re.search(r"\bViewport position: Showing page 1 of (\d+).", response) + total_pages = int(m.group(1)) # type: ignore[union-attr] + + response = function_map["page_down"]() + if browser_type == "text": + assert ( + f"Viewport position: Showing page 2 of {total_pages}." in response + ) # Assumes the content is longer than one screen + + response = function_map["page_up"]() + if browser_type == "text": + assert f"Viewport position: Showing page 1 of {total_pages}." in response + + # Try to scroll too far back up + response = function_map["page_up"]() + if browser_type == "text": + assert f"Viewport position: Showing page 1 of {total_pages}." in response + + # Try to scroll too far down + for i in range(0, total_pages + 1): + response = function_map["page_down"]() + if browser_type == "text": + assert f"Viewport position: Showing page {total_pages} of {total_pages}." in response + + if not skip_bing: + # Test web search -- we don't have a key in this case, so we expect it to raise an error (but it means the code path is correct) + with pytest.raises(ValueError, match="Missing Bing API key."): + response = function_map["informational_web_search"](BING_QUERY) + + with pytest.raises(ValueError, match="Missing Bing API key."): + response = function_map["navigational_web_search"](BING_QUERY) + + # Test Q&A and summarization -- we don't have a key so we expect it to fail (but it means the code path is correct) + with pytest.raises(IndexError): + response = function_map["answer_from_page"]("When was it founded?") + + with pytest.raises(IndexError): + response = function_map["summarize_page"]() + + +@pytest.mark.skipif( + skip_oai, + reason="do not run if oai is not installed", +) +def test_web_surfer_oai(browser_type="text", browser=None) -> None: + llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": 42} + + # adding Azure name variations to the model list + model = ["gpt-3.5-turbo"] + model += [m.replace(".", "") for m in model] + + summarizer_llm_config = { + "config_list": filter_config(config_list, dict(model=model)), # type: ignore[no-untyped-call] + "timeout": 180, + } + + assert len(llm_config["config_list"]) > 0 # type: ignore[arg-type] + assert len(summarizer_llm_config["config_list"]) > 0 + + page_size = 4096 + web_surfer = WebSurferAgent( + "web_surfer", + llm_config=llm_config, + summarizer_llm_config=summarizer_llm_config, + browser_config={"viewport_size": page_size, "type": browser_type, "browser": browser}, + ) + + user_proxy = UserProxyAgent( + "user_proxy", + human_input_mode="NEVER", + code_execution_config=False, + default_auto_reply="", + is_termination_msg=lambda x: True, + ) + + with Cache.disk(): + # Make some requests that should test function calling + user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'") + + user_proxy.initiate_chat(web_surfer, message="Please scroll down.") + + user_proxy.initiate_chat(web_surfer, message="Please scroll up.") + + user_proxy.initiate_chat(web_surfer, message="When was it founded?") + + # user_proxy.initiate_chat(web_surfer, message="What's this page about?") + + +@pytest.mark.skipif( + skip_bing, + reason="do not run if bing api key is not available", +) +def test_web_surfer_bing(browser_type="text", browser=None) -> None: + page_size = 4096 + web_surfer = WebSurferAgent( + "web_surfer", + llm_config={ + "config_list": [ + { + "model": "gpt-3.5-turbo-", + "api_key": "sk-PLACEHOLDER_KEY", + } + ] + }, + browser_config={ + "viewport_size": page_size, + "bing_api_key": BING_API_KEY, + "type": browser_type, + "browser": browser, + }, + ) + + # Sneak a peak at the function map, allowing us to call the functions for testing here + function_map = web_surfer._user_proxy._function_map + + # Test informational queries + response = function_map["informational_web_search"](BING_QUERY) + assert f"Address: bing: {BING_QUERY}" in response + assert f"Title: {BING_QUERY} - Search" in response + assert "Viewport position: Showing page 1 of 1." in response + assert f"A Bing search for '{BING_QUERY}' found " in response + + # Test informational queries + response = function_map["navigational_web_search"](BING_QUERY + " Wikipedia") + assert "Address: https://en.wikipedia.org/wiki/" in response + + +if __name__ == "__main__": + """Runs this file's tests from the command line.""" + + selected_driver = "edge" # can be 'edge', 'firefox', or 'chrome' + + test_web_surfer(browser_type="selenium", browser=selected_driver) + test_web_surfer_oai(browser_type="selenium", browser=selected_driver) + test_web_surfer_bing(browser_type="selenium", browser=selected_driver) From 8954fef1a35c3f8dcbc556563c5385ce06c97a33 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 08:56:17 +0000 Subject: [PATCH 21/36] Further cleaned the two test files and confirmed they passed using the dev docker and the pytest library --- test/agentchat/contrib/test_content_agent.py | 5 +- .../contrib/test_web_surfer_selenium.py | 54 ++++++------------- 2 files changed, 18 insertions(+), 41 deletions(-) diff --git a/test/agentchat/contrib/test_content_agent.py b/test/agentchat/contrib/test_content_agent.py index 2d84e3adac6..c855d483388 100644 --- a/test/agentchat/contrib/test_content_agent.py +++ b/test/agentchat/contrib/test_content_agent.py @@ -29,7 +29,8 @@ skip_oai, reason="do not run if oai is not installed", ) -def test_content_agent(browser: str) -> None: +def test_content_agent() -> None: + browser = "edge" llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": 42} model = ["gpt-3.5-turbo"] @@ -129,4 +130,4 @@ def test_content_agent(browser: str) -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" - test_content_agent(browser="firefox") + test_content_agent() diff --git a/test/agentchat/contrib/test_web_surfer_selenium.py b/test/agentchat/contrib/test_web_surfer_selenium.py index 6e031e497ce..8a7ebc0fbf8 100644 --- a/test/agentchat/contrib/test_web_surfer_selenium.py +++ b/test/agentchat/contrib/test_web_surfer_selenium.py @@ -51,15 +51,16 @@ skip_selenium, reason="do not run if dependency is not installed", ) -def test_web_surfer(browser_type="text", browser=None) -> None: +def test_web_surfer() -> None: + browser = "edge" # can be 'edge', 'firefox', or 'chrome' with pytest.MonkeyPatch.context() as mp: # we mock the API key so we can register functions (llm_config must be present for this to work) mp.setenv("OPENAI_API_KEY", MOCK_OPEN_AI_API_KEY) page_size = 4096 web_surfer = WebSurferAgent( "web_surfer", - llm_config={"model": "gpt-4", "config_list": []}, - browser_config={"viewport_size": page_size, "type": browser_type, "browser": browser}, + llm_config={"model": "gpt-3.5-turbo", "config_list": []}, + browser_config={"viewport_size": page_size, "type": "selenium", "browser": browser}, ) # Sneak a peak at the function map, allowing us to call the functions for testing here @@ -70,31 +71,6 @@ def test_web_surfer(browser_type="text", browser=None) -> None: assert f"Address: {BLOG_POST_URL}".strip() in response assert f"Title: {BLOG_POST_TITLE}".strip() in response - # Test scrolling - m = re.search(r"\bViewport position: Showing page 1 of (\d+).", response) - total_pages = int(m.group(1)) # type: ignore[union-attr] - - response = function_map["page_down"]() - if browser_type == "text": - assert ( - f"Viewport position: Showing page 2 of {total_pages}." in response - ) # Assumes the content is longer than one screen - - response = function_map["page_up"]() - if browser_type == "text": - assert f"Viewport position: Showing page 1 of {total_pages}." in response - - # Try to scroll too far back up - response = function_map["page_up"]() - if browser_type == "text": - assert f"Viewport position: Showing page 1 of {total_pages}." in response - - # Try to scroll too far down - for i in range(0, total_pages + 1): - response = function_map["page_down"]() - if browser_type == "text": - assert f"Viewport position: Showing page {total_pages} of {total_pages}." in response - if not skip_bing: # Test web search -- we don't have a key in this case, so we expect it to raise an error (but it means the code path is correct) with pytest.raises(ValueError, match="Missing Bing API key."): @@ -115,7 +91,8 @@ def test_web_surfer(browser_type="text", browser=None) -> None: skip_oai, reason="do not run if oai is not installed", ) -def test_web_surfer_oai(browser_type="text", browser=None) -> None: +def test_web_surfer_oai() -> None: + browser = "edge" # can be 'edge', 'firefox', or 'chrome' llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": 42} # adding Azure name variations to the model list @@ -135,7 +112,7 @@ def test_web_surfer_oai(browser_type="text", browser=None) -> None: "web_surfer", llm_config=llm_config, summarizer_llm_config=summarizer_llm_config, - browser_config={"viewport_size": page_size, "type": browser_type, "browser": browser}, + browser_config={"viewport_size": page_size, "type": "selenium", "browser": browser}, ) user_proxy = UserProxyAgent( @@ -163,14 +140,15 @@ def test_web_surfer_oai(browser_type="text", browser=None) -> None: skip_bing, reason="do not run if bing api key is not available", ) -def test_web_surfer_bing(browser_type="text", browser=None) -> None: +def test_web_surfer_bing() -> None: + browser = "edge" # can be 'edge', 'firefox', or 'chrome' page_size = 4096 web_surfer = WebSurferAgent( "web_surfer", llm_config={ "config_list": [ { - "model": "gpt-3.5-turbo-", + "model": "gpt-3.5-turbo", "api_key": "sk-PLACEHOLDER_KEY", } ] @@ -178,7 +156,7 @@ def test_web_surfer_bing(browser_type="text", browser=None) -> None: browser_config={ "viewport_size": page_size, "bing_api_key": BING_API_KEY, - "type": browser_type, + "type": "selenium", "browser": browser, }, ) @@ -190,7 +168,7 @@ def test_web_surfer_bing(browser_type="text", browser=None) -> None: response = function_map["informational_web_search"](BING_QUERY) assert f"Address: bing: {BING_QUERY}" in response assert f"Title: {BING_QUERY} - Search" in response - assert "Viewport position: Showing page 1 of 1." in response + # assert "Viewport position: Showing page 1 of 1." in response assert f"A Bing search for '{BING_QUERY}' found " in response # Test informational queries @@ -201,8 +179,6 @@ def test_web_surfer_bing(browser_type="text", browser=None) -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" - selected_driver = "edge" # can be 'edge', 'firefox', or 'chrome' - - test_web_surfer(browser_type="selenium", browser=selected_driver) - test_web_surfer_oai(browser_type="selenium", browser=selected_driver) - test_web_surfer_bing(browser_type="selenium", browser=selected_driver) + test_web_surfer() + test_web_surfer_oai() + test_web_surfer_bing() From 0c2202c10baef99b0e94cdb5fd8d71d09075bd7e Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 09:05:08 +0000 Subject: [PATCH 22/36] Update after feedback from GitHub built error, with my apologies for all the emails. --- test/agentchat/contrib/test_web_surfer_selenium.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/test/agentchat/contrib/test_web_surfer_selenium.py b/test/agentchat/contrib/test_web_surfer_selenium.py index 8a7ebc0fbf8..509a4a0b530 100644 --- a/test/agentchat/contrib/test_web_surfer_selenium.py +++ b/test/agentchat/contrib/test_web_surfer_selenium.py @@ -17,18 +17,13 @@ BING_QUERY = "Microsoft" try: - from autogen.agentchat.contrib.web_surfer import WebSurferAgent, IS_SELENIUM_CAPABLE + from autogen.agentchat.contrib.web_surfer import WebSurferAgent except ImportError: skip_all = True print("THERE WAS AN ERROR") else: skip_all = False -if not IS_SELENIUM_CAPABLE: - skip_selenium = True -else: - skip_selenium = False - try: from openai import OpenAI except ImportError: @@ -48,7 +43,7 @@ @pytest.mark.skipif( - skip_selenium, + skip_all, reason="do not run if dependency is not installed", ) def test_web_surfer() -> None: From 13ba006c73aae9186c05df7a3263a49c90e3e0c1 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 15:43:44 -0600 Subject: [PATCH 23/36] Update contrib-tests.yml for Selenium This update should GitHub to use the WebSurfer extras when testing test_web_surfer_selenium.py. --- .github/workflows/contrib-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index 81eaad453d3..5bc13f24b22 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -218,7 +218,7 @@ jobs: - name: Coverage run: | pip install coverage>=5.3 - coverage run -a -m pytest test/test_browser_utils.py test/agentchat/contrib/test_web_surfer.py --skip-openai + coverage run -a -m pytest test/test_browser_utils.py test/agentchat/contrib/test_web_surfer.py --skip-openai test/agentchat/contrib/test_web_surfer_selenium.py --skip-openai coverage xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 From e1e81f6ab7932f82f6858b5c9ee18e1b12d40da6 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 16:02:09 -0600 Subject: [PATCH 24/36] Update contrib-openai.yml Adding coverage within the Websurfer workflow for this PR: - test/agentchat/contrib/test_web_surfer_selenium.py - test/agentchat/contrib/test_content_agent.py --- .github/workflows/contrib-openai.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/contrib-openai.yml b/.github/workflows/contrib-openai.yml index 04a22be58ff..ca9f75a7a0e 100644 --- a/.github/workflows/contrib-openai.yml +++ b/.github/workflows/contrib-openai.yml @@ -262,7 +262,7 @@ jobs: OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }} BING_API_KEY: ${{ secrets.BING_API_KEY }} run: | - coverage run -a -m pytest test/agentchat/contrib/test_web_surfer.py + coverage run -a -m pytest test/agentchat/contrib/test_web_surfer.py test/agentchat/contrib/test_web_surfer_selenium.py test/agentchat/contrib/test_web_surfer_selenium.py test/agentchat/contrib/test_content_agent.py coverage xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 From 0b5e73350bc02dc3144446fd5d32926dfcbb024e Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 16:12:38 -0600 Subject: [PATCH 25/36] Update contrib-tests.yml Adding `test/agentchat/contrib/test_content_agent.py --skip-openai` under the assumption that all test files must be accounted for or they will rely on the default workflow. This test requires openAI calls, but still needs to be registered on this file to avoid build errors. --- .github/workflows/contrib-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index 5bc13f24b22..ad04353048e 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -218,7 +218,7 @@ jobs: - name: Coverage run: | pip install coverage>=5.3 - coverage run -a -m pytest test/test_browser_utils.py test/agentchat/contrib/test_web_surfer.py --skip-openai test/agentchat/contrib/test_web_surfer_selenium.py --skip-openai + coverage run -a -m pytest test/test_browser_utils.py test/agentchat/contrib/test_web_surfer.py --skip-openai test/agentchat/contrib/test_web_surfer_selenium.py --skip-openai test/agentchat/contrib/test_content_agent.py --skip-openai coverage xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 From 9099b57a64a506ced2f04e6dfdae31c3d999d7b5 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 16:19:22 -0600 Subject: [PATCH 26/36] Update contrib-openai.yml removed duplicate entry for test_web_surfer_selenium.py --- .github/workflows/contrib-openai.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/contrib-openai.yml b/.github/workflows/contrib-openai.yml index ca9f75a7a0e..84bf54d5c72 100644 --- a/.github/workflows/contrib-openai.yml +++ b/.github/workflows/contrib-openai.yml @@ -262,7 +262,7 @@ jobs: OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }} BING_API_KEY: ${{ secrets.BING_API_KEY }} run: | - coverage run -a -m pytest test/agentchat/contrib/test_web_surfer.py test/agentchat/contrib/test_web_surfer_selenium.py test/agentchat/contrib/test_web_surfer_selenium.py test/agentchat/contrib/test_content_agent.py + coverage run -a -m pytest test/agentchat/contrib/test_web_surfer.py test/agentchat/contrib/test_web_surfer_selenium.py test/agentchat/contrib/test_content_agent.py coverage xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 From 744345864cc4a375eb4b5aa24823ab25867b03e2 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 16:23:29 -0600 Subject: [PATCH 27/36] Update setup.py Added the missing `pillow` dependency for graphical based web browsing and downstream tasks --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3d153af4393..b4b756753c3 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ "teachable": ["chromadb"], "lmm": ["replicate", "pillow"], "graph": ["networkx", "matplotlib"], - "websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate", "selenium", "arxiv", "requests"], + "websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate", "selenium", "arxiv", "requests", "pillow"], "redis": ["redis"], "ipython": ["jupyter-client>=8.6.0", "ipykernel>=6.29.0"], }, From 1b87acdd17c636aab0be6804e53164d045d04ddb Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 16:46:46 -0600 Subject: [PATCH 28/36] Update test_content_agent.py Moving the ContentAgent import to be conditional on "not skip_oai" in the hope that it helps avoid the `markdownify` import error during build tests. --- test/agentchat/contrib/test_content_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/agentchat/contrib/test_content_agent.py b/test/agentchat/contrib/test_content_agent.py index c855d483388..49af9b46176 100644 --- a/test/agentchat/contrib/test_content_agent.py +++ b/test/agentchat/contrib/test_content_agent.py @@ -4,7 +4,6 @@ import tempfile import pytest from autogen.agentchat import UserProxyAgent -from autogen.agentchat.contrib.content_agent import ContentAgent from autogen.oai.openai_utils import filter_config, config_list_from_json from autogen.cache import Cache @@ -23,6 +22,7 @@ if not skip_oai: config_list = config_list_from_json(env_or_file=OAI_CONFIG_LIST, file_location=KEY_LOC) + from autogen.agentchat.contrib.content_agent import ContentAgent @pytest.mark.skipif( From 11b00e55ef106369d63f8fc925c705600b2c4ea9 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 22:58:32 +0000 Subject: [PATCH 29/36] pre-commit fix on setup.py for readability (websurfer extras) --- setup.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b4b756753c3..415871de89f 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,16 @@ "teachable": ["chromadb"], "lmm": ["replicate", "pillow"], "graph": ["networkx", "matplotlib"], - "websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate", "selenium", "arxiv", "requests", "pillow"], + "websurfer": [ + "beautifulsoup4", + "markdownify", + "pdfminer.six", + "pathvalidate", + "selenium", + "arxiv", + "requests", + "pillow", + ], "redis": ["redis"], "ipython": ["jupyter-client>=8.6.0", "ipykernel>=6.29.0"], }, From 66ac7bd3235d432be706242fab56a6d0b9d30033 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 23:33:42 +0000 Subject: [PATCH 30/36] Final cleanup of unnecessary comments within the PR. --- autogen/agentchat/contrib/content_agent.py | 11 -------- autogen/browser_utils.py | 27 +++++-------------- test/agentchat/contrib/test_content_agent.py | 3 --- .../contrib/test_web_surfer_selenium.py | 14 ++++------ 4 files changed, 12 insertions(+), 43 deletions(-) diff --git a/autogen/agentchat/contrib/content_agent.py b/autogen/agentchat/contrib/content_agent.py index 49b6fee7211..2edcaa6d4e2 100644 --- a/autogen/agentchat/contrib/content_agent.py +++ b/autogen/agentchat/contrib/content_agent.py @@ -108,17 +108,6 @@ def __init__( # Define the classifiers self.define_classifiers() - # def classifier_to_collector_reply(self, recipient, messages, sender, config): - # # Inner dialogue reply for boolean classification results - # last_message = messages[-1] if isinstance(messages, list) else messages - # _, rep = recipient.generate_oai_reply([last_message], sender) - # if "false" in rep.lower(): - # rep = "False" - # elif "true" in rep.lower(): - # rep = "True" - # else: - # rep = "False" - # return True, rep def classifier_to_collector_reply( self, recipient: Agent, # Assuming no specific type is enforced; otherwise, replace Any with the specific class type diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index f06e09564d1..43e4ccc0542 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -595,12 +595,10 @@ def SeleniumBrowser(**kwargs): # Function that loads the web driver Parameters: browser (str): A string specifying which browser to launch. Defaults to 'firefox'. download_dir (str): A path to where downloaded files are stored. Defaults to None + resolution (tuple): A tuple of size 2 for screen resolution in the order of width and height. Defaults to (1920,1080) Returns: webdriver: An instance of the Selenium WebDriver based on the specified browser. User can open a new page by `webdriver.get('https://www.microsoft.com')`. - - Raises: - ImportError: If selenium package is not installed, it raises an ImportError with a message suggesting to install it using pip. """ # Load the arguments from kwargs @@ -609,7 +607,7 @@ def SeleniumBrowser(**kwargs): # Function that loads the web driver if not download_dir: download_dir = tempfile.gettempdir() - browser_res = kwargs.get("resolution", (1920, 5200)) + browser_res = kwargs.get("resolution", (1920, 1080)) def get_headless_options(download_dir, options): options.headless = True @@ -647,19 +645,14 @@ def get_headless_options(download_dir, options): profile.set_preference("browser.download.dir", download_dir) profile.set_preference("browser.download.useDownloadDir", True) profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") # MIME type - # profile.set_preference("pdfjs.disabled", True) # Disable PDF viewer profile.set_preference("javascript.enabled", False) - # profile.set_preference("browser.startup.homepage", "https://microsoft.com") profile.update_preferences() options = FirefoxOptions() options.profile = profile options.set_capability("se:downloadsEnabled", True) # Instantiate the Firefox WebDriver with the configured options - driver = webdriver.Firefox( - options=get_headless_options(download_dir, options) - ) # , service_log_path=f'{tempfile.tempdir}/geckodriver.log') - driver.capabilities["moz:processID"] + driver = webdriver.Firefox(options=get_headless_options(download_dir, options)) elif browser.lower() == "chrome": # Instantiate the Chrome Options @@ -759,10 +752,6 @@ def address(self) -> str: @property def viewport(self) -> str: """Return the content of the current viewport.""" - # display_binary_image(self.driver.get_screenshot_as_png()) - # self._page_content # or self.driver.page_source - # Image.open(io.BytesIO(self.driver.get_screenshot_as_png())) - # if self._page_content and len(self._page_content) > 0 return self._page_content @property @@ -787,9 +776,8 @@ def set_address(self, uri_or_path: str) -> None: uri_or_path = urljoin(self.address, uri_or_path) self.history[-1] = uri_or_path # Update the address with the fully-qualified path # Navigate to the specified URI or path - self._fetch_page(uri_or_path) # Implemented, but not needed - # self.driver.get(uri_or_path) - # self.driver.implicitly_wait(self.page_load_time) + self._fetch_page(uri_or_path) + self.viewport_current_page = 0 self._split_pages() @@ -863,7 +851,6 @@ def _bing_api_call(self, query: str) -> Dict[str, Dict[str, List[Dict[str, Union request_kwargs["params"]["q"] = query request_kwargs["params"]["textDecorations"] = False request_kwargs["params"]["textFormat"] = "raw" - request_kwargs["stream"] = False # Make the request @@ -920,9 +907,9 @@ def _set_page_content(self, content): # Navigate to the file self.driver.get(f"file://{html_file_path}") - def download(self, uri_or_path: str) -> None: # TODO: update this based on the new method + def download(self, uri_or_path: str) -> None: """Download from a given URI""" - self.driver.get(uri_or_path) + download_using_requests(self.driver, self.downloads_folder, os.path.basename(uri_or_path.rstrip("/"))) def _get_headers(self): def parse_list_to_dict(lst): diff --git a/test/agentchat/contrib/test_content_agent.py b/test/agentchat/contrib/test_content_agent.py index 49af9b46176..f8b5d5722a2 100644 --- a/test/agentchat/contrib/test_content_agent.py +++ b/test/agentchat/contrib/test_content_agent.py @@ -124,9 +124,6 @@ def test_content_agent() -> None: os.path.getsize(os.path.join(content_agent.process_history[link]["local_path"], "screenshot.png")) > 0 ), "The file size of screenshot.png was zero" - # print() - # print(f"All done, feel free to browse the collected content at: {temporary_content_storage}") - if __name__ == "__main__": """Runs this file's tests from the command line.""" diff --git a/test/agentchat/contrib/test_web_surfer_selenium.py b/test/agentchat/contrib/test_web_surfer_selenium.py index 509a4a0b530..1e858466a5b 100644 --- a/test/agentchat/contrib/test_web_surfer_selenium.py +++ b/test/agentchat/contrib/test_web_surfer_selenium.py @@ -66,13 +66,12 @@ def test_web_surfer() -> None: assert f"Address: {BLOG_POST_URL}".strip() in response assert f"Title: {BLOG_POST_TITLE}".strip() in response - if not skip_bing: - # Test web search -- we don't have a key in this case, so we expect it to raise an error (but it means the code path is correct) - with pytest.raises(ValueError, match="Missing Bing API key."): - response = function_map["informational_web_search"](BING_QUERY) + # Test web search -- we don't have a key in this case, so we expect it to raise an error (but it means the code path is correct) + with pytest.raises(ValueError, match="Missing Bing API key."): + response = function_map["informational_web_search"](BING_QUERY) - with pytest.raises(ValueError, match="Missing Bing API key."): - response = function_map["navigational_web_search"](BING_QUERY) + with pytest.raises(ValueError, match="Missing Bing API key."): + response = function_map["navigational_web_search"](BING_QUERY) # Test Q&A and summarization -- we don't have a key so we expect it to fail (but it means the code path is correct) with pytest.raises(IndexError): @@ -128,8 +127,6 @@ def test_web_surfer_oai() -> None: user_proxy.initiate_chat(web_surfer, message="When was it founded?") - # user_proxy.initiate_chat(web_surfer, message="What's this page about?") - @pytest.mark.skipif( skip_bing, @@ -163,7 +160,6 @@ def test_web_surfer_bing() -> None: response = function_map["informational_web_search"](BING_QUERY) assert f"Address: bing: {BING_QUERY}" in response assert f"Title: {BING_QUERY} - Search" in response - # assert "Viewport position: Showing page 1 of 1." in response assert f"A Bing search for '{BING_QUERY}' found " in response # Test informational queries From 6fbe0b8fc51a81d91b232c81892c357536649770 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Thu, 22 Feb 2024 23:58:41 +0000 Subject: [PATCH 31/36] Restored the original copies of the two unrelated notebooks altered by pre-commit. --- notebook/agentchat_custom_model.ipynb | 1 - notebook/agentchat_lmm_gpt-4v.ipynb | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/notebook/agentchat_custom_model.ipynb b/notebook/agentchat_custom_model.ipynb index 365ff22c038..c29d3808926 100644 --- a/notebook/agentchat_custom_model.ipynb +++ b/notebook/agentchat_custom_model.ipynb @@ -383,7 +383,6 @@ "source": [ "# load model here\n", "\n", - "\n", "config = config_list_custom[0]\n", "device = config.get(\"device\", \"cpu\")\n", "loaded_model = AutoModelForCausalLM.from_pretrained(config[\"model\"]).to(device)\n", diff --git a/notebook/agentchat_lmm_gpt-4v.ipynb b/notebook/agentchat_lmm_gpt-4v.ipynb index b49f4472a50..c56c6e6a1db 100644 --- a/notebook/agentchat_lmm_gpt-4v.ipynb +++ b/notebook/agentchat_lmm_gpt-4v.ipynb @@ -637,6 +637,8 @@ } ], "source": [ + "\n", + "\n", "creator = FigureCreator(name=\"Figure Creator~\", llm_config=gpt4_llm_config)\n", "\n", "user_proxy = autogen.UserProxyAgent(\n", From c06f6fd30fb4e959c6e2a5338f592d5eb52a4cd0 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Sun, 25 Feb 2024 21:02:45 +0000 Subject: [PATCH 32/36] Provided a more descriptive name for the agent responsible for collecting web data. Added '_' to internal functions and docstrings to the web_archiver_agent.py file. --- .github/workflows/contrib-openai.yml | 2 +- .github/workflows/contrib-tests.yml | 2 +- ...content_agent.py => web_archiver_agent.py} | 210 +++++++++++++----- ...ynb => agentchat_web_archiver_agent.ipynb} | 48 ++-- ...nt_agent.py => test_web_archiver_agent.py} | 12 +- 5 files changed, 182 insertions(+), 92 deletions(-) rename autogen/agentchat/contrib/{content_agent.py => web_archiver_agent.py} (75%) rename notebook/{agentchat_content_agent.ipynb => agentchat_web_archiver_agent.ipynb} (98%) rename test/agentchat/contrib/{test_content_agent.py => test_web_archiver_agent.py} (93%) diff --git a/.github/workflows/contrib-openai.yml b/.github/workflows/contrib-openai.yml index 84bf54d5c72..f16c75db056 100644 --- a/.github/workflows/contrib-openai.yml +++ b/.github/workflows/contrib-openai.yml @@ -262,7 +262,7 @@ jobs: OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }} BING_API_KEY: ${{ secrets.BING_API_KEY }} run: | - coverage run -a -m pytest test/agentchat/contrib/test_web_surfer.py test/agentchat/contrib/test_web_surfer_selenium.py test/agentchat/contrib/test_content_agent.py + coverage run -a -m pytest test/agentchat/contrib/test_web_surfer.py test/agentchat/contrib/test_web_surfer_selenium.py test/agentchat/contrib/test_web_archiver_agent.py coverage xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index ad04353048e..6fd7535bf08 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -218,7 +218,7 @@ jobs: - name: Coverage run: | pip install coverage>=5.3 - coverage run -a -m pytest test/test_browser_utils.py test/agentchat/contrib/test_web_surfer.py --skip-openai test/agentchat/contrib/test_web_surfer_selenium.py --skip-openai test/agentchat/contrib/test_content_agent.py --skip-openai + coverage run -a -m pytest test/test_browser_utils.py test/agentchat/contrib/test_web_surfer.py --skip-openai test/agentchat/contrib/test_web_surfer_selenium.py --skip-openai test/agentchat/contrib/test_web_archiver_agent.py --skip-openai coverage xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/autogen/agentchat/contrib/content_agent.py b/autogen/agentchat/contrib/web_archiver_agent.py similarity index 75% rename from autogen/agentchat/contrib/content_agent.py rename to autogen/agentchat/contrib/web_archiver_agent.py index 2edcaa6d4e2..f3e1c9c232c 100644 --- a/autogen/agentchat/contrib/content_agent.py +++ b/autogen/agentchat/contrib/web_archiver_agent.py @@ -13,7 +13,7 @@ fix_missing_protocol, extract_pdf_text, ) -from typing import List, Union, Any, Tuple +from typing import List, Union, Any, Tuple, Dict import os import re import json @@ -37,7 +37,7 @@ pass -class ContentAgent(ConversableAgent): +class WebArchiverAgent(ConversableAgent): def __init__( self, silent: bool = True, @@ -48,14 +48,14 @@ def __init__( **kwargs, ): """ - ContentAgent: Custom LLM agent for collecting online content. + WebArchiverAgent: Custom LLM agent for collecting online content. - The ContentAgent class is a custom Autogen agent that can be used to collect and store online content from different + The WebArchiverAgent class is a custom Autogen agent that can be used to collect and store online content from different web pages. It extends the ConversableAgent class and provides additional functionality for managing a list of - additional links, storing collected content in local directories, and customizing request headers. ContentAgent + additional links, storing collected content in local directories, and customizing request headers. WebArchiverAgent uses deque to manage a list of additional links for further exploration, with a maximum depth limit set by max_depth parameter. The collected content is stored in the specified storage path (storage_path) using local directories. - ContentAgent can be customized with request_kwargs and llm_config parameters during instantiation. The default + WebArchiverAgent can be customized with request_kwargs and llm_config parameters during instantiation. The default User-Agent header is used for requests, but it can be overridden by providing a new dictionary of headers under request_kwargs. @@ -106,13 +106,13 @@ def __init__( } # Define the classifiers - self.define_classifiers() + self._define_classifiers() def classifier_to_collector_reply( self, - recipient: Agent, # Assuming no specific type is enforced; otherwise, replace Any with the specific class type + recipient: Agent, messages: Union[List[str], str], - sender: Agent, # Replace Any if the sender has a specific type + sender: Agent, config: dict, ) -> Tuple[bool, str]: """ @@ -149,7 +149,16 @@ def classifier_to_collector_reply( return True, classified_reply - def define_classifiers(self): + def _define_classifiers(self): + """ + Defines the agents used for classification tasks. + + Parameters: + - None + + Returns: + - None + """ # Define the system messages for the classifiers self.metadata_classifier_system_msg = "Help the user identify if the metadata contains potentially useful information such as: author, title, description, a date, etc. Respond True for useful, False for not." self.content_classifier_system_msg = "You are to classify web data as content or other (such as an adversitement) based on the page title. Respond True if it is content, False if not." @@ -178,52 +187,39 @@ def define_classifiers(self): ) self.content_classifier.register_reply(self, self.classifier_to_collector_reply, 1) - # Main entry point - def collect_content(self, recipient, messages, sender, config): - content_type, content = "", "" - all_links = [] - for message in messages: - if message.get("role") == "user": - links = re.findall( - r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", - message.get("content"), - ) - for link in links: - all_links.append(link) - - # Process the links provided by the user - for link in all_links: - content_type, content = self.fetch_content(link) - - # Inform self that it has completed the root level of link(s) - self.link_depth = 1 - if self.link_depth <= self.max_depth: - while len(self.additional_links) > 0: - additional_link = self.additional_links.pop() - content_type, content = self.fetch_content(additional_link) - all_links.append(all_links) + def _fetch_content(self, link: str) -> Tuple[str, str]: + """ + Fetches content from a given URL. - self.link_depth = 0 - return ( - True, - f"Success: archived the following links in your chosen location {self.local_dir}/ <-- {', '.join(all_links)}", - ) + Parameters: + - link (str): The URL from which to fetch content. - def fetch_content(self, link): + Returns: + - Tuple[str, str]: Content type and fetched content or error message. + """ # Parse the link parsed_url = urlparse(link) # A special case for arxiv links if "arxiv" in link and IS_ARXIV_CAPABLE: - return "pdf", self.fetch_arxiv_content(parsed_url) + return "pdf", self._fetch_arxiv_content(parsed_url) elif parsed_url.path.endswith(".pdf"): - return "pdf", self.fetch_pdf_content(link) + return "pdf", self._fetch_pdf_content(link) else: - return "html", self.fetch_html_content(link) + return "html", self._fetch_html_content(link) + + def _fetch_html_content(self, link: str) -> str: + """ + Handles the fetching of HTML content from a web page. + + Parameters: + - link (str): The URL of the web page. - def fetch_html_content(self, link): + Returns: + - str: Success (errors are handled at the higher level) + """ # Handle web page content (html) sd = {} # submission_data @@ -266,7 +262,7 @@ def fetch_html_content(self, link): # Store the BS object sd["soup"] = BeautifulSoup(sd["html"], "html.parser") - sd["content"] = self.identify_content(sd["soup"]) + sd["content"] = self._identify_content(sd["soup"]) # Save the content to a text file on disk with open(os.path.join(sd["local_path"], "content.txt"), "w") as f: @@ -277,7 +273,7 @@ def fetch_html_content(self, link): sd["soup"].url = link # Parse and store the Metadata - sd["meta"] = self.identify_metadata(sd["soup"]) # [ data.attrs for data in sd['soup'].find_all("meta") ] + sd["meta"] = self._identify_metadata(sd["soup"]) # [ data.attrs for data in sd['soup'].find_all("meta") ] # Open a file to write the metadata to with open(os.path.join(sd["local_path"], "metadata.txt"), "w") as f: @@ -310,7 +306,7 @@ def fetch_html_content(self, link): self.additional_links.append(link["href"]) # Parse and store the images - self.collect_images(sd["soup"], sd["local_path"]) + self._collect_images(sd["soup"], sd["local_path"]) # Close down the browser self.browser.quit() @@ -320,7 +316,16 @@ def fetch_html_content(self, link): return "success" - def fetch_pdf_content(self, link): + def _fetch_pdf_content(self, link: str) -> str: + """ + Fetches PDF content from a given URL. + + Parameters: + - link (str): The URL from which to fetch the PDF content. + + Returns: + - str: Extracted content or None in a failure event + """ local_pdf_path = os.path.join( self.local_dir, os.path.join(get_file_path_from_url(link, self.domain_path_rules), link.split("/")[-1]) ) @@ -344,7 +349,16 @@ def fetch_pdf_content(self, link): else: return None - def fetch_arxiv_content(self, link): + def _fetch_arxiv_content(self, link: str) -> str: + """ + Fetches content specifically from arXiv URLs. + + Parameters: + - link (str): The arXiv URL from which to fetch content. + + Returns: + - str: Extracted text content + """ # Identify the paper identification arxiv_id = link.path.split("/")[-1] @@ -373,7 +387,16 @@ def fetch_arxiv_content(self, link): return text - def identify_content(self, soup): + def _identify_content(self, soup: BeautifulSoup) -> List[str]: + """ + Identifies the title of the web page from the BeautifulSoup object. + + Parameters: + - soup (BeautifulSoup): BeautifulSoup object of the web page. + + Returns: + - list: A list of all text content classified as relevant + """ # Get the page title for use with the queries page_title = soup.find("head").find("title").string @@ -397,7 +420,17 @@ def identify_content(self, soup): return relevant_content - def identify_metadata(self, soup, verbose=False): + def _identify_metadata(self, soup: BeautifulSoup, verbose: bool = False) -> List[Dict]: + """ + Extracts metadata from the web page using BeautifulSoup. + + Parameters: + - soup (BeautifulSoup): BeautifulSoup object of the web page. + - verbose (bool): Flag to enable verbose logging. + + Returns: + - List[Dict]: A list of dictionaries representing the relevant Metadata extracted from the page. + """ soup.find("head").find("title").string relevant_content = [] for data in soup.find_all("meta"): @@ -430,7 +463,19 @@ def identify_metadata(self, soup, verbose=False): return relevant_content - def collect_images(self, soup, local_path, verbose=False): + def _collect_images(self, soup: BeautifulSoup, local_path: str, verbose: bool = False) -> None: + """ + Collects and saves images from the web page to a local path. + + Parameters: + - soup (BeautifulSoup): BeautifulSoup object of the web page. + - local_path (str): The local directory path where images will be saved. + - verbose (bool): Flag to enable verbose logging. + + Returns: + - None + """ + def get_basename(filename): return os.path.splitext(os.path.basename(filename))[0] @@ -481,3 +526,64 @@ def get_basename(filename): except Exception: print(image_url, img.attrs["src"]) traceback.print_exc() + + # Main entry point + def collect_content( + self, + recipient: Agent, + messages: Union[List[str], str], + sender: Agent, + config: dict, + ) -> Tuple[bool, str]: + """ + Collects and archives content from links found in messages. + + This function scans messages for URLs, fetches content from these URLs, + and archives them to a specified local directory. It supports recursive + link fetching up to a defined depth. + + Parameters: + - recipient (Agent): The agent designated to receive the content. + - messages (Union[List[str], str]): A list of messages or a single message containing URLs. + - sender (Agent): The agent sending the content. + - config (dict): Configuration parameters for content fetching and archiving. + + Returns: + - Tuple[bool, str]: A tuple where the first element is a boolean indicating + success or failure, and the second element is a string message detailing + the outcome or providing error logs in case of failure. + """ + + try: + content_type, content = "", "" + all_links = [] + for message in messages: + if message.get("role") == "user": + links = re.findall( + r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", + message.get("content"), + ) + for link in links: + all_links.append(link) + + # Process the links provided by the user + for link in all_links: + content_type, content = self._fetch_content(link) + + # Inform self that it has completed the root level of link(s) + self.link_depth = 1 + if self.link_depth <= self.max_depth: + while len(self.additional_links) > 0: + additional_link = self.additional_links.pop() + content_type, content = self._fetch_content(additional_link) + all_links.append(all_links) + + self.link_depth = 0 + return ( + True, + f"Success: archived the following links in your chosen location {self.local_dir}/ <-- {', '.join(all_links)}", + ) + except Exception: + # Return traceback information in case of an exception + error_log = traceback.format_exc() + return False, f"Failed to collect content due to an error: {error_log}" diff --git a/notebook/agentchat_content_agent.ipynb b/notebook/agentchat_web_archiver_agent.ipynb similarity index 98% rename from notebook/agentchat_content_agent.ipynb rename to notebook/agentchat_web_archiver_agent.ipynb index 94383714bcc..f61a9e6496e 100644 --- a/notebook/agentchat_content_agent.ipynb +++ b/notebook/agentchat_web_archiver_agent.ipynb @@ -4,10 +4,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Content Collection Tasks with ContentAgent\n", + "# Content Collection Tasks with WebArchiverAgent\n", "\n", "### Why would we want this?\n", - "As part of a larger pipeline, `ContentAgent` accomplishes the task of automatic retrieval and storage of online content for numerous downstream tasks. \n", + "As part of a larger pipeline, `WebArchiverAgent` accomplishes the task of automatic retrieval and storage of online content for numerous downstream tasks. \n", "This task is facilitated by a headless Selenium webdriver. \n", "\n", "\n", @@ -15,21 +15,10 @@ "\n", "AutoGen requires `Python>=3.8`. To run this notebook example, please install:\n", "```bash\n", - "pip install pyautogen, selenium, markdownify, pillow, pdfminer.six, beautifulsoup4\n", + "pip install \"pyautogen[websurfer]\"\n", "```" ] }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# %pip install pyautogen selenium markdownify pillow pdfminer.six beautifulsoup4 arxiv\n", - "## or\n", - "# %pip install \"pyautogen[websurfer]\"" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -59,7 +48,7 @@ "import autogen\n", "from PIL import Image\n", "from IPython.core.display_functions import display\n", - "from autogen.agentchat.contrib.content_agent import ContentAgent\n", + "from autogen.agentchat.contrib.web_archiver_agent import WebArchiverAgent\n", "from autogen.agentchat.user_proxy_agent import UserProxyAgent\n", "from autogen.oai import config_list_from_json\n", "from autogen.browser_utils import display_binary_image\n", @@ -166,8 +155,8 @@ "# Specify where our web content will be stored, we'll use this at the end of the notebook\n", "storage_path = \"./content\"\n", "\n", - "content_agent = ContentAgent(\n", - " name=\"ContentAgent\",\n", + "web_archiver_agent = WebArchiverAgent(\n", + " name=\"ContentAgent\", # Choose any name you prefer\n", " system_message=\"You are data collection agent specializing in content on the web.\",\n", " max_depth=0,\n", " llm_config=llm_config,\n", @@ -194,7 +183,7 @@ ")\n", "\n", "# We register our collection function as the default response\n", - "content_agent.register_reply(user_proxy, content_agent.collect_content)" + "web_archiver_agent.register_reply(user_proxy, web_archiver_agent.collect_content)" ] }, { @@ -240,7 +229,7 @@ "source": [ "link = \"https://arxiv.org/abs/2308.08155\"\n", "\n", - "user_proxy.initiate_chat(content_agent, message=link)" + "user_proxy.initiate_chat(web_archiver_agent, message=link)" ] }, { @@ -629,7 +618,7 @@ ], "source": [ "link = \"https://microsoft.github.io/autogen/docs/Examples\"\n", - "user_proxy.initiate_chat(content_agent, message=link)" + "user_proxy.initiate_chat(web_archiver_agent, message=link)" ] }, { @@ -1600,7 +1589,7 @@ ], "source": [ "link = \"https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/\"\n", - "user_proxy.initiate_chat(content_agent, message=link)" + "user_proxy.initiate_chat(web_archiver_agent, message=link)" ] }, { @@ -1667,7 +1656,7 @@ "metadata": {}, "outputs": [], "source": [ - "last_page = list(content_agent.process_history.keys())[-1]\n", + "last_page = list(web_archiver_agent.process_history.keys())[-1]\n", "\n", "local_path = f\"{storage_path}/{get_file_path_from_url(last_page)}\"\n", "screenshot_path = os.path.join(local_path, \"screenshot.png\")\n", @@ -1684,10 +1673,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### It seems the bottom was cropped, but using the 'firefox' browser for our agent will trigger the \"full page screenshot\" function, \n", - "And not to worry, everything is also stored to disk in its original form, including the source HTML as it was loaded in the desktop browser.\n", + "It seems the bottom was cropped, but using the 'firefox' browser for our agent will trigger the \"full page screenshot\" function.
\n", + "But not to worry, everything is also stored to disk in its original form, including the source HTML as it was loaded in the desktop browser.\n", "\n", - "#### Below we confirm that our Autogen Agent successfully cataloged all of the content into the file." + "Below we confirm that our Autogen Agent successfully cataloged all of the content into the file." ] }, { @@ -1728,14 +1717,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Thanks for looking at our new ContentAgent:\n", - "### Stay tuned for the larger pipeline known as the Athena Agent!" + "## Thanks for looking at our new WebArchiverAgent:\n", + "### Stay tuned for more updates from Autogen!" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] } ], "metadata": { diff --git a/test/agentchat/contrib/test_content_agent.py b/test/agentchat/contrib/test_web_archiver_agent.py similarity index 93% rename from test/agentchat/contrib/test_content_agent.py rename to test/agentchat/contrib/test_web_archiver_agent.py index f8b5d5722a2..080274ded24 100644 --- a/test/agentchat/contrib/test_content_agent.py +++ b/test/agentchat/contrib/test_web_archiver_agent.py @@ -22,7 +22,7 @@ if not skip_oai: config_list = config_list_from_json(env_or_file=OAI_CONFIG_LIST, file_location=KEY_LOC) - from autogen.agentchat.contrib.content_agent import ContentAgent + from autogen.agentchat.contrib.web_archiver_agent import WebArchiverAgent @pytest.mark.skipif( @@ -42,16 +42,16 @@ def test_content_agent() -> None: temporary_content_storage = os.path.join(tempfile.gettempdir(), "test_content_agent_storage") print(f"Storing temporary test files in {temporary_content_storage}") - # Define the system message for the ContentAgent + # Define the system message for the WebArchiverAgent content_agent_system_msg = "You are data collection agent specializing in content on the web." - # Instantiate the ContentAgent - content_agent = ContentAgent( - name="ContentAgent", + # Instantiate the WebArchiverAgent + content_agent = WebArchiverAgent( + name="WebArchiverAgent", system_message=content_agent_system_msg, llm_config=llm_config, max_consecutive_auto_reply=0, - # Below are the arguments specific to the ContentAgent + # Below are the arguments specific to the WebArchiverAgent silent=True, storage_path=temporary_content_storage, browser_config={"browser": browser}, From ef7586ef4ad4555e47971ebcdfa31c54a9870a18 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Tue, 26 Mar 2024 03:00:20 -0500 Subject: [PATCH 33/36] Update web_surfer.py change _set_page_content to set_page_content --- autogen/agentchat/contrib/web_surfer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autogen/agentchat/contrib/web_surfer.py b/autogen/agentchat/contrib/web_surfer.py index 024149d7b44..371fc90d2bc 100644 --- a/autogen/agentchat/contrib/web_surfer.py +++ b/autogen/agentchat/contrib/web_surfer.py @@ -112,7 +112,7 @@ def text_content(self): @property def render_text(self): - self.browser._set_page_content(self.browser.page_content) + self.browser.set_page_content(self.browser.page_content) return self.browser.page_content def close_the_browser(self): From 2be44bcf6c4f1c24a9832da94fdd50fccc22c513 Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Tue, 26 Mar 2024 03:01:10 -0500 Subject: [PATCH 34/36] Update browser_utils.py change _set_page_content to set_page_content --- autogen/browser_utils.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index 43e4ccc0542..0d0c5704da2 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -94,7 +94,7 @@ def set_address(self, uri_or_path: str) -> None: # Handle special URIs if uri_or_path == "about:blank": - self._set_page_content("") + self.set_page_content("") elif uri_or_path.startswith("bing:"): self._bing_search(uri_or_path[len("bing:") :].strip()) else: @@ -116,7 +116,7 @@ def page_content(self) -> str: """Return the full contents of the current page.""" return self._page_content - def _set_page_content(self, content: str) -> None: + def set_page_content(self, content: str) -> None: """Sets the text content of the current page.""" self._page_content = content self._split_pages() @@ -212,7 +212,7 @@ def _bing_search(self, query: str) -> None: ) if len(news_snippets) > 0: content += "\n\n## News Results:\n" + "\n\n".join(news_snippets) - self._set_page_content(content) + self.set_page_content(content) def _fetch_page(self, url: str) -> None: try: @@ -267,7 +267,7 @@ def _fetch_page(self, url: str) -> None: # Remove excessive blank lines self.page_title = soup.title.string - self._set_page_content(re.sub(r"\n{2,}", "\n\n", webpage_text).strip()) + self.set_page_content(re.sub(r"\n{2,}", "\n\n", webpage_text).strip()) elif content_type == "text/plain": # Get the content of the response plain_text = "" @@ -275,11 +275,11 @@ def _fetch_page(self, url: str) -> None: plain_text += chunk self.page_title = None - self._set_page_content(plain_text) + self.set_page_content(plain_text) elif IS_PDF_CAPABLE and content_type == "application/pdf": pdf_data = io.BytesIO(response.raw.read()) self.page_title = None - self._set_page_content(pdfminer.high_level.extract_text(pdf_data)) + self.set_page_content(pdfminer.high_level.extract_text(pdf_data)) elif self.downloads_folder is not None: # Try producing a safe filename fname = None @@ -303,16 +303,16 @@ def _fetch_page(self, url: str) -> None: # Return a page describing what just happened self.page_title = "Download complete." - self._set_page_content(f"Downloaded '{url}' to '{download_path}'.") + self.set_page_content(f"Downloaded '{url}' to '{download_path}'.") else: self.page_title = f"Error - Unsupported Content-Type '{content_type}'" - self._set_page_content(self.page_title) + self.set_page_content(self.page_title) else: self.page_title = "Error" - self._set_page_content("Failed to retrieve " + url) + self.set_page_content("Failed to retrieve " + url) except requests.exceptions.RequestException as e: self.page_title = "Error" - self._set_page_content(str(e)) + self.set_page_content(str(e)) def get_scheme(url: Union[str, ParseResult]) -> str: @@ -768,7 +768,7 @@ def set_address(self, uri_or_path: str) -> None: # Handle special URIs if uri_or_path == "about:blank": - self._set_page_content("") + self.set_page_content("") elif uri_or_path.startswith("bing:"): self._bing_search(uri_or_path[len("bing:") :].strip()) else: @@ -890,9 +890,9 @@ def _bing_search(self, query: str) -> None: if len(news_snippets) > 0: content += "\n\n## News Results:\n" + "\n\n".join(news_snippets) - self._set_page_content(content) + self.set_page_content(content) - def _set_page_content(self, content): + def set_page_content(self, content): """Sets the text content of the current page.""" self._page_content = content @@ -980,7 +980,7 @@ def _fetch_page(self, url: str) -> None: # Remove excessive blank lines if self.render_text: self.page_title = soup.title.string - self._set_page_content(webpage_text.strip()) + self.set_page_content(webpage_text.strip()) else: self._page_content = webpage_text @@ -990,7 +990,7 @@ def _fetch_page(self, url: str) -> None: plain_text = soup.prettify() if self.render_text: self.page_title = None - self._set_page_content(plain_text) + self.set_page_content(plain_text) else: self._page_content = plain_text @@ -999,7 +999,7 @@ def _fetch_page(self, url: str) -> None: plain_text = extract_pdf_text(os.path.join(self.downloads_folder, os.path.basename(url))) if self.render_text: self.page_title = None - self._set_page_content(plain_text) + self.set_page_content(plain_text) else: self._page_content = plain_text @@ -1025,16 +1025,16 @@ def _fetch_page(self, url: str) -> None: # Return a page describing what just happened if self.render_text: self.page_title = "Download complete." - self._set_page_content(f"Downloaded '{url}' to '{download_path}'.") + self.set_page_content(f"Downloaded '{url}' to '{download_path}'.") else: self._page_content = f"Downloaded '{url}' to '{download_path}'." elif self.render_text: self.page_title = f"Error - Unsupported Content-Type '{content_type}'" - self._set_page_content(self.page_title) + self.set_page_content(self.page_title) else: self._page_content = None except requests.exceptions.RequestException as e: self.page_title = "Error" - self._set_page_content(str(e)) + self.set_page_content(str(e)) From e64ae32a2e880b579d19a6ee75faec37108e1d7f Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Tue, 26 Mar 2024 03:05:14 -0500 Subject: [PATCH 35/36] Update browser_utils.py Removing the exception messages related to Selenium --- autogen/browser_utils.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index 0d0c5704da2..88a389b0352 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -35,7 +35,6 @@ pass # The Selenium package is used to automate web browser interaction from Python -IS_SELENIUM_CAPABLE = False try: from selenium import webdriver from selenium.common.exceptions import TimeoutException @@ -51,13 +50,8 @@ from selenium.webdriver.chrome.options import Options as ChromeOptions IS_SELENIUM_CAPABLE = True -except ImportError as e: - print(f"The module/package '{e.name}' is not available.") - print("Try running 'pip install selenium'. You may need to run 'sudo easy_install selenium' on Linux or MacOS") - print( - "Official selenium installation documentation: https://www.selenium.dev/documentation/webdriver/getting_started/install_library/" - ) - raise e +except: + IS_SELENIUM_CAPABLE = False class SimpleTextBrowser: From 3e7cf1878da9a0f8bf6afce9fca82a1d89faacef Mon Sep 17 00:00:00 2001 From: signalprime <15487280+signalprime@users.noreply.github.com> Date: Tue, 26 Mar 2024 03:06:41 -0500 Subject: [PATCH 36/36] Update contrib-openai.yml Minor fix to permit testing --- .github/workflows/contrib-openai.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/contrib-openai.yml b/.github/workflows/contrib-openai.yml index f16c75db056..37d0545f6b4 100644 --- a/.github/workflows/contrib-openai.yml +++ b/.github/workflows/contrib-openai.yml @@ -4,7 +4,7 @@ name: OpenAI4ContribTests on: - pull_request_target: + pull_request: branches: ['main'] paths: - 'autogen/**'