diff --git a/autogen/agentchat/contrib/functions/__init__.py b/autogen/agentchat/contrib/functions/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/autogen/agentchat/contrib/functions/file_utils.py b/autogen/agentchat/contrib/functions/file_utils.py new file mode 100644 index 00000000000..cfde36e3356 --- /dev/null +++ b/autogen/agentchat/contrib/functions/file_utils.py @@ -0,0 +1,229 @@ +from typing import Optional +from .functions_utils import FunctionWithRequirements + + +@FunctionWithRequirements(python_packages=["pdfminer.six", "requests"]) +def read_text_from_pdf(file_path: str) -> str: + """ + Reads text from a PDF file and returns it as a string. + + Args: + file_path (str): The path to the PDF file. + + Returns: + str: The extracted text from the PDF file. + """ + import io + import requests + from pdfminer.high_level import PDFResourceManager, PDFPageInterpreter + from pdfminer.converter import TextConverter + from pdfminer.pdfpage import PDFPage + + resource_manager = PDFResourceManager() + text_stream = io.StringIO() + converter = TextConverter(resource_manager, text_stream) + interpreter = PDFPageInterpreter(resource_manager, converter) + + if file_path.startswith("http://") or file_path.startswith("https://"): + response = requests.get(file_path) + file = io.BytesIO(response.content) + else: + file = open(file_path, "rb") + + for page in PDFPage.get_pages(file): + interpreter.process_page(page) + + text = text_stream.getvalue() + converter.close() + text_stream.close() + + return text + + +@FunctionWithRequirements(python_packages=["python-docx"]) +def read_text_from_docx(file_path: str) -> str: + """ + Reads text from a DOCX file and returns it as a string. + + Args: + file_path (str): The path to the DOCX file. + + Returns: + str: The extracted text from the DOCX file. + """ + from docx import Document + + doc = Document(file_path) + paragraphs = [p.text for p in doc.paragraphs] + text = "\n".join(paragraphs) + + return text + + +@FunctionWithRequirements(python_packages=["easyocr"]) +def read_text_from_image(file_path: str) -> str: + """ + Reads text from an image file or URL and returns it as a string. + + Warning: EasyOCR requires torch, which is slow to download and install. + TODO: is there a better way to handle large dependencies? + + Args: + file_path (str): The path to the image file or URL. + + Returns: + str: The extracted text from the image file or URL. + """ + import easyocr + + reader = easyocr.Reader(["en"]) # specify the language(s) + output = reader.readtext(file_path) + # The output is a list of tuples, each containing the coordinates of the text and the text itself. + # We join all the text pieces together to get the final text. + text = " ".join([item[1] for item in output]) + return text + + +@FunctionWithRequirements(python_packages=["python-pptx"]) +def read_text_from_pptx(file_path: str) -> str: + """ + Reads text from a PowerPoint file and returns it as a string. + + Args: + file_path (str): The path to the PowerPoint file. + + Returns: + str: The extracted text from the PowerPoint file. + """ + from pptx import Presentation + + presentation = Presentation(file_path) + text = "" + + slide_num = 0 + for slide in presentation.slides: + slide_num += 1 + + text += f"\n\n\n" + + for shape in slide.shapes: + if shape.has_text_frame: + text += shape.text + " " + + text = text.strip() + + return text + + +@FunctionWithRequirements(python_packages=["pandas", "openpyxl"]) +def read_text_from_xlsx(file_path: str) -> str: + """ + Reads text from an Excel file and returns it as a string. + + Args: + file_path (str): The path to the Excel file. + + Returns: + str: The extracted text from the Excel file. + """ + import pandas as pd + + df = pd.read_excel(file_path) + text = df.to_string(index=False) + + return text + + +@FunctionWithRequirements(python_packages=["speechrecognition", "requests", "pydub"]) +def read_text_from_audio(file_path: str) -> str: + """ + Reads text from an audio file or a URL and returns it as a string. + + Args: + file_path (str): The path to the audio file or the URL. + + Returns: + str: The extracted text from the audio file or the URL. + """ + import requests + import speech_recognition as sr + import tempfile + + recognizer = sr.Recognizer() + + if file_path.startswith("http"): + response = requests.get(file_path) + with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as temp_audio: + temp_audio.write(response.content) + temp_audio.seek(0) + with sr.AudioFile(temp_audio.name) as source: + audio = recognizer.record(source) + else: + with sr.AudioFile(file_path) as source: + audio = recognizer.record(source) + + text = recognizer.recognize_google(audio) + + return text + + +@FunctionWithRequirements(python_packages=["openai"], env_vars=["OPENAI_API_KEY"]) +def caption_image_using_gpt4v(file_path_or_url: str, prompt: Optional[str] = None) -> str: + """ + Generates a caption for an image using the GPT-4 Vision model from OpenAI. + + Args: + file_path_or_url (str): The path to the image file or the URL. + prompt (str, optional): The prompt to use for generating the caption. Defaults to "What’s in this image?". + + + Returns: + str: The caption generated for the image. + """ + import os + import base64 + import openai + from openai import OpenAI + + prompt = prompt or "What’s in this image?" + caption = "" + + openai.api_key = os.environ["OPENAI_API_KEY"] + client = OpenAI() + + # check if the file_path_or_url is a local file that exists + if os.path.exists(file_path_or_url): + image_path = file_path_or_url + with open(image_path, "rb") as image_file: + image_base64 = base64.b64encode(image_file.read()).decode("utf-8") + file_path_or_url = f"data:image/jpeg;base64,{image_base64}" + + # check if the file_path_or_url is a URL + if ( + file_path_or_url.startswith("http://") + or file_path_or_url.startswith("https://") + or file_path_or_url.startswith("data:") + ): + image_url = file_path_or_url + response = client.chat.completions.create( + model="gpt-4-vision-preview", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + }, + ], + } + ], + max_tokens=300, + ) + caption = response.choices[0].message.content + else: + raise ValueError("Invalid file path or URL") + return caption diff --git a/autogen/agentchat/contrib/functions/functions_utils.py b/autogen/agentchat/contrib/functions/functions_utils.py new file mode 100644 index 00000000000..bf273956a11 --- /dev/null +++ b/autogen/agentchat/contrib/functions/functions_utils.py @@ -0,0 +1,81 @@ +import inspect +import functools +from typing import List, Optional +from typing_extensions import Protocol, runtime_checkable + + +@runtime_checkable +class UserDefinedFunction(Protocol): + """ + Represents a user-defined function. + + Attributes: + name (str): The name of the function. + docstring (str): The documentation string of the function. + code (str): The code of the function. + python_packages (List[str]): The Python packages required by the function. + secrets (List[str]): The secrets required by the function. + """ + + name: str + docstring: str + code: str + python_packages: List[str] + env_vars: List[str] + + def name(self) -> str: + """Returns the name of the function.""" + return self.name + + def docstring(self) -> str: + """Returns the documentation string of the function.""" + return self.docstring + + def code(self) -> str: + """Returns the code of the function.""" + return self.code + + def python_packages(self) -> List[str]: + """Returns the Python packages required by the function.""" + return self.python_packages + + def env_vars(self) -> List[str]: + """Returns the environment variables required by the function.""" + return self.env_vars + + +class FunctionWithRequirements: + """Decorator class that adds requirements and setup functionality to a function.""" + + def __init__(self, python_packages: Optional[List[str]] = None, env_vars: Optional[List[str]] = None): + self.python_packages = python_packages or [] + self.env_vars = env_vars or [] + + def __call__(self, func: callable) -> UserDefinedFunction: + @functools.wraps(func) + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + + wrapper.name = func.__name__ # The name of the function + wrapper.docstring = func.__doc__ + wrapper.code = inspect.getsource(func) + wrapper.python_packages = self.python_packages + wrapper.env_vars = self.env_vars + return wrapper + + +if __name__ == "__main__": + + @FunctionWithRequirements(python_packages=["youtube_transcript_api==0.6.0"]) + def my_function(): + """This is a sample function""" + print("Hello world") + + print(my_function) + print(my_function.name) + print(my_function.docstring) + print(my_function.code) + print(my_function.python_packages) + print(my_function.env_vars) + + print(isinstance(my_function, UserDefinedFunction)) diff --git a/autogen/agentchat/contrib/functions/youtube_utils.py b/autogen/agentchat/contrib/functions/youtube_utils.py new file mode 100644 index 00000000000..3f37463b3e5 --- /dev/null +++ b/autogen/agentchat/contrib/functions/youtube_utils.py @@ -0,0 +1,29 @@ +from .functions_utils import FunctionWithRequirements + + +@FunctionWithRequirements(python_packages=["youtube_transcript_api==0.6.0"]) +def get_youtube_transcript(youtube_link: str) -> str: + """ + Gets the transcript of a YouTube video. + + Args: + youtube_link (str): The link to the YouTube video. + + Returns: + str: The transcript of the YouTube video. + """ + from youtube_transcript_api import YouTubeTranscriptApi + + # Extract video ID from the YouTube link + video_id = youtube_link.split("v=")[1] + + try: + # Get the transcript for the video + transcript_list = YouTubeTranscriptApi.get_transcript(video_id) + + # Combine all parts of the transcript into a single string + transcript = " ".join([part["text"] for part in transcript_list]) + + return transcript + except Exception as e: + return str(e) diff --git a/notebook/agentchat_function_store.ipynb b/notebook/agentchat_function_store.ipynb new file mode 100644 index 00000000000..fd81b89a198 --- /dev/null +++ b/notebook/agentchat_function_store.ipynb @@ -0,0 +1,412 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Intro to Built-In Functions from `contrib.functions`\n", + "\n", + "\n", + "## Initial Setup\n", + "\n", + "Lets first import the necessary modules and define the agents." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from autogen import AssistantAgent, UserProxyAgent\n", + "from autogen.agentchat.contrib.functions import youtube_utils as yt\n", + "from autogen.agentchat.contrib.functions import file_utils as fu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Functions and Requirements\n", + "\n", + "A python functions can have have many requirements. For example, 3rd-party python packages and secrets.\n", + "\n", + "### Accessing requirements\n", + "You can access requirements via the `python_packages` and `env_var` properties" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Code: get_youtube_transcript\n", + "Required python packages: ['youtube_transcript_api==0.6.0']\n" + ] + } + ], + "source": [ + "# get the requirements for the youtube transcript function\n", + "print(\"Code: \", yt.get_youtube_transcript.name)\n", + "print(\"Required python packages: \", yt.get_youtube_transcript.python_packages)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Testing and pre-installing requirements\n", + "\n", + "We also provide methods to install the required python packages. To do this, execute the following method in your execution environment. If required secrets are missing, the method will throw an error.\n", + "\n", + "This is especially useful when setup is costly and needs to be done before actually invoking the function in some end task (in this case use by the agent)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Simple Example" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "config_list = [\n", + " {\n", + " \"model\": \"gpt-4\",\n", + " \"api_key\": os.environ.get(\"OPENAI_API_KEY\"),\n", + " }\n", + "]\n", + "\n", + "assistant = AssistantAgent(name=\"coder\", llm_config={\"config_list\": config_list, \"cache\": None})\n", + "user = UserProxyAgent(\n", + " name=\"user\",\n", + " code_execution_config={\n", + " \"work_dir\": \"/tmp\",\n", + " },\n", + " human_input_mode=\"NEVER\",\n", + " is_termination_msg=lambda x: x.get(\"content\", \"\") and \"TERMINATE\" in x.get(\"content\", \"\"),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser\u001b[0m (to coder):\n", + "\n", + "Please summarize the video: https://www.youtube.com/watch?v=9iqn1HhFJ6c\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mcoder\u001b[0m (to user):\n", + "\n", + "\u001b[32m***** Suggested tool Call (call_zCo0cdMpn3jfN8jlu7LWGqcT): get_youtube_transcript *****\u001b[0m\n", + "Arguments: \n", + "{\n", + "\"youtube_link\": \"https://www.youtube.com/watch?v=9iqn1HhFJ6c\"\n", + "}\n", + "\u001b[32m***************************************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION get_youtube_transcript...\u001b[0m\n", + "\u001b[33muser\u001b[0m (to coder):\n", + "\n", + "\u001b[33muser\u001b[0m (to coder):\n", + "\n", + "\u001b[32m***** Response from calling tool \"call_zCo0cdMpn3jfN8jlu7LWGqcT\" *****\u001b[0m\n", + "now ai is a great thing because AI will solve all the problems that we have today it will solve employment it will solve disease it will solve poverty but it will also create new problems the problem of fake news is going to be a million times worse cyber attacks will become much more extreme we will have totally automated AI weapons I think AI has the potential to create infinitely stable dictatorships this morning a warning about the the power of artificial intelligence more than 1,300 tech industry leaders researchers and others are now asking for a pause in the development of artificial intelligence to consider the risks [Music] plain God scientists have been accused of playing God for a while but there is a real sense in which we are creating something very different from anything you've created so far yeah I mean we definitely will be able to create completely autonomous beings with their own goals and it will be very important especially as these beings become much smarter than humans it's going to be important to to have these beings the goals of these beings be aligned with our goals what inspires me I like thinking about the very fundamentals the basics what what can our systems not do that humans definitely do almost approach it philosophically questions like what is learning what is experience what is thinking how does the brain [Music] work I feel that technology is a force of nature I feel like there is a lot of similarity between technology and biological evolution it is very easy to understand how biological evolution works you have mutations you have Natural Selections you keep the good ones the ones survive and just through this process you going to have huge complexity in your [Music] organisms we cannot understand how the human body works because we understand Evolution but we understand the process more or less and I think machine learning is in a similar state right now especially deep learning we have very simple a very simple rule that takes the information from the data and puts it into the model and we just keep repeating this process and as a result of this process the complexity from the data gets transformed transferred into the complexity of the model so the resulting model is really complex and we don't really know exactly how it works you need to investigate but the algorithm that did it is very simple chat GPT maybe you've heard of it if you haven't then get ready you describe it as the first spots of rain before a downpour it's something we just need to be very conscious of because I agree at is a watershed moment Well Chad gbt is being heralded as a game changer and in many ways it is its latest Triumph outscoring people a recent study by Microsoft research concludes that gp4 is an early yet still incomplete artificial general intelligence [Music] system artificial general intelligence AGI a computer system that can do any job or any task that a human does but only better there is some probability the AGI is going to happen pretty soon there's also some probability it's going to take much longer but my position is that the probability that a ja would happen soon is high enough that we should take it [Music] seriously and it's going to be very important to make these very smart capable systems be aligned and act in our best interest the very first agis will be basically very very large data centers packed with specialized neural network processors working in parallel compact hot power hungry package consuming like 10 million homes worth of energy you're going to see dramatically more intelligent systems and I think it's highly likely that those systems will have completely astronomical impact on society will humans actually benefit and who will benefit who will [Music] not [Music] the beliefs and desires of the first agis will be extremely important and so it's important to program them correctly I think that if this is not done then the nature of evolution of natural selection favor those systems prioritize their own Survival above all else it's not that it's going to actively hate humans and want to harm them but it is going to be too powerful and I think a good analogy would be the way human humans treat animals it's not we hate animals I think humans love animals and have a lot of affection for them but when the time comes to build a highway between two cities we are not asking the animals for permission we just do it because it's important for us and I think by default that's the kind of relationship that's going to be between us and agis which are truly autonomous and operating on their own behalf [Music] tough many machine learning experts people who are very knowledgeable and very experienced have a lot of skepticism about HL about when it could happen and about whether it could happen at all right now this is something that just not that many people have realized yet that the speed of computers for neural networks for AI are going to become maybe 100,000 times faster in a small number of years if you have an arms race Dynamics between multiple teams trying to build the AGI first they will have less time make sure that the AGI that they will build will care deeply for humans cuz the way I imagine it is that there is an avalanche like there is an avalanche of AGI development imagine it this huge Unstoppable force and I think it's pretty likely the entire surface of the Earth will be covered with solar panels and data Cent given these kinds of concerns it will be important that AGI somehow buil as a cooperation between multiple countries the future is going to be good for the AI regardless would be nice if it were good for humans as well\n", + "\u001b[32m**********************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mcoder\u001b[0m (to user):\n", + "\n", + "The video discusses the potential and risks of Artificial Intelligence (AI). The video opens with a claim that while AI has the potential to solve many current issues such as employment, disease, and poverty, it also presents new problems like exacerbating the spread of fake news and escalating cyber attacks. AI could also potentially facilitate the creation of stable dictatorships.\n", + "\n", + "Over 1300 tech industry leaders and researchers are calling for a pause in the development of AI to consider these risks. The video discusses the goal of creating autonomous beings through AI, entities with their own goals which will need to be aligned with human goals. \n", + "\n", + "The video brings up Chat GPT, an evolution in AI that is considered a game changer, heralding an impending downpour of advancements. The video claims that the initial AI would be large data centers packed with neural network processors. These systems may have an astronomical impact on humankind, but it raises the question of who will and won't benefit.\n", + "\n", + "There are concerns about the beliefs and desires of these AIs, whether they will prioritize their own survival above all else, possibly treating humans not out of malice but with the disregard humans often have for animals when creating infrastructure. \n", + "\n", + "A contrasting viewpoint offered in the video is skepticism towards the possibility and timeline of AGI (Artificial General Intelligence) development from other machine learning experts. The speaker continues that with the development of AI expected to be exponentially faster within a few years, it could instigate a race to build AGI first, risking the care and consideration for humans in the process.\n", + "\n", + "The video concludes suggesting that the future development of AGI should be collaborative between multiple countries, hoping for a future that is beneficial for both AI and humans. \n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "assistant.register_for_llm(description=\"Fetch transcript of a youtube video\")(yt.get_youtube_transcript)\n", + "user.register_for_execution()(yt.get_youtube_transcript)\n", + "\n", + "result = user.initiate_chat(\n", + " assistant,\n", + " message=\"Please summarize the video: https://www.youtube.com/watch?v=9iqn1HhFJ6c\",\n", + " summary_method=\"last_msg\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced: Registering Multiple Functions\n", + "\n", + "Lets import multiple functions and use them accomplish more complex tasks." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# register multiple file reading functions\n", + "for foo in [\n", + " # fu.read_text_from_image,\n", + " fu.read_text_from_pdf,\n", + " fu.read_text_from_docx,\n", + " fu.read_text_from_pptx,\n", + " fu.read_text_from_xlsx,\n", + " fu.read_text_from_audio,\n", + "]:\n", + " foo_desc = foo.__doc__ # get doctring of the function\n", + " assistant.register_for_llm(description=foo_desc)(foo)\n", + " user.register_for_execution()(foo)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser\u001b[0m (to coder):\n", + "\n", + "Please summarize the contents of the following files: https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/Captioned_image_dataset_examples.jpg/1024px-Captioned_image_dataset_examples.jpg https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf https://github.com/realpython/python-speech-recognition/raw/master/audio_files/harvard.wav\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mcoder\u001b[0m (to user):\n", + "\n", + "\u001b[32m***** Suggested tool Call (call_A82Eb6pF1WcBZR6rBlfTEL0h): read_text_from_pdf *****\u001b[0m\n", + "Arguments: \n", + "{\n", + " \"file_path\": \"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf\"\n", + "}\n", + "\u001b[32m***********************************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION read_text_from_pdf...\u001b[0m\n", + "\u001b[33muser\u001b[0m (to coder):\n", + "\n", + "\u001b[33muser\u001b[0m (to coder):\n", + "\n", + "\u001b[32m***** Response from calling tool \"call_A82Eb6pF1WcBZR6rBlfTEL0h\" *****\u001b[0m\n", + "Dummy PDF file\f\n", + "\u001b[32m**********************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mcoder\u001b[0m (to user):\n", + "\n", + "\u001b[32m***** Suggested tool Call (call_VsuV254dKEwDWuHyDKAxZbtw): read_text_from_audio *****\u001b[0m\n", + "Arguments: \n", + "{\n", + " \"file_path\": \"https://github.com/realpython/python-speech-recognition/raw/master/audio_files/harvard.wav\"\n", + "}\n", + "\u001b[32m*************************************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION read_text_from_audio...\u001b[0m\n", + "\u001b[33muser\u001b[0m (to coder):\n", + "\n", + "\u001b[33muser\u001b[0m (to coder):\n", + "\n", + "\u001b[32m***** Response from calling tool \"call_VsuV254dKEwDWuHyDKAxZbtw\" *****\u001b[0m\n", + "the stale smell of old beer lingers it takes heat to bring out the odor a cold dip restores health and zest a salt pickle taste fine with ham tacos al pastor are my favorite a zestful food is the hot cross bun\n", + "\u001b[32m**********************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mcoder\u001b[0m (to user):\n", + "\n", + "The content of the file at \"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf\" is \"Dummy PDF file\".\n", + "\n", + "The content of the audio file at \"https://github.com/realpython/python-speech-recognition/raw/master/audio_files/harvard.wav\" is \"the stale smell of old beer lingers it takes heat to bring out the odor a cold dip restores health and zest a salt pickle taste fine with ham tacos al pastor are my favorite a zestful food is the hot cross bun\".\n", + "\n", + "As for the image file at \"https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/Captioned_image_dataset_examples.jpg/1024px-Captioned_image_dataset_examples.jpg\", without an appropriate tool, I'm unable to extract or summarize content from images as it may contain non-textual data.\n", + "\n", + "Remember, summarization involves understanding and condensing the content in a textual format. For images, it might be a description of the image or the text contained in it if any. But with the present capabilities, the image content can't be summarized.\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "dummy_png = \"https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/Captioned_image_dataset_examples.jpg/1024px-Captioned_image_dataset_examples.jpg\"\n", + "dummy_pdf = \"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf\"\n", + "dummy_mp3 = \"https://github.com/realpython/python-speech-recognition/raw/master/audio_files/harvard.wav\"\n", + "\n", + "result = user.initiate_chat(\n", + " assistant,\n", + " message=f\"Please summarize the contents of the following files: {' '.join([dummy_png, dummy_pdf, dummy_mp3])}\",\n", + " summary_method=\"last_msg\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced: Functions that Require Secrets\n", + "\n", + "In this example, we will use a function that expects a secret, e.g., an `OPENAI_API_KEY` for it work. One such example is the function that using GPT-4-vision to perform image understanding." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33muser\u001b[0m (to coder):\n", + "\n", + "Please summarize the contents of the following image using gpt4v: https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/Captioned_image_dataset_examples.jpg/1024px-Captioned_image_dataset_examples.jpg\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mcoder\u001b[0m (to user):\n", + "\n", + "\u001b[32m***** Suggested tool Call (call_PeT4QTSuC1T8BOzGfCswdCc8): caption_image_using_gpt4v *****\u001b[0m\n", + "Arguments: \n", + "{\n", + " \"file_path_or_url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/Captioned_image_dataset_examples.jpg/1024px-Captioned_image_dataset_examples.jpg\"\n", + "}\n", + "\u001b[32m******************************************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION caption_image_using_gpt4v...\u001b[0m\n", + "\u001b[33muser\u001b[0m (to coder):\n", + "\n", + "\u001b[33muser\u001b[0m (to coder):\n", + "\n", + "\u001b[32m***** Response from calling tool \"call_PeT4QTSuC1T8BOzGfCswdCc8\" *****\u001b[0m\n", + "The image appears to be a collage with three rows of photos, each row labeled with a different category: flowers, birds, and COCO.\n", + "\n", + "In the top row, under \"Oxford-102 Flowers,\" there are four images:\n", + "1. A flower with red petals and yellow tips.\n", + "2. A white flower with yellow anthers in the center.\n", + "3. A purple flower with nearly heart-shaped petals and small green receptacles.\n", + "4. A yellow flower with many layers and a small insect on it.\n", + "\n", + "In the middle row, labeled \"CUB-200 Birds,\" there are four images:\n", + "1. A blue bird with white on its head, blue feathers, and a white belly.\n", + "2. A black bird with a large blunt beak.\n", + "3. A bird with a green wing, brown breast, and a red bill.\n", + "4. A bird with long black wings, a white breast, and a short, black bill.\n", + "\n", + "In the bottom row, under \"COCO,\" there are four images:\n", + "1. Two people holding snowboards, standing in the snow.\n", + "2. A plate of rice and beans with soup and juice, presumably a meal.\n", + "3. A big rig truck in a parking lot without a trailer.\n", + "4. A group of men traveling on horses in water, which looks like a river crossing. \n", + "\n", + "Each image is accompanied by a brief description, likely for a dataset designed to teach AI systems how to recognize and describe various objects and scenes.\n", + "\u001b[32m**********************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mcoder\u001b[0m (to user):\n", + "\n", + "The image you provided appears to be a collage with three rows of photos, each row labeled with a different category: flowers, birds, and the COCO database. \n", + "\n", + "In the top row, the \"Oxford-102 Flowers\" category includes four images of exemplary flowers:\n", + "1. A flower with red petals and yellow tips.\n", + "2. A white flower with yellow anthers in the center.\n", + "3. A purple flower with nearly heart-shaped petals and small green receptacles.\n", + "4. A yellow flower with many layers and a small insect on it.\n", + "\n", + "In the middle row, under the \"CUB-200 Birds\" label, there are four images of different birds:\n", + "1. A blue bird with white on its head, blue feathers, and a white belly.\n", + "2. A black bird with a large blunt beak.\n", + "3. A bird with a green wing, brown breast, and a red bill.\n", + "4. A bird with long black wings, a white breast, and a short, black bill.\n", + "\n", + "Finally, in the bottom row, under the \"COCO\" label, there are four generalized images presumably from the COCO database:\n", + "1. Two people holding snowboards, standing in the snow.\n", + "2. A plate of rice and beans with soup and juice, what appears to be a meal.\n", + "3. A big rig truck in a parking lot without a trailer.\n", + "4. A group of men traveling on horses in water, which looks like a river crossing.\n", + "\n", + "Each image is accompanied by a brief description, likely designed for the purpose of teaching AI systems how to recognize and describe various objects and scenes.\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "assistant.register_for_llm(description=\"Use gpt4 vision to understand an image\")(fu.caption_image_using_gpt4v)\n", + "user.register_for_execution()(fu.caption_image_using_gpt4v)\n", + "\n", + "result = user.initiate_chat(\n", + " assistant,\n", + " message=f\"Please summarize the contents of the following image using gpt4v: {dummy_png}\",\n", + " summary_method=\"last_msg\",\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/test/agentchat/contrib/functions/test_file_utils.py b/test/agentchat/contrib/functions/test_file_utils.py new file mode 100644 index 00000000000..d3c95c4a01d --- /dev/null +++ b/test/agentchat/contrib/functions/test_file_utils.py @@ -0,0 +1,60 @@ +import sys +import os +import pytest +from autogen.agentchat.contrib.functions import file_utils as fu + +sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", "..")) +from conftest import skip_openai # noqa: E402 + +try: + from openai import OpenAI +except ImportError: + skip = True +else: + skip = False or skip_openai + +TESTDIR = os.path.join(os.path.join(os.path.dirname(__file__), "..", "..", ".."), "test_files") + + +def test_read_text_from_pdf(): + text = fu.read_text_from_pdf(os.path.join(TESTDIR, "example.pdf")) + assert isinstance(text, str) + + +def test_read_text_from_docx(): + text = fu.read_text_from_docx(os.path.join(TESTDIR, "example.docx")) + assert isinstance(text, str) + + +def test_read_text_from_image(): + for file in ["example.jpg", "example.png"]: + text = fu.read_text_from_image(os.path.join(TESTDIR, file)) + assert isinstance(text, str) + + +def test_read_text_from_pptx(): + text = fu.read_text_from_pptx(os.path.join(TESTDIR, "example.pptx")) + assert isinstance(text, str) + + +def test_read_text_from_xlsx(): + text = fu.read_text_from_xlsx(os.path.join(TESTDIR, "example.xlsx")) + assert isinstance(text, str) + + +# def test_read_text_from_audio(): +# TODO: Needs work + smaller test file +# for file in ["example.wav"]: +# text = fu.read_text_from_audio(os.path.join(TESTDIR, file)) +# print(text) +# assert isinstance(text, str) + + +@pytest.mark.skipif( + sys.platform in ["darwin", "win32"] or skip, + reason="do not run on MacOS or windows OR openai not installed OR requested to skip", +) +def test_caption_image_using_gpt4v(): + for file in ["example.jpg", "example.png"]: + text = fu.caption_image_using_gpt4v(os.path.join(TESTDIR, file)) + assert isinstance(text, str) diff --git a/test/test_files/example.jpg b/test/test_files/example.jpg new file mode 100644 index 00000000000..3bcabab03de Binary files /dev/null and b/test/test_files/example.jpg differ diff --git a/test/test_files/example.png b/test/test_files/example.png new file mode 100644 index 00000000000..93a38697908 Binary files /dev/null and b/test/test_files/example.png differ diff --git a/test/test_files/example.pptx b/test/test_files/example.pptx new file mode 100644 index 00000000000..643fafac70b Binary files /dev/null and b/test/test_files/example.pptx differ diff --git a/test/test_files/example.xlsx b/test/test_files/example.xlsx new file mode 100644 index 00000000000..dad08d40d5f Binary files /dev/null and b/test/test_files/example.xlsx differ