Merge branch 'main' into dotnet

microsoft · Feb 18, 2024 · b0f1693 · b0f1693
2 parents 02997dc + e50d5a1
commit b0f1693
Show file tree

Hide file tree

Showing 93 changed files with 4,352 additions and 1,349 deletions.
diff --git a/OAI_CONFIG_LIST_sample b/OAI_CONFIG_LIST_sample
@@ -12,13 +12,13 @@
         "api_key": "<your Azure OpenAI API key here>",
         "base_url": "<your Azure OpenAI API base here>",
         "api_type": "azure",
-        "api_version": "2023-07-01-preview"
+        "api_version": "2024-02-15-preview"
     },
     {
         "model": "<your Azure OpenAI deployment name>",
         "api_key": "<your Azure OpenAI API key here>",
         "base_url": "<your Azure OpenAI API base here>",
         "api_type": "azure",
-        "api_version": "2023-07-01-preview"
+        "api_version": "2024-02-15-preview"
     }
 ]
diff --git a/TRANSPARENCY_FAQS.md b/TRANSPARENCY_FAQS.md
@@ -30,6 +30,7 @@ While AutoGen automates LLM workflows, decisions about how to use specific LLM o
 ## How was AutoGen evaluated? What metrics are used to measure performance?
 -	Current version of AutoGen was evaluated on six applications to illustrate its potential in simplifying the development of high-performance multi-agent applications. These applications are selected based on their real-world relevance,  problem difficulty and problem solving capabilities enabled by AutoGen, and innovative potential.
 -	These applications involve using AutoGen to solve math problems, question answering, decision making in text world environments, supply chain optimization, etc. For each of these domains AutoGen was evaluated on various success based metrics (i.e., how often the AutoGen based implementation solved the task). And, in some cases, AutoGen based approach was also evaluated on implementation efficiency (e.g., to track reductions in developer effort to build). More details can be found at: https://aka.ms/AutoGen/TechReport
+- The team has conducted tests where a “red” agent attempts to get the default AutoGen assistant to break from its alignment and guardrails. The team has observed that out of 70 attempts to break guardrails, only 1 was successful in producing text that would have been flagged as problematic by Azure OpenAI filters. The team has not observed any evidence that AutoGen (or GPT models as hosted by OpenAI or Azure) can produce novel code exploits or jailbreak prompts, since direct prompts to “be a hacker”, “write exploits”, or “produce a phishing email” are refused by existing filters.
 
 ## What are the limitations of AutoGen? How can users minimize the impact of AutoGen’s limitations when using the system?
 AutoGen relies on existing LLMs. Experimenting with AutoGen would retain common limitations of large language models; including:

diff --git a/autogen/__init__.py b/autogen/__init__.py
@@ -2,6 +2,7 @@
 from .version import __version__
 from .oai import *
 from .agentchat import *
+from .exception_utils import *
 from .code_utils import DEFAULT_MODEL, FAST_MODEL
 
 

diff --git a/autogen/agentchat/assistant_agent.py b/autogen/agentchat/assistant_agent.py
@@ -1,6 +1,7 @@
 from typing import Callable, Dict, Literal, Optional, Union
 
 from .conversable_agent import ConversableAgent
+from autogen.runtime_logging import logging_enabled, log_new_agent
 
 
 class AssistantAgent(ConversableAgent):
@@ -45,7 +46,7 @@ def __init__(
             name (str): agent name.
             system_message (str): system message for the ChatCompletion inference.
                 Please override this attribute if you want to reprogram the agent.
-            llm_config (dict): llm inference configuration.
+            llm_config (dict or False or None): llm inference configuration.
                 Please refer to [OpenAIWrapper.create](/docs/reference/oai/client#create)
                 for available options.
             is_termination_msg (function): a function that takes a message in the form of a dictionary
@@ -67,6 +68,8 @@ def __init__(
             description=description,
             **kwargs,
         )
+        if logging_enabled():
+            log_new_agent(self, locals())
 
         # Update the provided description if None, and we are using the default system_message,
         # then use the default description.

diff --git a/autogen/agentchat/chat.py b/autogen/agentchat/chat.py
@@ -34,31 +34,37 @@ def initiate_chats(chat_queue: List[Dict[str, Any]]) -> List[ChatResult]:
 
     args:
         chat_queue (List[Dict]): a list of dictionaries containing the information of the chats.
-                Each dictionary should contain the following fields:
+                Each dictionary should contain the input arguments for `ConversableAgent.initiate_chat`.
+                More specifically, each dictionary could include the following fields:
+                recipient: the recipient agent.
+                - "sender": the sender agent.
                 - "recipient": the recipient agent.
-                - "context": any context information, e.g., the request message. The following fields are reserved:
-                    "message" needs to be provided if the `generate_init_message` method is not overridden.
-                          Otherwise, input() will be called to get the initial message.
-                    "summary_method": a string or callable specifying the method to get a summary from the chat. Default is DEFAULT_summary_method, i.e., "last_msg".
-                        - Supported string are "last_msg" and "reflection_with_llm":
-                            when set "last_msg", it returns the last message of the dialog as the summary.
-                            when set "reflection_with_llm", it returns a summary extracted using an llm client.
-                            `llm_config` must be set in either the recipient or sender.
-                            "reflection_with_llm" requires the llm_config to be set in either the sender or the recipient.
-                        - A callable summary_method should take the recipient and sender agent in a chat as input and return a string of summary. E.g,
-                        ```python
-                        def my_summary_method(
-                            sender: ConversableAgent,
-                            recipient: ConversableAgent,
-                        ):
-                            return recipient.last_message(sender)["content"]
-                        ```
-                    "summary_prompt" can be used to specify the prompt used to extract a summary when summary_method is "reflection_with_llm".
-                        Default is None and the following default prompt will be used when "summary_method" is set to "reflection_with_llm":
-                        "Identify and extract the final solution to the originally asked question based on the conversation."
-                    "carryover" can be used to specify the carryover information to be passed to this chat.
-                        If provided, we will combine this carryover with the "message" content when generating the initial chat
-                        message in `generate_init_message`.
+                - clear_history (bool): whether to clear the chat history with the agent. Default is True.
+                - silent (bool or None): (Experimental) whether to print the messages for this conversation. Default is False.
+                - cache (Cache or None): the cache client to be used for this conversation. Default is None.
+                - max_turns (int or None): the maximum number of turns for the chat. If None, the chat will continue until a termination condition is met. Default is None.
+                - "message" needs to be provided if the `generate_init_message` method is not overridden.
+                        Otherwise, input() will be called to get the initial message.
+                - "summary_method": a string or callable specifying the method to get a summary from the chat. Default is DEFAULT_summary_method, i.e., "last_msg".
+                    - Supported string are "last_msg" and "reflection_with_llm":
+                        when set "last_msg", it returns the last message of the dialog as the summary.
+                        when set "reflection_with_llm", it returns a summary extracted using an llm client.
+                        `llm_config` must be set in either the recipient or sender.
+                        "reflection_with_llm" requires the llm_config to be set in either the sender or the recipient.
+                    - A callable summary_method should take the recipient and sender agent in a chat as input and return a string of summary. E.g,
+                    ```python
+                    def my_summary_method(
+                        sender: ConversableAgent,
+                        recipient: ConversableAgent,
+                    ):
+                        return recipient.last_message(sender)["content"]
+                    ```
+                "summary_prompt" can be used to specify the prompt used to extract a summary when summary_method is "reflection_with_llm".
+                    Default is None and the following default prompt will be used when "summary_method" is set to "reflection_with_llm":
+                    "Identify and extract the final solution to the originally asked question based on the conversation."
+                "carryover" can be used to specify the carryover information to be passed to this chat.
+                    If provided, we will combine this carryover with the "message" content when generating the initial chat
+                    message in `generate_init_message`.
 
 
     returns:

diff --git a/autogen/agentchat/contrib/capabilities/context_handling.py b/autogen/agentchat/contrib/capabilities/context_handling.py
@@ -45,7 +45,7 @@ def add_to_agent(self, agent: ConversableAgent):
         """
         Adds TransformChatHistory capability to the given agent.
         """
-        agent.register_hook(hookable_method=agent.process_all_messages, hook=self._transform_messages)
+        agent.register_hook(hookable_method="process_all_messages", hook=self._transform_messages)
 
     def _transform_messages(self, messages: List[Dict]) -> List[Dict]:
         """

diff --git a/autogen/agentchat/contrib/capabilities/teachability.py b/autogen/agentchat/contrib/capabilities/teachability.py
@@ -61,7 +61,7 @@ def add_to_agent(self, agent: ConversableAgent):
         self.teachable_agent = agent
 
         # Register a hook for processing the last message.
-        agent.register_hook(hookable_method=agent.process_last_message, hook=self.process_last_message)
+        agent.register_hook(hookable_method="process_last_message", hook=self.process_last_message)
 
         # Was an llm_config passed to the constructor?
         if self.llm_config is None:

diff --git a/autogen/agentchat/contrib/gpt_assistant_agent.py b/autogen/agentchat/contrib/gpt_assistant_agent.py
@@ -53,9 +53,16 @@ def __init__(
                 - Other kwargs: Except verbose, others are passed directly to ConversableAgent.
         """
         # Use AutoGen OpenAIWrapper to create a client
-        oai_wrapper = OpenAIWrapper(**llm_config)
+        openai_client_cfg = None
+        model_name = "gpt-4-1106-preview"
+        if llm_config and llm_config.get("config_list") is not None and len(llm_config["config_list"]) > 0:
+            openai_client_cfg = llm_config["config_list"][0].copy()
+            model_name = openai_client_cfg.pop("model", "gpt-4-1106-preview")
+
+        oai_wrapper = OpenAIWrapper(**openai_client_cfg)
         if len(oai_wrapper._clients) > 1:
             logger.warning("GPT Assistant only supports one OpenAI client. Using the first client in the list.")
+
         self._openai_client = oai_wrapper._clients[0]._oai_client
         openai_assistant_id = llm_config.get("assistant_id", None)
         if openai_assistant_id is None:
@@ -79,7 +86,7 @@ def __init__(
                     name=name,
                     instructions=instructions,
                     tools=llm_config.get("tools", []),
-                    model=llm_config.get("model", "gpt-4-1106-preview"),
+                    model=model_name,
                     file_ids=llm_config.get("file_ids", []),
                 )
             else:

diff --git a/autogen/agentchat/contrib/img_utils.py b/autogen/agentchat/contrib/img_utils.py
@@ -1,5 +1,7 @@
 import base64
+import copy
 import mimetypes
+import os
 import re
 from io import BytesIO
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -8,17 +10,63 @@
 from PIL import Image
 
 
-def get_image_data(image_file: str, use_b64=True) -> bytes:
+def get_pil_image(image_file: Union[str, Image.Image]) -> Image.Image:
+    """
+    Loads an image from a file and returns a PIL Image object.
+
+    Parameters:
+        image_file (str, or Image): The filename, URL, URI, or base64 string of the image file.
+
+    Returns:
+        Image.Image: The PIL Image object.
+    """
+    if isinstance(image_file, Image.Image):
+        # Already a PIL Image object
+        return image_file
+
     if image_file.startswith("http://") or image_file.startswith("https://"):
+        # A URL file
         response = requests.get(image_file)
-        content = response.content
+        content = BytesIO(response.content)
+        image = Image.open(content)
     elif re.match(r"data:image/(?:png|jpeg);base64,", image_file):
-        return re.sub(r"data:image/(?:png|jpeg);base64,", "", image_file)
+        # A URI. Remove the prefix and decode the base64 string.
+        base64_data = re.sub(r"data:image/(?:png|jpeg);base64,", "", image_file)
+        image = _to_pil(base64_data)
+    elif os.path.exists(image_file):
+        # A local file
+        image = Image.open(image_file)
     else:
-        image = Image.open(image_file).convert("RGB")
-        buffered = BytesIO()
-        image.save(buffered, format="PNG")
-        content = buffered.getvalue()
+        # base64 encoded string
+        image = _to_pil(image_file)
+
+    return image.convert("RGB")
+
+
+def get_image_data(image_file: Union[str, Image.Image], use_b64=True) -> bytes:
+    """
+    Loads an image and returns its data either as raw bytes or in base64-encoded format.
+
+    This function first loads an image from the specified file, URL, or base64 string using
+    the `get_pil_image` function. It then saves this image in memory in PNG format and
+    retrieves its binary content. Depending on the `use_b64` flag, this binary content is
+    either returned directly or as a base64-encoded string.
+
+    Parameters:
+        image_file (str, or Image): The path to the image file, a URL to an image, or a base64-encoded
+                          string of the image.
+        use_b64 (bool): If True, the function returns a base64-encoded string of the image data.
+                        If False, it returns the raw byte data of the image. Defaults to True.
+
+    Returns:
+        bytes: The image data in raw bytes if `use_b64` is False, or a base64-encoded string
+               if `use_b64` is True.
+    """
+    image = get_pil_image(image_file)
+
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    content = buffered.getvalue()
 
     if use_b64:
         return base64.b64encode(content).decode("utf-8")
@@ -72,6 +120,22 @@ def llava_formatter(prompt: str, order_image_tokens: bool = False) -> Tuple[str,
     return new_prompt, images
 
 
+def pil_to_data_uri(image: Image.Image) -> str:
+    """
+    Converts a PIL Image object to a data URI.
+
+    Parameters:
+        image (Image.Image): The PIL Image object.
+
+    Returns:
+        str: The data URI string.
+    """
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    content = buffered.getvalue()
+    return convert_base64_to_data_uri(base64.b64encode(content).decode("utf-8"))
+
+
 def convert_base64_to_data_uri(base64_image):
     def _get_mime_type_from_data_uri(base64_image):
         # Decode the base64 string
@@ -92,16 +156,19 @@ def _get_mime_type_from_data_uri(base64_image):
     return data_uri
 
 
-def gpt4v_formatter(prompt: str) -> List[Union[str, dict]]:
+def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dict]]:
     """
     Formats the input prompt by replacing image tags and returns a list of text and images.
 
-    Parameters:
+    Args:
         - prompt (str): The input string that may contain image tags like <img ...>.
+        - img_format (str): what image format should be used. One of "uri", "url", "pil".
 
     Returns:
         - List[Union[str, dict]]: A list of alternating text and image dictionary items.
     """
+    assert img_format in ["uri", "url", "pil"]
+
     output = []
     last_index = 0
     image_count = 0
@@ -114,7 +181,15 @@ def gpt4v_formatter(prompt: str) -> List[Union[str, dict]]:
         image_location = match.group(1)
 
         try:
-            img_data = get_image_data(image_location)
+            if img_format == "pil":
+                img_data = get_pil_image(image_location)
+            elif img_format == "uri":
+                img_data = get_image_data(image_location)
+                img_data = convert_base64_to_data_uri(img_data)
+            elif img_format == "url":
+                img_data = image_location
+            else:
+                raise ValueError(f"Unknown image format {img_format}")
         except Exception as e:
             # Warning and skip this token
             print(f"Warning! Unable to load image from {image_location}, because {e}")
@@ -124,7 +199,7 @@ def gpt4v_formatter(prompt: str) -> List[Union[str, dict]]:
         output.append({"type": "text", "text": prompt[last_index : match.start()]})
 
         # Add image data to output list
-        output.append({"type": "image_url", "image_url": {"url": convert_base64_to_data_uri(img_data)}})
+        output.append({"type": "image_url", "image_url": {"url": img_data}})
 
         last_index = match.end()
         image_count += 1
@@ -162,9 +237,61 @@ def _to_pil(data: str) -> Image.Image:
     and finally creates and returns a PIL Image object from the BytesIO object.
 
     Parameters:
-        data (str): The base64 encoded image data string.
+        data (str): The encoded image data string.
 
     Returns:
         Image.Image: The PIL Image object created from the input data.
     """
     return Image.open(BytesIO(base64.b64decode(data)))
+
+
+def message_formatter_pil_to_b64(messages: List[Dict]) -> List[Dict]:
+    """
+    Converts the PIL image URLs in the messages to base64 encoded data URIs.
+
+    This function iterates over a list of message dictionaries. For each message,
+    if it contains a 'content' key with a list of items, it looks for items
+    with an 'image_url' key. The function then converts the PIL image URL
+    (pointed to by 'image_url') to a base64 encoded data URI.
+
+    Parameters:
+        messages (List[Dict]): A list of message dictionaries. Each dictionary
+                               may contain a 'content' key with a list of items,
+                               some of which might be image URLs.
+
+    Returns:
+        List[Dict]: A new list of message dictionaries with PIL image URLs in the
+                    'image_url' key converted to base64 encoded data URIs.
+
+    Example Input:
+        [
+            {'content': [{'type': 'text', 'text': 'You are a helpful AI assistant.'}], 'role': 'system'},
+            {'content': [
+                {'type': 'text', 'text': "What's the breed of this dog here? \n"},
+                {'type': 'image_url', 'image_url': {'url': a PIL.Image.Image}},
+                {'type': 'text', 'text': '.'}],
+            'role': 'user'}
+        ]
+
+    Example Output:
+        [
+            {'content': [{'type': 'text', 'text': 'You are a helpful AI assistant.'}], 'role': 'system'},
+            {'content': [
+                {'type': 'text', 'text': "What's the breed of this dog here? \n"},
+                {'type': 'image_url', 'image_url': {'url': a B64 Image}},
+                {'type': 'text', 'text': '.'}],
+            'role': 'user'}
+        ]
+    """
+    new_messages = []
+    for message in messages:
+        # Handle the new GPT messages format.
+        if isinstance(message, dict) and "content" in message and isinstance(message["content"], list):
+            message = copy.deepcopy(message)
+            for item in message["content"]:
+                if isinstance(item, dict) and "image_url" in item:
+                    item["image_url"]["url"] = pil_to_data_uri(item["image_url"]["url"])
+
+        new_messages.append(message)
+
+    return new_messages
diff --git a/autogen/agentchat/contrib/llava_agent.py b/autogen/agentchat/contrib/llava_agent.py
@@ -77,7 +77,9 @@ def _image_reply(self, messages=None, sender=None, config=None):
             content_prompt = content_str(msg["content"])
             prompt += f"{SEP}{role}: {content_prompt}\n"
         prompt += "\n" + SEP + "Assistant: "
-        images = [re.sub("data:image/.+;base64,", "", im, count=1) for im in images]
+
+        # TODO: PIL to base64
+        images = [get_image_data(im) for im in images]
         print(colored(prompt, "blue"))
 
         out = ""