microsoft · sonichi · Nov 6, 2023 · Nov 5, 2023 · Nov 5, 2023 · Nov 5, 2023
diff --git a/.github/workflows/contrib-lmm.yml b/.github/workflows/contrib-lmm.yml
@@ -0,0 +1,64 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: ContribTests
+
+on:
+  pull_request:
+    branches: ['main', 'dev/v0.2']
+    paths:
+      - 'autogen/img_utils.py'
+      - 'autogen/agentchat/multimodal_conversable_agent.py'
+      - 'autogen/agentchat/contrib/llava_agent.py'
+      - 'test/test_img_utils.py'
+      - 'test/agentchat/test_lmm.py'
+      - 'test/agentchat/contrib/test_llava.py'
+      - '.github/workflows/lmm-test.yml'
+      - 'setup.py'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+jobs:
+  build:
+
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-2019]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install packages and dependencies for all tests
+        run: |
+          python -m pip install --upgrade pip wheel
+          pip install pytest
+      - name: Install qdrant_client when python-version is 3.10
+        if: matrix.python-version == '3.10' || matrix.python-version == '3.8'
+        run: |
+          pip install qdrant_client[fastembed]
+      - name: Install packages and dependencies for LMM
+        run: |
+          pip install -e .[lmm]
+          pip uninstall -y openai
+      - name: Test LMM and LLaVA
+        run: |
+          pytest test/test_img_utils.py test/agentchat/test_lmm.py test/agentchat/contrib/test_llava.py
+      - name: Coverage
+        if: matrix.python-version == '3.10'
+        run: |
+          pip install coverage>=5.3
+          coverage run -a -m pytest test/test_img_utils.py test/agentchat/test_lmm.py test/agentchat/contrib/test_llava.py
+          coverage xml
+      - name: Upload coverage to Codecov
+        if: matrix.python-version == '3.10'
+        uses: codecov/codecov-action@v3
+        with:
+          file: ./coverage.xml
+          flags: unittests
diff --git a/autogen/agentchat/__init__.py b/autogen/agentchat/__init__.py
@@ -1,8 +1,9 @@
 from .agent import Agent
-from .conversable_agent import ConversableAgent
 from .assistant_agent import AssistantAgent
-from .user_proxy_agent import UserProxyAgent
+from .conversable_agent import ConversableAgent
 from .groupchat import GroupChat, GroupChatManager
+from .multimodal_conversable_agent import MultimodalConversableAgent
+from .user_proxy_agent import UserProxyAgent
 
 __all__ = [
     "Agent",
@@ -11,4 +12,5 @@
     "UserProxyAgent",
     "GroupChat",
     "GroupChatManager",
+    "MultimodalConversableAgent",
 ]
diff --git a/autogen/agentchat/contrib/llava_agent.py b/autogen/agentchat/contrib/llava_agent.py
@@ -0,0 +1,175 @@
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import replicate
+import requests
+from regex import R
+
+from autogen.agentchat.agent import Agent
+from autogen.agentchat.multimodal_conversable_agent import MultimodalConversableAgent
+from autogen.img_utils import get_image_data, lmm_formater
+
+try:
+    from termcolor import colored
+except ImportError:
+
+    def colored(x, *args, **kwargs):
+        return x
+
+
+logger = logging.getLogger(__name__)
+
+# we will override the following variables later.
+SEP = "###"
+
+DEFAULT_LLAVA_SYS_MSG = "You are an AI agent and you can view images."
+
+
+class LLaVAAgent(MultimodalConversableAgent):
+    def __init__(
+        self,
+        name: str,
+        system_message: Optional[Tuple[str, List]] = DEFAULT_LLAVA_SYS_MSG,
+        *args,
+        **kwargs,
+    ):
+        """
+        Args:
+            name (str): agent name.
+            system_message (str): system message for the ChatCompletion inference.
+                Please override this attribute if you want to reprogram the agent.
+            **kwargs (dict): Please refer to other kwargs in
+                [ConversableAgent](conversable_agent#__init__).
+        """
+        super().__init__(
+            name,
+            system_message=system_message,
+            *args,
+            **kwargs,
+        )
+
+        assert self.llm_config is not None, "llm_config must be provided."
+        self.register_reply([Agent, None], reply_func=LLaVAAgent._image_reply, position=0)
+
+    def _image_reply(self, messages=None, sender=None, config=None):
+        # Note: we did not use "llm_config" yet.
+        # TODO 1: make the LLaVA API design compatible with llm_config
+
+        if all((messages is None, sender is None)):
+            error_msg = f"Either {messages=} or {sender=} must be provided."
+            logger.error(error_msg)
+            raise AssertionError(error_msg)
+
+        if messages is None:
+            messages = self._oai_messages[sender]
+
+        # The formats for LLaVA and GPT are different. So, we manually handle them here.
+        # TODO: format the images from the history accordingly.
+        images = []
+        prompt = self._content_str(self.system_message) + "\n"
+        for msg in messages:
+            role = "Human" if msg["role"] == "user" else "Assistant"
+            images += [d["image"] for d in msg["content"] if isinstance(d, dict)]
+            content_prompt = self._content_str(msg["content"])
+            prompt += f"{SEP}{role}: {content_prompt}\n"
+        prompt += "\n" + SEP + "Assistant: "
+        print(colored(prompt, "blue"))
+
+        out = ""
+        retry = 10
+        while len(out) == 0 and retry > 0:
+            # image names will be inferred automatically from llava_call
+            out = llava_call_binary(
+                prompt=prompt,
+                images=images,
+                config_list=self.llm_config["config_list"],
+                temperature=self.llm_config.get("temperature", 0.5),
+                max_new_tokens=self.llm_config.get("max_new_tokens", 2000),
+            )
+            retry -= 1
+
+        assert out != "", "Empty response from LLaVA."
+
+        return True, out
+
+
+def _llava_call_binary_with_config(
+    prompt: str, images: list, config: dict, max_new_tokens: int = 1000, temperature: float = 0.5, seed: int = 1
+):
+    if config["base_url"].find("0.0.0.0") >= 0 or config["base_url"].find("localhost") >= 0:
+        llava_mode = "local"
+    else:
+        llava_mode = "remote"
+
+    if llava_mode == "local":
+        headers = {"User-Agent": "LLaVA Client"}
+        pload = {
+            "model": config["model"],
+            "prompt": prompt,
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "stop": SEP,
+            "images": images,
+        }
+
+        response = requests.post(
+            config["base_url"].rstrip("/") + "/worker_generate_stream", headers=headers, json=pload, stream=False
+        )
+
+        for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
+            if chunk:
+                data = json.loads(chunk.decode("utf-8"))
+                output = data["text"].split(SEP)[-1]
+    elif llava_mode == "remote":
+        # The Replicate version of the model only support 1 image for now.
+        img = "data:image/jpeg;base64," + images[0]
+        response = replicate.run(
+            config["base_url"], input={"image": img, "prompt": prompt.replace("<image>", " "), "seed": seed}
+        )
+        # The yorickvp/llava-13b model can stream output as it's running.
+        # The predict method returns an iterator, and you can iterate over that output.
+        output = ""
+        for item in response:
+            # https://replicate.com/yorickvp/llava-13b/versions/2facb4a474a0462c15041b78b1ad70952ea46b5ec6ad29583c0b29dbd4249591/api#output-schema
+            output += item
+
+    # Remove the prompt and the space.
+    output = output.replace(prompt, "").strip().rstrip()
+    return output
+
+
+def llava_call_binary(
+    prompt: str, images: list, config_list: list, max_new_tokens: int = 1000, temperature: float = 0.5, seed: int = 1
+):
+    # TODO 1: add caching around the LLaVA call to save compute and cost
+    # TODO 2: add `seed` to ensure reproducibility. The seed is not working now.
+
+    for config in config_list:
+        try:
+            return _llava_call_binary_with_config(prompt, images, config, max_new_tokens, temperature, seed)
+        except Exception as e:
+            print(f"Error: {e}")
+            continue
+
+
+def llava_call(prompt: str, llm_config: dict) -> str:
+    """
+    Makes a call to the LLaVA service to generate text based on a given prompt
+    """
+
+    prompt, images = lmm_formater(prompt, order_image_tokens=False)
+
+    for im in images:
+        if len(im) == 0:
+            raise RuntimeError("An image is empty!")
+
+    return llava_call_binary(
+        prompt,
+        images,
+        config_list=llm_config["config_list"],
+        max_new_tokens=llm_config.get("max_new_tokens", 2000),
+        temperature=llm_config.get("temperature", 0.5),
+        seed=llm_config.get("seed", None),
+    )
diff --git a/autogen/agentchat/multimodal_conversable_agent.py b/autogen/agentchat/multimodal_conversable_agent.py
@@ -0,0 +1,118 @@
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+from autogen import oai
+from autogen.img_utils import gpt4v_formatter
+
+from .agent import Agent
+from .conversable_agent import ConversableAgent
+
+try:
+    from termcolor import colored
+except ImportError:
+
+    def colored(x, *args, **kwargs):
+        return x
+
+
+DEFAULT_LMM_SYS_MSG = """You are a helpful AI assistant.
+You can also view images, where the "<image i>" represent the i-th image you received."""
+
+
+class MultimodalConversableAgent(ConversableAgent):
+    def __init__(
+        self,
+        name: str,
+        system_message: Optional[Tuple[str, List]] = DEFAULT_LMM_SYS_MSG,
+        is_termination_msg=None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Args:
+            name (str): agent name.
+            system_message (str): system message for the ChatCompletion inference.
+                Please override this attribute if you want to reprogram the agent.
+            **kwargs (dict): Please refer to other kwargs in
+                [ConversableAgent](conversable_agent#__init__).
+        """
+        super().__init__(
+            name,
+            system_message,
+            is_termination_msg=is_termination_msg,
+            *args,
+            **kwargs,
+        )
+
+        self.update_system_message(system_message)
+        self._is_termination_msg = (
+            is_termination_msg if is_termination_msg is not None else (lambda x: x.get("content")[-1] == "TERMINATE")
+        )
+
+    @property
+    def system_message(self) -> List:
+        """Return the system message."""
+        return self._oai_system_message[0]["content"]
+
+    def update_system_message(self, system_message: Union[Dict, List, str]):
+        """Update the system message.
+
+        Args:
+            system_message (str): system message for the ChatCompletion inference.
+        """
+        self._oai_system_message[0]["content"] = self._message_to_dict(system_message)["content"]
+        self._oai_system_message[0]["role"] = "system"
+
+    @staticmethod
+    def _message_to_dict(message: Union[Dict, List, str]):
+        """Convert a message to a dictionary.
+
+        The message can be a string or a dictionary. The string will be put in the "content" field of the new dictionary.
+        """
+        if isinstance(message, str):
+            return {"content": gpt4v_formatter(message)}
+        if isinstance(message, list):
+            return {"content": message}
+        else:
+            return message
+
+    def _content_str(self, content: List) -> str:
+        rst = ""
+        for item in content:
+            if isinstance(item, str):
+                rst += item
+            else:
+                assert isinstance(item, dict) and "image" in item, "Wrong content format."
+                rst += "<image>"
+        return rst
+
+    def _print_received_message(self, message: Union[Dict, str], sender: Agent):
+        # print the message received
+        print(colored(sender.name, "yellow"), "(to", f"{self.name}):\n", flush=True)
+        if message.get("role") == "function":
+            func_print = f"***** Response from calling function \"{message['name']}\" *****"
+            print(colored(func_print, "green"), flush=True)
+            print(self._content_str(message["content"]), flush=True)
+            print(colored("*" * len(func_print), "green"), flush=True)
+        else:
+            content = message.get("content")
+            if content is not None:
+                if "context" in message:
+                    content = oai.ChatCompletion.instantiate(
+                        content,
+                        message["context"],
+                        self.llm_config and self.llm_config.get("allow_format_str_template", False),
+                    )
+                print(self._content_str(content), flush=True)
+            if "function_call" in message:
+                func_print = f"***** Suggested function Call: {message['function_call'].get('name', '(No function name found)')} *****"
+                print(colored(func_print, "green"), flush=True)
+                print(
+                    "Arguments: \n",
+                    message["function_call"].get("arguments", "(No arguments found)"),
+                    flush=True,
+                    sep="",
+                )
+                print(colored("*" * len(func_print), "green"), flush=True)
+        print("\n", "-" * 80, flush=True, sep="")
+
+    # TODO: we may want to udpate `generate_code_execution_reply` or `extract_code` for the "content" type change.