microsoft · thinkall · Dec 13, 2023 · Nov 10, 2023 · Nov 10, 2023 · Nov 10, 2023
diff --git a/autogen/__init__.py b/autogen/__init__.py
@@ -4,7 +4,6 @@
 from .agentchat import *
 from .code_utils import DEFAULT_MODEL, FAST_MODEL
 
-
 # Set the root logger.
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
diff --git a/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py b/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py
@@ -1,7 +1,7 @@
 from typing import Callable, Dict, List, Optional
 
 from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
-from autogen.retrieve_utils import get_files_from_dir, split_files_to_chunks, TEXT_FORMATS
+from autogen.agentchat.contrib.retriever.retrieve_utils import get_files_from_dir, split_files_to_chunks, TEXT_FORMATS
 import logging
 
 logger = logging.getLogger(__name__)

diff --git a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py
@@ -1,14 +1,12 @@
 import re
 
-try:
-    import chromadb
-except ImportError:
-    raise ImportError("Please install dependencies first. `pip install pyautogen[retrievechat]`")
 from autogen.agentchat.agent import Agent
 from autogen.agentchat import UserProxyAgent
-from autogen.retrieve_utils import create_vector_db_from_dir, query_vector_db, TEXT_FORMATS
+from autogen.agentchat.contrib.retriever.retrieve_utils import TEXT_FORMATS
 from autogen.token_count_utils import count_token
 from autogen.code_utils import extract_code
+from autogen.agentchat.contrib.retriever import get_retriever
+
 from autogen import logger
 
 from typing import Callable, Dict, Optional, Union, List, Tuple, Any
@@ -96,10 +94,9 @@ def __init__(
                 To use default config, set to None. Otherwise, set to a dictionary with the following keys:
                 - task (Optional, str): the task of the retrieve chat. Possible values are "code", "qa" and "default". System
                     prompt will be different for different tasks. The default value is `default`, which supports both code and qa.
-                - client (Optional, chromadb.Client): the chromadb client. If key not provided, a default client `chromadb.Client()`
-                    will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function.
-                - docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file,
-                    the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created.
+                - client (Optional, Any): the vectordb client/connection. If key not provided, the Retreiver class should handle it.
+                - docs_path (Optional, str): the path to the docs directory. It can also be the path to a single file,
+                    or the url to a single file. Default is None, which works only if the collection is already created.
                 - collection_name (Optional, str): the name of the collection.
                     If key not provided, a default name `autogen-docs` will be used.
                 - model (Optional, str): the model to use for the retrieve chat.
@@ -123,8 +120,11 @@ def __init__(
                 - customized_answer_prefix (Optional, str): the customized answer prefix for the retrieve chat. Default is "".
                     If not "" and the customized_answer_prefix is not in the answer, `Update Context` will be triggered.
                 - update_context (Optional, bool): if False, will not apply `Update Context` for interactive retrieval. Default is True.
-                - get_or_create (Optional, bool): if True, will create/return a collection for the retrieve chat. This is the same as that used in chromadb.
-                    Default is False. Will raise ValueError if the collection already exists and get_or_create is False. Will be set to True if docs_path is None.
+                - db_mode (Optional, str): the mode to create the vector db. Possible values are "get", "recreate", "create". Default is "recreate" to
+                    keep the workflow less error-prone. If "get", will try to get an existing collection. If "recreate", will recreate a collection
+                    if the collection already exists. If "create", will create a collection if the collection doesn't exist.
+                - get_or_create (Optional, bool): [Depricated]if True, will create/recreate a collection for the retrieve chat.
+                    This is the same as that used in retriever. Default is False. Will be set to False if docs_path is None.
                 - custom_token_count_function (Optional, Callable): a custom function to count the number of tokens in a string.
                     The function should take (text:str, model:str) as input and return the token_count(int). the retrieve_config["model"] will be passed in the function.
                     Default is autogen.token_count_utils.count_token that uses tiktoken, which may not be accurate for non-OpenAI models.
@@ -136,7 +136,7 @@ def __init__(
             **kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__).
 
         Example of overriding retrieve_docs:
-        If you have set up a customized vector db, and it's not compatible with chromadb, you can easily plug in it with below code.
+        If you have set up a customized vector db, and it's not compatible with retriever, you can easily plug in it with below code.
         ```python
         class MyRetrieveUserProxyAgent(RetrieveUserProxyAgent):
             def query_vector_db(
@@ -166,10 +166,12 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
             human_input_mode=human_input_mode,
             **kwargs,
         )
-
+        self.retriever = None
         self._retrieve_config = {} if retrieve_config is None else retrieve_config
+        self._retriever_type = self._retrieve_config.get("retriever_type")
+        self._retriever_path = self._retrieve_config.get("retriever_path", "~/autogen")
         self._task = self._retrieve_config.get("task", "default")
-        self._client = self._retrieve_config.get("client", chromadb.Client())
+        self._client = self._retrieve_config.get("client", None)
         self._docs_path = self._retrieve_config.get("docs_path", None)
         self._collection_name = self._retrieve_config.get("collection_name", "autogen-docs")
         if "docs_path" not in self._retrieve_config:
@@ -188,7 +190,6 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
         self.customized_prompt = self._retrieve_config.get("customized_prompt", None)
         self.customized_answer_prefix = self._retrieve_config.get("customized_answer_prefix", "").upper()
         self.update_context = self._retrieve_config.get("update_context", True)
-        self._get_or_create = self._retrieve_config.get("get_or_create", False) if self._docs_path is not None else True
         self.custom_token_count_function = self._retrieve_config.get("custom_token_count_function", count_token)
         self.custom_text_split_function = self._retrieve_config.get("custom_text_split_function", None)
         self._custom_text_types = self._retrieve_config.get("custom_text_types", TEXT_FORMATS)
@@ -202,6 +203,26 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
         self._doc_contents = []  # the contents of the current used doc
         self._doc_ids = []  # the ids of the current used doc
         self._search_string = ""  # the search string used in the current query
+        self._db_mode = self._retrieve_config.get("db_mode")
+        self._get_or_create = self._retrieve_config.get("get_or_create")
+        if self._db_mode and self._get_or_create:
+            logger.warning(
+                colored(
+                    "Warning: db_mode and get_or_create are both set. get_or_create will be ignored. get_or_create is depricated",
+                    "yellow",
+                )
+            )
+            self._get_or_create = None
+        elif self._db_mode is None and self._get_or_create is None:  # if both not set, set db_mode's default value
+            self._db_mode = "recreate"
+        elif self._get_or_create:
+            logger.warning(
+                colored(
+                    "Warning: get_or_create is depricated and will be removed from future versions. Use `db_mode` instead",
+                    "yellow",
+                )
+            )
+
         # update the termination message function
         self._is_termination_msg = (
             self._is_termination_msg_retrievechat if is_termination_msg is None else is_termination_msg
@@ -362,13 +383,9 @@ def _generate_retrieve_user_reply(
 
     def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = ""):
         """Retrieve docs based on the given problem and assign the results to the class property `_results`.
-        In case you want to customize the retrieval process, such as using a different vector db whose APIs are not
-        compatible with chromadb or filter results with metadata, you can override this function. Just keep the current
-        parameters and add your own parameters with default values, and keep the results in below type.
 
         Type of the results: Dict[str, List[List[Any]]], should have keys "ids" and "documents", "ids" for the ids of
-        the retrieved docs and "documents" for the contents of the retrieved docs. Any other keys are optional. Refer
-        to `chromadb.api.types.QueryResult` as an example.
+        the retrieved docs and "documents" for the contents of the retrieved docs. Any other keys are optional.
             ids: List[string]
             documents: List[List[string]]
 
@@ -377,33 +394,57 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str =
             n_results (int): the number of results to be retrieved. Default is 20.
             search_string (str): only docs that contain an exact match of this string will be retrieved. Default is "".
         """
-        if not self._collection or not self._get_or_create:
-            print("Trying to create collection.")
-            self._client = create_vector_db_from_dir(
-                dir_path=self._docs_path,
+        if not self.retriever:
+            retriever_class = get_retriever(self._retriever_type)
+            self.retriever = retriever_class(
+                path=self._retriever_path,
+                name=self._collection_name,
+                embedding_model_name=self._embedding_model,
+                embedding_function=self._embedding_function,
                 max_tokens=self._chunk_token_size,
-                client=self._client,
-                collection_name=self._collection_name,
                 chunk_mode=self._chunk_mode,
                 must_break_at_empty_line=self._must_break_at_empty_line,
-                embedding_model=self._embedding_model,
-                get_or_create=self._get_or_create,
-                embedding_function=self._embedding_function,
                 custom_text_split_function=self.custom_text_split_function,
+                client=self._client,
                 custom_text_types=self._custom_text_types,
                 recursive=self._recursive,
             )
-            self._collection = True
-            self._get_or_create = True
-
-        results = query_vector_db(
-            query_texts=[problem],
-            n_results=n_results,
-            search_string=search_string,
-            client=self._client,
-            collection_name=self._collection_name,
-            embedding_model=self._embedding_model,
-            embedding_function=self._embedding_function,
+        if self._db_mode:
+            if self._db_mode not in ["get", "recreate", "create"]:
+                raise ValueError(
+                    f"db_mode {self._db_mode} is not supported. Possible values are 'get', 'recreate', 'create'."
+                )
+            if self._db_mode == "get":
+                if (
+                    not self.retriever.index_exists
+                ):  # warn users if the index doesn't exist. Maybe we can even raise here
+                    raise ValueError("The index doesn't exist. Please set db_mode to 'recreate' or 'create'.")
+                self.retriever.use_existing_index()
+            elif self._db_mode == "recreate":
+                logger.info("Trying to create index. If the index already exists, it will be recreated.")
+                self.retriever.ingest_data(self._docs_path, overwrite=True)
+            elif self._db_mode == "create":
+                logger.info("Trying to create index.")
+                if self.retriever.index_exists:
+                    raise ValueError("The index already exists. Please set db_mode to 'get' or 'recreate'.")
+                self.retriever.ingest_data(self._docs_path, overwrite=False)
+
+        elif self._get_or_create:
+            if not self.retriever.index_exists or self._get_or_create:
+                if not self.retriever.index_exists:
+                    logger.info("Trying to create index.")
+                    self.retriever.ingest_data(self._docs_path, overwrite=False)
+                else:
+                    logger.info("Trying to recreate index.")
+                    self.retriever.ingest_data(self._docs_path, overwrite=True)
+            else:
+                logger.info("Trying to use existing collection.")
+                self.retriever.use_existing_index()
+
+        results = self.retriever.query(
+            texts=[problem],
+            top_k=n_results,
+            filter=search_string,
         )
         self._search_string = search_string
         self._results = results

diff --git a/autogen/agentchat/contrib/retriever/__init__.py b/autogen/agentchat/contrib/retriever/__init__.py
@@ -0,0 +1 @@
+from .retrieve_utils import get_retriever
diff --git a/autogen/agentchat/contrib/retriever/base.py b/autogen/agentchat/contrib/retriever/base.py
@@ -0,0 +1,85 @@
+from abc import ABC, abstractmethod
+from typing import List, Union, Callable, Any
+
+
+class Retriever(ABC):
+    def __init__(
+        self,
+        path="./db",
+        name="vectorstore",
+        embedding_model_name="all-MiniLM-L6-v2",
+        embedding_function=None,
+        max_tokens: int = 4000,
+        chunk_mode: str = "multi_lines",
+        must_break_at_empty_line: bool = True,
+        custom_text_split_function: Callable = None,
+        client=None,
+        # TODO: add support for custom text types and recurisive
+        custom_text_types: str = None,
+        recursive: bool = True,
+    ):
+        """
+        Args:
+            path: path to the folder where the database is stored
+            name: name of the database
+            embedding_model_name: name of the embedding model to use
+            embedding_function: function to use to embed the text
+            max_tokens: maximum number of tokens to embed
+            chunk_mode: mode to chunk the text. Can be "multi_lines" or "single_line"
+            must_break_at_empty_line: chunk will only break at empty line if True. Default is True.
+                    If chunk_mode is "one_line", this parameter will be ignored.
+            custom_text_split_function: custom function to split the text into chunks
+        """
+        self.path = path
+        self.name = name
+        self.embedding_model_name = embedding_model_name
+        self.embedding_function = embedding_function
+        self.max_tokens = max_tokens
+        self.chunk_mode = chunk_mode
+        self.must_break_at_empty_line = must_break_at_empty_line
+        self.custom_text_split_function = custom_text_split_function
+        self.client = client
+
+        self.init_db()
+
+    @abstractmethod
+    def ingest_data(self, data_dir, overwrite: bool = False):
+        """
+        Create a vector database from a directory of files.
+        Args:
+            data_dir: path to the directory containing the text files
+            overwrite: overwrite the existing database if True
+        """
+        pass
+
+    @abstractmethod
+    def use_existing_index(self):
+        """
+        Open an existing index.
+        """
+        pass
+
+    @abstractmethod
+    def query(self, texts: List[str], top_k: int = 10, filter: Any = None):
+        """
+        Query the database.
+        Args:
+            query: query string or list of query strings
+            top_k: number of results to return
+        """
+        pass
+
+    @abstractmethod
+    def init_db(self):
+        """
+        Initialize the database.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def index_exists(self):
+        """
+        Check if the index exists in the database.
+        """
+        pass
diff --git a/autogen/agentchat/contrib/retriever/chromadb.py b/autogen/agentchat/contrib/retriever/chromadb.py
@@ -0,0 +1,85 @@
+from typing import List
+from .base import Retriever
+from .retrieve_utils import split_text_to_chunks, extract_text_from_pdf, split_files_to_chunks, get_files_from_dir
+
+try:
+    import chromadb
+
+    if chromadb.__version__ < "0.4.15":
+        from chromadb.api import API
+    else:
+        from chromadb.api import ClientAPI as API
+    from chromadb.api.types import QueryResult
+    import chromadb.utils.embedding_functions as ef
+except ImportError:
+    raise ImportError("Please install chromadb: pip install chromadb")
+
+
+class ChromaDB(Retriever):
+    def init_db(self):
+        self.client = chromadb.PersistentClient(path=self.path)
+        self.embedding_function = (
+            ef.SentenceTransformerEmbeddingFunction(self.embedding_model_name)
+            if self.embedding_function is None
+            else self.embedding_function
+        )
+        self.collection = None
+
+    def ingest_data(self, data_dir, overwrite: bool = False):
+        """
+        Create a vector database from a directory of files.
+        Args:
+            data_dir: path to the directory containing the text files
+        """
+
+        self.collection = self.client.create_collection(
+            self.name,
+            embedding_function=self.embedding_function,
+            get_or_create=overwrite,
+            # https://github.com/nmslib/hnswlib#supported-distances
+            # https://github.com/chroma-core/chroma/blob/566bc80f6c8ee29f7d99b6322654f32183c368c4/chromadb/segment/impl/vector/local_hnsw.py#L184
+            # https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+            metadata={"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 32},  # ip, l2, cosine
+        )
+
+        if self.custom_text_split_function is not None:
+            chunks = split_files_to_chunks(
+                get_files_from_dir(data_dir), custom_text_split_function=self.custom_text_split_function
+            )
+        else:
+            chunks = split_files_to_chunks(
+                get_files_from_dir(data_dir), self.max_tokens, self.chunk_mode, self.must_break_at_empty_line
+            )
+        print(f"Found {len(chunks)} chunks.")  #
+        # Upsert in batch of 40000 or less if the total number of chunks is less than 40000
+        for i in range(0, len(chunks), min(40000, len(chunks))):
+            end_idx = i + min(40000, len(chunks) - i)
+            self.collection.upsert(
+                documents=chunks[i:end_idx],
+                ids=[f"doc_{j}" for j in range(i, end_idx)],  # unique for each doc
+            )
+
+    def use_existing_index(self):
+        self.collection = self.client.get_collection(name=self.name, embedding_function=self.embedding_function)
+
+    def query(self, texts: List[str], top_k: int = 10, filter: str = None):
+        # the collection's embedding function is always the default one, but we want to use the one we used to create the
+        # collection. So we compute the embeddings ourselves and pass it to the query function.
+
+        query_embeddings = self.embedding_function(texts)
+        # Query/search n most similar results. You can also .get by id
+        results = self.collection.query(
+            query_embeddings=query_embeddings,
+            n_results=top_k,
+            where_document={"$contains": filter} if filter else None,  # optional filter
+        )
+        return results
+
+    @property
+    def index_exists(self):
+        try:
+            self.client.get_collection(name=self.name, embedding_function=self.embedding_function)
+            # Not sure if there's an explicit way to check if a collection exists for chromadb
+            return True
+        except Exception:
+            return False