diff --git a/.github/workflows/contrib-openai.yml b/.github/workflows/contrib-openai.yml index 467d5270c8e..78770792fb4 100644 --- a/.github/workflows/contrib-openai.yml +++ b/.github/workflows/contrib-openai.yml @@ -42,6 +42,7 @@ jobs: pip install docker pip install qdrant_client[fastembed] pip install -e .[retrievechat] + pip install chromadb - name: Coverage env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -49,7 +50,7 @@ jobs: AZURE_OPENAI_API_BASE: ${{ secrets.AZURE_OPENAI_API_BASE }} OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }} run: | - coverage run -a -m pytest test/agentchat/contrib/test_retrievechat.py test/agentchat/contrib/test_qdrant_retrievechat.py + coverage run -a -m pytest test/agentchat/contrib/retrievers coverage xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml index 7e1eaa7d85b..a042b93dec2 100644 --- a/.github/workflows/contrib-tests.yml +++ b/.github/workflows/contrib-tests.yml @@ -45,15 +45,16 @@ jobs: - name: Install packages and dependencies for RetrieveChat run: | pip install -e .[retrievechat] + pip install chromadb pip uninstall -y openai - name: Test RetrieveChat run: | - pytest test/test_retrieve_utils.py test/agentchat/contrib/test_retrievechat.py test/agentchat/contrib/test_qdrant_retrievechat.py + pytest test/agentchat/contrib/retrievers - name: Coverage if: matrix.python-version == '3.10' run: | pip install coverage>=5.3 - coverage run -a -m pytest test/test_retrieve_utils.py test/agentchat/contrib + coverage run -a -m pytest test/agentchat/contrib/retrievers coverage xml - name: Upload coverage to Codecov if: matrix.python-version == '3.10' diff --git a/autogen/__init__.py b/autogen/__init__.py index 3002ad5df8e..5d3a8a14b5e 100644 --- a/autogen/__init__.py +++ b/autogen/__init__.py @@ -4,7 +4,6 @@ from .agentchat import * from .code_utils import DEFAULT_MODEL, FAST_MODEL - # Set the root logger. logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) diff --git a/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py b/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py index 88a2d318aa9..9a8b141cd30 100644 --- a/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py +++ b/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py @@ -1,7 +1,7 @@ from typing import Callable, Dict, List, Optional from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent -from autogen.retrieve_utils import get_files_from_dir, split_files_to_chunks, TEXT_FORMATS +from autogen.agentchat.contrib.retriever.retrieve_utils import get_files_from_dir, split_files_to_chunks, TEXT_FORMATS import logging logger = logging.getLogger(__name__) diff --git a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py index 28879d31563..280a7e0d7c3 100644 --- a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py +++ b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py @@ -1,14 +1,12 @@ import re -try: - import chromadb -except ImportError: - raise ImportError("Please install dependencies first. `pip install pyautogen[retrievechat]`") from autogen.agentchat.agent import Agent from autogen.agentchat import UserProxyAgent -from autogen.retrieve_utils import create_vector_db_from_dir, query_vector_db, TEXT_FORMATS +from autogen.agentchat.contrib.retriever.retrieve_utils import TEXT_FORMATS from autogen.token_count_utils import count_token from autogen.code_utils import extract_code +from autogen.agentchat.contrib.retriever import get_retriever + from autogen import logger from typing import Callable, Dict, Optional, Union, List, Tuple, Any @@ -94,12 +92,14 @@ def __init__( The dict can contain the following keys: "content", "role", "name", "function_call". retrieve_config (dict or None): config for the retrieve agent. To use default config, set to None. Otherwise, set to a dictionary with the following keys: + - retriever_type (Optional, str): the type of the retriever. + - retriever_path (Optional, str): the path to use for retriever-realted operations. Default is `~/autogen`. - task (Optional, str): the task of the retrieve chat. Possible values are "code", "qa" and "default". System prompt will be different for different tasks. The default value is `default`, which supports both code and qa. - - client (Optional, chromadb.Client): the chromadb client. If key not provided, a default client `chromadb.Client()` - will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function. + - client (Optional, Any): the vectordb client/connection. If key not provided, the Retreiver class should handle it. - docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file, - the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created. + the url to a single file or a list of directories, files and urls. + Default is None, which works only if the collection is already created. - collection_name (Optional, str): the name of the collection. If key not provided, a default name `autogen-docs` will be used. - model (Optional, str): the model to use for the retrieve chat. @@ -123,8 +123,14 @@ def __init__( - customized_answer_prefix (Optional, str): the customized answer prefix for the retrieve chat. Default is "". If not "" and the customized_answer_prefix is not in the answer, `Update Context` will be triggered. - update_context (Optional, bool): if False, will not apply `Update Context` for interactive retrieval. Default is True. - - get_or_create (Optional, bool): if True, will create/return a collection for the retrieve chat. This is the same as that used in chromadb. - Default is False. Will raise ValueError if the collection already exists and get_or_create is False. Will be set to True if docs_path is None. + - db_mode (Optional, str): the mode to create the vector db. Possible values are "get", "recreate", "create". Default is "recreate" to + keep the workflow less error-prone. If "get", will try to get an existing collection. If "recreate", will recreate a collection + if the collection already exists. If "create", will create a collection if the collection doesn't exist. + Raises ValueError if: + * the collection doesn't exist and "get" is used. + * the collection already exists and "create" is used. + - get_or_create (Optional, bool): [Depricated] if True, will create/recreate a collection for the retrieve chat. + This is the same as that used in retriever. Default is False. Will be set to False if docs_path is None. - custom_token_count_function (Optional, Callable): a custom function to count the number of tokens in a string. The function should take (text:str, model:str) as input and return the token_count(int). the retrieve_config["model"] will be passed in the function. Default is autogen.token_count_utils.count_token that uses tiktoken, which may not be accurate for non-OpenAI models. @@ -136,7 +142,7 @@ def __init__( **kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__). Example of overriding retrieve_docs: - If you have set up a customized vector db, and it's not compatible with chromadb, you can easily plug in it with below code. + If you want to set up a customized vector db, and it's not compatible with retriever, you can easily plug in it with below code. ```python class MyRetrieveUserProxyAgent(RetrieveUserProxyAgent): def query_vector_db( @@ -166,10 +172,12 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = human_input_mode=human_input_mode, **kwargs, ) - + self.retriever = None self._retrieve_config = {} if retrieve_config is None else retrieve_config + self._retriever_type = self._retrieve_config.get("retriever_type") + self._retriever_path = self._retrieve_config.get("retriever_path", "~/autogen") self._task = self._retrieve_config.get("task", "default") - self._client = self._retrieve_config.get("client", chromadb.Client()) + self._client = self._retrieve_config.get("client", None) self._docs_path = self._retrieve_config.get("docs_path", None) self._collection_name = self._retrieve_config.get("collection_name", "autogen-docs") if "docs_path" not in self._retrieve_config: @@ -188,7 +196,6 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = self.customized_prompt = self._retrieve_config.get("customized_prompt", None) self.customized_answer_prefix = self._retrieve_config.get("customized_answer_prefix", "").upper() self.update_context = self._retrieve_config.get("update_context", True) - self._get_or_create = self._retrieve_config.get("get_or_create", False) if self._docs_path is not None else True self.custom_token_count_function = self._retrieve_config.get("custom_token_count_function", count_token) self.custom_text_split_function = self._retrieve_config.get("custom_text_split_function", None) self._custom_text_types = self._retrieve_config.get("custom_text_types", TEXT_FORMATS) @@ -202,6 +209,26 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = self._doc_contents = [] # the contents of the current used doc self._doc_ids = [] # the ids of the current used doc self._search_string = "" # the search string used in the current query + self._db_mode = self._retrieve_config.get("db_mode") + self._get_or_create = self._retrieve_config.get("get_or_create") + if self._db_mode is not None and self._get_or_create is not None: + logger.warning( + colored( + "Warning: db_mode and get_or_create are both set. get_or_create will be ignored. get_or_create is depricated", + "yellow", + ) + ) + self._get_or_create = None + elif self._db_mode is None and self._get_or_create is None: # if both not set, set db_mode's default value + self._db_mode = "recreate" + elif self._get_or_create: + logger.warning( + colored( + "Warning: get_or_create is depricated and will be removed from future versions. Use `db_mode` instead", + "yellow", + ) + ) + # update the termination message function self._is_termination_msg = ( self._is_termination_msg_retrievechat if is_termination_msg is None else is_termination_msg @@ -362,13 +389,9 @@ def _generate_retrieve_user_reply( def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = ""): """Retrieve docs based on the given problem and assign the results to the class property `_results`. - In case you want to customize the retrieval process, such as using a different vector db whose APIs are not - compatible with chromadb or filter results with metadata, you can override this function. Just keep the current - parameters and add your own parameters with default values, and keep the results in below type. Type of the results: Dict[str, List[List[Any]]], should have keys "ids" and "documents", "ids" for the ids of - the retrieved docs and "documents" for the contents of the retrieved docs. Any other keys are optional. Refer - to `chromadb.api.types.QueryResult` as an example. + the retrieved docs and "documents" for the contents of the retrieved docs. Any other keys are optional. ids: List[string] documents: List[List[string]] @@ -377,33 +400,51 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = n_results (int): the number of results to be retrieved. Default is 20. search_string (str): only docs that contain an exact match of this string will be retrieved. Default is "". """ - if not self._collection or not self._get_or_create: - print("Trying to create collection.") - self._client = create_vector_db_from_dir( - dir_path=self._docs_path, + if not self.retriever: + retriever_class = get_retriever(self._retriever_type) + self.retriever = retriever_class( + path=self._retriever_path, + name=self._collection_name, + embedding_model_name=self._embedding_model, + embedding_function=self._embedding_function, max_tokens=self._chunk_token_size, - client=self._client, - collection_name=self._collection_name, chunk_mode=self._chunk_mode, must_break_at_empty_line=self._must_break_at_empty_line, - embedding_model=self._embedding_model, - get_or_create=self._get_or_create, - embedding_function=self._embedding_function, custom_text_split_function=self.custom_text_split_function, + client=self._client, custom_text_types=self._custom_text_types, recursive=self._recursive, ) - self._collection = True - self._get_or_create = True - - results = query_vector_db( - query_texts=[problem], - n_results=n_results, - search_string=search_string, - client=self._client, - collection_name=self._collection_name, - embedding_model=self._embedding_model, - embedding_function=self._embedding_function, + if self._db_mode: + if self._db_mode not in ["get", "recreate", "create"]: + raise ValueError( + f"db_mode {self._db_mode} is not supported. Possible values are 'get', 'recreate', 'create'." + ) + if self._db_mode == "get": + if not self.retriever.index_exists: + raise ValueError("The index doesn't exist. Please set db_mode to 'recreate' or 'create'.") + self.retriever.use_existing_index() + elif self._db_mode == "recreate": + logger.info("Trying to create index. If the index already exists, it will be recreated.") + self.retriever.ingest_data(self._docs_path, overwrite=True) + elif self._db_mode == "create": + logger.info("Trying to create index.") + if self.retriever.index_exists: + raise ValueError("The index already exists. Please set db_mode to 'get' or 'recreate'.") + self.retriever.ingest_data(self._docs_path, overwrite=False) + + elif self._get_or_create is not None: + if self._get_or_create and self.retriever.index_exists: + logger.info("Trying to use existing collection.") + self.retriever.use_existing_index() + else: + logger.info("Trying to create index.") + self.retriever.ingest_data(self._docs_path, overwrite=False) + + results = self.retriever.query( + texts=[problem], + top_k=n_results, + filter=search_string, ) self._search_string = search_string self._results = results diff --git a/autogen/agentchat/contrib/retriever/__init__.py b/autogen/agentchat/contrib/retriever/__init__.py new file mode 100644 index 00000000000..389bed28dd0 --- /dev/null +++ b/autogen/agentchat/contrib/retriever/__init__.py @@ -0,0 +1 @@ +from .retrieve_utils import get_retriever diff --git a/autogen/agentchat/contrib/retriever/base.py b/autogen/agentchat/contrib/retriever/base.py new file mode 100644 index 00000000000..f9a8331b145 --- /dev/null +++ b/autogen/agentchat/contrib/retriever/base.py @@ -0,0 +1,91 @@ +from abc import ABC, abstractmethod +from typing import List, Union, Callable, Any + + +class Retriever(ABC): + def __init__( + self, + path="./db", + name="vectorstore", + embedding_model_name="all-MiniLM-L6-v2", + embedding_function=None, + max_tokens: int = 4000, + chunk_mode: str = "multi_lines", + must_break_at_empty_line: bool = True, + custom_text_split_function: Callable = None, + client=None, + # TODO: add support for custom text types and recurisive + custom_text_types: str = None, + recursive: bool = True, + ): + """ + Args: + path: path to the folder where the database is stored + name: name of the database + embedding_model_name: name of the embedding model to use + embedding_function: function to use to embed the text + max_tokens: maximum number of tokens to embed + chunk_mode: mode to chunk the text. Can be "multi_lines" or "single_line" + must_break_at_empty_line: chunk will only break at empty line if True. Default is True. + If chunk_mode is "one_line", this parameter will be ignored. + custom_text_split_function: custom function to split the text into chunks + client: client to use to connect to the database + custom_text_types: custom text types to ingest + recursive: whether to recursively ingest the files in the directory + """ + self.path = path + self.name = name + self.embedding_model_name = embedding_model_name + self.embedding_function = embedding_function + self.max_tokens = max_tokens + self.chunk_mode = chunk_mode + self.must_break_at_empty_line = must_break_at_empty_line + self.custom_text_split_function = custom_text_split_function + self.client = client + self.custom_text_types = custom_text_types + self.recursive = recursive + + self.init_db() + + @abstractmethod + def ingest_data(self, data_dir, overwrite: bool = False): + """ + Create a vector database from a directory of files. + Args: + data_dir: path to the directory containing the text files + overwrite: overwrite the existing database if True + """ + pass + + @abstractmethod + def use_existing_index(self): + """ + Open an existing index. + """ + pass + + @abstractmethod + def query(self, texts: List[str], top_k: int = 10, search_string: Any = None): + """ + Query the database. + Args: + texts: list of texts to query + top_k: number of results to return + search_string: string to filter the results + """ + pass + + @abstractmethod + def init_db(self): + """ + Initialize the database. + """ + pass + + @property + @abstractmethod + def index_exists(self): + """ + Check if the index exists in the database. + """ + pass diff --git a/autogen/agentchat/contrib/retriever/chromadb.py b/autogen/agentchat/contrib/retriever/chromadb.py new file mode 100644 index 00000000000..dc1e81db028 --- /dev/null +++ b/autogen/agentchat/contrib/retriever/chromadb.py @@ -0,0 +1,87 @@ +from typing import List +from .base import Retriever +from .retrieve_utils import split_text_to_chunks, extract_text_from_pdf, split_files_to_chunks, get_files_from_dir + +try: + import chromadb + + if chromadb.__version__ < "0.4.15": + from chromadb.api import API + else: + from chromadb.api import ClientAPI as API + from chromadb.api.types import QueryResult + import chromadb.utils.embedding_functions as ef +except ImportError: + raise ImportError("Please install chromadb: pip install chromadb") + + +class ChromaDB(Retriever): + def init_db(self): + self.client = chromadb.PersistentClient(path=self.path) + self.embedding_function = ( + ef.SentenceTransformerEmbeddingFunction(self.embedding_model_name) + if self.embedding_function is None + else self.embedding_function + ) + self.collection = None + + def ingest_data(self, data_dir, overwrite: bool = False): + """ + Create a vector database from a directory of files. + Args: + data_dir: path to the directory containing the text files + """ + if overwrite is True and self.index_exists: + self.client.delete_collection(name=self.name) + + self.collection = self.client.create_collection( + self.name, + embedding_function=self.embedding_function, + get_or_create=overwrite, + # https://github.com/nmslib/hnswlib#supported-distances + # https://github.com/chroma-core/chroma/blob/566bc80f6c8ee29f7d99b6322654f32183c368c4/chromadb/segment/impl/vector/local_hnsw.py#L184 + # https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md + metadata={"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 32}, # ip, l2, cosine + ) + + if self.custom_text_split_function is not None: + chunks = split_files_to_chunks( + get_files_from_dir(data_dir), custom_text_split_function=self.custom_text_split_function + ) + else: + chunks = split_files_to_chunks( + get_files_from_dir(data_dir), self.max_tokens, self.chunk_mode, self.must_break_at_empty_line + ) + print(f"Found {len(chunks)} chunks.") # + # Upsert in batch of 40000 or less if the total number of chunks is less than 40000 + for i in range(0, len(chunks), min(40000, len(chunks))): + end_idx = i + min(40000, len(chunks) - i) + self.collection.upsert( + documents=chunks[i:end_idx], + ids=[f"doc_{j}" for j in range(i, end_idx)], # unique for each doc + ) + + def use_existing_index(self): + self.collection = self.client.get_collection(name=self.name, embedding_function=self.embedding_function) + + def query(self, texts: List[str], top_k: int = 10, search_string: str = None): + # the collection's embedding function is always the default one, but we want to use the one we used to create the + # collection. So we compute the embeddings ourselves and pass it to the query function. + + query_embeddings = self.embedding_function(texts) + # Query/search n most similar results. You can also .get by id + results = self.collection.query( + query_embeddings=query_embeddings, + n_results=top_k, + where_document={"$contains": search_string} if search_string else None, # optional filter + ) + return results + + @property + def index_exists(self): + try: + self.client.get_collection(name=self.name, embedding_function=self.embedding_function) + # Not sure if there's an explicit way to check if a collection exists for chromadb + return True + except Exception: + return False diff --git a/autogen/agentchat/contrib/retriever/lancedb.py b/autogen/agentchat/contrib/retriever/lancedb.py new file mode 100644 index 00000000000..88dd3ccba6d --- /dev/null +++ b/autogen/agentchat/contrib/retriever/lancedb.py @@ -0,0 +1,97 @@ +from typing import Callable, List +from collections import defaultdict +from .base import Retriever +from .retrieve_utils import split_text_to_chunks, extract_text_from_pdf, split_files_to_chunks, get_files_from_dir + +try: + import lancedb + from lancedb.embeddings import get_registry, EmbeddingFunction, with_embeddings + from lancedb.pydantic import LanceModel, Vector + import pyarrow as pa +except ImportError: + raise ImportError("Please install lancedb: pip install lancedb") + + +class LanceDB(Retriever): + db = None + table = None + + def init_db(self): + self.db = lancedb.connect(self.path) + self.embedding_function = ( + get_registry().get("sentence-transformers").create(name=self.embedding_model_name, show_progress_bar=True) + if self.embedding_function is None + else self.embedding_function + ) + + def ingest_data(self, data_dir, overwrite: bool = False): + """ + Create a vector database from a directory of files. + Args: + data_dir: path to the directory containing the text files + """ + schema = self._get_schema(self.embedding_function) + self.table = self.db.create_table(self.name, schema=schema, mode="overwrite" if overwrite else "create") + + if self.custom_text_split_function is not None: + chunks = split_files_to_chunks( + get_files_from_dir(data_dir), custom_text_split_function=self.custom_text_split_function + ) + else: + chunks = split_files_to_chunks( + get_files_from_dir(data_dir), self.max_tokens, self.chunk_mode, self.must_break_at_empty_line + ) + print(f"Found {len(chunks)} chunks.") # + data = [{"documents": docs, "ids": idx} for idx, docs in enumerate(chunks)] + if isinstance(self.embedding_function, EmbeddingFunction): # this means we are using embedding API + self.table.add(data) + elif isinstance(self.embedding_function, Callable): + pa_table = pa.Table.from_pylist(data) + data = with_embeddings(self.embedding_function, pa_table, column="documents") + self.table.add(data) + + def use_existing_index(self): + self.table = self.db.open_table(self.name) + + def query(self, texts: List[str], top_k: int = 10, search_string: str = None): + if self.db is None: + self.init_db() + texts = [texts] if isinstance(texts, str) else texts + results = defaultdict(list) + for text in texts: + query = self.embedding_function(text) if isinstance(self.embedding_function, Callable) else text + print("query: ", query) + result = self.table.search(query) + if search_string is not None: + result = result.where(f"documents LIKE '%{search_string}%'") + result = result.limit(top_k).to_arrow().to_pydict() + for k, v in result.items(): + results[k].append(v) + + return results + + @property + def index_exists(self): + return self.name in self.db.table_names() + + def _get_schema(self, embedding_function): + if isinstance(embedding_function, EmbeddingFunction): + + class Schema(LanceModel): + vector: Vector(embedding_function.ndims()) = embedding_function.VectorField() + documents: str = embedding_function.SourceField() + ids: str + + return Schema + elif isinstance(embedding_function, Callable): + dim = embedding_function("test")[0].shape[0] # TODO: check this + schema = pa.schema( + [ + pa.field("vector", pa.list_(pa.float32(), dim)), + pa.field("documents", pa.string()), + pa.field("ids", pa.string()), + ] + ) + return schema + else: + raise ValueError("embedding_function should be a callable or an EmbeddingFunction instance") diff --git a/autogen/agentchat/contrib/retriever/retrieve_utils.py b/autogen/agentchat/contrib/retriever/retrieve_utils.py new file mode 100644 index 00000000000..3b68fa84758 --- /dev/null +++ b/autogen/agentchat/contrib/retriever/retrieve_utils.py @@ -0,0 +1,240 @@ +from typing import List, Union, Callable, Optional +import os +import requests +from urllib.parse import urlparse +import glob +import logging +from autogen.token_count_utils import count_token + +try: + from unstructured.partition.auto import partition + + HAS_UNSTRUCTURED = True +except ImportError: + HAS_UNSTRUCTURED = False + +logger = logging.getLogger(__name__) +TEXT_FORMATS = [ + "txt", + "json", + "csv", + "tsv", + "md", + "html", + "htm", + "rtf", + "rst", + "jsonl", + "log", + "xml", + "yaml", + "yml", + "pdf", +] +UNSTRUCTURED_FORMATS = [ + "docx", + "doc", + "odt", + "pptx", + "ppt", + "xlsx", + "eml", + "msg", + "epub", +] # These formats will be parsed by the 'unstructured' library, if installed. +if HAS_UNSTRUCTURED: + TEXT_FORMATS += UNSTRUCTURED_FORMATS + TEXT_FORMATS = list(set(TEXT_FORMATS)) +VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"}) + + +def split_text_to_chunks( + text: str, + max_tokens: int = 4000, + chunk_mode: str = "multi_lines", + must_break_at_empty_line: bool = True, + overlap: int = 10, +): + """Split a long text into chunks of max_tokens.""" + if chunk_mode not in VALID_CHUNK_MODES: + raise AssertionError + if chunk_mode == "one_line": + must_break_at_empty_line = False + chunks = [] + lines = text.split("\n") + lines_tokens = [count_token(line) for line in lines] + sum_tokens = sum(lines_tokens) + while sum_tokens > max_tokens: + if chunk_mode == "one_line": + estimated_line_cut = 2 + else: + estimated_line_cut = int(max_tokens / sum_tokens * len(lines)) + 1 + cnt = 0 + prev = "" + for cnt in reversed(range(estimated_line_cut)): + if must_break_at_empty_line and lines[cnt].strip() != "": + continue + if sum(lines_tokens[:cnt]) <= max_tokens: + prev = "\n".join(lines[:cnt]) + break + if cnt == 0: + logger.warning( + f"max_tokens is too small to fit a single line of text. Breaking this line:\n\t{lines[0][:100]} ..." + ) + if not must_break_at_empty_line: + split_len = int(max_tokens / lines_tokens[0] * 0.9 * len(lines[0])) + prev = lines[0][:split_len] + lines[0] = lines[0][split_len:] + lines_tokens[0] = count_token(lines[0]) + else: + logger.warning("Failed to split docs with must_break_at_empty_line being True, set to False.") + must_break_at_empty_line = False + chunks.append(prev) if len(prev) > 10 else None # don't add chunks less than 10 characters + lines = lines[cnt:] + lines_tokens = lines_tokens[cnt:] + sum_tokens = sum(lines_tokens) + text_to_chunk = "\n".join(lines) + chunks.append(text_to_chunk) if len(text_to_chunk) > 10 else None # don't add chunks less than 10 characters + return chunks + + +def extract_text_from_pdf(file: str) -> str: + """Extract text from PDF files""" + import pypdf # optional dependency + + text = "" + with open(file, "rb") as f: + reader = pypdf.PdfReader(f) + if reader.is_encrypted: # Check if the PDF is encrypted + try: + reader.decrypt("") + except pypdf.errors.FileNotDecryptedError as e: + logger.warning(f"Could not decrypt PDF {file}, {e}") + return text # Return empty text if PDF could not be decrypted + + for page_num in range(len(reader.pages)): + page = reader.pages[page_num] + text += page.extract_text() + + if not text.strip(): # Debugging line to check if text is empty + logger.warning(f"Could not decrypt PDF {file}") + + return text + + +def split_files_to_chunks( + files: list, + max_tokens: int = 4000, + chunk_mode: str = "multi_lines", + must_break_at_empty_line: bool = True, + custom_text_split_function: Callable = None, +): + """Split a list of files into chunks of max_tokens.""" + + chunks = [] + + for file in files: + _, file_extension = os.path.splitext(file) + file_extension = file_extension.lower() + + if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS: + text = partition(file) + text = "\n".join([t.text for t in text]) if len(text) > 0 else "" + elif file_extension == ".pdf": + text = extract_text_from_pdf(file) + else: # For non-PDF text-based files + with open(file, "r", encoding="utf-8", errors="ignore") as f: + text = f.read() + + if not text.strip(): # Debugging line to check if text is empty after reading + logger.warning(f"No text available in file: {file}") + continue # Skip to the next file if no text is available + + if custom_text_split_function is not None: + chunks += custom_text_split_function(text) + else: + chunks += split_text_to_chunks(text, max_tokens, chunk_mode, must_break_at_empty_line) + + return chunks + + +def get_files_from_dir(dir_path: Union[str, List[str]], types: list = TEXT_FORMATS, recursive: bool = True): + """Return a list of all the files in a given directory.""" + if len(types) == 0: + raise ValueError("types cannot be empty.") + types = [t[1:].lower() if t.startswith(".") else t.lower() for t in set(types)] + types += [t.upper() for t in types] + + files = [] + # If the path is a list of files or urls, process and return them + if isinstance(dir_path, list): + for item in dir_path: + if os.path.isfile(item): + files.append(item) + elif is_url(item): + files.append(get_file_from_url(item)) + else: + logger.warning(f"File {item} does not exist. Skipping.") + return files + + # If the path is a file, return it + if os.path.isfile(dir_path): + return [dir_path] + + # If the path is a url, download it and return the downloaded file + if is_url(dir_path): + return [get_file_from_url(dir_path)] + + if os.path.exists(dir_path): + for type in types: + if recursive: + files += glob.glob(os.path.join(dir_path, f"**/*.{type}"), recursive=True) + else: + files += glob.glob(os.path.join(dir_path, f"*.{type}"), recursive=False) + else: + logger.error(f"Directory {dir_path} does not exist.") + raise ValueError(f"Directory {dir_path} does not exist.") + return files + + +def get_file_from_url(url: str, save_path: str = None): + """Download a file from a URL.""" + if save_path is None: + os.makedirs("/tmp/chromadb", exist_ok=True) + save_path = os.path.join("/tmp/chromadb", os.path.basename(url)) + else: + os.makedirs(os.path.dirname(save_path), exist_ok=True) + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(save_path, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + return save_path + + +def is_url(string: str): + """Return True if the string is a valid URL.""" + try: + result = urlparse(string) + return all([result.scheme, result.netloc]) + except ValueError: + return False + + +AVILABLE_RETRIEVERS = ["lanchedb", "chromadb"] +DEFAULT_RETRIEVER = "lancedb" + + +def get_retriever(type: Optional[str] = None): + """Return a retriever instance.""" + type = type or DEFAULT_RETRIEVER + if type == "chromadb": + from .chromadb import ChromaDB + + return ChromaDB + elif type == "lancedb": + from .lancedb import LanceDB + + return LanceDB + else: + raise ValueError(f"Unknown retriever type {type}") diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py index d65fabb440b..6e92a938c27 100644 --- a/autogen/retrieve_utils.py +++ b/autogen/retrieve_utils.py @@ -1,374 +1,4 @@ -from typing import List, Union, Callable -import os -import requests -from urllib.parse import urlparse -import glob -import chromadb +from . import logger +from .agentchat.contrib.retriever.retrieve_utils import * -if chromadb.__version__ < "0.4.15": - from chromadb.api import API -else: - from chromadb.api import ClientAPI as API -from chromadb.api.types import QueryResult -import chromadb.utils.embedding_functions as ef -import logging -import pypdf -from autogen.token_count_utils import count_token - -try: - from unstructured.partition.auto import partition - - HAS_UNSTRUCTURED = True -except ImportError: - HAS_UNSTRUCTURED = False - -logger = logging.getLogger(__name__) -TEXT_FORMATS = [ - "txt", - "json", - "csv", - "tsv", - "md", - "html", - "htm", - "rtf", - "rst", - "jsonl", - "log", - "xml", - "yaml", - "yml", - "pdf", -] -UNSTRUCTURED_FORMATS = [ - "doc", - "docx", - "epub", - "msg", - "odt", - "org", - "pdf", - "ppt", - "pptx", - "rtf", - "rst", - "xlsx", -] # These formats will be parsed by the 'unstructured' library, if installed. -if HAS_UNSTRUCTURED: - TEXT_FORMATS += UNSTRUCTURED_FORMATS - TEXT_FORMATS = list(set(TEXT_FORMATS)) -VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"}) - - -def split_text_to_chunks( - text: str, - max_tokens: int = 4000, - chunk_mode: str = "multi_lines", - must_break_at_empty_line: bool = True, - overlap: int = 10, -): - """Split a long text into chunks of max_tokens.""" - if chunk_mode not in VALID_CHUNK_MODES: - raise AssertionError - if chunk_mode == "one_line": - must_break_at_empty_line = False - chunks = [] - lines = text.split("\n") - lines_tokens = [count_token(line) for line in lines] - sum_tokens = sum(lines_tokens) - while sum_tokens > max_tokens: - if chunk_mode == "one_line": - estimated_line_cut = 2 - else: - estimated_line_cut = int(max_tokens / sum_tokens * len(lines)) + 1 - cnt = 0 - prev = "" - for cnt in reversed(range(estimated_line_cut)): - if must_break_at_empty_line and lines[cnt].strip() != "": - continue - if sum(lines_tokens[:cnt]) <= max_tokens: - prev = "\n".join(lines[:cnt]) - break - if cnt == 0: - logger.warning( - f"max_tokens is too small to fit a single line of text. Breaking this line:\n\t{lines[0][:100]} ..." - ) - if not must_break_at_empty_line: - split_len = int(max_tokens / lines_tokens[0] * 0.9 * len(lines[0])) - prev = lines[0][:split_len] - lines[0] = lines[0][split_len:] - lines_tokens[0] = count_token(lines[0]) - else: - logger.warning("Failed to split docs with must_break_at_empty_line being True, set to False.") - must_break_at_empty_line = False - chunks.append(prev) if len(prev) > 10 else None # don't add chunks less than 10 characters - lines = lines[cnt:] - lines_tokens = lines_tokens[cnt:] - sum_tokens = sum(lines_tokens) - text_to_chunk = "\n".join(lines) - chunks.append(text_to_chunk) if len(text_to_chunk) > 10 else None # don't add chunks less than 10 characters - return chunks - - -def extract_text_from_pdf(file: str) -> str: - """Extract text from PDF files""" - text = "" - with open(file, "rb") as f: - reader = pypdf.PdfReader(f) - if reader.is_encrypted: # Check if the PDF is encrypted - try: - reader.decrypt("") - except pypdf.errors.FileNotDecryptedError as e: - logger.warning(f"Could not decrypt PDF {file}, {e}") - return text # Return empty text if PDF could not be decrypted - - for page_num in range(len(reader.pages)): - page = reader.pages[page_num] - text += page.extract_text() - - if not text.strip(): # Debugging line to check if text is empty - logger.warning(f"Could not decrypt PDF {file}") - - return text - - -def split_files_to_chunks( - files: list, - max_tokens: int = 4000, - chunk_mode: str = "multi_lines", - must_break_at_empty_line: bool = True, - custom_text_split_function: Callable = None, -): - """Split a list of files into chunks of max_tokens.""" - - chunks = [] - - for file in files: - _, file_extension = os.path.splitext(file) - file_extension = file_extension.lower() - - if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS: - text = partition(file) - text = "\n".join([t.text for t in text]) if len(text) > 0 else "" - elif file_extension == ".pdf": - text = extract_text_from_pdf(file) - else: # For non-PDF text-based files - with open(file, "r", encoding="utf-8", errors="ignore") as f: - text = f.read() - - if not text.strip(): # Debugging line to check if text is empty after reading - logger.warning(f"No text available in file: {file}") - continue # Skip to the next file if no text is available - - if custom_text_split_function is not None: - chunks += custom_text_split_function(text) - else: - chunks += split_text_to_chunks(text, max_tokens, chunk_mode, must_break_at_empty_line) - - return chunks - - -def get_files_from_dir(dir_path: Union[str, List[str]], types: list = TEXT_FORMATS, recursive: bool = True): - """Return a list of all the files in a given directory, a url, a file path or a list of them.""" - if len(types) == 0: - raise ValueError("types cannot be empty.") - types = [t[1:].lower() if t.startswith(".") else t.lower() for t in set(types)] - types += [t.upper() for t in types] - - files = [] - # If the path is a list of files or urls, process and return them - if isinstance(dir_path, list): - for item in dir_path: - if os.path.isfile(item): - files.append(item) - elif is_url(item): - files.append(get_file_from_url(item)) - elif os.path.exists(item): - try: - files.extend(get_files_from_dir(item, types, recursive)) - except ValueError: - logger.warning(f"Directory {item} does not exist. Skipping.") - else: - logger.warning(f"File {item} does not exist. Skipping.") - return files - - # If the path is a file, return it - if os.path.isfile(dir_path): - return [dir_path] - - # If the path is a url, download it and return the downloaded file - if is_url(dir_path): - return [get_file_from_url(dir_path)] - - if os.path.exists(dir_path): - for type in types: - if recursive: - files += glob.glob(os.path.join(dir_path, f"**/*.{type}"), recursive=True) - else: - files += glob.glob(os.path.join(dir_path, f"*.{type}"), recursive=False) - else: - logger.error(f"Directory {dir_path} does not exist.") - raise ValueError(f"Directory {dir_path} does not exist.") - return files - - -def get_file_from_url(url: str, save_path: str = None): - """Download a file from a URL.""" - if save_path is None: - os.makedirs("/tmp/chromadb", exist_ok=True) - save_path = os.path.join("/tmp/chromadb", os.path.basename(url)) - else: - os.makedirs(os.path.dirname(save_path), exist_ok=True) - with requests.get(url, stream=True) as r: - r.raise_for_status() - with open(save_path, "wb") as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - return save_path - - -def is_url(string: str): - """Return True if the string is a valid URL.""" - try: - result = urlparse(string) - return all([result.scheme, result.netloc]) - except ValueError: - return False - - -def create_vector_db_from_dir( - dir_path: Union[str, List[str]], - max_tokens: int = 4000, - client: API = None, - db_path: str = "/tmp/chromadb.db", - collection_name: str = "all-my-documents", - get_or_create: bool = False, - chunk_mode: str = "multi_lines", - must_break_at_empty_line: bool = True, - embedding_model: str = "all-MiniLM-L6-v2", - embedding_function: Callable = None, - custom_text_split_function: Callable = None, - custom_text_types: List[str] = TEXT_FORMATS, - recursive: bool = True, -) -> API: - """Create a vector db from all the files in a given directory, the directory can also be a single file or a url to - a single file. We support chromadb compatible APIs to create the vector db, this function is not required if - you prepared your own vector db. - - Args: - dir_path (Union[str, List[str]]): the path to the directory, file, url or a list of them. - max_tokens (Optional, int): the maximum number of tokens per chunk. Default is 4000. - client (Optional, API): the chromadb client. Default is None. - db_path (Optional, str): the path to the chromadb. Default is "/tmp/chromadb.db". - collection_name (Optional, str): the name of the collection. Default is "all-my-documents". - get_or_create (Optional, bool): Whether to get or create the collection. Default is False. If True, the collection - will be returned if it already exists. Will raise ValueError if the collection already exists and get_or_create is False. - chunk_mode (Optional, str): the chunk mode. Default is "multi_lines". - must_break_at_empty_line (Optional, bool): Whether to break at empty line. Default is True. - embedding_model (Optional, str): the embedding model to use. Default is "all-MiniLM-L6-v2". Will be ignored if - embedding_function is not None. - embedding_function (Optional, Callable): the embedding function to use. Default is None, SentenceTransformer with - the given `embedding_model` will be used. If you want to use OpenAI, Cohere, HuggingFace or other embedding - functions, you can pass it here, follow the examples in `https://docs.trychroma.com/embeddings`. - custom_text_split_function (Optional, Callable): a custom function to split a string into a list of strings. - Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`. - custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS. - recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True. - - Returns: - API: the chromadb client. - """ - if client is None: - client = chromadb.PersistentClient(path=db_path) - try: - embedding_function = ( - ef.SentenceTransformerEmbeddingFunction(embedding_model) - if embedding_function is None - else embedding_function - ) - collection = client.create_collection( - collection_name, - get_or_create=get_or_create, - embedding_function=embedding_function, - # https://github.com/nmslib/hnswlib#supported-distances - # https://github.com/chroma-core/chroma/blob/566bc80f6c8ee29f7d99b6322654f32183c368c4/chromadb/segment/impl/vector/local_hnsw.py#L184 - # https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md - metadata={"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 32}, # ip, l2, cosine - ) - - if custom_text_split_function is not None: - chunks = split_files_to_chunks( - get_files_from_dir(dir_path, custom_text_types, recursive), - custom_text_split_function=custom_text_split_function, - ) - else: - chunks = split_files_to_chunks( - get_files_from_dir(dir_path, custom_text_types, recursive), - max_tokens, - chunk_mode, - must_break_at_empty_line, - ) - logger.info(f"Found {len(chunks)} chunks.") - # Upsert in batch of 40000 or less if the total number of chunks is less than 40000 - for i in range(0, len(chunks), min(40000, len(chunks))): - end_idx = i + min(40000, len(chunks) - i) - collection.upsert( - documents=chunks[i:end_idx], - ids=[f"doc_{j}" for j in range(i, end_idx)], # unique for each doc - ) - except ValueError as e: - logger.warning(f"{e}") - return client - - -def query_vector_db( - query_texts: List[str], - n_results: int = 10, - client: API = None, - db_path: str = "/tmp/chromadb.db", - collection_name: str = "all-my-documents", - search_string: str = "", - embedding_model: str = "all-MiniLM-L6-v2", - embedding_function: Callable = None, -) -> QueryResult: - """Query a vector db. We support chromadb compatible APIs, it's not required if you prepared your own vector db - and query function. - - Args: - query_texts (List[str]): the list of strings which will be used to query the vector db. - n_results (Optional, int): the number of results to return. Default is 10. - client (Optional, API): the chromadb compatible client. Default is None, a chromadb client will be used. - db_path (Optional, str): the path to the vector db. Default is "/tmp/chromadb.db". - collection_name (Optional, str): the name of the collection. Default is "all-my-documents". - search_string (Optional, str): the search string. Only docs that contain an exact match of this string will be retrieved. Default is "". - embedding_model (Optional, str): the embedding model to use. Default is "all-MiniLM-L6-v2". Will be ignored if - embedding_function is not None. - embedding_function (Optional, Callable): the embedding function to use. Default is None, SentenceTransformer with - the given `embedding_model` will be used. If you want to use OpenAI, Cohere, HuggingFace or other embedding - functions, you can pass it here, follow the examples in `https://docs.trychroma.com/embeddings`. - - Returns: - QueryResult: the query result. The format is: - class QueryResult(TypedDict): - ids: List[IDs] - embeddings: Optional[List[List[Embedding]]] - documents: Optional[List[List[Document]]] - metadatas: Optional[List[List[Metadata]]] - distances: Optional[List[List[float]]] - """ - if client is None: - client = chromadb.PersistentClient(path=db_path) - # the collection's embedding function is always the default one, but we want to use the one we used to create the - # collection. So we compute the embeddings ourselves and pass it to the query function. - collection = client.get_collection(collection_name) - embedding_function = ( - ef.SentenceTransformerEmbeddingFunction(embedding_model) if embedding_function is None else embedding_function - ) - query_embeddings = embedding_function(query_texts) - # Query/search n most similar results. You can also .get by id - results = collection.query( - query_embeddings=query_embeddings, - n_results=n_results, - where_document={"$contains": search_string} if search_string else None, # optional filter - ) - return results +logger.warning("This module is deprecated. Please use autogen.agentchat.contrib.retriever.retrieve_utils instead.") diff --git a/notebook/agentchat_RetrieveChat.ipynb b/notebook/agentchat_RetrieveChat.ipynb index 79db55191ff..d4b743e413f 100644 --- a/notebook/agentchat_RetrieveChat.ipynb +++ b/notebook/agentchat_RetrieveChat.ipynb @@ -67,14 +67,14 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "models to use: ['gpt-35-turbo']\n" + "models to use: ['gpt-4']\n" ] } ], @@ -82,7 +82,7 @@ "import autogen\n", "\n", "config_list = autogen.config_list_from_json(\n", - " env_or_file=\"OAI_CONFIG_LIST\",\n", + " env_or_file=\"../OAI_CONFIG_LIST\",\n", " file_location=\".\",\n", " filter_dict={\n", " \"model\": {\n", @@ -148,15 +148,22 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 2, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "This module is deprecated. Please use autogen.agentchat.contrib.retriever.retrieve_utils instead.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "Accepted file formats for `docs_path`:\n", - "['xml', 'htm', 'msg', 'docx', 'org', 'pptx', 'jsonl', 'txt', 'tsv', 'yml', 'json', 'md', 'pdf', 'xlsx', 'csv', 'html', 'log', 'yaml', 'doc', 'odt', 'rtf', 'ppt', 'epub', 'rst']\n" + "['txt', 'json', 'csv', 'tsv', 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml', 'pdf']\n" ] } ], @@ -171,13 +178,12 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n", "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n", - "import chromadb\n", "import os\n", "\n", "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n", @@ -212,12 +218,12 @@ " \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md\",\n", " os.path.join(os.path.abspath(''), \"..\", \"website\", \"docs\"),\n", " ],\n", + " \"retriever_path\": \"~/test\",\n", " \"custom_text_types\": [\"mdx\"],\n", " \"chunk_token_size\": 2000,\n", " \"model\": config_list[0][\"model\"],\n", - " \"client\": chromadb.PersistentClient(path=\"/tmp/chromadb\"),\n", " \"embedding_model\": \"all-mpnet-base-v2\",\n", - " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection, but you'll need to remove the collection manually\n", + " \"db_mode\": \"recreate\", # \"get\", \"create\", \"recreate\".\n", " },\n", " code_execution_config=False, # set to False if you don't want to execute the code\n", ")" @@ -240,36 +246,31 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:autogen.retrieve_utils:Found 2 chunks.\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "Trying to create collection.\n" + "Trying to create index. If the index already exists, it will be recreated.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n" + "File /Users/ayushchaurasia/Documents/autogen/autogen/notebook/../website/docs does not exist. Skipping.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "doc_ids: [['doc_0']]\n", - "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", + "Found 2 chunks.\n", + "query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "doc_ids: [['0']]\n", + "\u001b[32mAdding doc_id 0 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -409,42 +410,51 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "You can use FLAML's `lgbm_spark` estimator for classification tasks and activate Spark as the parallel backend during training by setting `use_spark` to `True`. Here is an example code snippet:\n", + "You can use the provided FLAML API along with Spark for distributed training and parallel jobs. FLAML integrates Spark ML estimators for AutoML and offers utilities to prepare your data in the required format. This includes the `to_pandas_on_spark` function for converting your data into a pandas-on-spark dataframe, and the `VectorAssembler` for merging all feature columns into a single vector column.\n", + "\n", + "Here's an example of how you can perform a classification task with FLAML and Spark and force cancel jobs if the time limit is reached:\n", "\n", "```python\n", - "import flaml\n", + "import pandas as pd\n", "from flaml.automl.spark.utils import to_pandas_on_spark\n", "from pyspark.ml.feature import VectorAssembler\n", + "import flaml\n", + "\n", + "# Creating a dictionary\n", + "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", + "\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", "\n", - "# Assuming you have a Spark DataFrame named 'df' that contains your data\n", - "dataframe = df.toPandas()\n", - "label = \"target\"\n", + "# Convert to pandas-on-spark dataframe\n", "psdf = to_pandas_on_spark(dataframe)\n", "\n", + "# Prepare features using VectorAssembler\n", "columns = psdf.columns\n", "feature_cols = [col for col in columns if col != label]\n", "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\", label]\n", "\n", - "# configure and run AutoML\n", - "automl = flaml.AutoML()\n", - "settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"accuracy\",\n", - " \"estimator_list\": [\"lgbm_spark\"],\n", - " \"task\": \"classification\",\n", - " \"n_jobs\": -1, # Use all available CPUs\n", - " \"use_spark\": True, # Use Spark as the parallel backend\n", - " \"force_cancel\": True # Halt Spark jobs that run for longer than the time budget\n", + "# Define FLAML settings\n", + "automl_settings = {\n", + " \"time_budget\": 30, # Train for 30 seconds\n", + " \"metric\": \"accuracy\", # Evaluation metric\n", + " \"task\": \"classification\", # Type of task\n", + " \"n_concurrent_trials\": 2, # Number of concurrent trials\n", + " \"use_spark\": True, # Use spark for parallel training\n", + " \"force_cancel\": True, # Force cancel jobs if time limit is reached\n", "}\n", - "automl.fit(\n", - " dataframe=psdf,\n", - " label=label,\n", - " **settings,\n", - ")\n", + "\n", + "automl = flaml.AutoML()\n", + "\n", + "# Train with FLAML and Spark with a classification task\n", + "automl.fit(dataframe=psdf, label=label, **automl_settings)\n", "```\n", "\n", - "Note that you should not use `use_spark` if you are working with Spark data, because SparkML models already run in parallel.\n", + "Please note that this is a basic example. FLAML has many more options available for tuning the models such as the estimator_list option for specifying desired models to try. Also note that the Spark environment needs to be properly set up for running this code.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", @@ -457,61 +467,80 @@ "UPDATE CONTEXT\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n" + "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", + "Trying to create index. If the index already exists, it will be recreated.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 60 is greater than number of elements in index 2, updating n_results = 2\n", - "WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 100 is greater than number of elements in index 2, updating n_results = 2\n" + "File /Users/ayushchaurasia/Documents/autogen/autogen/notebook/../website/docs does not exist. Skipping.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "doc_ids: [['doc_0']]\n", - "doc_ids: [['doc_0']]\n" + "Found 2 chunks.\n", + "query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "doc_ids: [['0']]\n", + "Trying to create index. If the index already exists, it will be recreated.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 140 is greater than number of elements in index 2, updating n_results = 2\n" + "File /Users/ayushchaurasia/Documents/autogen/autogen/notebook/../website/docs does not exist. Skipping.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "doc_ids: [['doc_0']]\n" + "Found 2 chunks.\n", + "query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "doc_ids: [['0']]\n", + "Trying to create index. If the index already exists, it will be recreated.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 180 is greater than number of elements in index 2, updating n_results = 2\n" + "File /Users/ayushchaurasia/Documents/autogen/autogen/notebook/../website/docs does not exist. Skipping.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "doc_ids: [['doc_0']]\n", + "Found 2 chunks.\n", + "query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "doc_ids: [['0']]\n", + "Trying to create index. If the index already exists, it will be recreated.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "File /Users/ayushchaurasia/Documents/autogen/autogen/notebook/../website/docs does not exist. Skipping.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 2 chunks.\n", + "query: How can I use FLAML to perform a classification task and use spark to do parallel training. Train 30 seconds and force cancel jobs if time limit is reached.\n", + "doc_ids: [['0']]\n", "\u001b[32mNo more context, will terminate.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "TERMINATE\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "TERMINATE\n", - "\n", "--------------------------------------------------------------------------------\n" ] } @@ -545,23 +574,32 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trying to create index. If the index already exists, it will be recreated.\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n" + "File /Users/ayushchaurasia/Documents/autogen/autogen/notebook/../website/docs does not exist. Skipping.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "doc_ids: [['doc_0', 'doc_1']]\n", - "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", + "Found 2 chunks.\n", + "query: Who is the author of FLAML?\n", + "doc_ids: [['0', '1']]\n", + "\u001b[32mAdding doc_id 0 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -814,7 +852,65 @@ "\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "\n", + "The primary authors of FLAML, or Fast Lightweight AutoML, are Chi Wang and Qingyun Wu. They developed this library at Microsoft Research. Other contributors include Markus Weimer and Erkang Zhu. They have published several research papers on various aspects of FLAML, which further discuss the technical details and innovative techniques used in this AutoML library.\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "# reset the assistant. Always reset the assistant before starting a new conversation.\n", + "assistant.reset()\n", + "\n", + "qa_problem = \"Who is the author of FLAML?\"\n", + "ragproxyagent.initiate_chat(assistant, problem=qa_problem)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Example 3\n", + "\n", + "[back to top](#toc)\n", + "\n", + "Use RetrieveChat to help generate sample code and ask for human-in-loop feedbacks.\n", + "\n", + "Problem: how to build a time series forecasting model for stock price using FLAML?" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trying to create index. If the index already exists, it will be recreated.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "File /Users/ayushchaurasia/Documents/autogen/autogen/notebook/../website/docs does not exist. Skipping.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 2 chunks.\n", + "query: how to build a time series forecasting model for stock price using FLAML?\n", + "doc_ids: [['0', '1']]\n", + "\u001b[32mAdding doc_id 0 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -827,7 +923,7 @@ "# your code\n", "```\n", "\n", - "User's question is: Who is the author of FLAML?\n", + "User's question is: how to build a time series forecasting model for stock price using FLAML?\n", "\n", "Context is: # Integrate - Spark\n", "\n", @@ -1069,18 +1165,71 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "The authors of FLAML are Chi Wang, Qingyun Wu, Markus Weimer, and Erkang Zhu.\n", + "To build a forecasting model for stock price using FLAML, we first need to ensure we have time-series data related to stock price in a pandas data frame format. Then, we convert it to pandas-on-spark data frame using the function `to_pandas_on_spark` provided by FLAML. And then, we use `VectorAssembler` to merge all feature columns into one vector column.\n", + "\n", + "Following the data preprocessing, we integrate Spark ML with the FLAML AutoML model. The provided FLAML AutoML model using Spark is `lgbm_spark`. Set up the settings such as time budget, metric, and task among the others, and then call the `fit()` method of FLAML AutoML model.\n", + "\n", + "Below is the example in Python:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl import AutoML\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "# Assuming data is a dataframe containing your time series data\n", + "\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(data)\n", + "\n", + "# Assume label is the name of the column in data you want to predict\n", + "label = \"Price\"\n", + "\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "\n", + "# Transform data\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\", label]\n", + "\n", + "# Initialize and setup automl object\n", + "automl = AutoML()\n", + "settings = {\n", + " \"time_budget\": 100, # in seconds\n", + " \"metric\": \"mae\",\n", + " \"task\": \"forecast\",\n", + " \"estimator_list\": [\"lgbm_spark\"],\n", + "}\n", + "\n", + "# Fit automl model\n", + "automl.fit(dataframe=psdf, label=label, **settings)\n", + "```\n", + "\n", + "Replace `data` with the pandas dataframe containing time series information related to the stock price. \"Price\" should be replaced by the column you want to predict in the data frame.\n", + "\n", + "Please note, the SparkML LightGBM model `lgbm_spark` is more suitable for regression tasks. If your stock price dataset is a time series classification task, you might want to choose a different model that supports classification task.\n", + "\n", + "Additionally, FLAML AutoML settings are flexible so you can adjust the settings as needed. For example, you can increase the time budget if you want FLAML to try more configurations, or change the metric to another error metric like \"mse\".\n", "\n", "--------------------------------------------------------------------------------\n" ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: exit\n" + ] } ], "source": [ "# reset the assistant. Always reset the assistant before starting a new conversation.\n", "assistant.reset()\n", "\n", - "qa_problem = \"Who is the author of FLAML?\"\n", - "ragproxyagent.initiate_chat(assistant, problem=qa_problem)" + "# set `human_input_mode` to be `ALWAYS`, so the agent will ask for human input at every step.\n", + "ragproxyagent.human_input_mode = \"ALWAYS\"\n", + "code_problem = \"how to build a time series forecasting model for stock price using FLAML?\"\n", + "ragproxyagent.initiate_chat(assistant, problem=code_problem)" ] }, { @@ -1088,35 +1237,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "### Example 3\n", + "\n", + "### Example 4\n", "\n", "[back to top](#toc)\n", "\n", - "Use RetrieveChat to help generate sample code and ask for human-in-loop feedbacks.\n", + "Use RetrieveChat to answer a question and ask for human-in-loop feedbacks.\n", "\n", - "Problem: how to build a time series forecasting model for stock price using FLAML?" + "Problem: Is there a function named `tune_automl` in FLAML?" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 7, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "doc_ids: [['doc_0', 'doc_1']]\n", - "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", + "Trying to use existing collection.\n", + "query: Is there a function named `tune_automl` in FLAML?\n", + "doc_ids: [['0', '1']]\n", + "\u001b[32mAdding doc_id 0 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -1129,7 +1273,7 @@ "# your code\n", "```\n", "\n", - "User's question is: how to build a time series forecasting model for stock price using FLAML?\n", + "User's question is: Is there a function named `tune_automl` in FLAML?\n", "\n", "Context is: # Integrate - Spark\n", "\n", @@ -1371,34 +1515,17 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "To build a time series forecasting model for stock price using FLAML, you can use the `lgbm_spark` estimator and organize your data in the required format. First, use `to_pandas_on_spark` function to convert your data into a pandas-on-spark dataframe/series, which Spark estimators require. Next, you should use `VectorAssembler` to merge all feature columns into a single vector column. Finally, use `flaml.AutoML` to try different configurations for the `lgbm_spark` model. Here is an example code snippet: \n", + "From the provided context, it does not appear that there is a function named `tune_automl` in FLAML. Instead, the `AutoML` class is instantiated and its `fit` method is used to conduct the automated machine learning process. This process includes hyperparameter tuning, but it is not conducted with a standalone `tune_automl` function. This is true for both the general use case of FLAML, as well as the specific case of using FLAML with Spark, as outlined in the provided context.\n", + "\n", + "If more context is given or if you are referring to a different version or extension of FLAML, the answer might be different.\n", "\n", "```python\n", "import flaml\n", - "import pandas as pd\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "from pyspark.ml.feature import VectorAssembler\n", - "\n", - "# load your stock price data into a pandas dataframe\n", - "data = pd.read_csv('stock_price.csv')\n", - "\n", - "# specify label column name\n", - "label = 'price'\n", - "\n", - "# convert pandas dataframe to pandas-on-spark dataframe\n", - "psdf = to_pandas_on_spark(data)\n", - "\n", - "# merge feature columns as a single vector column\n", - "feature_cols = [col for col in psdf.columns if col != label]\n", - "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", - "\n", - "# start an AutoML experiment with lgbm_spark estimator\n", "automl = flaml.AutoML()\n", "settings = {\n", " \"time_budget\": 30,\n", " \"metric\": \"r2\",\n", - " \"estimator_list\": [\"lgbm_spark\"],\n", + " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", " \"task\": \"regression\",\n", "}\n", "\n", @@ -1413,77 +1540,10 @@ ] }, { - "name": "stdout", + "name": "stdin", "output_type": "stream", "text": [ - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "I want the time_budget to be 10 mins\n", - "\n", - "--------------------------------------------------------------------------------\n", - "I want the time_budget to be 10 mins\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", - "\n", - "You can change the `time_budget` parameter in the `settings` dictionary to 10 minutes (600 seconds) like this:\n", - "\n", - "```python\n", - "import flaml\n", - "import pandas as pd\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "from pyspark.ml.feature import VectorAssembler\n", - "\n", - "# load your stock price data into a pandas dataframe\n", - "data = pd.read_csv('stock_price.csv')\n", - "\n", - "# specify label column name\n", - "label = 'price'\n", - "\n", - "# convert pandas dataframe to pandas-on-spark dataframe\n", - "psdf = to_pandas_on_spark(data)\n", - "\n", - "# merge feature columns as a single vector column\n", - "feature_cols = [col for col in psdf.columns if col != label]\n", - "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", - "\n", - "# start an AutoML experiment with lgbm_spark estimator and time_budget of 10 mins\n", - "automl = flaml.AutoML()\n", - "settings = {\n", - " \"time_budget\": 600, # time_budget in seconds\n", - " \"metric\": \"r2\",\n", - " \"estimator_list\": [\"lgbm_spark\"],\n", - " \"task\": \"regression\",\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=psdf,\n", - " label=label,\n", - " **settings,\n", - ")\n", - "```\n", - "\n", - "\n", - "In this example, the `time_budget` parameter is set to 600, which represents the number of seconds the FLAML AutoML experiment will run. You can adjust this value to control the total time spent on the experiment.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> NO HUMAN INPUT RECEIVED.\u001b[0m\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", - "\n", - "Is there anything else I can help you with?\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> NO HUMAN INPUT RECEIVED.\u001b[0m\n" + "Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: exit\n" ] } ], @@ -1493,8 +1553,8 @@ "\n", "# set `human_input_mode` to be `ALWAYS`, so the agent will ask for human input at every step.\n", "ragproxyagent.human_input_mode = \"ALWAYS\"\n", - "code_problem = \"how to build a time series forecasting model for stock price using FLAML?\"\n", - "ragproxyagent.initiate_chat(assistant, problem=code_problem)" + "qa_problem = \"Is there a function named `tune_automl` in FLAML?\"\n", + "ragproxyagent.initiate_chat(assistant, problem=qa_problem) # type \"exit\" to exit the conversation" ] }, { @@ -1502,578 +1562,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "### Example 4\n", + "\n", + "### Example 5\n", "\n", "[back to top](#toc)\n", "\n", - "Use RetrieveChat to answer a question and ask for human-in-loop feedbacks.\n", + "Use RetrieveChat to answer questions for [NaturalQuestion](https://ai.google.com/research/NaturalQuestions) dataset.\n", "\n", - "Problem: Is there a function named `tune_automl` in FLAML?" + "First, we will create a new document collection which includes all the contextual corpus. Then, we will choose some questions and utilize RetrieveChat to answer them. For this particular example, we will be using the `gpt-3.5-turbo` model, and we will demonstrate RetrieveChat's feature of automatically updating context in case the documents retrieved do not contain sufficient information." ] }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "doc_ids: [['doc_0', 'doc_1']]\n", - "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", - "context provided by the user.\n", - "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", - "For code generation, you must obey the following rules:\n", - "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", - "Rule 2. You must follow the formats below to write your code:\n", - "```language\n", - "# your code\n", - "```\n", - "\n", - "User's question is: Is there a function named `tune_automl` in FLAML?\n", - "\n", - "Context is: # Integrate - Spark\n", - "\n", - "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", - "- Use Spark ML estimators for AutoML.\n", - "- Use Spark to run training in parallel spark jobs.\n", - "\n", - "## Spark ML Estimators\n", - "\n", - "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", - "\n", - "### Data\n", - "\n", - "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", - "\n", - "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", - "\n", - "This function also accepts optional arguments `index_col` and `default_index_type`.\n", - "- `index_col` is the column name to use as the index, default is None.\n", - "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", - "\n", - "Here is an example code snippet for Spark Data:\n", - "\n", - "```python\n", - "import pandas as pd\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "# Creating a dictionary\n", - "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", - " \"Age_Years\": [20, 15, 10, 7, 25],\n", - " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", - "\n", - "# Creating a pandas DataFrame\n", - "dataframe = pd.DataFrame(data)\n", - "label = \"Price\"\n", - "\n", - "# Convert to pandas-on-spark dataframe\n", - "psdf = to_pandas_on_spark(dataframe)\n", - "```\n", - "\n", - "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", - "\n", - "Here is an example of how to use it:\n", - "```python\n", - "from pyspark.ml.feature import VectorAssembler\n", - "columns = psdf.columns\n", - "feature_cols = [col for col in columns if col != label]\n", - "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", - "```\n", - "\n", - "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", - "\n", - "### Estimators\n", - "#### Model List\n", - "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", - "\n", - "#### Usage\n", - "First, prepare your data in the required format as described in the previous section.\n", - "\n", - "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", - "\n", - "Here is an example code snippet using SparkML models in AutoML:\n", - "\n", - "```python\n", - "import flaml\n", - "# prepare your data in pandas-on-spark format as we previously mentioned\n", - "\n", - "automl = flaml.AutoML()\n", - "settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", - " \"task\": \"regression\",\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=psdf,\n", - " label=label,\n", - " **settings,\n", - ")\n", - "```\n", - "\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", - "\n", - "## Parallel Spark Jobs\n", - "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", - "\n", - "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", - "\n", - "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", - "\n", - "\n", - "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", - "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", - "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", - "\n", - "An example code snippet for using parallel Spark jobs:\n", - "```python\n", - "import flaml\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"n_concurrent_trials\": 2,\n", - " \"use_spark\": True,\n", - " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=dataframe,\n", - " label=label,\n", - " **automl_settings,\n", - ")\n", - "```\n", - "\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", - "\n", - "# Research\n", - "\n", - "For technical details, please check our research publications.\n", - "\n", - "* [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2021flaml,\n", - " title={FLAML: A Fast and Lightweight AutoML Library},\n", - " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", - " year={2021},\n", - " booktitle={MLSys},\n", - "}\n", - "```\n", - "\n", - "* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2021cfo,\n", - " title={Frugal Optimization for Cost-related Hyperparameters},\n", - " author={Qingyun Wu and Chi Wang and Silu Huang},\n", - " year={2021},\n", - " booktitle={AAAI},\n", - "}\n", - "```\n", - "\n", - "* [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2021blendsearch,\n", - " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", - " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", - " year={2021},\n", - " booktitle={ICLR},\n", - "}\n", - "```\n", - "\n", - "* [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{liuwang2021hpolm,\n", - " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", - " author={Susan Xueqing Liu and Chi Wang},\n", - " year={2021},\n", - " booktitle={ACL},\n", - "}\n", - "```\n", - "\n", - "* [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2021chacha,\n", - " title={ChaCha for Online AutoML},\n", - " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", - " year={2021},\n", - " booktitle={ICML},\n", - "}\n", - "```\n", - "\n", - "* [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", - "\n", - "```bibtex\n", - "@inproceedings{wuwang2021fairautoml,\n", - " title={Fair AutoML},\n", - " author={Qingyun Wu and Chi Wang},\n", - " year={2021},\n", - " booktitle={ArXiv preprint arXiv:2111.06495},\n", - "}\n", - "```\n", - "\n", - "* [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", - "\n", - "```bibtex\n", - "@inproceedings{kayaliwang2022default,\n", - " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", - " author={Moe Kayali and Chi Wang},\n", - " year={2022},\n", - " booktitle={ArXiv preprint arXiv:2202.09927},\n", - "}\n", - "```\n", - "\n", - "* [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", - "\n", - "```bibtex\n", - "@inproceedings{zhang2023targeted,\n", - " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", - " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", - " booktitle={International Conference on Learning Representations},\n", - " year={2023},\n", - " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", - "}\n", - "```\n", - "\n", - "* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2023EcoOptiGen,\n", - " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", - " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", - " year={2023},\n", - " booktitle={ArXiv preprint arXiv:2303.04673},\n", - "}\n", - "```\n", - "\n", - "* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2023empirical,\n", - " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", - " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", - " year={2023},\n", - " booktitle={ArXiv preprint arXiv:2306.01337},\n", - "}\n", - "```\n", - "\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", - "context provided by the user.\n", - "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", - "For code generation, you must obey the following rules:\n", - "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", - "Rule 2. You must follow the formats below to write your code:\n", - "```language\n", - "# your code\n", - "```\n", - "\n", - "User's question is: Is there a function named `tune_automl` in FLAML?\n", - "\n", - "Context is: # Integrate - Spark\n", - "\n", - "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", - "- Use Spark ML estimators for AutoML.\n", - "- Use Spark to run training in parallel spark jobs.\n", - "\n", - "## Spark ML Estimators\n", - "\n", - "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", - "\n", - "### Data\n", - "\n", - "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", - "\n", - "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", - "\n", - "This function also accepts optional arguments `index_col` and `default_index_type`.\n", - "- `index_col` is the column name to use as the index, default is None.\n", - "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", - "\n", - "Here is an example code snippet for Spark Data:\n", - "\n", - "```python\n", - "import pandas as pd\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "# Creating a dictionary\n", - "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", - " \"Age_Years\": [20, 15, 10, 7, 25],\n", - " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", - "\n", - "# Creating a pandas DataFrame\n", - "dataframe = pd.DataFrame(data)\n", - "label = \"Price\"\n", - "\n", - "# Convert to pandas-on-spark dataframe\n", - "psdf = to_pandas_on_spark(dataframe)\n", - "```\n", - "\n", - "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", - "\n", - "Here is an example of how to use it:\n", - "```python\n", - "from pyspark.ml.feature import VectorAssembler\n", - "columns = psdf.columns\n", - "feature_cols = [col for col in columns if col != label]\n", - "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", - "```\n", - "\n", - "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", - "\n", - "### Estimators\n", - "#### Model List\n", - "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", - "\n", - "#### Usage\n", - "First, prepare your data in the required format as described in the previous section.\n", - "\n", - "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", - "\n", - "Here is an example code snippet using SparkML models in AutoML:\n", - "\n", - "```python\n", - "import flaml\n", - "# prepare your data in pandas-on-spark format as we previously mentioned\n", - "\n", - "automl = flaml.AutoML()\n", - "settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", - " \"task\": \"regression\",\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=psdf,\n", - " label=label,\n", - " **settings,\n", - ")\n", - "```\n", - "\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", - "\n", - "## Parallel Spark Jobs\n", - "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", - "\n", - "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", - "\n", - "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", - "\n", - "\n", - "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", - "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", - "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", - "\n", - "An example code snippet for using parallel Spark jobs:\n", - "```python\n", - "import flaml\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"n_concurrent_trials\": 2,\n", - " \"use_spark\": True,\n", - " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=dataframe,\n", - " label=label,\n", - " **automl_settings,\n", - ")\n", - "```\n", - "\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", - "\n", - "# Research\n", - "\n", - "For technical details, please check our research publications.\n", - "\n", - "* [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2021flaml,\n", - " title={FLAML: A Fast and Lightweight AutoML Library},\n", - " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", - " year={2021},\n", - " booktitle={MLSys},\n", - "}\n", - "```\n", - "\n", - "* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2021cfo,\n", - " title={Frugal Optimization for Cost-related Hyperparameters},\n", - " author={Qingyun Wu and Chi Wang and Silu Huang},\n", - " year={2021},\n", - " booktitle={AAAI},\n", - "}\n", - "```\n", - "\n", - "* [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2021blendsearch,\n", - " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", - " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", - " year={2021},\n", - " booktitle={ICLR},\n", - "}\n", - "```\n", - "\n", - "* [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{liuwang2021hpolm,\n", - " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", - " author={Susan Xueqing Liu and Chi Wang},\n", - " year={2021},\n", - " booktitle={ACL},\n", - "}\n", - "```\n", - "\n", - "* [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2021chacha,\n", - " title={ChaCha for Online AutoML},\n", - " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", - " year={2021},\n", - " booktitle={ICML},\n", - "}\n", - "```\n", - "\n", - "* [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", - "\n", - "```bibtex\n", - "@inproceedings{wuwang2021fairautoml,\n", - " title={Fair AutoML},\n", - " author={Qingyun Wu and Chi Wang},\n", - " year={2021},\n", - " booktitle={ArXiv preprint arXiv:2111.06495},\n", - "}\n", - "```\n", - "\n", - "* [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", - "\n", - "```bibtex\n", - "@inproceedings{kayaliwang2022default,\n", - " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", - " author={Moe Kayali and Chi Wang},\n", - " year={2022},\n", - " booktitle={ArXiv preprint arXiv:2202.09927},\n", - "}\n", - "```\n", - "\n", - "* [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", - "\n", - "```bibtex\n", - "@inproceedings{zhang2023targeted,\n", - " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", - " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", - " booktitle={International Conference on Learning Representations},\n", - " year={2023},\n", - " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", - "}\n", - "```\n", - "\n", - "* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2023EcoOptiGen,\n", - " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", - " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", - " year={2023},\n", - " booktitle={ArXiv preprint arXiv:2303.04673},\n", - "}\n", - "```\n", - "\n", - "* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2023empirical,\n", - " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", - " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", - " year={2023},\n", - " booktitle={ArXiv preprint arXiv:2306.01337},\n", - "}\n", - "```\n", - "\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", - "\n", - "There is no function named `tune_automl` in FLAML. However, FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark: \n", - "- Use Spark ML Estimators for AutoML.\n", - "- Use Spark to run training in parallel Spark jobs.\n", - "\n", - "--------------------------------------------------------------------------------\n" - ] - } - ], - "source": [ - "# reset the assistant. Always reset the assistant before starting a new conversation.\n", - "assistant.reset()\n", - "\n", - "# set `human_input_mode` to be `ALWAYS`, so the agent will ask for human input at every step.\n", - "ragproxyagent.human_input_mode = \"ALWAYS\"\n", - "qa_problem = \"Is there a function named `tune_automl` in FLAML?\"\n", - "ragproxyagent.initiate_chat(assistant, problem=qa_problem) # type \"exit\" to exit the conversation" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### Example 5\n", - "\n", - "[back to top](#toc)\n", - "\n", - "Use RetrieveChat to answer questions for [NaturalQuestion](https://ai.google.com/research/NaturalQuestions) dataset.\n", - "\n", - "First, we will create a new document collection which includes all the contextual corpus. Then, we will choose some questions and utilize RetrieveChat to answer them. For this particular example, we will be using the `gpt-3.5-turbo` model, and we will demonstrate RetrieveChat's feature of automatically updating context in case the documents retrieved do not contain sufficient information." - ] - }, - { - "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -2082,7 +1583,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -2099,17 +1600,17 @@ " \"docs_path\": corpus_file,\n", " \"chunk_token_size\": 2000,\n", " \"model\": config_list[0][\"model\"],\n", - " \"client\": chromadb.PersistentClient(path=\"/tmp/chromadb\"),\n", " \"collection_name\": \"natural-questions\",\n", " \"chunk_mode\": \"one_line\",\n", " \"embedding_model\": \"all-MiniLM-L6-v2\",\n", + " \"get_or_create\": True\n", " },\n", ")" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -2140,7 +1641,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -2152,130 +1653,28 @@ ">>>>>>>>>>>> Below are outputs of Case 1 <<<<<<<<<<<<\n", "\n", "\n", - "Trying to create collection.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
Film Year Fuck count Minutes Uses / mi ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
Character Ultimate Avengers Ultimate Avengers 2 I ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
Position Country Town / City PM2. 5 PM ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
Rank Country ( or dependent territory ) Population
Rank State Gross collections ( in thousands ) Rev ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t < ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
Date Province Mag . MMI Deaths
City River State
Gangakhed ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
Player Pos . Team Career start Career ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t ABO and Rh blood type distribution by country ( population averages )
Country
Total area Land area Performance in the European Cup and UEFA Champions League by club
  • ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Rank City State Land area ( sq mi ) La ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    # Country Name International goals Cap ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Rank City Image Population Definition ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Rank Team Won Lost Tied Pct ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Territory Rights holder Ref
    Asia
    ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    ( hide ) Rank Nat Name Years Goals
    Total area Land area
    Bids by school Most recent
    Rank Name Nation TP SP
    2014 Rank City 2014 Estimate 2010 Census
    S.No . Year Name
    1961
    Densities of various materials covering a range of values
    Material ρ ( ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Club Season League Nation ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Rank ( 2016 ) Airports ( large hubs ) IATA Code M ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    City Region / State Country Park name ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Year Winner ( nationally ) Votes Percent
    Compound SERT NET DAT 5 - HT
    Rank Name Industry Revenue ( USD millions )
    ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Rank Name Name in Georgian Population 1989
    Country The World Factbook World Res ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Rank Country Area ( km2 ) Notes
    ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Rank Country Area ( km2 ) Notes
    Date State ( s ) Magnitude Fatalities ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t < ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Artist # Gold # Platinum # Multi-Platinum
    Name Number of locations Revenue
    Name Country Region Depth ( meters ) < ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t
    Rank Player ( 2017 HRs ) HR
    ...\n", - "max_tokens is too small to fit a single line of text. Breaking this line:\n", - "\t ...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "doc_ids: [['doc_0', 'doc_3334', 'doc_720', 'doc_2732', 'doc_2510', 'doc_5084', 'doc_5068', 'doc_3727', 'doc_1938', 'doc_4689', 'doc_5249', 'doc_1751', 'doc_480', 'doc_3989', 'doc_2115', 'doc_1233', 'doc_2264', 'doc_633', 'doc_2376', 'doc_2293', 'doc_5274', 'doc_5213', 'doc_3991', 'doc_2880', 'doc_2737', 'doc_1257', 'doc_1748', 'doc_2038', 'doc_4073', 'doc_2876']]\n", - "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3334 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_720 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2732 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2510 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_5084 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_5068 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3727 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1938 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4689 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_5249 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1751 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_480 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3989 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3334 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_720 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2732 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2510 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_5084 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_5068 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3727 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1938 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4689 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_5249 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1751 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_480 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3989 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2115 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1233 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2264 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_633 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2376 to context.\u001b[0m\n", + "Trying to use existing collection.\n", + "query: what is non controlling interest on balance sheet\n", + "doc_ids: [['0', '3334', '720', '2732', '2510', '5084', '5068', '3727', '1938', '4689', '5249', '1751', '480', '3989', '2115', '1233', '2264', '633', '2376', '2293', '5274', '4842', '5213', '3991', '2880', '2737', '1257', '1748', '2038', '4073']]\n", + "\u001b[32mAdding doc_id 0 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3334 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 720 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2732 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2510 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 5084 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 5068 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3727 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1938 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4689 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 5249 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1751 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 480 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3989 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2115 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1233 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2264 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 633 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2376 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the\n", @@ -2310,7 +1709,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "Non controlling interest on balance sheet refers to the portion of a subsidiary corporation's stock that is not owned by the parent corporation. It represents ownership of less than 50% of the outstanding shares. It is shown as a separate line item in the equity section of the balance sheet.\n", + "Non-controlling interest, also known as minority interest, on a balance sheet is the portion of a subsidiary corporation's stock not owned by the parent corporation.\n", "\n", "--------------------------------------------------------------------------------\n", "\n", @@ -2318,32 +1717,37 @@ ">>>>>>>>>>>> Below are outputs of Case 2 <<<<<<<<<<<<\n", "\n", "\n", - "doc_ids: [['doc_1', 'doc_1097', 'doc_4221', 'doc_4972', 'doc_1352', 'doc_96', 'doc_988', 'doc_2370', 'doc_2414', 'doc_5038', 'doc_302', 'doc_1608', 'doc_980', 'doc_2112', 'doc_562', 'doc_4204', 'doc_3298', 'doc_2995', 'doc_3978', 'doc_1258', 'doc_2971', 'doc_2171', 'doc_1065', 'doc_17', 'doc_2683', 'doc_87', 'doc_1767', 'doc_158', 'doc_482', 'doc_3850']]\n", - "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1097 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4221 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4972 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1352 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_96 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_988 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2370 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2414 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_5038 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_302 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1608 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_980 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2112 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_562 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4204 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3298 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2995 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3978 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1258 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2971 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2171 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1065 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_17 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2683 to context.\u001b[0m\n", + "Trying to use existing collection.\n", + "query: how many episodes are in chicago fire season 4\n", + "doc_ids: [['1', '1097', '4221', '4972', '1352', '4974', '96', '4301', '988', '2370', '2414', '5038', '302', '1608', '980', '2112', '1699', '562', '4204', '3298', '2995', '3978', '1258', '2971', '2171', '1065', '17', '2683', '87', '1767']]\n", + "\u001b[32mAdding doc_id 1 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1097 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4221 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4972 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1352 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4974 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 96 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4301 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 988 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2370 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2414 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 5038 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 302 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1608 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 980 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2112 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1699 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 562 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4204 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3298 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2995 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3978 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1258 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2971 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2171 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1065 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 17 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2683 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the\n", @@ -2358,7 +1762,9 @@ "

    The fourth season began airing on October 10 , 2017 , on The CW .

    \n", "

    The fifth season of Chicago P.D. , an American police drama television series with executive producer Dick Wolf , and producers Derek Haas , Michael Brandt , and Rick Eid , premiered on September 27 , 2017 . This season featured its 100th episode .

    \n", "

    This was the city of Chicago 's first professional sports championship since the Chicago Fire won MLS Cup ' 98 ( which came four months after the Chicago Bulls ' sixth NBA championship that year ) . The next major Chicago sports championship came in 2010 , when the NHL 's Chicago Blackhawks ended a 49 - year Stanley Cup title drought . With the Chicago Bears ' win in Super Bowl XX and the Chicago Cubs ' own World Series championship in 2016 , all Chicago sports teams have won at least one major championship since 1985 . Meanwhile , the Astros themselves made it back to the World Series in 2017 , but this time as an AL team , where they defeated the Los Angeles Dodgers in seven games , resulting in Houston 's first professional sports championship since the 2006 -- 07 Houston Dynamo won their back - to - back MLS Championships .

    \n", + "
    No . Athlete Nation Sport Years
    Chicago P.D. ( season 5 )
    Chicago P.D. Season 5 poster
    Country of origin United States
    No. of episodes 20
    Release
    Original network NBC
    Original release September 27 , 2017 ( 2017 - 09 - 27 ) -- present
    Season chronology
    ← Previous Season 4
    List of Chicago P.D. episodes
    \n", "

    The season was ordered in May 2017 , and production began the following month . Ben McKenzie stars as Gordon , alongside Donal Logue , David Mazouz , Morena Baccarin , Sean Pertwee , Robin Lord Taylor , Erin Richards , Camren Bicondova , Cory Michael Smith , Jessica Lucas , Chris Chalk , Drew Powell , Crystal Reed and Alexander Siddig . The fourth season premiered on September 21 , 2017 , on Fox , while the second half premiered on March 1 , 2018 .

    \n", + "

    The Eagle Creek Fire was a destructive wildfire in the Columbia River Gorge in the U.S. states of Oregon and Washington . The fire was started on September 2 , 2017 , reportedly caused by teenagers igniting fireworks during a burn ban . In mid-September , highway closures and local evacuations were gradually being lifted . As of September 28 , 2017 , the fire had consumed 48,831 acres ( 19,761 ha ) and was 46 % contained . In late October , fire growth was slowed by rain . On November 30 , 2017 , the fire was declared fully contained but not yet completely out .

    \n", "

    As of May 24 , 2017 , 58 episodes of The 100 have aired , concluding the fourth season . In March 2017 , The CW renewed the series for a fifth season , set to premiere on April 24 , 2018 .

    \n", "

    The fifth book , River of Fire , is scheduled to be released on April 10 , 2018 .

    \n", "

    On September 10 , 2013 , AMC officially cancelled the series after 38 episodes and three seasons . However , on November 15 , 2013 , Netflix ordered a fourth and final season of six episodes , that was released on Netflix on August 1 , 2014 .

    \n", @@ -2367,6 +1773,7 @@ "

    The first season consisted of eight one - hour - long episodes which were released worldwide on Netflix on July 15 , 2016 , in Ultra HD 4K . The second season , consisting of nine episodes , was released on October 27 , 2017 in HDR . A teaser for the second season , which also announced the release date , aired during Super Bowl LI .

    \n", "

    `` Two Days Before the Day After Tomorrow '' is the eighth episode in the ninth season of the American animated television series South Park . The 133rd overall episode overall , it originally aired on Comedy Central in the United States on October 19 , 2005 . In the episode , Stan and Cartman accidentally destroy a dam , causing the town of Beaverton to be destroyed .

    \n", "

    The fourth season consists of a double order of twenty episodes , split into two parts of ten episodes ; the second half premiered on November 30 , 2016 . The season follows the battles between Ragnar and Rollo in Francia , Bjorn 's raid into the Mediterranean , and the Viking invasion of England . It concluded in its entirety on February 1 , 2017 .

    \n", + "
    • Elizabeth Banks as Gail Abernathy - McKadden - Feinberger , an a cappella commentator making an insulting documentary about The Bellas
    • John Michael Higgins as John Smith , an a cappella commentator making an insulting documentary about The Bellas
    • John Lithgow as Fergus Hobart , Fat Amy 's estranged criminal father
    • Matt Lanter as Chicago Walp , a U.S. soldier guiding the Bellas during the tour , and Chloe 's love interest .
    • Guy Burnet as Theo , DJ Khaled 's music producer , who takes a liking to Beca
    • DJ Khaled as himself
    • Troy Ian Hall as Zeke , a U.S. soldier , partners with Chicago
    • Michael Rose as Aubrey 's father
    • Jessica Chaffin as Evan
    • Moises Arias as Pimp - Lo
    • Ruby Rose , Andy Allo , Venzella Joy Williams , and Hannah Fairlight as Calamity , Serenity , Charity , and Veracity , respectively , members of the band Evermoist
    • Whiskey Shivers as Saddle Up , a country - bluegrass - based band competing against the Bellas
    • Trinidad James and D.J. Looney as Young Sparrow and DJ Dragon Nutz , respectively
    \n", "

    This is an episode list for Sabrina the Teenage Witch , an American sitcom that debuted on ABC in 1996 . From Season 5 , the program was aired on The WB . The series ran for seven seasons totaling 163 episodes . It originally premiered on September 27 , 1996 on ABC and ended on April 24 , 2003 on The WB .

    \n", "

    Hart of Dixie was renewed by The CW for 10 episode season on May 8 , 2014 . The show 's fourth and final season premiered on November 15 , 2014 . The series was later cancelled on May 7 , 2015 .

    \n", "

    The Burning Maze is the third book in the series . It is scheduled to be released on May 1 , 2018 .

    \n", @@ -2384,7 +1791,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "There are 23 episodes in Chicago Fire season 4.\n", + "The fourth season of Chicago Fire contained 23 episodes.\n", "\n", "--------------------------------------------------------------------------------\n", "\n", @@ -2392,28 +1799,30 @@ ">>>>>>>>>>>> Below are outputs of Case 3 <<<<<<<<<<<<\n", "\n", "\n", - "doc_ids: [['doc_47', 'doc_45', 'doc_2570', 'doc_2851', 'doc_4033', 'doc_5320', 'doc_3849', 'doc_4172', 'doc_3202', 'doc_2282', 'doc_1896', 'doc_949', 'doc_103', 'doc_1552', 'doc_2791', 'doc_392', 'doc_1175', 'doc_5315', 'doc_832', 'doc_3185', 'doc_2532', 'doc_3409', 'doc_824', 'doc_4075', 'doc_1201', 'doc_4116', 'doc_1448', 'doc_2545', 'doc_2251', 'doc_2485']]\n", - "\u001b[32mAdding doc_id doc_47 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_45 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2570 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2851 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4033 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_5320 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3849 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4172 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3202 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2282 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1896 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_949 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_103 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1552 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2791 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_392 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1175 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_5315 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_832 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3185 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2532 to context.\u001b[0m\n", + "Trying to use existing collection.\n", + "query: what are bulls used for on a farm\n", + "doc_ids: [['47', '45', '2570', '2851', '4033', '5320', '3849', '4172', '3202', '2282', '1896', '949', '103', '1552', '2791', '392', '1175', '5315', '832', '3185', '2532', '3409', '824', '2814', '4075', '2815', '1201', '4116', '1448', '5293']]\n", + "\u001b[32mAdding doc_id 47 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 45 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2570 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2851 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4033 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 5320 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3849 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4172 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3202 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2282 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1896 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 949 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 103 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1552 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2791 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 392 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1175 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 5315 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 832 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3185 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2532 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the\n", @@ -2450,7 +1859,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "Bulls are used for breeding and often kept for their semen to sell for AI purposes. Some male cattle are also kept as work oxen for haulage. The vast majority, however, are slaughtered for meat before the age of three years.\n", + "Bulls on a farm are used for breeding purposes, with their semen often used for artificial insemination (AI) to maintain and improve the quality of the herd. Some bulls are kept specifically for semen collection, which can be profitable. Additionally, majority of male cattle, including bulls, are slaughtered for meat before the age of three years, providing leaner muscle compared to castrated males or females.\n", "\n", "--------------------------------------------------------------------------------\n", "\n", @@ -2458,16 +1867,18 @@ ">>>>>>>>>>>> Below are outputs of Case 4 <<<<<<<<<<<<\n", "\n", "\n", - "doc_ids: [['doc_3031', 'doc_819', 'doc_4521', 'doc_3980', 'doc_3423', 'doc_5275', 'doc_745', 'doc_753', 'doc_3562', 'doc_4139', 'doc_3678', 'doc_4931', 'doc_2347', 'doc_1115', 'doc_2806', 'doc_5204', 'doc_2707', 'doc_3653', 'doc_1122', 'doc_2398', 'doc_309', 'doc_3891', 'doc_2087', 'doc_330', 'doc_4844', 'doc_2155', 'doc_2674', 'doc_5357', 'doc_1581', 'doc_9']]\n", - "\u001b[32mAdding doc_id doc_3031 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_819 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4521 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3980 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3423 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_5275 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_745 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_753 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3562 to context.\u001b[0m\n", + "Trying to use existing collection.\n", + "query: has been honoured with the wisden leading cricketer in the world award for 2016\n", + "doc_ids: [['3031', '819', '4521', '3980', '3423', '5275', '745', '753', '3562', '4139', '3678', '4931', '2347', '1115', '2806', '5204', '2707', '3653', '1122', '2398', '309', '3891', '2087', '330', '4844', '2155', '2987', '2674', '5357', '1581']]\n", + "\u001b[32mAdding doc_id 3031 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 819 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4521 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3980 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3423 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 5275 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 745 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 753 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3562 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the\n", @@ -2492,19 +1903,19 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "I'm sorry, I couldn't find any information about who has been honoured with the Wisden Leading Cricketer in the World award for 2016. UPDATE CONTEXT.\n", + "UPDATE CONTEXT\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4139 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3678 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4931 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2347 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1115 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2806 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_5204 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2707 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3653 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4139 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3678 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4931 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2347 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1115 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2806 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 5204 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2707 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3653 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the\n", @@ -2529,17 +1940,17 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "UPDATE CONTEXT. The current context does not provide information related to the question.\n", + "UPDATE CONTEXT\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1122 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2398 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_309 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3891 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2087 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_330 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4844 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1122 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2398 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 309 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3891 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2087 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 330 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4844 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the\n", @@ -2562,7 +1973,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "I'm sorry, the provided context doesn't contain information about any cricketer being honored with the Wisden Leading Cricketer in the World award for 2016. UPDATE CONTEXT if you have any other query.\n", + "Virat Kohli\n", "\n", "--------------------------------------------------------------------------------\n", "\n", @@ -2570,20 +1981,23 @@ ">>>>>>>>>>>> Below are outputs of Case 5 <<<<<<<<<<<<\n", "\n", "\n", - "doc_ids: [['doc_20', 'doc_2943', 'doc_2059', 'doc_3293', 'doc_4056', 'doc_1914', 'doc_2749', 'doc_1796', 'doc_3468', 'doc_1793', 'doc_876', 'doc_2577', 'doc_27', 'doc_366', 'doc_321', 'doc_3103', 'doc_715', 'doc_3534', 'doc_142', 'doc_5337', 'doc_2426', 'doc_5346', 'doc_3021', 'doc_1596', 'doc_316', 'doc_1103', 'doc_1602', 'doc_1677', 'doc_1670', 'doc_2853']]\n", - "\u001b[32mAdding doc_id doc_20 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2943 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2059 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3293 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_4056 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1914 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2749 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1796 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_3468 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1793 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_876 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_2577 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_27 to context.\u001b[0m\n", + "Trying to use existing collection.\n", + "query: who carried the usa flag in opening ceremony\n", + "doc_ids: [['20', '2943', '2059', '3293', '4056', '1914', '2749', '1796', '3468', '1793', '876', '2577', '27', '2780', '366', '2574', '321', '3103', '715', '3534', '142', '5337', '2426', '5346', '3021', '1596', '316', '2343', '1103', '1602']]\n", + "\u001b[32mAdding doc_id 20 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2943 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2059 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3293 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 4056 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1914 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2749 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1796 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 3468 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 1793 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 876 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2577 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 27 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2780 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the\n", @@ -2606,17 +2020,19 @@ "

    The United States Oath of Allegiance , officially referred to as the `` Oath of Allegiance , '' 8 C.F.R. Part 337 ( 2008 ) , is an allegiance oath that must be taken by all immigrants who wish to become United States citizens .

    \n", "

    During the first half of the 19th century , seven stars were added to the flag to represent the seven signatories to the Venezuelan declaration of independence , being the provinces of Caracas , Cumaná , Barcelona , Barinas , Margarita , Mérida , and Trujillo .

    \n", "

    With the annexation of Hawaii in 1898 and the seizure of Guam and the Philippines during the Spanish -- American War that same year , the United States began to consider unclaimed and uninhabited Wake Island , located approximately halfway between Honolulu and Manila , as a good location for a telegraph cable station and coaling station for refueling warships of the rapidly expanding United States Navy and passing merchant and passenger steamships . On July 4 , 1898 , United States Army Brigadier General Francis V. Greene of the 2nd Brigade , Philippine Expeditionary Force , of the Eighth Army Corps , stopped at Wake Island and raised the American flag while en route to the Philippines on the steamship liner SS China .

    \n", + "

    On Opening Day , April 9 , 1965 , a sold - out crowd of 47,879 watched an exhibition game between the Houston Astros and the New York Yankees . President Lyndon B. Johnson and his wife Lady Bird were in attendance , as well as Texas Governor John Connally and Houston Mayor Louie Welch . Governor Connally tossed out the first ball for the first game ever played indoors . Dick `` Turk '' Farrell of the Astros threw the first pitch . Mickey Mantle had both the first hit ( a single ) and the first home run in the Astrodome . The Astros beat the Yankees that night , 2 - 1 .

    \n", "\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "I don't have the answer with the provided context. UPDATE CONTEXT.\n", + "UPDATE CONTEXT\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_366 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 366 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 2574 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the\n", @@ -2627,13 +2043,14 @@ "User's question is: who carried the usa flag in opening ceremony\n", "\n", "Context is: \n", + "

    The opening ceremony of the 2018 Winter Olympics was held at the Pyeongchang Olympic Stadium in Pyeongchang , South Korea on 9 February 2018 . It began at 20 : 00 KST and finished at approximately 22 : 20 KST . The Games were officially opened by President of the Republic of Korea Moon Jae - in .

    \n", "\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "Erin Hamlin carried the USA flag in the opening ceremony.\n", + "Erin Hamlin\n", "\n", "--------------------------------------------------------------------------------\n" ] @@ -2675,7 +2092,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -2720,7 +2137,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -2737,19 +2154,19 @@ " \"docs_path\": corpus_file,\n", " \"chunk_token_size\": 2000,\n", " \"model\": config_list[0][\"model\"],\n", - " \"client\": chromadb.PersistentClient(path=\"/tmp/chromadb\"),\n", " \"collection_name\": \"2wikimultihopqa\",\n", " \"chunk_mode\": \"one_line\",\n", " \"embedding_model\": \"all-MiniLM-L6-v2\",\n", " \"customized_prompt\": PROMPT_MULTIHOP,\n", " \"customized_answer_prefix\": \"the answer is\",\n", + " \"get_or_create\": True\n", " },\n", ")" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -2777,7 +2194,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -2789,13 +2206,14 @@ ">>>>>>>>>>>> Below are outputs of Case 1 <<<<<<<<<<<<\n", "\n", "\n", - "Trying to create collection.\n" + "Trying to create index.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ + "[2023-11-26T19:32:19Z WARN lance::dataset] No existing dataset at /Users/ayushchaurasia/autogen/2wikimultihopqa.lance, it will be created\n", "max_tokens is too small to fit a single line of text. Breaking this line:\n", "\tClyde Thompson: Clyde Thompson( 1910 – July 1, 1979) was an American prisoner turned chaplain. He is ...\n", "max_tokens is too small to fit a single line of text. Breaking this line:\n", @@ -2806,77 +2224,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "doc_ids: [['doc_12', 'doc_11', 'doc_16', 'doc_19', 'doc_13116', 'doc_14', 'doc_13', 'doc_18', 'doc_977', 'doc_10']]\n", - "\u001b[32mAdding doc_id doc_12 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_11 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_16 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_19 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_13116 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_14 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_13 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_18 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_977 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_10 to context.\u001b[0m\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the context provided by the user. You must think step-by-step.\n", - "First, please learn the following examples of context and question pairs and their corresponding answers.\n", - "\n", - "Context:\n", - "Kurram Garhi: Kurram Garhi is a small village located near the city of Bannu, which is the part of Khyber Pakhtunkhwa province of Pakistan. Its population is approximately 35000.\n", - "Trojkrsti: Trojkrsti is a village in Municipality of Prilep, Republic of Macedonia.\n", - "Q: Are both Kurram Garhi and Trojkrsti located in the same country?\n", - "A: Kurram Garhi is located in the country of Pakistan. Trojkrsti is located in the country of Republic of Macedonia. Thus, they are not in the same country. So the answer is: no.\n", - "\n", - "\n", - "Context:\n", - "Early Side of Later: Early Side of Later is the third studio album by English singer- songwriter Matt Goss. It was released on 21 June 2004 by Concept Music and reached No. 78 on the UK Albums Chart.\n", - "What's Inside: What's Inside is the fourteenth studio album by British singer- songwriter Joan Armatrading.\n", - "Q: Which album was released earlier, What'S Inside or Cassandra'S Dream (Album)?\n", - "A: What's Inside was released in the year 1995. Cassandra's Dream (album) was released in the year 2008. Thus, of the two, the album to release earlier is What's Inside. So the answer is: What's Inside.\n", - "\n", - "\n", - "Context:\n", - "Maria Alexandrovna (Marie of Hesse): Maria Alexandrovna , born Princess Marie of Hesse and by Rhine (8 August 1824 – 3 June 1880) was Empress of Russia as the first wife of Emperor Alexander II.\n", - "Grand Duke Alexei Alexandrovich of Russia: Grand Duke Alexei Alexandrovich of Russia,(Russian: Алексей Александрович; 14 January 1850 (2 January O.S.) in St. Petersburg – 14 November 1908 in Paris) was the fifth child and the fourth son of Alexander II of Russia and his first wife Maria Alexandrovna (Marie of Hesse).\n", - "Q: What is the cause of death of Grand Duke Alexei Alexandrovich Of Russia's mother?\n", - "A: The mother of Grand Duke Alexei Alexandrovich of Russia is Maria Alexandrovna. Maria Alexandrovna died from tuberculosis. So the answer is: tuberculosis.\n", - "\n", - "\n", - "Context:\n", - "Laughter in Hell: Laughter in Hell is a 1933 American Pre-Code drama film directed by Edward L. Cahn and starring Pat O'Brien. The film's title was typical of the sensationalistic titles of many Pre-Code films.\n", - "Edward L. Cahn: Edward L. Cahn (February 12, 1899 – August 25, 1963) was an American film director.\n", - "Q: When did the director of film Laughter In Hell die?\n", - "A: The film Laughter In Hell was directed by Edward L. Cahn. Edward L. Cahn died on August 25, 1963. So the answer is: August 25, 1963.\n", - "\n", - "Second, please complete the answer by thinking step-by-step.\n", - "\n", - "Context:\n", - "The Mask of Fu Manchu: The Mask of Fu Manchu is a 1932 pre-Code adventure film directed by Charles Brabin. It was written by Irene Kuhn, Edgar Allan Woolf and John Willard based on the 1932 novel of the same name by Sax Rohmer. Starring Boris Karloff as Fu Manchu, and featuring Myrna Loy as his depraved daughter, the movie revolves around Fu Manchu's quest for the golden sword and mask of Genghis Khan. Lewis Stone plays his nemesis. Dr. Petrie is absent from this film.\n", - "The Mysterious Dr. Fu Manchu: The Mysterious Dr. Fu Manchu is a 1929 American pre-Code drama film directed by Rowland V. Lee and starring Warner Oland as Dr. Fu Manchu. It was the first Fu Manchu film of the talkie era. Since this was during the transition period to sound, a silent version was also released in the United States.\n", - "The Face of Fu Manchu: The Face of Fu Manchu is a 1965 thriller film directed by Don Sharp and based on the characters created by Sax Rohmer. It stars Christopher Lee as the eponymous villain, a Chinese criminal mastermind, and Nigel Green as his pursuing rival Nayland Smith, a Scotland Yard detective. The film was a British- West German co-production, and was the first in a five- part series starring Lee and produced by Harry Alan Towers for Constantin Film, the second of which was\" The Brides of Fu Manchu\" released the next year, with the final entry being\" The Castle of Fu Manchu\" in 1969. It was shot in Technicolor and Techniscope, on- location in County Dublin, Ireland.\n", - "The Return of Dr. Fu Manchu: The Return of Dr. Fu Manchu is a 1930 American pre-Code film directed by Rowland V. Lee. It is the second of three films starring Warner Oland as the fiendish Fu Manchu, who returns from apparent death in the previous film,\" The Mysterious Dr. Fu Manchu\"( 1929), to seek revenge on those he holds responsible for the death of his wife and child.\n", - "The Vengeance of Fu Manchu: The Vengeance of Fu Manchu is a 1967 British film directed by Jeremy Summers and starring Christopher Lee, Horst Frank, Douglas Wilmer and Tsai Chin. It was the third British/ West German Constantin Film co-production of the Dr. Fu Manchu series and the first to be filmed in Hong Kong. It was generally released in the U.K. through Warner- Pathé( as a support feature to the Lindsay Shonteff film\" The Million Eyes of Sumuru\") on 3 December 1967.\n", - "The Brides of Fu Manchu: The Brides of Fu Manchu is a 1966 British/ West German Constantin Film co-production adventure crime film based on the fictional Chinese villain Dr. Fu Manchu, created by Sax Rohmer. It was the second film in a series, and was preceded by\" The Face of Fu ManchuThe Vengeance of Fu Manchu\" followed in 1967,\" The Blood of Fu Manchu\" in 1968, and\" The Castle of Fu Manchu\" in 1969. It was produced by Harry Alan Towers for Hallam Productions. Like the first film, it was directed by Don Sharp, and starred Christopher Lee as Fu Manchu. Nigel Green was replaced by Douglas Wilmer as Scotland Yard detective Nayland Smith. The action takes place mainly in London, where much of the location filming took place.\n", - "The Castle of Fu Manchu: The Castle of Fu Manchu( also known as The Torture Chamber of Dr. Fu Manchu and also known by its German title Die Folterkammer des Dr. Fu Man Chu) is a 1969 film and the fifth and final Dr. Fu Manchu film with Christopher Lee portraying the title character.\n", - "The Blood of Fu Manchu: The Blood of Fu Manchu, also known as Fu Manchu and the Kiss of Death, Kiss of Death, Kiss and Kill( U.S. title) and Against All Odds( original U.S. video title), is a 1968 British adventure crime film directed by Jesús Franco, based on the fictional Asian villain Dr. Fu Manchu created by Sax Rohmer. It was the fourth film in a series, and was preceded by\" The Vengeance of Fu Manchu The Castle of Fu Manchu\" followed in 1969. It was produced by Harry Alan Towers for Udastex Films. It starred Christopher Lee as Dr. Fu Manchu, Richard Greene as Scotland Yard detective Nayland Smith, and Howard Marion- Crawford as Dr. Petrie. The movie was filmed in Spain and Brazil. Shirley Eaton appears in a scene that she claimed she was never paid for; apparently, the director Jesús Franco had inserted some stock footage of her from one of her films(\" The Girl from Rio\"( 1968)) into the film without telling her. She only found out years later that she had been in a Fu Manchu film.\n", - "Don Sharp: Donald Herman Sharp( 19 April 192114 December 2011) was an Australian- born British film director. His best known films were made for Hammer in the 1960s, and included\" The Kiss of the Vampire\"( 1962) and\" Rasputin, the Mad Monk\"( 1966). In 1965 he directed\" The Face of Fu Manchu\", based on the character created by Sax Rohmer, and starring Christopher Lee. Sharp also directed the sequel\" The Brides of Fu Manchu\"( 1966). In the 1980s he was also responsible for several hugely popular miniseries adapted from the novels of Barbara Taylor Bradford.\n", - "Blind Shaft: Blind Shaft is a 2003 film about a pair of brutal con artists operating in the illegal coal mines of present- day northern China. The film was written and directed by Li Yang( 李杨), and is based on Chinese writer Liu Qingbang's short novel\" Shen MuSacred Wood\").\n", - "\n", - "Q: Which film came out first, Blind Shaft or The Mask Of Fu Manchu?\n", - "A:\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[32mAdding doc_id doc_11 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_16 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_19 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_13116 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_14 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_13 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_18 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_977 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_10 to context.\u001b[0m\n", + "Found 57090 chunks.\n", + "query: Which film came out first, Blind Shaft or The Mask Of Fu Manchu?\n", + "doc_ids: [['12', '11', '16', '19', '13116', '14', '13', '18', '977', '10']]\n", + "\u001b[32mAdding doc_id 12 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 11 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 16 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 19 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 13116 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 14 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 13 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 18 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 977 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 10 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the context provided by the user. You must think step-by-step.\n", @@ -2930,7 +2290,7 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "Blind Shaft is a 2003 film while The Mask of Fu Manchu is a 1932 pre-Code adventure film. Thus, The Mask of Fu Manchu came out earlier than Blind Shaft. So the answer is: The Mask of Fu Manchu.\n", + "The film The Mask of Fu Manchu was released in the year 1932. Blind Shaft was released in the year 2003. Thus, of the two, the film to release earlier is The Mask of Fu Manchu. So the answer is: The Mask of Fu Manchu.\n", "\n", "--------------------------------------------------------------------------------\n", "\n", @@ -2938,134 +2298,24 @@ ">>>>>>>>>>>> Below are outputs of Case 2 <<<<<<<<<<<<\n", "\n", "\n", - "doc_ids: [['doc_74', 'doc_76', 'doc_68', 'doc_42890', 'doc_75', 'doc_19596', 'doc_45135', 'doc_995', 'doc_7274', 'doc_23187']]\n", - "\u001b[32mAdding doc_id doc_74 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_76 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_68 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_42890 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_75 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_19596 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_45135 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_995 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_7274 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_23187 to context.\u001b[0m\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the context provided by the user. You must think step-by-step.\n", - "First, please learn the following examples of context and question pairs and their corresponding answers.\n", - "\n", - "Context:\n", - "Kurram Garhi: Kurram Garhi is a small village located near the city of Bannu, which is the part of Khyber Pakhtunkhwa province of Pakistan. Its population is approximately 35000.\n", - "Trojkrsti: Trojkrsti is a village in Municipality of Prilep, Republic of Macedonia.\n", - "Q: Are both Kurram Garhi and Trojkrsti located in the same country?\n", - "A: Kurram Garhi is located in the country of Pakistan. Trojkrsti is located in the country of Republic of Macedonia. Thus, they are not in the same country. So the answer is: no.\n", - "\n", - "\n", - "Context:\n", - "Early Side of Later: Early Side of Later is the third studio album by English singer- songwriter Matt Goss. It was released on 21 June 2004 by Concept Music and reached No. 78 on the UK Albums Chart.\n", - "What's Inside: What's Inside is the fourteenth studio album by British singer- songwriter Joan Armatrading.\n", - "Q: Which album was released earlier, What'S Inside or Cassandra'S Dream (Album)?\n", - "A: What's Inside was released in the year 1995. Cassandra's Dream (album) was released in the year 2008. Thus, of the two, the album to release earlier is What's Inside. So the answer is: What's Inside.\n", - "\n", - "\n", - "Context:\n", - "Maria Alexandrovna (Marie of Hesse): Maria Alexandrovna , born Princess Marie of Hesse and by Rhine (8 August 1824 – 3 June 1880) was Empress of Russia as the first wife of Emperor Alexander II.\n", - "Grand Duke Alexei Alexandrovich of Russia: Grand Duke Alexei Alexandrovich of Russia,(Russian: Алексей Александрович; 14 January 1850 (2 January O.S.) in St. Petersburg – 14 November 1908 in Paris) was the fifth child and the fourth son of Alexander II of Russia and his first wife Maria Alexandrovna (Marie of Hesse).\n", - "Q: What is the cause of death of Grand Duke Alexei Alexandrovich Of Russia's mother?\n", - "A: The mother of Grand Duke Alexei Alexandrovich of Russia is Maria Alexandrovna. Maria Alexandrovna died from tuberculosis. So the answer is: tuberculosis.\n", - "\n", - "\n", - "Context:\n", - "Laughter in Hell: Laughter in Hell is a 1933 American Pre-Code drama film directed by Edward L. Cahn and starring Pat O'Brien. The film's title was typical of the sensationalistic titles of many Pre-Code films.\n", - "Edward L. Cahn: Edward L. Cahn (February 12, 1899 – August 25, 1963) was an American film director.\n", - "Q: When did the director of film Laughter In Hell die?\n", - "A: The film Laughter In Hell was directed by Edward L. Cahn. Edward L. Cahn died on August 25, 1963. So the answer is: August 25, 1963.\n", - "\n", - "Second, please complete the answer by thinking step-by-step.\n", - "\n", - "Context:\n", - "Seoul High School: Seoul High School( Hangul: 서울고등학교) is a public high school located in the heart of Seoul, South Korea.\n", - "North Marion High School (Oregon): North Marion High School is a public high school in Aurora, Oregon, United States. The school is part of the North Marion School District with all four schools being located on the same campus. The school draws students from the cities of Aurora, Hubbard, and Donald as well as the communities of Broadacres and Butteville.\n", - "Marion High School (Kansas): Marion High School is a public high school in Marion, Kansas, USA. It is one of three schools operated by Marion USD 408, and is the sole high school in the district.\n", - "Northwest High School: Northwest High School or North West High School may refer to:\n", - "Marion High School (Indiana): Marion High School is a high school in Marion, Indiana with more than 1,000 students.\n", - "Macon County High School: Macon County High School is located in Montezuma, Georgia, United States, which is a part of Macon County. Enrollment as of the 2017- 2018 school year is 491.\n", - "Canyon High School (Ogden, Utah): Canyon High School was a high school in Ogden, Utah.\n", - "Northside High School: Northside High School or North Side High School or Northside Christian School or similar can refer to:\n", - "Springs Boys' High School: Springs Boys' High School is a high school in Springs, Gauteng, South Africa.\n", - "International School of Koje: International School of Koje( ISK) is a privately funded international school located in Geoje, South Korea.\n", - "\n", - "Q: Are North Marion High School (Oregon) and Seoul High School both located in the same country?\n", - "A:\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", - "\n", - "No, North Marion High School (Oregon) is located in the United States, specifically in the state of Oregon, while Seoul High School is located in South Korea. So they are not in the same country.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[32mUpdating context and resetting conversation.\u001b[0m\n", - "doc_ids: [['doc_76', 'doc_68', 'doc_74', 'doc_75', 'doc_19596', 'doc_42890', 'doc_24819', 'doc_69', 'doc_995', 'doc_7274']]\n", - "\u001b[32mAdding doc_id doc_24819 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_69 to context.\u001b[0m\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the context provided by the user. You must think step-by-step.\n", - "First, please learn the following examples of context and question pairs and their corresponding answers.\n", - "\n", - "Context:\n", - "Kurram Garhi: Kurram Garhi is a small village located near the city of Bannu, which is the part of Khyber Pakhtunkhwa province of Pakistan. Its population is approximately 35000.\n", - "Trojkrsti: Trojkrsti is a village in Municipality of Prilep, Republic of Macedonia.\n", - "Q: Are both Kurram Garhi and Trojkrsti located in the same country?\n", - "A: Kurram Garhi is located in the country of Pakistan. Trojkrsti is located in the country of Republic of Macedonia. Thus, they are not in the same country. So the answer is: no.\n", - "\n", - "\n", - "Context:\n", - "Early Side of Later: Early Side of Later is the third studio album by English singer- songwriter Matt Goss. It was released on 21 June 2004 by Concept Music and reached No. 78 on the UK Albums Chart.\n", - "What's Inside: What's Inside is the fourteenth studio album by British singer- songwriter Joan Armatrading.\n", - "Q: Which album was released earlier, What'S Inside or Cassandra'S Dream (Album)?\n", - "A: What's Inside was released in the year 1995. Cassandra's Dream (album) was released in the year 2008. Thus, of the two, the album to release earlier is What's Inside. So the answer is: What's Inside.\n", - "\n", - "\n", - "Context:\n", - "Maria Alexandrovna (Marie of Hesse): Maria Alexandrovna , born Princess Marie of Hesse and by Rhine (8 August 1824 – 3 June 1880) was Empress of Russia as the first wife of Emperor Alexander II.\n", - "Grand Duke Alexei Alexandrovich of Russia: Grand Duke Alexei Alexandrovich of Russia,(Russian: Алексей Александрович; 14 January 1850 (2 January O.S.) in St. Petersburg – 14 November 1908 in Paris) was the fifth child and the fourth son of Alexander II of Russia and his first wife Maria Alexandrovna (Marie of Hesse).\n", - "Q: What is the cause of death of Grand Duke Alexei Alexandrovich Of Russia's mother?\n", - "A: The mother of Grand Duke Alexei Alexandrovich of Russia is Maria Alexandrovna. Maria Alexandrovna died from tuberculosis. So the answer is: tuberculosis.\n", - "\n", - "\n", - "Context:\n", - "Laughter in Hell: Laughter in Hell is a 1933 American Pre-Code drama film directed by Edward L. Cahn and starring Pat O'Brien. The film's title was typical of the sensationalistic titles of many Pre-Code films.\n", - "Edward L. Cahn: Edward L. Cahn (February 12, 1899 – August 25, 1963) was an American film director.\n", - "Q: When did the director of film Laughter In Hell die?\n", - "A: The film Laughter In Hell was directed by Edward L. Cahn. Edward L. Cahn died on August 25, 1963. So the answer is: August 25, 1963.\n", - "\n", - "Second, please complete the answer by thinking step-by-step.\n", - "\n", - "Context:\n", - "Seoul High School: Seoul High School( Hangul: 서울고등학교) is a public high school located in the heart of Seoul, South Korea.\n", - "North Marion High School (Oregon): North Marion High School is a public high school in Aurora, Oregon, United States. The school is part of the North Marion School District with all four schools being located on the same campus. The school draws students from the cities of Aurora, Hubbard, and Donald as well as the communities of Broadacres and Butteville.\n", - "Marion High School (Kansas): Marion High School is a public high school in Marion, Kansas, USA. It is one of three schools operated by Marion USD 408, and is the sole high school in the district.\n", - "Northwest High School: Northwest High School or North West High School may refer to:\n", - "Marion High School (Indiana): Marion High School is a high school in Marion, Indiana with more than 1,000 students.\n", - "Macon County High School: Macon County High School is located in Montezuma, Georgia, United States, which is a part of Macon County. Enrollment as of the 2017- 2018 school year is 491.\n", - "Canyon High School (Ogden, Utah): Canyon High School was a high school in Ogden, Utah.\n", - "Northside High School: Northside High School or North Side High School or Northside Christian School or similar can refer to:\n", - "Springs Boys' High School: Springs Boys' High School is a high school in Springs, Gauteng, South Africa.\n", - "International School of Koje: International School of Koje( ISK) is a privately funded international school located in Geoje, South Korea.\n", - "Anderson High School (Anderson, Indiana): Anderson High School is a public high school located in Anderson, Indiana.\n", - "North Marion High School (West Virginia): North Marion High School is a public Double A (\"AA\") high school in the U.S. state of West Virginia, with a current enrollment of 851 students. North Marion High School is located approximately 4 miles from Farmington, West Virginia on US Route 250 north. While it is closer to the city of Mannington, West Virginia, and is often considered to be located in Rachel, West Virginia, the school mailing address is Farmington. Rachel is a small coal mining community located adjacent to the school, and is an unincorporated municipality. North Marion High School is represented as \"Grantville High School\" in the popular alternative history novel \"1632\" by writer Eric Flint. The novel is set in the fictional town of Grantville, which is based on the real town and surroundings of Mannington.\n", - "Q: Are North Marion High School (Oregon) and Seoul High School both located in the same country?\n", - "A:\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", - "\n", - "North Marion High School (Oregon) is located in the country of United States. Seoul High School is located in the country of South Korea. Thus, they are not in the same country. So the answer is: no.\n", - "\n", - "--------------------------------------------------------------------------------\n" + "Trying to create index.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\tClyde Thompson: Clyde Thompson( 1910 – July 1, 1979) was an American prisoner turned chaplain. He is ...\n", + "max_tokens is too small to fit a single line of text. Breaking this line:\n", + "\tAustralian Historical Monographs: The Australian Historical Monographs are a series of Historical st ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 57090 chunks.\n" ] } ], @@ -3079,6 +2329,13 @@ " qa_problem = questions[i]\n", " ragproxyagent.initiate_chat(assistant, problem=qa_problem, n_results=10)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -3097,7 +2354,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/notebook/agentchat_groupchat_RAG.ipynb b/notebook/agentchat_groupchat_RAG.ipynb index c68b3181950..87905ff2dcd 100644 --- a/notebook/agentchat_groupchat_RAG.ipynb +++ b/notebook/agentchat_groupchat_RAG.ipynb @@ -55,7 +55,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "LLM models: ['gpt-35-turbo', 'gpt-35-turbo-0613']\n" + "LLM models: ['gpt-4']\n" ] } ], @@ -110,20 +110,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 19, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11060). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n", - " return torch._C._cuda_getDeviceCount() > 0\n" - ] - } - ], + "outputs": [], "source": [ "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n", "from autogen import AssistantAgent\n", @@ -159,9 +148,8 @@ " \"docs_path\": \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md\",\n", " \"chunk_token_size\": 1000,\n", " \"model\": config_list[0][\"model\"],\n", - " \"client\": chromadb.PersistentClient(path=\"/tmp/chromadb\"),\n", " \"collection_name\": \"groupchat\",\n", - " \"get_or_create\": True,\n", + " \"db_mode\": \"recreate\",\n", " },\n", " code_execution_config=False, # we don't want to execute code in this case.\n", ")\n", @@ -317,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -331,88 +319,67 @@ "--------------------------------------------------------------------------------\n", "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", - "To use Spark for parallel training in FLAML, you can use the `SparkTrials` class provided by FLAML. Here is a sample code:\n", + "Sure, you can use PySpark to parallelize the training process in FLAML. Here's a simple example of how you can do it:\n", "\n", "```python\n", "from flaml import AutoML\n", - "from flaml.data import load_credit\n", - "from flaml.model import SparkTrials\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import col\n", + "from sklearn.datasets import load_boston\n", + "import pandas as pd\n", "\n", - "# Load data\n", - "X_train, y_train, X_test, y_test = load_credit()\n", + "# Initialize SparkSession\n", + "spark = SparkSession.builder \\\n", + " .appName(\"Parallel Training with FLAML\") \\\n", + " .getOrCreate()\n", "\n", - "# Define the search space\n", - "search_space = {\n", - " \"n_estimators\": {\"domain\": range(10, 100)},\n", - " \"max_depth\": {\"domain\": range(6, 10)},\n", - " \"learning_rate\": {\"domain\": (0.01, 0.1, 1)},\n", - "}\n", + "# Load the dataset\n", + "boston = load_boston()\n", + "df = pd.DataFrame(boston.data, columns=boston.feature_names)\n", + "df['target'] = pd.Series(boston.target)\n", "\n", - "# Create an AutoML instance with SparkTrials\n", - "automl = AutoML(\n", - " search_space=search_space,\n", - " task=\"classification\",\n", - " n_jobs=1,\n", - " ensemble_size=0,\n", - " max_trials=10,\n", - " trials=SparkTrials(parallelism=2),\n", - ")\n", + "# Convert the pandas dataframe to spark dataframe\n", + "sdf = spark.createDataFrame(df)\n", "\n", - "# Train the model\n", - "automl.fit(X_train=X_train, y_train=y_train)\n", + "# Split the data into training and test sets\n", + "train, test = sdf.randomSplit([0.8, 0.2])\n", "\n", - "# Evaluate the model\n", - "print(\"Best model:\", automl.best_model)\n", - "print(\"Best hyperparameters:\", automl.best_config)\n", - "print(\"Test accuracy:\", automl.score(X_test=X_test, y_test=y_test))\n", + "# Convert the spark dataframes back to pandas dataframes\n", + "train = train.select(\"*\").toPandas()\n", + "test = test.select(\"*\").toPandas()\n", "\n", - "# Terminate\n", - "TERMINATE\n", - "```\n", - "\n", - "In this code, we first load the credit dataset. Then, we define the search space for the hyperparameters. We create an `AutoML` instance with `SparkTrials` as the `trials` parameter. We set the `parallelism` parameter to 2, which means that FLAML will use 2 Spark workers to run the trials in parallel. Finally, we fit the model and evaluate it.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "# Initialize the AutoML instance\n", + "automl = AutoML()\n", "\n", - "Great! That's a clear and concise example. No further questions from my side.\n", + "# Specify the task as regression and the metric to optimize as rmse\n", + "automl_settings = {\n", + " \"time_budget\": 120, # in seconds\n", + " \"metric\": 'rmse',\n", + " \"task\": 'regression',\n", + " \"log_file_name\": \"boston.log\",\n", + "}\n", "\n", - "--------------------------------------------------------------------------------\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", + "# Train the model\n", + "automl.fit(train.drop('target', axis=1), train['target'], **automl_settings)\n", "\n", - "Thank you! Let me know if you have any other questions.\n", + "# Predict on the test data\n", + "preds = automl.predict(test.drop('target', axis=1))\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", + "# Print the best model and metric\n", + "print('Best ML model:', automl.best_estimator)\n", + "print('Best metric:', automl.best_loss)\n", + "```\n", "\n", - "Reply `TERMINATE` if the task is done.\n", + "This script loads the Boston housing dataset, splits it into training and test sets, and then uses FLAML's AutoML to find the best model and hyperparameters. The training process is parallelized using PySpark.\n", "\n", - "--------------------------------------------------------------------------------\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GroupChat select_speaker failed to resolve the next speaker's name. Speaker selection will default to the next speaker in the list. This is because the speaker selection OAI call returned:\n", - "The next role to play is not specified in the conversation. Please provide more information.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "Please note that this is a simple example and might not fully utilize the capabilities of Spark. For larger datasets and more complex scenarios, you might need to customize this script to better suit your needs.\n", "\n", - "TERMINATE\n", + "Also, please ensure that you have the necessary packages installed in your environment. You can install them using pip:\n", "\n", - "--------------------------------------------------------------------------------\n", + "```bash\n", + "pip install flaml[forecast]\n", + "pip install pyspark\n", + "```\n", "TERMINATE\n", "\n", "--------------------------------------------------------------------------------\n" @@ -434,30 +401,25 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 20, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Trying to create collection.\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2\n" + "INFO:autogen:Trying to create index. If the index already exists, it will be recreated.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "doc_ids: [['doc_0', 'doc_1']]\n", - "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", - "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", + "Found 2 chunks.\n", + "query: How to use spark for parallel training in FLAML? Give me sample code.\n", + "doc_ids: [['1', '0']]\n", + "\u001b[32mAdding doc_id 1 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id 0 to context.\u001b[0m\n", "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -472,29 +434,47 @@ "\n", "User's question is: How to use spark for parallel training in FLAML? Give me sample code.\n", "\n", - "Context is: # Integrate - Spark\n", + "Context is: \n", + "use_spark: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable FLAML_MAX_CONCURRENT to override the detected num_executors. The final number of concurrent trials will be the minimum of n_concurrent_trials and num_executors.\n", + "n_concurrent_trials: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", + "force_cancel: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", + "An example code snippet for using parallel Spark jobs:\n", "\n", - "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", - "- Use Spark ML estimators for AutoML.\n", - "- Use Spark to run training in parallel spark jobs.\n", + "import flaml\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", + "}\n", "\n", - "## Spark ML Estimators\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", + "Integrate - Spark\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", "\n", + "Use Spark ML estimators for AutoML.\n", + "Use Spark to run training in parallel spark jobs.\n", + "Spark ML Estimators\n", "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", "\n", - "### Data\n", + "Data\n", + "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function to_pandas_on_spark in the flaml.automl.spark.utils module to convert your data into a pandas-on-spark (pyspark.pandas) dataframe/series, which Spark estimators require.\n", "\n", - "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", + "This utility function takes data in the form of a pandas.Dataframe or pyspark.sql.Dataframe and converts it into a pandas-on-spark dataframe. It also takes pandas.Series or pyspark.sql.Dataframe and converts it into a pandas-on-spark series. If you pass in a pyspark.pandas.Dataframe, it will not make any changes.\n", "\n", - "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", - "\n", - "This function also accepts optional arguments `index_col` and `default_index_type`.\n", - "- `index_col` is the column name to use as the index, default is None.\n", - "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", + "This function also accepts optional arguments index_col and default_index_type.\n", "\n", + "index_col is the column name to use as the index, default is None.\n", + "default_index_type is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official documentation\n", "Here is an example code snippet for Spark Data:\n", "\n", - "```python\n", "import pandas as pd\n", "from flaml.automl.spark.utils import to_pandas_on_spark\n", "# Creating a dictionary\n", @@ -508,33 +488,27 @@ "\n", "# Convert to pandas-on-spark dataframe\n", "psdf = to_pandas_on_spark(dataframe)\n", - "```\n", - "\n", - "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", + "To use Spark ML models you need to format your data appropriately. Specifically, use VectorAssembler to merge all feature columns into a single vector column.\n", "\n", "Here is an example of how to use it:\n", - "```python\n", + "\n", "from pyspark.ml.feature import VectorAssembler\n", "columns = psdf.columns\n", "feature_cols = [col for col in columns if col != label]\n", "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", - "```\n", - "\n", - "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", + "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using X_train, y_train or dataframe, label.\n", "\n", - "### Estimators\n", - "#### Model List\n", - "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", - "\n", - "#### Usage\n", + "Estimators\n", + "Model List\n", + "lgbm_spark: The class for fine-tuning Spark version LightGBM models, using SynapseML API.\n", + "Usage\n", "First, prepare your data in the required format as described in the previous section.\n", "\n", - "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", + "By including the models you intend to try in the estimators_list argument to flaml.automl, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the _spark postfix by default, even if you haven't specified them.\n", "\n", "Here is an example code snippet using SparkML models in AutoML:\n", "\n", - "```python\n", "import flaml\n", "# prepare your data in pandas-on-spark format as we previously mentioned\n", "\n", @@ -551,78 +525,105 @@ " label=label,\n", " **settings,\n", ")\n", - "```\n", - "\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", + "Link to notebook | Open in colab\n", "\n", - "## Parallel Spark Jobs\n", - "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", + "Parallel Spark Jobs\n", + "You can activate Spark as the parallel backend during parallel tuning in both AutoML and Hyperparameter Tuning, by setting the use_spark to true. FLAML will dispatch your job to the distributed Spark backend using joblib-spark.\n", "\n", - "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", + "Please note that you should not set use_spark to true when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with use_spark again.\n", "\n", "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", "\n", "\n", - "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", - "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", - "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", - "\n", - "An example code snippet for using parallel Spark jobs:\n", - "```python\n", - "import flaml\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"n_concurrent_trials\": 2,\n", - " \"use_spark\": True,\n", - " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", - "}\n", "\n", - "automl.fit(\n", - " dataframe=dataframe,\n", - " label=label,\n", - " **automl_settings,\n", - ")\n", - "```\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", + "Based on the context provided, here is a sample code on how to use Spark for parallel training in FLAML:\n", "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", + "```python\n", + "# Import necessary libraries\n", + "import pandas as pd\n", + "from flaml.automl import AutoML\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "from pyspark.ml.feature import VectorAssembler\n", "\n", + "# Creating a dictionary\n", + "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", "\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", "\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "# Use VectorAssembler to merge all feature columns into a single vector column\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", "\n", - "To use Spark for parallel training in FLAML, you can activate Spark as the parallel backend during parallel tuning in both AutoML and Hyperparameter Tuning, by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using `joblib-spark`. Here is an example code snippet for using parallel Spark jobs:\n", + "# Initialize AutoML\n", + "automl = AutoML()\n", "\n", - "```python\n", - "import flaml\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", + "# Define settings\n", + "settings = {\n", " \"time_budget\": 30,\n", " \"metric\": \"r2\",\n", " \"task\": \"regression\",\n", " \"n_concurrent_trials\": 2,\n", " \"use_spark\": True,\n", - " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", + " \"force_cancel\": True,\n", "}\n", "\n", + "# Fit the model\n", "automl.fit(\n", - " dataframe=dataframe,\n", + " dataframe=psdf,\n", " label=label,\n", - " **automl_settings,\n", + " **settings,\n", ")\n", "```\n", "\n", - "Note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", + "This code first creates a pandas DataFrame and converts it to a pandas-on-spark DataFrame. Then, it uses VectorAssembler to merge all feature columns into a single vector column. After that, it initializes AutoML and defines the settings for the model. Finally, it fits the model using the defined settings. \n", "\n", - "I hope this helps! Let me know if you have any further questions.\n", + "Please note that you need to have a running Spark session to use Spark for parallel training in FLAML.\n", "\n", - "--------------------------------------------------------------------------------\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", "\n", "TERMINATE\n", @@ -647,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -658,136 +659,70 @@ "\n", "How to use spark for parallel training in FLAML? Give me sample code.\n", "\n", - "--------------------------------------------------------------------------------\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", - "\n", - "To use Spark for parallel training in FLAML, you can follow these steps:\n", - "\n", - "1. Install PySpark and FLAML on your machine.\n", - "2. Start a Spark cluster using the `pyspark` command.\n", - "3. Import the necessary libraries and initialize a SparkSession object.\n", - "4. Load your data into a Spark DataFrame.\n", - "5. Define your search space and search strategy using FLAML's API.\n", - "6. Create a SparkEstimator object and pass it to FLAML's `fit()` method.\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", - "Here's some sample code to get you started:\n", + "Sure, you can use PySpark to parallelize the training process in FLAML. Here's a simple example of how you can do it:\n", "\n", "```python\n", - "from pyspark.sql import SparkSession\n", "from flaml import AutoML\n", - "from flaml.data import get_output_from_log\n", - "\n", - "# Initialize a SparkSession object\n", - "spark = SparkSession.builder.appName(\"FLAML-Spark\").getOrCreate()\n", - "\n", - "# Load your data into a Spark DataFrame\n", - "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"path/to/data.csv\")\n", - "\n", - "# Define your search space and search strategy\n", - "search_space = {\n", - " \"n_estimators\": {\"domain\": range(10, 100)},\n", - " \"max_depth\": {\"domain\": range(1, 10)},\n", - " \"learning_rate\": {\"domain\": [0.001, 0.01, 0.1]},\n", - "}\n", - "search_strategy = \"skopt\"\n", - "\n", - "# Create a SparkEstimator object\n", - "from pyspark.ml.classification import GBTClassifier\n", - "estimator = GBTClassifier()\n", - "\n", - "# Pass the SparkEstimator object to FLAML's fit() method\n", - "automl = AutoML()\n", - "automl.fit(\n", - " X_train=data,\n", - " estimator=estimator,\n", - " task=\"classification\",\n", - " search_space=search_space,\n", - " search_alg=search_strategy,\n", - " n_jobs=-1,\n", - ")\n", - "\n", - "# Get the best model and its hyperparameters\n", - "best_model = automl.model\n", - "best_params = automl.best_config\n", - "\n", - "# Print the results\n", - "print(f\"Best model: {best_model}\")\n", - "print(f\"Best hyperparameters: {best_params}\")\n", - "\n", - "# Stop the SparkSession object\n", - "spark.stop()\n", - "```\n", - "\n", - "Note that the `n_jobs` parameter is set to `-1` to use all available cores on the Spark cluster. You can adjust this value to control the level of parallelism. Also, the `get_output_from_log()` function can be used to extract the results from the FLAML log file. \n", - "\n", - "TERMINATE\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import col\n", + "from sklearn.datasets import load_boston\n", + "import pandas as pd\n", "\n", - "--------------------------------------------------------------------------------\n", - "To use Spark for parallel training in FLAML, you can follow these steps:\n", + "# Initialize SparkSession\n", + "spark = SparkSession.builder \\\n", + " .appName(\"Parallel Training with FLAML\") \\\n", + " .getOrCreate()\n", "\n", - "1. Install PySpark and FLAML on your machine.\n", - "2. Start a Spark cluster using the `pyspark` command.\n", - "3. Import the necessary libraries and initialize a SparkSession object.\n", - "4. Load your data into a Spark DataFrame.\n", - "5. Define your search space and search strategy using FLAML's API.\n", - "6. Create a SparkEstimator object and pass it to FLAML's `fit()` method.\n", + "# Load the dataset\n", + "boston = load_boston()\n", + "df = pd.DataFrame(boston.data, columns=boston.feature_names)\n", + "df['target'] = pd.Series(boston.target)\n", "\n", - "Here's some sample code to get you started:\n", + "# Convert the pandas dataframe to spark dataframe\n", + "sdf = spark.createDataFrame(df)\n", "\n", - "```python\n", - "from pyspark.sql import SparkSession\n", - "from flaml import AutoML\n", - "from flaml.data import get_output_from_log\n", + "# Split the data into training and test sets\n", + "train, test = sdf.randomSplit([0.8, 0.2])\n", "\n", - "# Initialize a SparkSession object\n", - "spark = SparkSession.builder.appName(\"FLAML-Spark\").getOrCreate()\n", + "# Convert the spark dataframes back to pandas dataframes\n", + "train = train.select(\"*\").toPandas()\n", + "test = test.select(\"*\").toPandas()\n", "\n", - "# Load your data into a Spark DataFrame\n", - "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"path/to/data.csv\")\n", + "# Initialize the AutoML instance\n", + "automl = AutoML()\n", "\n", - "# Define your search space and search strategy\n", - "search_space = {\n", - " \"n_estimators\": {\"domain\": range(10, 100)},\n", - " \"max_depth\": {\"domain\": range(1, 10)},\n", - " \"learning_rate\": {\"domain\": [0.001, 0.01, 0.1]},\n", + "# Specify the task as regression and the metric to optimize as rmse\n", + "automl_settings = {\n", + " \"time_budget\": 120, # in seconds\n", + " \"metric\": 'rmse',\n", + " \"task\": 'regression',\n", + " \"log_file_name\": \"boston.log\",\n", "}\n", - "search_strategy = \"skopt\"\n", "\n", - "# Create a SparkEstimator object\n", - "from pyspark.ml.classification import GBTClassifier\n", - "estimator = GBTClassifier()\n", + "# Train the model\n", + "automl.fit(train.drop('target', axis=1), train['target'], **automl_settings)\n", "\n", - "# Pass the SparkEstimator object to FLAML's fit() method\n", - "automl = AutoML()\n", - "automl.fit(\n", - " X_train=data,\n", - " estimator=estimator,\n", - " task=\"classification\",\n", - " search_space=search_space,\n", - " search_alg=search_strategy,\n", - " n_jobs=-1,\n", - ")\n", + "# Predict on the test data\n", + "preds = automl.predict(test.drop('target', axis=1))\n", "\n", - "# Get the best model and its hyperparameters\n", - "best_model = automl.model\n", - "best_params = automl.best_config\n", + "# Print the best model and metric\n", + "print('Best ML model:', automl.best_estimator)\n", + "print('Best metric:', automl.best_loss)\n", + "```\n", "\n", - "# Print the results\n", - "print(f\"Best model: {best_model}\")\n", - "print(f\"Best hyperparameters: {best_params}\")\n", + "This script loads the Boston housing dataset, splits it into training and test sets, and then uses FLAML's AutoML to find the best model and hyperparameters. The training process is parallelized using PySpark.\n", "\n", - "# Stop the SparkSession object\n", - "spark.stop()\n", - "```\n", + "Please note that this is a simple example and might not fully utilize the capabilities of Spark. For larger datasets and more complex scenarios, you might need to customize this script to better suit your needs.\n", "\n", - "Note that the `n_jobs` parameter is set to `-1` to use all available cores on the Spark cluster. You can adjust this value to control the level of parallelism. Also, the `get_output_from_log()` function can be used to extract the results from the FLAML log file. \n", + "Also, please ensure that you have the necessary packages installed in your environment. You can install them using pip:\n", "\n", + "```bash\n", + "pip install flaml[forecast]\n", + "pip install pyspark\n", + "```\n", "TERMINATE\n", "\n", "--------------------------------------------------------------------------------\n" @@ -797,11 +732,18 @@ "source": [ "call_rag_chat()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "flaml", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -815,9 +757,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.6" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/setup.py b/setup.py index 21de92527a3..760da4f474b 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ ], "blendsearch": ["flaml[blendsearch]"], "mathchat": ["sympy", "pydantic==1.10.9", "wolframalpha"], - "retrievechat": ["chromadb", "sentence_transformers", "pypdf", "ipython"], + "retrievechat": ["lancedb", "sentence_transformers", "pypdf", "ipython"], "teachable": ["chromadb"], "lmm": ["replicate", "pillow"], "graphs": ["networkx~=3.2.1", "matplotlib~=3.8.1"], diff --git a/test/agentchat/contrib/retrievers/test_chromadb.py b/test/agentchat/contrib/retrievers/test_chromadb.py new file mode 100644 index 00000000000..6e8b37a8c8d --- /dev/null +++ b/test/agentchat/contrib/retrievers/test_chromadb.py @@ -0,0 +1,46 @@ +import os +import pytest +from pathlib import Path +from autogen.agentchat.contrib.retriever.retrieve_utils import ( + split_text_to_chunks, + extract_text_from_pdf, + split_files_to_chunks, + get_files_from_dir, + is_url, +) + +try: + from autogen.agentchat.contrib.retriever.chromadb import ChromaDB + import chromadb +except ImportError: + skip = True +else: + skip = False + +test_dir = Path(__file__).parent.parent.parent.parent / "test_files" + + +@pytest.mark.skipif(skip, reason="chromadb is not installed") +def test_chromadb(tmpdir): + # Test index creation and querying + client = chromadb.PersistentClient(path=tmpdir) + vectorstore = ChromaDB(path=tmpdir) + + vectorstore.ingest_data(str(test_dir)) + + assert client.get_collection("vectorstore") + + results = vectorstore.query(["autogen"]) + assert isinstance(results, dict) and any("autogen" in res[0].lower() for res in results.get("documents", [])) + + # Test index_exists() + vectorstore = ChromaDB(path=tmpdir) + assert vectorstore.index_exists + + # Test use_existing_index() + assert vectorstore.collection is None + vectorstore.use_existing_index() + assert vectorstore.collection is not None + + vectorstore.ingest_data(str(test_dir), overwrite=True) + vectorstore.query(["hello"]) diff --git a/test/agentchat/contrib/retrievers/test_lancedb.py b/test/agentchat/contrib/retrievers/test_lancedb.py new file mode 100644 index 00000000000..c8450926b7d --- /dev/null +++ b/test/agentchat/contrib/retrievers/test_lancedb.py @@ -0,0 +1,45 @@ +import numpy as np +from pathlib import Path +import pytest + +try: + from autogen.agentchat.contrib.retriever.lancedb import LanceDB + import lancedb +except ImportError: + skip = True +else: + skip = False + + +test_dir = Path(__file__).parent.parent.parent.parent / "test_files" + + +def embedding_fcn(texts): + return [np.array([0, 0]) for _ in texts] + + +@pytest.mark.skipif(skip, reason="lancedb is not installed") +def test_lancedb(tmpdir): + db = lancedb.connect(str(tmpdir)) + vectorstore = LanceDB(path=str(tmpdir)) + vectorstore.ingest_data(str(test_dir)) + + assert "vectorstore" in db.table_names() + + results = vectorstore.query(["autogen"]) + assert isinstance(results, dict) and any("autogen" in res[0].lower() for res in results.get("documents", [])) + + # Test index_exists() + vectorstore = LanceDB(path=str(tmpdir)) + assert vectorstore.index_exists + + # Test use_existing_index() + assert vectorstore.table is None + vectorstore.use_existing_index() + assert vectorstore.table is not None + + vectorstore.ingest_data(str(test_dir), overwrite=True) + vectorstore.query(["hello"]) + + vectorstore = LanceDB(path=str(tmpdir), embedding_function=embedding_fcn) + vectorstore.ingest_data(str(test_dir), overwrite=True) diff --git a/test/agentchat/contrib/test_qdrant_retrievechat.py b/test/agentchat/contrib/retrievers/test_qdrant_retrievechat.py similarity index 92% rename from test/agentchat/contrib/test_qdrant_retrievechat.py rename to test/agentchat/contrib/retrievers/test_qdrant_retrievechat.py index 1d3c5afd6af..e0dee0b21a4 100644 --- a/test/agentchat/contrib/test_qdrant_retrievechat.py +++ b/test/agentchat/contrib/retrievers/test_qdrant_retrievechat.py @@ -1,12 +1,10 @@ import os import sys +from pathlib import Path import pytest from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent from autogen import config_list_from_json -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402 - try: from qdrant_client import QdrantClient from autogen.agentchat.contrib.qdrant_retrieve_user_proxy_agent import ( @@ -27,7 +25,10 @@ except ImportError: OPENAI_INSTALLED = False -test_dir = os.path.join(os.path.dirname(__file__), "../..", "test_files") + +KEY_LOC = "notebook" +OAI_CONFIG_LIST = "OAI_CONFIG_LIST" +test_dir = Path(__file__).parent.parent.parent.parent / "test_files" @pytest.mark.skipif( @@ -90,7 +91,7 @@ def test_qdrant_filter(): @pytest.mark.skipif(not QDRANT_INSTALLED, reason="qdrant_client is not installed") def test_qdrant_search(): client = QdrantClient(":memory:") - create_qdrant_from_dir(test_dir, client=client) + create_qdrant_from_dir(str(test_dir), client=client) assert client.get_collection("all-my-documents") diff --git a/test/test_retrieve_utils.py b/test/agentchat/contrib/retrievers/test_retrieve_utils.py similarity index 77% rename from test/test_retrieve_utils.py rename to test/agentchat/contrib/retrievers/test_retrieve_utils.py index 013228e47ce..8dc657cb986 100644 --- a/test/test_retrieve_utils.py +++ b/test/agentchat/contrib/retrievers/test_retrieve_utils.py @@ -1,25 +1,27 @@ """ Unit test for retrieve_utils.py """ +import os +import sys +from pathlib import Path +import pytest + try: - import chromadb - from autogen.retrieve_utils import ( + from autogen.agentchat.contrib.retriever.retrieve_utils import ( split_text_to_chunks, extract_text_from_pdf, split_files_to_chunks, get_files_from_dir, is_url, - create_vector_db_from_dir, - query_vector_db, ) + from autogen.agentchat.contrib.retriever import DEFAULT_RETRIEVER, get_retriever from autogen.token_count_utils import count_token + + Retriever = get_retriever(DEFAULT_RETRIEVER) except ImportError: skip = True else: skip = False -import os -import sys -import pytest try: from unstructured.partition.auto import partition @@ -28,7 +30,7 @@ except ImportError: HAS_UNSTRUCTURED = False -test_dir = os.path.join(os.path.dirname(__file__), "test_files") +test_dir = Path(__file__).parent.parent.parent.parent / "test_files" expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities of Large Language Models (LLMs) for various applications. The primary purpose of AutoGen is to automate and simplify the process of building applications that leverage the power of LLMs, allowing for seamless @@ -93,26 +95,46 @@ def test_is_url(self): assert is_url("https://www.example.com") assert not is_url("not_a_url") - def test_create_vector_db_from_dir(self): - db_path = "/tmp/test_retrieve_utils_chromadb.db" - if os.path.exists(db_path): - client = chromadb.PersistentClient(path=db_path) - else: - client = chromadb.PersistentClient(path=db_path) - create_vector_db_from_dir(test_dir, client=client) + def test_custom_text_split_function(self): + def custom_text_split_function(text): + return [text[: len(text) // 2], text[len(text) // 2 :]] + + db_path = "/tmp/test_retrieve_utils" + retriever = Retriever( + path=db_path, + name="mytestcollection", + custom_text_split_function=custom_text_split_function, + use_existing=False, + recursive=False, + ) + retriever.ingest_data(os.path.join(test_dir, "example.txt")) + results = retriever.query(["autogen"], top_k=1) + assert ( + "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities" + in results.get("documents")[0][0] + ) - assert client.get_collection("all-my-documents") + def test_retrieve_utils(self): + retriever = Retriever(path="/tmp/chromadb", name="autogen-docs", use_existing=False) + retriever.ingest_data("./website/docs") + results = retriever.query(["autogen"], top_k=4, filter="AutoGen") - def test_query_vector_db(self): - db_path = "/tmp/test_retrieve_utils_chromadb.db" - if os.path.exists(db_path): - client = chromadb.PersistentClient(path=db_path) - else: # If the database does not exist, create it first - client = chromadb.PersistentClient(path=db_path) - create_vector_db_from_dir(test_dir, client=client) + print(results["ids"][0]) + assert len(results["ids"][0]) == 4 - results = query_vector_db(["autogen"], client=client) - assert isinstance(results, dict) and any("autogen" in res[0].lower() for res in results.get("documents", [])) + @pytest.mark.skipif( + not HAS_UNSTRUCTURED, + reason="do not run if unstructured is not installed", + ) + def test_unstructured(self): + pdf_file_path = os.path.join(test_dir, "example.pdf") + txt_file_path = os.path.join(test_dir, "example.txt") + word_file_path = os.path.join(test_dir, "example.docx") + chunks = split_files_to_chunks([pdf_file_path, txt_file_path, word_file_path]) + assert all( + isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip() + for chunk in chunks + ) def test_custom_vector_db(self): try: @@ -178,61 +200,6 @@ def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = ragragproxyagent.retrieve_docs("This is a test document spark", n_results=10, search_string="spark") assert ragragproxyagent._results["ids"] == [[3, 1, 5]] - def test_custom_text_split_function(self): - def custom_text_split_function(text): - return [text[: len(text) // 2], text[len(text) // 2 :]] - - db_path = "/tmp/test_retrieve_utils_chromadb.db" - client = chromadb.PersistentClient(path=db_path) - create_vector_db_from_dir( - os.path.join(test_dir, "example.txt"), - client=client, - collection_name="mytestcollection", - custom_text_split_function=custom_text_split_function, - get_or_create=True, - recursive=False, - ) - results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1) - assert ( - "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities" - in results.get("documents")[0][0] - ) - - def test_retrieve_utils(self): - client = chromadb.PersistentClient(path="/tmp/chromadb") - create_vector_db_from_dir( - dir_path="./website/docs", - client=client, - collection_name="autogen-docs", - custom_text_types=["txt", "md", "rtf", "rst"], - get_or_create=True, - ) - results = query_vector_db( - query_texts=[ - "How can I use AutoGen UserProxyAgent and AssistantAgent to do code generation?", - ], - n_results=4, - client=client, - collection_name="autogen-docs", - search_string="AutoGen", - ) - print(results["ids"][0]) - assert len(results["ids"][0]) == 4 - - @pytest.mark.skipif( - not HAS_UNSTRUCTURED, - reason="do not run if unstructured is not installed", - ) - def test_unstructured(self): - pdf_file_path = os.path.join(test_dir, "example.pdf") - txt_file_path = os.path.join(test_dir, "example.txt") - word_file_path = os.path.join(test_dir, "example.docx") - chunks = split_files_to_chunks([pdf_file_path, txt_file_path, word_file_path]) - assert all( - isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip() - for chunk in chunks - ) - if __name__ == "__main__": pytest.main() diff --git a/test/agentchat/contrib/test_retrievechat.py b/test/agentchat/contrib/retrievers/test_retrievechat.py similarity index 95% rename from test/agentchat/contrib/test_retrievechat.py rename to test/agentchat/contrib/retrievers/test_retrievechat.py index 574e3571b62..64941b25092 100644 --- a/test/agentchat/contrib/test_retrievechat.py +++ b/test/agentchat/contrib/retrievers/test_retrievechat.py @@ -1,11 +1,9 @@ import pytest import os import sys +from pathlib import Path import autogen -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402 - try: import openai from autogen.agentchat.contrib.retrieve_assistant_agent import ( @@ -21,6 +19,9 @@ except ImportError: skip_test = True +KEY_LOC = "notebook" +OAI_CONFIG_LIST = "OAI_CONFIG_LIST" + @pytest.mark.skipif( sys.platform in ["darwin", "win32"] or skip_test, @@ -51,10 +52,10 @@ def test_retrievechat(): human_input_mode="NEVER", max_consecutive_auto_reply=2, retrieve_config={ + "client": chromadb.PersistentClient(path="/tmp/chromadb"), "docs_path": "./website/docs", "chunk_token_size": 2000, "model": config_list[0]["model"], - "client": chromadb.PersistentClient(path="/tmp/chromadb"), "embedding_function": sentence_transformer_ef, "get_or_create": True, },
    # Event year Season Ceremony Flag bearer Sex State / Country Sport
    62 2018 Winter Closing Diggins , Jessica Jessica Diggins Minnesota Cross-country skiing
    61 2018 Winter Opening Hamlin , Erin Erin Hamlin New York Luge
    60 2016 Summer Closing Biles , Simone Simone Biles Texas Gymnastics
    59 2016 Summer Opening Phelps , Michael Michael Phelps Maryland Swimming
    58 2014 Winter Closing Chu , Julie Julie Chu Connecticut Hockey
    57 2014 Winter Opening Lodwick , Todd Todd Lodwick Colorado Nordic combined
    56 2012 Summer Closing Nellum , Bryshon Bryshon Nellum California Athletics
    55 2012 Summer Opening Zagunis , Mariel Mariel Zagunis Oregon Fencing
    54 Winter Closing Demong , Bill Bill Demong New York Nordic combined
    53 Winter Opening Grimmette , Mark Mark Grimmette Michigan Luge
    52 2008 Summer Closing Lorig , Khatuna Khatuna Lorig Georgia ( country ) Archery
    51 2008 Summer Opening Lomong , Lopez Lopez Lomong Sudan ( now South Sudan ) Athletics
    50 2006 Winter Closing Cheek , Joey Joey Cheek North Carolina Speed skating
    49 2006 Winter Opening Witty , Chris Chris Witty Wisconsin Speed skating
    48 Summer Closing Hamm , Mia Mia Hamm Texas Women 's soccer
    47 Summer Opening Staley , Dawn Dawn Staley Pennsylvania Basketball
    46 2002 Winter Closing Shimer , Brian Brian Shimer Florida Bobsleigh
    45 2002 Winter Opening Peterson , Amy Amy Peterson Minnesota Short track speed skating
    44 2000 Summer Closing Gardner , Rulon Rulon Gardner Wyoming Wrestling
    43 2000 Summer Opening Meidl , Cliff Cliff Meidl California Canoeing
    42 1998 Winter Closing Granato , Cammi Cammi Granato Illinois Hockey
    41 1998 Winter Opening Flaim , Eric Eric Flaim Massachusetts Speed skating
    40 Summer Closing Matz , Michael Michael Matz Pennsylvania Equestrian
    39 Summer Opening Baumgartner , Bruce Bruce Baumgartner New Jersey Wrestling
    38 1994 Winter Closing Jansen , Dan Dan Jansen Wisconsin Speed skating
    37 1994 Winter Opening Myler , Cammy Cammy Myler New York