From 5cdc3ee1b318fabac410a2abd3d8e7f7f691433a Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 8 Oct 2023 07:33:44 +0000 Subject: [PATCH 1/4] Improve num_tokens_from_text --- .../contrib/retrieve_user_proxy_agent.py | 6 +- autogen/retrieve_utils.py | 69 +++++++++---------- test/test_retrieve_utils.py | 5 ++ 3 files changed, 41 insertions(+), 39 deletions(-) diff --git a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py index 11193a91e04..443755d2597 100644 --- a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py +++ b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py @@ -128,6 +128,9 @@ def __init__( - update_context (Optional, bool): if False, will not apply `Update Context` for interactive retrieval. Default is True. - get_or_create (Optional, bool): if True, will create/recreate a collection for the retrieve chat. This is the same as that used in chromadb. Default is False. + - custom_token_count_function(Optional, Callable): a custom function to count the number of tokens in a string. + The function should take a string as input and return three integers (token_count, tokens_per_message, tokens_per_name). + Default is None, tiktoken will be used and works well for OpenAI models, but may not accurate for other models. **kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__). """ super().__init__( @@ -152,6 +155,7 @@ def __init__( self.customized_answer_prefix = self._retrieve_config.get("customized_answer_prefix", "").upper() self.update_context = self._retrieve_config.get("update_context", True) self._get_or_create = self._retrieve_config.get("get_or_create", False) + self.custom_token_count_function = self._retrieve_config.get("custom_token_count_function", None) self._context_max_tokens = self._max_tokens * 0.8 self._collection = False # the collection is not created self._ipython = get_ipython() @@ -191,7 +195,7 @@ def _get_context(self, results): continue if results["ids"][0][idx] in self._doc_ids: continue - _doc_tokens = num_tokens_from_text(doc) + _doc_tokens = num_tokens_from_text(doc, custom_token_count_function=self.custom_token_count_function) if _doc_tokens > self._context_max_tokens: func_print = f"Skip doc_id {results['ids'][0][idx]} as it is too long to fit in the context." print(colored(func_print, "green"), flush=True) diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py index ecca2f2b0bf..9bc6e33470b 100644 --- a/autogen/retrieve_utils.py +++ b/autogen/retrieve_utils.py @@ -1,4 +1,4 @@ -from typing import List, Union, Dict, Tuple +from typing import List, Union, Dict, Tuple, Callable import os import requests from urllib.parse import urlparse @@ -33,59 +33,52 @@ def num_tokens_from_text( - text: str, model: str = "gpt-3.5-turbo-0613", return_tokens_per_name_and_message: bool = False + text: str, model: str = "gpt-3.5-turbo-0613", return_tokens_per_name_and_message: bool = False, custom_token_count_function: Callable = None ) -> Union[int, Tuple[int, int, int]]: """Return the number of tokens used by a text.""" - # https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb - try: - encoding = tiktoken.encoding_for_model(model) - except KeyError: - logger.debug("Warning: model not found. Using cl100k_base encoding.") - encoding = tiktoken.get_encoding("cl100k_base") - if model in { - "gpt-3.5-turbo-0613", - "gpt-3.5-turbo-16k-0613", - "gpt-4-0314", - "gpt-4-32k-0314", - "gpt-4-0613", - "gpt-4-32k-0613", - }: - tokens_per_message = 3 - tokens_per_name = 1 - elif model == "gpt-3.5-turbo-0301": - tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n - tokens_per_name = -1 # if there's a name, the role is omitted - elif "gpt-3.5-turbo" in model or "gpt-35-turbo" in model: - logger.warning("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.") - return num_tokens_from_text(text, model="gpt-3.5-turbo-0613") - elif "gpt-4" in model: - logger.warning("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.") - return num_tokens_from_text(text, model="gpt-4-0613") - else: - raise NotImplementedError( - f"""num_tokens_from_text() is not implemented for model {model}. See """ - f"""https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are """ - f"""converted to tokens.""" - ) + if isinstance(custom_token_count_function, Callable): + token_count, tokens_per_message, tokens_per_name = custom_token_count_function(text) + else: + # https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb + try: + encoding = tiktoken.encoding_for_model(model) + except KeyError: + logger.debug("Warning: model not found. Using cl100k_base encoding.") + encoding = tiktoken.get_encoding("cl100k_base") + known_models = { + "gpt-3.5-turbo":(3,1), + "gpt-35-turbo":(3,1), + "gpt-3.5-turbo-0613":(3,1), + "gpt-3.5-turbo-16k-0613":(3,1), + "gpt-3.5-turbo-0301":(4,-1), + "gpt-4":(3,1), + "gpt-4-0314":(3,1), + "gpt-4-32k-0314":(3,1), + "gpt-4-0613":(3,1), + "gpt-4-32k-0613":(3,1), + } + tokens_per_message, tokens_per_name = known_models.get(model, (3,1)) + token_count = len(encoding.encode(text)) + if return_tokens_per_name_and_message: - return len(encoding.encode(text)), tokens_per_message, tokens_per_name + return token_count, tokens_per_message, tokens_per_name else: - return len(encoding.encode(text)) + return token_count -def num_tokens_from_messages(messages: dict, model: str = "gpt-3.5-turbo-0613"): +def num_tokens_from_messages(messages: dict, model: str = "gpt-3.5-turbo-0613", custom_token_count_function: Callable = None, custom_prime_count=3): """Return the number of tokens used by a list of messages.""" num_tokens = 0 for message in messages: for key, value in message.items(): _num_tokens, tokens_per_message, tokens_per_name = num_tokens_from_text( - value, model=model, return_tokens_per_name_and_message=True + value, model=model, return_tokens_per_name_and_message=True, custom_token_count_function=custom_token_count_function ) num_tokens += _num_tokens if key == "name": num_tokens += tokens_per_name num_tokens += tokens_per_message - num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> + num_tokens += custom_prime_count # By default every reply is primed with <|start|>assistant<|message|> return num_tokens diff --git a/test/test_retrieve_utils.py b/test/test_retrieve_utils.py index b0743455fbc..9c30bdc6de7 100644 --- a/test/test_retrieve_utils.py +++ b/test/test_retrieve_utils.py @@ -31,6 +31,11 @@ class TestRetrieveUtils: + def test_num_tokens_from_text_custom_token_count_function(self): + def custom_token_count_function(text): + return len(text), 1, 2 + text = "This is a sample text." + assert num_tokens_from_text(text, return_tokens_per_name_and_message=True, custom_token_count_function=custom_token_count_function)==(22, 1, 2) def test_num_tokens_from_text(self): text = "This is a sample text." assert num_tokens_from_text(text) == len(tiktoken.get_encoding("cl100k_base").encode(text)) From f72992f2d9be6f0fe40f84d00fc4938404d5e978 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 8 Oct 2023 07:34:18 +0000 Subject: [PATCH 2/4] Format --- autogen/retrieve_utils.py | 41 +++++++++++++++++++++++-------------- test/test_retrieve_utils.py | 6 +++++- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py index 9bc6e33470b..fa5e9c6a88d 100644 --- a/autogen/retrieve_utils.py +++ b/autogen/retrieve_utils.py @@ -33,12 +33,15 @@ def num_tokens_from_text( - text: str, model: str = "gpt-3.5-turbo-0613", return_tokens_per_name_and_message: bool = False, custom_token_count_function: Callable = None + text: str, + model: str = "gpt-3.5-turbo-0613", + return_tokens_per_name_and_message: bool = False, + custom_token_count_function: Callable = None, ) -> Union[int, Tuple[int, int, int]]: """Return the number of tokens used by a text.""" if isinstance(custom_token_count_function, Callable): token_count, tokens_per_message, tokens_per_name = custom_token_count_function(text) - else: + else: # https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb try: encoding = tiktoken.encoding_for_model(model) @@ -46,18 +49,18 @@ def num_tokens_from_text( logger.debug("Warning: model not found. Using cl100k_base encoding.") encoding = tiktoken.get_encoding("cl100k_base") known_models = { - "gpt-3.5-turbo":(3,1), - "gpt-35-turbo":(3,1), - "gpt-3.5-turbo-0613":(3,1), - "gpt-3.5-turbo-16k-0613":(3,1), - "gpt-3.5-turbo-0301":(4,-1), - "gpt-4":(3,1), - "gpt-4-0314":(3,1), - "gpt-4-32k-0314":(3,1), - "gpt-4-0613":(3,1), - "gpt-4-32k-0613":(3,1), + "gpt-3.5-turbo": (3, 1), + "gpt-35-turbo": (3, 1), + "gpt-3.5-turbo-0613": (3, 1), + "gpt-3.5-turbo-16k-0613": (3, 1), + "gpt-3.5-turbo-0301": (4, -1), + "gpt-4": (3, 1), + "gpt-4-0314": (3, 1), + "gpt-4-32k-0314": (3, 1), + "gpt-4-0613": (3, 1), + "gpt-4-32k-0613": (3, 1), } - tokens_per_message, tokens_per_name = known_models.get(model, (3,1)) + tokens_per_message, tokens_per_name = known_models.get(model, (3, 1)) token_count = len(encoding.encode(text)) if return_tokens_per_name_and_message: @@ -66,13 +69,21 @@ def num_tokens_from_text( return token_count -def num_tokens_from_messages(messages: dict, model: str = "gpt-3.5-turbo-0613", custom_token_count_function: Callable = None, custom_prime_count=3): +def num_tokens_from_messages( + messages: dict, + model: str = "gpt-3.5-turbo-0613", + custom_token_count_function: Callable = None, + custom_prime_count=3, +): """Return the number of tokens used by a list of messages.""" num_tokens = 0 for message in messages: for key, value in message.items(): _num_tokens, tokens_per_message, tokens_per_name = num_tokens_from_text( - value, model=model, return_tokens_per_name_and_message=True, custom_token_count_function=custom_token_count_function + value, + model=model, + return_tokens_per_name_and_message=True, + custom_token_count_function=custom_token_count_function, ) num_tokens += _num_tokens if key == "name": diff --git a/test/test_retrieve_utils.py b/test/test_retrieve_utils.py index 9c30bdc6de7..fdb93d26ca8 100644 --- a/test/test_retrieve_utils.py +++ b/test/test_retrieve_utils.py @@ -34,8 +34,12 @@ class TestRetrieveUtils: def test_num_tokens_from_text_custom_token_count_function(self): def custom_token_count_function(text): return len(text), 1, 2 + text = "This is a sample text." - assert num_tokens_from_text(text, return_tokens_per_name_and_message=True, custom_token_count_function=custom_token_count_function)==(22, 1, 2) + assert num_tokens_from_text( + text, return_tokens_per_name_and_message=True, custom_token_count_function=custom_token_count_function + ) == (22, 1, 2) + def test_num_tokens_from_text(self): text = "This is a sample text." assert num_tokens_from_text(text) == len(tiktoken.get_encoding("cl100k_base").encode(text)) From 7fc065bc96e07cc1987b01d5af9baa8dc5061b64 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 8 Oct 2023 08:16:21 +0000 Subject: [PATCH 3/4] Update comments --- autogen/retrieve_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py index fa5e9c6a88d..7810b1d88e9 100644 --- a/autogen/retrieve_utils.py +++ b/autogen/retrieve_utils.py @@ -73,7 +73,7 @@ def num_tokens_from_messages( messages: dict, model: str = "gpt-3.5-turbo-0613", custom_token_count_function: Callable = None, - custom_prime_count=3, + custom_prime_count: int = 3, ): """Return the number of tokens used by a list of messages.""" num_tokens = 0 @@ -89,7 +89,7 @@ def num_tokens_from_messages( if key == "name": num_tokens += tokens_per_name num_tokens += tokens_per_message - num_tokens += custom_prime_count # By default every reply is primed with <|start|>assistant<|message|> + num_tokens += custom_prime_count # ChatGPT every reply is primed with <|start|>assistant<|message|> return num_tokens From 3f3d137b96b089d5bc4765c456e196009408035e Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 9 Oct 2023 08:46:31 +0800 Subject: [PATCH 4/4] Improve docstrings --- .../contrib/retrieve_user_proxy_agent.py | 2 +- autogen/retrieve_utils.py | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py index 443755d2597..37b8c5c33a7 100644 --- a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py +++ b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py @@ -130,7 +130,7 @@ def __init__( This is the same as that used in chromadb. Default is False. - custom_token_count_function(Optional, Callable): a custom function to count the number of tokens in a string. The function should take a string as input and return three integers (token_count, tokens_per_message, tokens_per_name). - Default is None, tiktoken will be used and works well for OpenAI models, but may not accurate for other models. + Default is None, tiktoken will be used and may not be accurate for non-OpenAI models. **kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__). """ super().__init__( diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py index 7810b1d88e9..fbe7c28784a 100644 --- a/autogen/retrieve_utils.py +++ b/autogen/retrieve_utils.py @@ -38,7 +38,20 @@ def num_tokens_from_text( return_tokens_per_name_and_message: bool = False, custom_token_count_function: Callable = None, ) -> Union[int, Tuple[int, int, int]]: - """Return the number of tokens used by a text.""" + """Return the number of tokens used by a text. + + Args: + text (str): The text to count tokens for. + model (Optional, str): The model to use for tokenization. Default is "gpt-3.5-turbo-0613". + return_tokens_per_name_and_message (Optional, bool): Whether to return the number of tokens per name and per + message. Default is False. + custom_token_count_function (Optional, Callable): A custom function to count tokens. Default is None. + + Returns: + int: The number of tokens used by the text. + int: The number of tokens per message. Only returned if return_tokens_per_name_and_message is True. + int: The number of tokens per name. Only returned if return_tokens_per_name_and_message is True. + """ if isinstance(custom_token_count_function, Callable): token_count, tokens_per_message, tokens_per_name = custom_token_count_function(text) else: @@ -89,7 +102,7 @@ def num_tokens_from_messages( if key == "name": num_tokens += tokens_per_name num_tokens += tokens_per_message - num_tokens += custom_prime_count # ChatGPT every reply is primed with <|start|>assistant<|message|> + num_tokens += custom_prime_count # With ChatGPT, every reply is primed with <|start|>assistant<|message|> return num_tokens