diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py index 88485f4e9..45c7f8186 100644 --- a/private_gpt/components/llm/llm_component.py +++ b/private_gpt/components/llm/llm_component.py @@ -1,11 +1,13 @@ import logging from injector import inject, singleton +from llama_index import set_global_tokenizer from llama_index.llms import MockLLM from llama_index.llms.base import LLM +from transformers import AutoTokenizer # type: ignore from private_gpt.components.llm.prompt_helper import get_prompt_style -from private_gpt.paths import models_path +from private_gpt.paths import models_cache_path, models_path from private_gpt.settings.settings import Settings logger = logging.getLogger(__name__) @@ -18,6 +20,14 @@ class LLMComponent: @inject def __init__(self, settings: Settings) -> None: llm_mode = settings.llm.mode + if settings.llm.tokenizer: + set_global_tokenizer( + AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=settings.llm.tokenizer, + cache_dir=str(models_cache_path), + ) + ) + logger.info("Initializing the LLM in mode=%s", llm_mode) match settings.llm.mode: case "local": @@ -29,9 +39,7 @@ def __init__(self, settings: Settings) -> None: model_path=str(models_path / settings.local.llm_hf_model_file), temperature=0.1, max_new_tokens=settings.llm.max_new_tokens, - # llama2 has a context window of 4096 tokens, - # but we set it lower to allow for some wiggle room - context_window=3900, + context_window=settings.llm.context_window, generate_kwargs={}, # All to GPU model_kwargs={"n_gpu_layers": -1}, @@ -46,6 +54,8 @@ def __init__(self, settings: Settings) -> None: self.llm = SagemakerLLM( endpoint_name=settings.sagemaker.llm_endpoint_name, + max_new_tokens=settings.llm.max_new_tokens, + context_window=settings.llm.context_window, ) case "openai": from llama_index.llms import OpenAI diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index 8b03f6111..06d8a70bd 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -86,6 +86,18 @@ class LLMSettings(BaseModel): 256, description="The maximum number of token that the LLM is authorized to generate in one completion.", ) + context_window: int = Field( + 3900, + description="The maximum number of context tokens for the model.", + ) + tokenizer: str = Field( + None, + description="The model id of a predefined tokenizer hosted inside a model repo on " + "huggingface.co. Valid model ids can be located at the root-level, like " + "`bert-base-uncased`, or namespaced under a user or organization name, " + "like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching " + "gpt-3.5-turbo LLM.", + ) class VectorstoreSettings(BaseModel): diff --git a/scripts/setup b/scripts/setup index 7c1f44464..e08516a2b 100755 --- a/scripts/setup +++ b/scripts/setup @@ -3,6 +3,7 @@ import os import argparse from huggingface_hub import hf_hub_download, snapshot_download +from transformers import AutoTokenizer from private_gpt.paths import models_path, models_cache_path from private_gpt.settings.settings import settings @@ -15,8 +16,9 @@ if __name__ == '__main__': resume_download = args.resume os.makedirs(models_path, exist_ok=True) -embedding_path = models_path / "embedding" +# Download Embedding model +embedding_path = models_path / "embedding" print(f"Downloading embedding {settings().local.embedding_hf_model_name}") snapshot_download( repo_id=settings().local.embedding_hf_model_name, @@ -24,9 +26,9 @@ snapshot_download( local_dir=embedding_path, ) print("Embedding model downloaded!") -print("Downloading models for local execution...") # Download LLM and create a symlink to the model file +print(f"Downloading LLM {settings().local.llm_hf_model_file}") hf_hub_download( repo_id=settings().local.llm_hf_repo_id, filename=settings().local.llm_hf_model_file, @@ -34,6 +36,14 @@ hf_hub_download( local_dir=models_path, resume_download=resume_download, ) - print("LLM model downloaded!") + +# Download Tokenizer +print(f"Downloading tokenizer {settings().llm.tokenizer}") +AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=settings().llm.tokenizer, + cache_dir=models_cache_path, +) +print("Tokenizer downloaded!") + print("Setup done") diff --git a/settings.yaml b/settings.yaml index c3929539a..7521515e6 100644 --- a/settings.yaml +++ b/settings.yaml @@ -34,6 +34,10 @@ ui: llm: mode: local + # Should be matching the selected model + max_new_tokens: 512 + context_window: 32768 + tokenizer: mistralai/Mistral-7B-Instruct-v0.2 embedding: # Should be matching the value above in most cases