Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configurable context_window and tokenizer #1437

Merged
merged 2 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions private_gpt/components/llm/llm_component.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging

from injector import inject, singleton
from llama_index import set_global_tokenizer
from llama_index.llms import MockLLM
from llama_index.llms.base import LLM
from transformers import AutoTokenizer # type: ignore

from private_gpt.components.llm.prompt_helper import get_prompt_style
from private_gpt.paths import models_path
from private_gpt.paths import models_cache_path, models_path
from private_gpt.settings.settings import Settings

logger = logging.getLogger(__name__)
Expand All @@ -18,6 +20,14 @@ class LLMComponent:
@inject
def __init__(self, settings: Settings) -> None:
llm_mode = settings.llm.mode
if settings.llm.tokenizer:
set_global_tokenizer(
AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=settings.llm.tokenizer,
cache_dir=str(models_cache_path),
)
)

logger.info("Initializing the LLM in mode=%s", llm_mode)
match settings.llm.mode:
case "local":
Expand All @@ -29,9 +39,7 @@ def __init__(self, settings: Settings) -> None:
model_path=str(models_path / settings.local.llm_hf_model_file),
temperature=0.1,
max_new_tokens=settings.llm.max_new_tokens,
# llama2 has a context window of 4096 tokens,
# but we set it lower to allow for some wiggle room
context_window=3900,
context_window=settings.llm.context_window,
generate_kwargs={},
# All to GPU
model_kwargs={"n_gpu_layers": -1},
Expand All @@ -46,6 +54,8 @@ def __init__(self, settings: Settings) -> None:

self.llm = SagemakerLLM(
endpoint_name=settings.sagemaker.llm_endpoint_name,
max_new_tokens=settings.llm.max_new_tokens,
context_window=settings.llm.context_window,
)
case "openai":
from llama_index.llms import OpenAI
Expand Down
12 changes: 12 additions & 0 deletions private_gpt/settings/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,18 @@ class LLMSettings(BaseModel):
256,
description="The maximum number of token that the LLM is authorized to generate in one completion.",
)
context_window: int = Field(
3900,
description="The maximum number of context tokens for the model.",
)
tokenizer: str = Field(
None,
description="The model id of a predefined tokenizer hosted inside a model repo on "
"huggingface.co. Valid model ids can be located at the root-level, like "
"`bert-base-uncased`, or namespaced under a user or organization name, "
"like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching "
"gpt-3.5-turbo LLM.",
)


class VectorstoreSettings(BaseModel):
Expand Down
16 changes: 13 additions & 3 deletions scripts/setup
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import os
import argparse

from huggingface_hub import hf_hub_download, snapshot_download
from transformers import AutoTokenizer

from private_gpt.paths import models_path, models_cache_path
from private_gpt.settings.settings import settings
Expand All @@ -15,25 +16,34 @@ if __name__ == '__main__':
resume_download = args.resume

os.makedirs(models_path, exist_ok=True)
embedding_path = models_path / "embedding"

# Download Embedding model
embedding_path = models_path / "embedding"
print(f"Downloading embedding {settings().local.embedding_hf_model_name}")
snapshot_download(
repo_id=settings().local.embedding_hf_model_name,
cache_dir=models_cache_path,
local_dir=embedding_path,
)
print("Embedding model downloaded!")
print("Downloading models for local execution...")

# Download LLM and create a symlink to the model file
print(f"Downloading LLM {settings().local.llm_hf_model_file}")
hf_hub_download(
repo_id=settings().local.llm_hf_repo_id,
filename=settings().local.llm_hf_model_file,
cache_dir=models_cache_path,
local_dir=models_path,
resume_download=resume_download,
)

print("LLM model downloaded!")

# Download Tokenizer
print(f"Downloading tokenizer {settings().llm.tokenizer}")
AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=settings().llm.tokenizer,
cache_dir=models_cache_path,
)
print("Tokenizer downloaded!")

print("Setup done")
4 changes: 4 additions & 0 deletions settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ ui:

llm:
mode: local
# Should be matching the selected model
max_new_tokens: 512
context_window: 32768
tokenizer: mistralai/Mistral-7B-Instruct-v0.2

embedding:
# Should be matching the value above in most cases
Expand Down