diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py index 351513e46..500ed02e2 100644 --- a/private_gpt/components/llm/llm_component.py +++ b/private_gpt/components/llm/llm_component.py @@ -39,16 +39,24 @@ def __init__(self, settings: Settings) -> None: ) from e prompt_style = get_prompt_style(settings.llamacpp.prompt_style) + settings_kwargs = { + "tfs_z": settings.llamacpp.tfs_z, # ollama and llama-cpp + "top_k": settings.llamacpp.top_k, # ollama and llama-cpp + "top_p": settings.llamacpp.top_p, # ollama and llama-cpp + "repeat_penalty": settings.llamacpp.repeat_penalty, # ollama llama-cpp + "n_gpu_layers": -1, + "offload_kqv": True, + } self.llm = LlamaCPP( model_path=str(models_path / settings.llamacpp.llm_hf_model_file), - temperature=0.1, + temperature=settings.llm.temperature, max_new_tokens=settings.llm.max_new_tokens, context_window=settings.llm.context_window, generate_kwargs={}, callback_manager=LlamaIndexSettings.callback_manager, # All to GPU - model_kwargs={"n_gpu_layers": -1, "offload_kqv": True}, + model_kwargs=settings_kwargs, # transform inputs into Llama2 format messages_to_prompt=prompt_style.messages_to_prompt, completion_to_prompt=prompt_style.completion_to_prompt, @@ -81,6 +89,7 @@ def __init__(self, settings: Settings) -> None: api_base=openai_settings.api_base, api_key=openai_settings.api_key, model=openai_settings.model, + temperature=settings.llm.temperature, ) case "openailike": try: @@ -108,8 +117,22 @@ def __init__(self, settings: Settings) -> None: ) from e ollama_settings = settings.ollama + + settings_kwargs = { + "tfs_z": ollama_settings.tfs_z, # ollama and llama-cpp + "num_predict": ollama_settings.num_predict, # ollama only + "top_k": ollama_settings.top_k, # ollama and llama-cpp + "top_p": ollama_settings.top_p, # ollama and llama-cpp + "repeat_last_n": ollama_settings.repeat_last_n, # ollama + "repeat_penalty": ollama_settings.repeat_penalty, # ollama llama-cpp + } + self.llm = Ollama( - model=ollama_settings.llm_model, base_url=ollama_settings.api_base + model=ollama_settings.llm_model, + base_url=ollama_settings.api_base, + temperature=settings.llm.temperature, + context_window=settings.llm.context_window, + additional_kwargs=settings_kwargs, ) case "mock": self.llm = MockLLM() diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index cbb890237..40afb6b1e 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -98,6 +98,10 @@ class LLMSettings(BaseModel): "like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching " "gpt-3.5-turbo LLM.", ) + temperature: float = Field( + 0.1, + description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.", + ) class VectorstoreSettings(BaseModel): @@ -118,6 +122,22 @@ class LlamaCPPSettings(BaseModel): "`llama2` is the historic behaviour. `default` might work better with your custom models." ), ) + tfs_z: float = Field( + 1.0, + description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.", + ) + top_k: int = Field( + 40, + description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)", + ) + top_p: float = Field( + 0.9, + description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)", + ) + repeat_penalty: float = Field( + 1.1, + description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)", + ) class HuggingFaceSettings(BaseModel): @@ -184,6 +204,30 @@ class OllamaSettings(BaseModel): None, description="Model to use. Example: 'nomic-embed-text'.", ) + tfs_z: float = Field( + 1.0, + description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.", + ) + num_predict: int = Field( + 128, + description="Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)", + ) + top_k: int = Field( + 40, + description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)", + ) + top_p: float = Field( + 0.9, + description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)", + ) + repeat_last_n: int = Field( + 64, + description="Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)", + ) + repeat_penalty: float = Field( + 1.1, + description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)", + ) class UISettings(BaseModel): diff --git a/settings-ollama.yaml b/settings-ollama.yaml index 4f2cab4d8..e494a803d 100644 --- a/settings-ollama.yaml +++ b/settings-ollama.yaml @@ -5,6 +5,7 @@ llm: mode: ollama max_new_tokens: 512 context_window: 3900 + temperature: 0.1 #The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1) embedding: mode: ollama @@ -13,6 +14,12 @@ ollama: llm_model: mistral embedding_model: nomic-embed-text api_base: http://localhost:11434 + tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. + num_predict: 128 # Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context) + top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) + top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) + repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) + repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) vectorstore: database: qdrant diff --git a/settings.yaml b/settings.yaml index 9d3cd0737..a9a676bdb 100644 --- a/settings.yaml +++ b/settings.yaml @@ -39,11 +39,16 @@ llm: # Should be matching the selected model max_new_tokens: 512 context_window: 3900 + temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1) llamacpp: prompt_style: "mistral" llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf + tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting + top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) + top_p: 1.0 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) + repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) embedding: # Should be matching the value above in most cases