fix: fix the prompt templates

umbertogriffo · Jul 29, 2024 · 3c46741 · 3c46741
1 parent cbd9a33
commit 3c46741
Show file tree

Hide file tree

Showing 10 changed files with 82 additions and 158 deletions.
diff --git a/README.md b/README.md
@@ -138,15 +138,13 @@ format.
 | 🤖 Model                                   | Supported | Model Size | Notes and link to the model                                                                                                                                          |
 |--------------------------------------------|-----------|------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `llama-3.1` Meta Llama 3.1 Instruct        | ✅         | 8B         | [link](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF)                                                                                             |
-| `llama-3` Meta Llama 3 Instruct            | ✅         | 8B         | [link](https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF)                                                                                               |
-| `openchat-3.6` - OpenChat 3.6              | ✅         | 8B         | **Recommended model** [link](https://huggingface.co/bartowski/openchat-3.6-8b-20240522-GGUF)                                                                         |
-| `openchat-3.5` - OpenChat 3.5              | ✅         | 7B         | [link](https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF)                                                                                                       |
+| `openchat-3.6` - OpenChat 3.6              | ✅         | 8B         | [link](https://huggingface.co/bartowski/openchat-3.6-8b-20240522-GGUF)                                                                                               |
+| `openchat-3.5` - OpenChat 3.5              | ✅         | 7B         | **Recommended model** [link](https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF)                                                                                 |
 | `starling` Starling Beta                   | ✅         | 7B         | Is trained from `Openchat-3.5-0106`. It's recommended if you prefer more verbosity over OpenChat - [link](https://huggingface.co/bartowski/Starling-LM-7B-beta-GGUF) |
-| `neural-beagle` NeuralBeagle14             | ✅         | 7B         | [link](https://huggingface.co/TheBloke/NeuralBeagle14-7B-GGUF)                                                                                                       |
 | `dolphin` Dolphin 2.6 Mistral DPO Laser    | ✅         | 7B         | [link](https://huggingface.co/TheBloke/dolphin-2.6-mistral-7B-dpo-laser-GGUF)                                                                                        |
 | `zephyr` Zephyr Beta                       | ✅         | 7B         | [link](https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF)                                                                                                          |
 | `mistral` Mistral OpenOrca                 | ✅         | 7B         | [link](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF)                                                                                                     |
-| `phi-3` Phi-3 Mini 4K Instruct             | ✅         | 3.8B       | [link](https://huggingface.co/bartowski/Phi-3.1-mini-4k-instruct-GGUF)                                                                                               |
+| `phi-3` Phi-3.1 Mini 4K Instruct           | ✅         | 3.8B       | Set `max-new-tokens` up to `1024`. Not recommended for RAG. [link](https://huggingface.co/bartowski/Phi-3.1-mini-4k-instruct-GGUF)                                   |
 | `stablelm-zephyr` StableLM Zephyr OpenOrca | ✅         | 3B         | [link](https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF)                                                                                                      |
 
 ## Supported Response Synthesis strategies

diff --git a/chatbot/bot/conversation/conversation_retrieval.py b/chatbot/bot/conversation/conversation_retrieval.py
@@ -95,6 +95,9 @@ def refine_question(self, question: str, max_new_tokens: int = 128) -> str:
             conversation_awareness_prompt = self.llm.generate_refined_question_conversation_awareness_prompt(
                 question, chat_history
             )
+
+            logger.info(f"--- Prompt:\n {conversation_awareness_prompt} \n---")
+
             refined_question = self.llm.generate_answer(conversation_awareness_prompt, max_new_tokens=max_new_tokens)
 
             logger.info(f"--- Refined Question: {refined_question} ---")
@@ -139,13 +142,17 @@ def answer(self, question: str, max_new_tokens: int = 512) -> Any:
             conversation_awareness_prompt = self.llm.generate_refined_answer_conversation_awareness_prompt(
                 question, chat_history
             )
+
+            logger.debug(f"--- Prompt:\n {conversation_awareness_prompt} \n---")
+
             streamer = self.llm.start_answer_iterator_streamer(
                 conversation_awareness_prompt, max_new_tokens=max_new_tokens
             )
 
             return streamer
         else:
             prompt = self.llm.generate_qa_prompt(question=question)
+            logger.debug(f"--- Prompt:\n {prompt} \n---")
             streamer = self.llm.start_answer_iterator_streamer(prompt, max_new_tokens=max_new_tokens)
             return streamer
 

diff --git a/chatbot/bot/model/model_settings.py b/chatbot/bot/model/model_settings.py
@@ -1,9 +1,8 @@
 from enum import Enum
 
 from bot.model.settings.dolphin import DolphinSettings
-from bot.model.settings.llama_3 import Llama3Settings, Llama31Settings
+from bot.model.settings.llama_3 import Llama31Settings
 from bot.model.settings.mistral import MistralSettings
-from bot.model.settings.neural_beagle import NeuralBeagleSettings
 from bot.model.settings.openchat import OpenChat35Settings, OpenChat36Settings
 from bot.model.settings.phi_3 import PhiThreeSettings
 from bot.model.settings.stablelm_zephyr import StableLMZephyrSettings
@@ -19,9 +18,7 @@ class ModelType(Enum):
     OPENCHAT_3_5 = "openchat-3.5"
     OPENCHAT_3_6 = "openchat-3.6"
     STARLING = "starling"
-    NEURAL_BEAGLE = "neural-beagle"
     PHI_3 = "phi-3"
-    LLAMA_3 = "llama-3"
     LLAMA_3_1 = "llama-3.1"
 
 
@@ -33,9 +30,7 @@ class ModelType(Enum):
     ModelType.OPENCHAT_3_5.value: OpenChat35Settings,
     ModelType.OPENCHAT_3_6.value: OpenChat36Settings,
     ModelType.STARLING.value: StarlingSettings,
-    ModelType.NEURAL_BEAGLE.value: NeuralBeagleSettings,
     ModelType.PHI_3.value: PhiThreeSettings,
-    ModelType.LLAMA_3.value: Llama3Settings,
     ModelType.LLAMA_3_1.value: Llama31Settings,
 }
 

diff --git a/chatbot/bot/model/settings/llama_3.py b/chatbot/bot/model/settings/llama_3.py
@@ -2,21 +2,19 @@
 from bot.model.model import Model
 
 
-class Llama3Settings(Model):
-    url = "https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
-    file_name = "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
+class Llama31Settings(Model):
+    url = "https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
+    file_name = "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
     clients = [LlmClientType.LAMA_CPP]
     config = {
         "n_ctx": 4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
         "n_threads": 8,  # The number of CPU threads to use, tailor to your system and the resulting performance
         "n_gpu_layers": 50,  # The number of layers to offload to GPU, if you have GPU acceleration available
     }
     config_answer = {"temperature": 0.7, "stop": []}
-    system_template = (
-        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a helpful, respectful and "
-        "honest assistant. <|eot_id|><|start_header_id|>user<|end_header_id|>"
-    )
-    qa_prompt_template = """{system}\n
+    system_template = "You are a helpful, respectful and  honest assistant."
+    qa_prompt_template = """<|start_header_id|>System<|end_header_id|>\n
+{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n
 Answer the question below:
 {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 """
@@ -28,7 +26,8 @@ class Llama3Settings(Model):
 Given the context information and not prior knowledge, answer the question below:
 {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 """
-    refined_ctx_prompt_template = """{system}\n
+    refined_ctx_prompt_template = """<|start_header_id|>System<|end_header_id|>\n
+{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n
 {question}
 We have provided an existing answer: {existing_answer}
 We have the opportunity to refine the existing answer
@@ -40,7 +39,8 @@ class Llama3Settings(Model):
 If the context isn't useful, return the original answer.
 Refined Answer:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 """
-    refined_question_conversation_awareness_prompt_template = """{system}\n
+    refined_question_conversation_awareness_prompt_template = """<|start_header_id|>System<|end_header_id|>\n
+{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n
 Chat History:
 ---------------------
 {chat_history}
@@ -50,11 +50,12 @@ class Llama3Settings(Model):
 Standalone question:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 """
 
-    refined_answer_conversation_awareness_prompt_template = """
+    refined_answer_conversation_awareness_prompt_template = """<|start_header_id|>System<|end_header_id|>
+{system}<|eot_id|><|start_header_id|>user<|end_header_id|>
 You are engaging in a conversation with a human participant who is unaware that they might be
-interacting with a machine. \n
-Your goal is to respond in a way that convincingly simulates human-like intelligence and behavior. \n
-The conversation should be natural, coherent, and contextually relevant. \n
+interacting with a machine.
+Your goal is to respond in a way that convincingly simulates human-like intelligence and behavior.
+The conversation should be natural, coherent, and contextually relevant.
 Chat History:
 ---------------------
 {chat_history}
@@ -66,9 +67,3 @@ class Llama3Settings(Model):
 Please also don't reformulate the follow up question, and write just a concise answer.
 <|eot_id|><|start_header_id|>assistant<|end_header_id|>
 """
-
-
-class Llama31Settings(Llama3Settings):
-    url = "https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
-    file_name = "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
-    clients = [LlmClientType.LAMA_CPP]
diff --git a/chatbot/bot/model/settings/neural_beagle.py b/chatbot/bot/model/settings/neural_beagle.py
diff --git a/chatbot/bot/model/settings/openchat.py b/chatbot/bot/model/settings/openchat.py
@@ -12,20 +12,20 @@ class OpenChat35Settings(Model):
         "n_gpu_layers": 50,  # The number of layers to offload to GPU, if you have GPU acceleration available
     }
     config_answer = {"temperature": 0.7, "stop": []}
-    system_template = "You are a helpful, respectful and honest assistant. "
-    qa_prompt_template = """{system}\n
+    system_template = ""
+    qa_prompt_template = """
 GPT4 Correct User: Answer the question below:
 {question}<|end_of_turn|>GPT4 Correct Assistant:
 """
-    ctx_prompt_template = """{system}\n
+    ctx_prompt_template = """
 GPT4 Correct User: Context information is below.
 ---------------------
 {context}
 ---------------------
 Given the context information and not prior knowledge, answer the question below:
 {question}<|end_of_turn|>GPT4 Correct Assistant:
 """
-    refined_ctx_prompt_template = """{system}\n
+    refined_ctx_prompt_template = """
 GPT4 Correct User: The original query is as follows: {question}
 We have provided an existing answer: {existing_answer}
 We have the opportunity to refine the existing answer
@@ -37,7 +37,7 @@ class OpenChat35Settings(Model):
 If the context isn't useful, return the original answer.
 Refined Answer:<|end_of_turn|>GPT4 Correct Assistant:
 """
-    refined_question_conversation_awareness_prompt_template = """{system}\n
+    refined_question_conversation_awareness_prompt_template = """
 GPT4 Correct User: Chat History:
 ---------------------
 {chat_history}
@@ -49,9 +49,9 @@ class OpenChat35Settings(Model):
 
     refined_answer_conversation_awareness_prompt_template = """
 GPT4 Correct User: You are engaging in a conversation with a human participant who is unaware that they might be
-interacting with a machine. \n
-Your goal is to respond in a way that convincingly simulates human-like intelligence and behavior. \n
-The conversation should be natural, coherent, and contextually relevant. \n
+interacting with a machine.
+Your goal is to respond in a way that convincingly simulates human-like intelligence and behavior.
+The conversation should be natural, coherent, and contextually relevant.
 Chat History:
 ---------------------
 {chat_history}
@@ -76,23 +76,20 @@ class OpenChat36Settings(Model):
         "flash_attn": False,  # Use flash attention.
     }
     config_answer = {"temperature": 0.7, "stop": []}
-    system_template = (
-        "<|start_header_id|>system<|end_header_id|>You are a helpful, respectful and "
-        "honest assistant. <|eot_id|><|start_header_id|>GPT4 Correct User<|end_header_id|>"
-    )
-    qa_prompt_template = """{system}\n
+    system_template = ""
+    qa_prompt_template = """<|start_header_id|>GPT4 Correct User<|end_header_id|>\n
 Answer the question below:
 {question}<|eot_id|><|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n
 """
-    ctx_prompt_template = """{system}\n
+    ctx_prompt_template = """<|begin_of_text|><|start_header_id|>GPT4 Correct User<|end_header_id|>\n
 Context information is below.
 ---------------------
 {context}
 ---------------------
 Given the context information and not prior knowledge, answer the question below:
 {question}<|eot_id|><|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n
 """
-    refined_ctx_prompt_template = """{system}\n
+    refined_ctx_prompt_template = """<|start_header_id|>GPT4 Correct User<|end_header_id|>\n
 The original query is as follows: {question}
 We have provided an existing answer: {existing_answer}
 We have the opportunity to refine the existing answer
@@ -104,7 +101,7 @@ class OpenChat36Settings(Model):
 If the context isn't useful, return the original answer.
 Refined Answer:<|eot_id|><|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n
 """
-    refined_question_conversation_awareness_prompt_template = """{system}\n
+    refined_question_conversation_awareness_prompt_template = """<|start_header_id|>GPT4 Correct User<|end_header_id|>\n
 Chat History:
 ---------------------
 {chat_history}
@@ -114,11 +111,11 @@ class OpenChat36Settings(Model):
 Standalone question:<|eot_id|><|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n
 """
 
-    refined_answer_conversation_awareness_prompt_template = """
+    refined_answer_conversation_awareness_prompt_template = """<|start_header_id|>GPT4 Correct User<|end_header_id|>\n
 You are engaging in a conversation with a human participant who is unaware that they might be
-interacting with a machine. \n
-Your goal is to respond in a way that convincingly simulates human-like intelligence and behavior. \n
-The conversation should be natural, coherent, and contextually relevant. \n
+interacting with a machine.
+Your goal is to respond in a way that convincingly simulates human-like intelligence and behavior.
+The conversation should be natural, coherent, and contextually relevant.
 Chat History:
 ---------------------
 {chat_history}