From c99f888f3e7f4abd940677dc7e877d40bd0f9c69 Mon Sep 17 00:00:00 2001
From: Ritchie Ng <ritchieng@u.nus.edu>
Date: Sun, 10 Mar 2024 16:48:27 +0800
Subject: [PATCH] Completed LLM introduction tutorial

---
 .../llm/llm_intro_hyperparameter_tuning.ipynb | 542 +++++++++---------
 .../llm/llm_intro_hyperparameter_tuning.md    | 536 ++++++++++-------
 docs/language_model/rag/rag_intro.ipynb       |   4 +-
 3 files changed, 599 insertions(+), 483 deletions(-)

diff --git a/docs/language_model/llm/llm_intro_hyperparameter_tuning.ipynb b/docs/language_model/llm/llm_intro_hyperparameter_tuning.ipynb
index 9e46d9ff3..b9111c33b 100644
--- a/docs/language_model/llm/llm_intro_hyperparameter_tuning.ipynb
+++ b/docs/language_model/llm/llm_intro_hyperparameter_tuning.ipynb
@@ -29,7 +29,7 @@
    "id": "fe13995d-0b3a-468d-8a7c-c6f36969e78c",
    "metadata": {},
    "source": [
-    "Follow our [tutorial on Apptainer](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/) to get started. Once you have followed the tutorial and you completed the [Ollama, LlamaIndex and Gemma:7b](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/#ollama-gemma-workload section), you will be able to run `jupyter lab` in a new window to access and run this notebook.\n",
+    "Follow our [tutorial on Apptainer](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/) to get started. Once you have followed the tutorial till the [Ollama section](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/#ollama-gemma-workloads) where you successfully ran `ollama serve` and `ollama run gemma:7b`, you can run the `apptainer shell --nv --nvccli apptainer_container_0.1.sif` command followed by `jupyter lab` to access and run this notebook.\n",
     "\n",
     "!!! info  \"Directory Guide\"\n",
     "\n",
@@ -52,6 +52,16 @@
     "In this section, we will leverage on the `Gemma:7b` LLM model to ask basic questions to get responses."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d13f90c6-d6b3-429c-b1d6-2e5e07c409fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ollama"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "e5f2e169-c225-406a-8eb2-33f001704587",
@@ -62,39 +72,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "dbdc4de1-24be-4e3d-a4fa-483a4f3cd901",
+   "execution_count": 2,
+   "id": "9654b6da-f290-41f6-a1da-aa70422261ef",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/conda/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Singapore is a city-state located on the island of Singapore. It is a Southeast Asian country and is known for its high standard of living, cleanliness, and efficiency.\n"
+      "Singapore is a city-state located on the island of Singapore, a Southeast Asian island. It is a highly developed city known for its modern architecture, efficient transportation system, and vibrant cultural diversity.\n"
      ]
     }
    ],
    "source": [
-    "# Import the Ollama class from the llama_index.llms.ollama module.\n",
-    "from llama_index.llms.ollama import Ollama\n",
-    "\n",
-    "# Create an instance of the Ollama class. The \"gemma:7b\" argument specifies the model to be used.\n",
-    "llm = Ollama(model=\"gemma:7b\")\n",
+    "# The 'chat' function is called with two parameters: 'model' and 'messages'.\n",
+    "response = ollama.chat(\n",
+    "    model='gemma:7b',  # The 'model' parameter specifies the model to be used. Here, 'gemma:7b' is the model.\n",
+    "    messages=[  # The 'messages' parameter is a list of message objects.\n",
+    "        {\n",
+    "            'role': 'user',  # Each message object has a 'role' key. It can be 'user' or 'assistant'.\n",
+    "            'content': 'What is Singapore?',  # The 'content' key contains the actual content of the message.\n",
+    "        },\n",
+    "    ]\n",
+    ")\n",
     "\n",
-    "# Call the complete method on the Ollama instance. \n",
-    "# The method generates a completion for the given prompt \"What is Singapore?\".\n",
-    "response = llm.complete(\"What is Singapore?\")\n",
-    "\n",
-    "# Print the generated response\n",
-    "print(response)"
+    "# The 'chat' function returns a response object. \n",
+    "# The content of the assistant's message is accessed using the keys 'message' and 'content'.\n",
+    "# The 'print' function is used to display this content.\n",
+    "print(response['message']['content'])"
    ]
   },
   {
@@ -107,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "cee2341f-da02-4b10-86ea-fac4676ac4f2",
    "metadata": {},
    "outputs": [
@@ -115,13 +120,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "A Large Language Model (LLM) is a type of language model that has been trained on a massive amount of text data, typically billions or trillions of words. LLMs are designed to be able to understand and generate human-like text, engage in natural language processing tasks, and provide information and knowledge across a wide range of topics. LLMs are typically deep learning models that are trained using transformer architectures, such as the GPT-3 model.\n"
+      "Sure, a Large Language Model (LLM) is a type of language model that has been trained on a massive amount of text data and has the ability to engage in a wide range of natural language processing tasks. LLMs are typically designed to have a large number of parameters, which allows them to learn complex relationships between words and sentences. LLMs are often used for tasks such as text summarization, translation, and code generation.\n"
      ]
     }
    ],
    "source": [
-    "response = llm.complete(\"What is a Large Language Model?\")\n",
-    "print(response)"
+    "response = ollama.chat(\n",
+    "    model='gemma:7b',\n",
+    "    messages=[\n",
+    "        {\n",
+    "            'role': 'user',\n",
+    "            'content': 'What is a Large Language Model?',\n",
+    "        },\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "print(response['message']['content'])"
    ]
   },
   {
@@ -129,7 +143,7 @@
    "id": "f6b3a912-7f5f-44c6-aa9d-361ef008880c",
    "metadata": {},
    "source": [
-    "In our second question, we change the question to \"What is a Large Language Model?\" and you can observe how the answer is substantially longer than the first question \"What is Singapore\". In the next section, you will discover that this relates to a few hyperparemeters in LLMs that can be tweaked."
+    "In our second question, we change the question to \"What is a Large Language Model?\" and you can observe how the answer is slightly longer than the first question \"What is Singapore\". In the next section, you will discover that this relates to a few hyperparemeters in LLMs that can be tweaked."
    ]
   },
   {
@@ -165,69 +179,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
-   "id": "e42a33fd-aed7-4717-97f3-1f3a68014d0c",
+   "execution_count": 17,
+   "id": "e38a97de-7bfc-4a53-a514-6feaadaf98c6",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Sure, here's a happy birthday message for your friend:\n",
-      "\n",
-      "**Happy Birthday, [Friend's Name]!**\n",
-      "\n",
-      "I hope your day is filled with joy, laughter, and happiness. May all your wishes come true.\n",
-      "\n",
-      "Have a wonderful day, and I'm looking forward to celebrating with you soon.\n",
-      "\n",
-      "**Best regards,**\n",
-      "\n",
-      "[Your Name]\n"
+      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true! 🎉🎂\n"
      ]
     }
    ],
    "source": [
-    "# Set the prompt\n",
-    "prompt = \"Write a happy birthday message, I would like to send to my friend.\"\n",
-    "\n",
+    "# Create new model\n",
     "# Set the temperature\n",
-    "# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.01) make it more deterministic\n",
-    "temperature = 0.01\n",
-    "# Instantiate the Ollama class again\n",
-    "llm = Ollama(model=\"gemma:7b\", temperature=temperature)\n",
+    "# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic\n",
+    "modelfile='''\n",
+    "FROM gemma:7b\n",
+    "PARAMETER temperature 0.1\n",
+    "'''\n",
+    "ollama.create('gemma_low_temp', modelfile=modelfile)\n",
     "\n",
-    "# Generate the response\n",
-    "response = llm.complete(prompt)\n",
+    "# Now you can use the new model with adjusted temperature\n",
+    "response = ollama.chat(\n",
+    "    model='gemma_low_temp',\n",
+    "    messages=[\n",
+    "        {\n",
+    "            'role': 'user',\n",
+    "            'content': 'Write a happy birthday message, I would like to send to my friend.',\n",
+    "        },\n",
+    "    ]\n",
+    ")\n",
     "\n",
-    "print(response)"
+    "print(response['message']['content'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
-   "id": "d4650747-696f-448a-ab64-a44f8b1e3114",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7fd7e76754f0>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x7fd898237100>, completion_to_prompt=<function default_completion_to_prompt at 0x7fd8982bfd80>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='gemma:7b', temperature=0.01, context_window=3900, request_timeout=30.0, prompt_key='prompt', additional_kwargs={})"
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Check model\n",
-    "llm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 18,
    "id": "c9686601-2c7d-46c6-831c-59eaf8f03188",
    "metadata": {},
    "outputs": [
@@ -238,68 +228,36 @@
       "----------\n",
       "Response 0\n",
       "----------\n",
-      "Sure, here's a happy birthday message for your friend:\n",
-      "\n",
-      "**Happy Birthday, [Friend's Name]!**\n",
-      "\n",
-      "I hope your day is filled with joy, laughter, and happiness. May all your wishes come true.\n",
-      "\n",
-      "Have a wonderful day, and I'm looking forward to celebrating with you soon.\n",
-      "\n",
-      "**Best regards,**\n",
-      "\n",
-      "[Your Name]\n",
+      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true! 🎉🎂\n",
       "----------\n",
       "Response 1\n",
       "----------\n",
-      "Sure, here's a happy birthday message for your friend:\n",
-      "\n",
-      "**Happy Birthday, [Friend's Name]!**\n",
-      "\n",
-      "I hope your day is filled with joy, laughter, and happiness. May all your wishes come true.\n",
-      "\n",
-      "Have a wonderful day, and I'm looking forward to celebrating with you soon.\n",
-      "\n",
-      "**Best regards,**\n",
-      "\n",
-      "[Your Name]\n",
+      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true! 🎉🎂\n",
       "----------\n",
       "Response 2\n",
       "----------\n",
-      "Sure, here's a happy birthday message for your friend:\n",
-      "\n",
-      "**Happy Birthday, [Friend's Name]!**\n",
-      "\n",
-      "I hope your day is filled with joy, laughter, and happiness. May all your wishes come true.\n",
-      "\n",
-      "Have a wonderful day, and I'm looking forward to celebrating with you soon.\n",
-      "\n",
-      "**Best regards,**\n",
-      "\n",
-      "[Your Name]\n"
+      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true! 🎉🎂\n"
      ]
     }
    ],
    "source": [
     "# Run multiple times\n",
     "for i in range(3):\n",
-    "    # Set the prompt\n",
-    "    prompt = \"Write a happy birthday message, I would like to send to my friend.\"\n",
+    "    response = ollama.chat(\n",
+    "        model='gemma_low_temp',\n",
+    "        messages=[\n",
+    "            {\n",
+    "                'role': 'user',\n",
+    "                'content': 'Write a happy birthday message, I would like to send to my friend.',\n",
+    "            },\n",
+    "        ],\n",
+    "    )\n",
     "    \n",
-    "    # Set the temperature\n",
-    "    # Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.01) make it more deterministic\n",
-    "    temperature = 0.01\n",
-    "    # Instantiate the Ollama class again\n",
-    "    llm = Ollama(model=\"gemma:7b\", temperature=temperature)\n",
-    "    \n",
-    "    # Generate the response\n",
-    "    response = llm.complete(prompt)\n",
-    "\n",
     "    # Print \n",
     "    print('-'*10)\n",
     "    print(f'Response {i}') \n",
     "    print('-'*10)\n",
-    "    print(response)\n"
+    "    print(response['message']['content'])"
    ]
   },
   {
@@ -307,7 +265,7 @@
    "id": "813b3cba-f8b6-415f-8991-255e4df454d7",
    "metadata": {},
    "source": [
-    "**We can see above it is almost the exact same answer calling the LLM 3 times**."
+    "**We can see above it is the exact same answer calling the LLM 3 times**."
    ]
   },
   {
@@ -320,7 +278,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 19,
    "id": "7417e317-6374-4551-9072-bd6c02a8c137",
    "metadata": {},
    "outputs": [
@@ -328,70 +286,38 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Here are a few happy birthday messages you can send to your friend:\n",
-      "\n",
-      "**Short and sweet:**\n",
-      "\n",
-      "* \"Happy Birthday, [friend's name]! Wishing you a day filled with joy!\"\n",
-      "* \"Have a very happy birthday, [friend's name]! Can't wait to see you!\"\n",
-      "* \"Happy Birthday, [friend's name]! May your day be filled with happiness!\"\n",
-      "\n",
-      "**A little more personal:**\n",
-      "\n",
-      "* \"Happy Birthday, [friend's name]! I hope your day is as special as you are.\"\n",
-      "* \"Have a wonderful birthday, [friend's name]! I'm so glad I have you in my life.\"\n",
-      "* \"Wishing you a very happy birthday, [friend's name]. Let's celebrate this special day together!\"\n",
-      "\n",
-      "**Fun and cheeky:**\n",
-      "\n",
-      "* \"Happy Birthday, [friend's name]! I hope your day is filled with cake and laughter.\"\n",
-      "* \"Have a great birthday, [friend's name]! I'm not going to tell you how old you are... for now, at least.\"\n",
-      "* \"Happy Birthday, [friend's name]! May your day be filled with all your favorite things... even if it's me.\"\n"
+      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness! 🎉🎂 🎉\n"
      ]
     }
    ],
    "source": [
-    "# Set the prompt\n",
-    "prompt = \"Write a happy birthday message, I would like to send to my friend.\"\n",
-    "\n",
+    "# Create new model\n",
     "# Set the temperature\n",
-    "# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.01) make it more deterministic\n",
-    "temperature = 1.0\n",
-    "# Instantiate the Ollama class again\n",
-    "llm = Ollama(model=\"gemma:7b\", temperature=temperature)\n",
+    "# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic\n",
+    "modelfile='''\n",
+    "FROM gemma:7b\n",
+    "PARAMETER temperature 1.0\n",
+    "'''\n",
+    "ollama.create('gemma_high_temp', modelfile=modelfile)\n",
     "\n",
-    "# Generate the response\n",
-    "response = llm.complete(prompt)\n",
+    "# Now you can use the new model with adjusted temperature\n",
+    "response = ollama.chat(\n",
+    "    model='gemma_high_temp',\n",
+    "    messages=[\n",
+    "        {\n",
+    "            'role': 'user',\n",
+    "            'content': 'Write a happy birthday message, I would like to send to my friend.',\n",
+    "        },\n",
+    "    ]\n",
+    ")\n",
     "\n",
-    "print(response)\n"
+    "print(response['message']['content'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
-   "id": "da0d7fd9-f562-40e0-a629-20323ebe08f0",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7fd7e8b82ea0>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x7fd898237100>, completion_to_prompt=<function default_completion_to_prompt at 0x7fd8982bfd80>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='gemma:7b', temperature=1.0, context_window=3900, request_timeout=30.0, prompt_key='prompt', additional_kwargs={})"
-      ]
-     },
-     "execution_count": 47,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Check model\n",
-    "llm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "93888408-0818-43fb-b015-c60e8579f043",
+   "execution_count": 20,
+   "id": "0b5ca364-0c85-4026-ac72-de9392aed188",
    "metadata": {},
    "outputs": [
     {
@@ -401,85 +327,40 @@
       "----------\n",
       "Response 0\n",
       "----------\n",
-      "Here are a few happy birthday messages you can send to your friend:\n",
-      "\n",
-      "**Classic Wishes:**\n",
-      "\n",
-      "* \"Happy Birthday, [Friend's Name]! Wishing you a day filled with joy, happiness, and laughter.\"\n",
-      "* \"Have a very happy birthday, [Friend's Name]! May your day be filled with sunshine and good times.\"\n",
-      "* \"Happy Birthday, my dear [Friend's Name]! I hope your day is as awesome as you are.\"\n",
-      "\n",
-      "**Personalized Wishes:**\n",
-      "\n",
-      "* \"Happy Birthday, [Friend's Name]! I hope your day is filled with [specific things you know your friend enjoys].\"\n",
-      "* \"I'm so glad it's your birthday, [Friend's Name]! I'm sending you a virtual hug and a bunch of birthday wishes.\"\n",
-      "* \"Wishing you a very happy birthday, [Friend's Name]! I can't wait to see what you have planned for this special day.\"\n",
-      "\n",
-      "**Fun and Quirky Wishes:**\n",
-      "\n",
-      "* \"Happy Birthday, [Friend's Name]! May your day be filled with cake and laughter... and maybe a sprinkle of unicorn magic.\"\n",
-      "* \"Have a very happy birthday, [Friend's Name]! I'm hoping your day is as memorable as a trip to the moon.\"\n",
-      "* \"Happy Birthday, [Friend's Name]! I'm sending you virtual balloons and a party hat big enough for the both of us.\"\n",
+      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. Let's celebrate your special day together!\n",
       "----------\n",
       "Response 1\n",
       "----------\n",
-      "Here are some happy birthday messages you can send to your friend:\n",
-      "\n",
-      "**Classic wishes:**\n",
-      "\n",
-      "* \"Happy Birthday, [friend's name]! May your day be filled with joy, laughter, and good times.\"\n",
-      "* \"Have a very happy birthday, [friend's name]! I hope all your wishes come true.\"\n",
-      "* \"Wishing you a very happy birthday, [friend's name]! I'm sending you warmest wishes for a day filled with happiness.\"\n",
-      "\n",
-      "**Personalized wishes:**\n",
-      "\n",
-      "* \"Happy Birthday, [friend's name]! I hope your day is as special as you are.\"\n",
-      "* \"Have a wonderful birthday, [friend's name]! I'm so glad to have you in my life.\"\n",
-      "* \"Sending you big birthday wishes, [friend's name]! I can't wait to see what you have planned.\"\n",
-      "\n",
-      "**Fun and cheesy:**\n",
-      "\n",
-      "* \"Happy Birthday, [friend's name]! I'm hoping you have a day as awesome as you are.\"\n",
-      "* \"Have a blast on your birthday, [friend's name]! I'm planning on eating a cake in your honor.\"\n",
-      "* \"I'm not a party pooper, but I'm definitely not attending your party, [friend's name]. Have a great day!\"\n",
-      "\n",
-      "**Remember:**\n",
-      "\n",
-      "* You can personalize the message with your friend's name and preferred gender-neutral pronouns.\n",
-      "* You can add a specific wish or goal you have for your friend.\n",
-      "* You can include a funny joke or a reference to a shared inside joke.\n",
-      "* You can keep the message short and sweet, or you can write a longer, more heartfelt message.\n",
+      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. Wishing you a very special day filled with memorable moments and sweet treats!\n",
       "----------\n",
       "Response 2\n",
       "----------\n",
-      "Sure, here's a happy birthday message you can send to your friend:\n",
+      "Sure, here is a happy birthday message you can send to your friend:\n",
       "\n",
       "**Happy Birthday, [Friend's Name]!**\n",
       "\n",
-      "May your day be filled with joy, laughter, and happiness. I hope your special day is filled with all your favorite things, and I'm wishing you a very, very happy birthday!\n"
+      "May your day be filled with joy, laughter, and happiness. I hope your special day is filled with everything you wish for. I'm sending you positive vibes and can't wait to see you soon.\n"
      ]
     }
    ],
    "source": [
     "# Run multiple times\n",
     "for i in range(3):\n",
-    "    # Set the prompt\n",
-    "    prompt = \"Write a happy birthday message, I would like to send to my friend.\"\n",
-    "    \n",
-    "    # Set the temperature\n",
-    "    # Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.01) make it more deterministic\n",
-    "    temperature = 1.0\n",
-    "    # Instantiate the Ollama class again\n",
-    "    llm = Ollama(model=\"gemma:7b\", temperature=temperature)\n",
+    "    response = ollama.chat(\n",
+    "        model='gemma_high_temp',\n",
+    "        messages=[\n",
+    "            {\n",
+    "                'role': 'user',\n",
+    "                'content': 'Write a happy birthday message, I would like to send to my friend.',\n",
+    "            },\n",
+    "        ],\n",
+    "    )\n",
     "    \n",
-    "    # Generate the response\n",
-    "    response = llm.complete(prompt)\n",
-    "\n",
     "    # Print \n",
     "    print('-'*10)\n",
     "    print(f'Response {i}') \n",
     "    print('-'*10)\n",
-    "    print(response)\n"
+    "    print(response['message']['content'])"
    ]
   },
   {
@@ -487,7 +368,7 @@
    "id": "8b684188-2490-4f29-ae2a-428810c61f28",
    "metadata": {},
    "source": [
-    "**Here, you can see very varied answer in each of the 3 calls to the LLM commpared to lower temperature.**"
+    "**Here, you can see very varied answer in each of the 3 calls to the LLM compared to lower temperature.**"
    ]
   },
   {
@@ -509,11 +390,11 @@
     "\n",
     "- The soft(arg)max function is defined as follows:\n",
     "\n",
-    "    $$\\text{softargmax}(x_i) = \\frac{e^{x_i}}{\\sum_j e^{x_j}}$$\n",
+    "    $$\\text{softargmax}(x_i) = \\frac{e^{x_i}}{\\sum_i^j e^{x_i}}$$\n",
     "\n",
     "- Before applying soft(arg)max, the logits are divided by the `temperature` value. This process is called temperature scaling and the equation becomes:\n",
     "\n",
-    "    $$\\text{softargmax}(x_i) = \\frac{e^{x_i/T}}{\\sum_j e^{x_j/T}}$$\n",
+    "    $$\\text{softargmax}(x_i) = \\frac{e^{x_i/T}}{\\sum_i^j e^{x_i/T}}$$\n",
     "\n",
     "- When `T` > 1, it makes the distribution more uniform (increases randomness). When `T` < 1, it makes the distribution more peaky (reduces randomness).\n",
     "\n",
@@ -522,7 +403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 21,
    "id": "9d458879-dad7-4cf6-b396-6fbded6f52a8",
    "metadata": {},
    "outputs": [
@@ -601,59 +482,94 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
-   "id": "17c769a2-7fc0-422a-968e-0a865a066c75",
+   "execution_count": 26,
+   "id": "e78503dc-b17b-4bd5-93cb-138c553179dc",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, happiness, and laughter. May all your wishes come true. 🎉🎂\n"
+      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true!\n"
      ]
     }
    ],
    "source": [
-    "# Set the prompt\n",
-    "prompt = \"Write a happy birthday message, I would like to send to my friend.\"\n",
+    "# Create new model\n",
     "\n",
     "# Set the temperature\n",
-    "# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.2) make it more deterministic\n",
-    "temperature = 0.8\n",
+    "# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic\n",
     "\n",
     "# Set the top_k\n",
     "# This parameter controls the number of tokens considered for each step of the generation process\n",
-    "top_k = 50\n",
-    "\n",
-    "# Instantiate the Ollama class again\n",
-    "llm = Ollama(model=\"gemma:7b\", temperature=temperature, top_k=top_k)\n",
+    "modelfile='''\n",
+    "FROM gemma:7b\n",
+    "PARAMETER temperature 0.5\n",
+    "PARAMETER top_k 3\n",
+    "'''\n",
+    "ollama.create('gemma_topk_3', modelfile=modelfile)\n",
     "\n",
-    "# Generate the response\n",
-    "response = llm.complete(prompt)\n",
+    "# Now you can use the new model with adjusted temperature\n",
+    "response = ollama.chat(\n",
+    "    model='gemma_topk_3',\n",
+    "    messages=[\n",
+    "        {\n",
+    "            'role': 'user',\n",
+    "            'content': 'Write a happy birthday message, I would like to send to my friend.',\n",
+    "        },\n",
+    "    ]\n",
+    ")\n",
     "\n",
-    "print(response)\n"
+    "print(response['message']['content'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
-   "id": "476c6919-a41f-455e-a17b-4aef2f43940f",
+   "execution_count": 27,
+   "id": "60c232a4-d6c0-43c4-bdd1-3ad0d8c108da",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7fd7e7248950>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x7fd898237100>, completion_to_prompt=<function default_completion_to_prompt at 0x7fd8982bfd80>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='gemma:7b', temperature=0.8, context_window=3900, request_timeout=30.0, prompt_key='prompt', additional_kwargs={})"
-      ]
-     },
-     "execution_count": 65,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "----------\n",
+      "Response 0\n",
+      "----------\n",
+      "Sure, here's a happy birthday message for your friend:\n",
+      "\n",
+      "**Happy Birthday, [Friend's Name]!**\n",
+      "\n",
+      "I hope your day is filled with joy, laughter, and happiness. May all your wishes come true. Have a blast! 🎉🎂🎉\n",
+      "----------\n",
+      "Response 1\n",
+      "----------\n",
+      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true. 🎉🎂\n",
+      "----------\n",
+      "Response 2\n",
+      "----------\n",
+      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true. 🎉🎂\n"
+     ]
     }
    ],
    "source": [
-    "# Check model\n",
-    "llm"
+    "# Run multiple times\n",
+    "for i in range(3):\n",
+    "    response = ollama.chat(\n",
+    "        model='gemma_topk_3',\n",
+    "        messages=[\n",
+    "            {\n",
+    "                'role': 'user',\n",
+    "                'content': 'Write a happy birthday message, I would like to send to my friend.',\n",
+    "            },\n",
+    "        ],\n",
+    "    )\n",
+    "    \n",
+    "    # Print \n",
+    "    print('-'*10)\n",
+    "    print(f'Response {i}') \n",
+    "    print('-'*10)\n",
+    "    print(response['message']['content'])"
    ]
   },
   {
@@ -666,34 +582,110 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "7953aaa8-c950-4086-97f4-6f326ff5eabb",
+   "execution_count": 24,
+   "id": "da33fecc-3218-4b65-a43c-0ac0f0ca7cd7",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. 🎉🎂\n"
+      "Happy Birthday, [Friend's name]! I hope your day is filled with joy, laughter, and happiness! 🎉🎂\n"
      ]
     }
    ],
    "source": [
-    "# Set the prompt\n",
-    "prompt = \"Write a happy birthday message, I would like to send to my friend.\"\n",
+    "# Create new model\n",
     "\n",
     "# Set the temperature\n",
-    "# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.2) make it more deterministic\n",
-    "temperature = 0.8\n",
+    "# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic\n",
     "\n",
     "# Set the top_k\n",
     "# This parameter controls the number of tokens considered for each step of the generation process\n",
-    "top_k = 100\n",
+    "modelfile='''\n",
+    "FROM gemma:7b\n",
+    "PARAMETER temperature 0.5\n",
+    "PARAMETER top_k 200\n",
+    "'''\n",
+    "ollama.create('gemma_topk_200', modelfile=modelfile)\n",
     "\n",
-    "# Generate the response\n",
-    "response = llm.complete(prompt, temperature=temperature, top_k=top_k)\n",
+    "# Now you can use the new model with adjusted temperature\n",
+    "response = ollama.chat(\n",
+    "    model='gemma_topk_200',\n",
+    "    messages=[\n",
+    "        {\n",
+    "            'role': 'user',\n",
+    "            'content': 'Write a happy birthday message, I would like to send to my friend.',\n",
+    "        },\n",
+    "    ]\n",
+    ")\n",
     "\n",
-    "print(response)\n"
+    "print(response['message']['content'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "43ebb158-6cfc-4933-af4d-20b417a43b51",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "----------\n",
+      "Response 0\n",
+      "----------\n",
+      "Sure, here is a happy birthday message for your friend:\n",
+      "\n",
+      "**Happy Birthday, [Friend's Name]!**\n",
+      "\n",
+      "I hope your day is filled with joy, laughter, and happiness. May all your wishes come true today.\n",
+      "\n",
+      "Have a wonderful birthday, and I look forward to seeing you soon.\n",
+      "\n",
+      "**Best regards,**\n",
+      "\n",
+      "[Your Name]\n",
+      "----------\n",
+      "Response 1\n",
+      "----------\n",
+      "Sure, here's a happy birthday message for your friend:\n",
+      "\n",
+      "**Happy Birthday, [Friend's Name]!**\n",
+      "\n",
+      "May your day be filled with joy, laughter, and happiness. I hope your special day is filled with all your favorite things and that your wishes come true.\n",
+      "\n",
+      "**Have a wonderful birthday, my dear friend!**\n",
+      "----------\n",
+      "Response 2\n",
+      "----------\n",
+      "Sure, here's a happy birthday message for your friend:\n",
+      "\n",
+      "**Happy Birthday, [Friend's Name]!**\n",
+      "\n",
+      "May your day be filled with joy, laughter, and happiness. I hope you have a wonderful time celebrating your special day!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Run multiple times\n",
+    "for i in range(3):\n",
+    "    response = ollama.chat(\n",
+    "        model='gemma_topk_200',\n",
+    "        messages=[\n",
+    "            {\n",
+    "                'role': 'user',\n",
+    "                'content': 'Write a happy birthday message, I would like to send to my friend.',\n",
+    "            },\n",
+    "        ],\n",
+    "    )\n",
+    "    \n",
+    "    # Print \n",
+    "    print('-'*10)\n",
+    "    print(f'Response {i}') \n",
+    "    print('-'*10)\n",
+    "    print(response['message']['content'])"
    ]
   },
   {
@@ -730,7 +722,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 28,
    "id": "461b29c9-2dbc-4e3d-8256-befc38789af4",
    "metadata": {},
    "outputs": [
diff --git a/docs/language_model/llm/llm_intro_hyperparameter_tuning.md b/docs/language_model/llm/llm_intro_hyperparameter_tuning.md
index 5e3e78558..8ae53421f 100644
--- a/docs/language_model/llm/llm_intro_hyperparameter_tuning.md
+++ b/docs/language_model/llm/llm_intro_hyperparameter_tuning.md
@@ -4,7 +4,7 @@ In this tutorial, we will be covering LLMs leveraging on Ollama and LlamaIndex u
 
 ## Environment Setup
 
-Follow our [tutorial on Apptainer](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/) to get started. Once you have followed the tutorial and you completed the [Ollama, LlamaIndex and Gemma:7b](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/#ollama-gemma-workload section), you will be able to run `jupyter lab` in a new window to access and run this notebook.
+Follow our [tutorial on Apptainer](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/) to get started. Once you have followed the tutorial till the [Ollama section](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/#ollama-gemma-workloads) where you successfully ran `ollama serve` and `ollama run gemma:7b`, you can run the `apptainer shell --nv --nvccli apptainer_container_0.1.sif` command followed by `jupyter lab` to access and run this notebook.
 
 !!! info  "Directory Guide"
 
@@ -14,43 +14,56 @@ Follow our [tutorial on Apptainer](https://www.deeplearningwizard.com/language_m
 
 In this section, we will leverage on the `Gemma:7b` LLM model to ask basic questions to get responses.
 
-### Question 1
-
 
 ```python
-# Import the Ollama class from the llama_index.llms.ollama module.
-from llama_index.llms.ollama import Ollama
+import ollama
+```
 
-# Create an instance of the Ollama class. The "gemma:7b" argument specifies the model to be used.
-llm = Ollama(model="gemma:7b")
+### Question 1
 
-# Call the complete method on the Ollama instance. 
-# The method generates a completion for the given prompt "What is Singapore?".
-response = llm.complete("What is Singapore?")
 
-# Print the generated response
-print(response)
+```python
+# The 'chat' function is called with two parameters: 'model' and 'messages'.
+response = ollama.chat(
+    model='gemma:7b',  # The 'model' parameter specifies the model to be used. Here, 'gemma:7b' is the model.
+    messages=[  # The 'messages' parameter is a list of message objects.
+        {
+            'role': 'user',  # Each message object has a 'role' key. It can be 'user' or 'assistant'.
+            'content': 'What is Singapore?',  # The 'content' key contains the actual content of the message.
+        },
+    ]
+)
+
+# The 'chat' function returns a response object. 
+# The content of the assistant's message is accessed using the keys 'message' and 'content'.
+# The 'print' function is used to display this content.
+print(response['message']['content'])
 ```
 
-    /opt/conda/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
-      from .autonotebook import tqdm as notebook_tqdm
-
-
-    Singapore is a city-state located on the island of Singapore. It is a Southeast Asian country and is known for its high standard of living, cleanliness, and efficiency.
+    Singapore is a city-state located on the island of Singapore, a Southeast Asian island. It is a highly developed city known for its modern architecture, efficient transportation system, and vibrant cultural diversity.
 
 
 ### Question 2
 
 
 ```python
-response = llm.complete("What is a Large Language Model?")
-print(response)
+response = ollama.chat(
+    model='gemma:7b',
+    messages=[
+        {
+            'role': 'user',
+            'content': 'What is a Large Language Model?',
+        },
+    ]
+)
+
+print(response['message']['content'])
 ```
 
-    A Large Language Model (LLM) is a type of language model that has been trained on a massive amount of text data, typically billions or trillions of words. LLMs are designed to be able to understand and generate human-like text, engage in natural language processing tasks, and provide information and knowledge across a wide range of topics. LLMs are typically deep learning models that are trained using transformer architectures, such as the GPT-3 model.
+    Sure, a Large Language Model (LLM) is a type of language model that has been trained on a massive amount of text data and has the ability to engage in a wide range of natural language processing tasks. LLMs are typically designed to have a large number of parameters, which allows them to learn complex relationships between words and sentences. LLMs are often used for tasks such as text summarization, translation, and code generation.
 
 
-In our second question, we change the question to "What is a Large Language Model?" and you can observe how the answer is substantially longer than the first question "What is Singapore". In the next section, you will discover that this relates to a few hyperparemeters in LLMs that can be tweaked.
+In our second question, we change the question to "What is a Large Language Model?" and you can observe how the answer is slightly longer than the first question "What is Singapore". In the next section, you will discover that this relates to a few hyperparemeters in LLMs that can be tweaked.
 
 ## Question and Answer | Hyperparameter Tuning
 
@@ -67,311 +80,422 @@ The choice of temperature value is a trade-off between consistency and variety,
 
 
 ```python
-# Set the prompt
-prompt = "Write a happy birthday message, I would like to send to my friend."
-
+# Create new model
 # Set the temperature
-# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.01) make it more deterministic
-temperature = 0.01
-# Instantiate the Ollama class again
-llm = Ollama(model="gemma:7b", temperature=temperature)
-
-# Generate the response
-response = llm.complete(prompt)
-
-print(response)
+# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic
+modelfile='''
+FROM gemma:7b
+PARAMETER temperature 0.1
+'''
+ollama.create('gemma_low_temp', modelfile=modelfile)
+
+# Now you can use the new model with adjusted temperature
+response = ollama.chat(
+    model='gemma_low_temp',
+    messages=[
+        {
+            'role': 'user',
+            'content': 'Write a happy birthday message, I would like to send to my friend.',
+        },
+    ]
+)
+
+print(response['message']['content'])
 ```
 
-    Sure, here's a happy birthday message for your friend:
-    
-    **Happy Birthday, [Friend's Name]!**
-    
-    I hope your day is filled with joy, laughter, and happiness. May all your wishes come true.
-    
-    Have a wonderful day, and I'm looking forward to celebrating with you soon.
-    
-    **Best regards,**
-    
-    [Your Name]
+    Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true! 🎉🎂
 
 
 
 ```python
-# Check model
-llm
+# Run multiple times
+for i in range(3):
+    response = ollama.chat(
+        model='gemma_low_temp',
+        messages=[
+            {
+                'role': 'user',
+                'content': 'Write a happy birthday message, I would like to send to my friend.',
+            },
+        ],
+    )
+    
+    # Print 
+    print('-'*10)
+    print(f'Response {i}') 
+    print('-'*10)
+    print(response['message']['content'])
 ```
 
+    ----------
+    Response 0
+    ----------
+    Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true! 🎉🎂
+    ----------
+    Response 1
+    ----------
+    Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true! 🎉🎂
+    ----------
+    Response 2
+    ----------
+    Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true! 🎉🎂
 
 
+**We can see above it is the exact same answer calling the LLM 3 times**.
 
-    Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7fd7e76754f0>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x7fd898237100>, completion_to_prompt=<function default_completion_to_prompt at 0x7fd8982bfd80>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='gemma:7b', temperature=0.01, context_window=3900, request_timeout=30.0, prompt_key='prompt', additional_kwargs={})
+#### High Temperature
 
 
+```python
+# Create new model
+# Set the temperature
+# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic
+modelfile='''
+FROM gemma:7b
+PARAMETER temperature 1.0
+'''
+ollama.create('gemma_high_temp', modelfile=modelfile)
+
+# Now you can use the new model with adjusted temperature
+response = ollama.chat(
+    model='gemma_high_temp',
+    messages=[
+        {
+            'role': 'user',
+            'content': 'Write a happy birthday message, I would like to send to my friend.',
+        },
+    ]
+)
+
+print(response['message']['content'])
+```
+
+    Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness! 🎉🎂 🎉
+
 
 
 ```python
 # Run multiple times
 for i in range(3):
-    # Set the prompt
-    prompt = "Write a happy birthday message, I would like to send to my friend."
+    response = ollama.chat(
+        model='gemma_high_temp',
+        messages=[
+            {
+                'role': 'user',
+                'content': 'Write a happy birthday message, I would like to send to my friend.',
+            },
+        ],
+    )
     
-    # Set the temperature
-    # Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.01) make it more deterministic
-    temperature = 0.01
-    # Instantiate the Ollama class again
-    llm = Ollama(model="gemma:7b", temperature=temperature)
-    
-    # Generate the response
-    response = llm.complete(prompt)
-
     # Print 
     print('-'*10)
     print(f'Response {i}') 
     print('-'*10)
-    print(response)
-
+    print(response['message']['content'])
 ```
 
     ----------
     Response 0
     ----------
-    Sure, here's a happy birthday message for your friend:
-    
-    **Happy Birthday, [Friend's Name]!**
-    
-    I hope your day is filled with joy, laughter, and happiness. May all your wishes come true.
-    
-    Have a wonderful day, and I'm looking forward to celebrating with you soon.
-    
-    **Best regards,**
-    
-    [Your Name]
+    Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. Let's celebrate your special day together!
     ----------
     Response 1
     ----------
-    Sure, here's a happy birthday message for your friend:
-    
-    **Happy Birthday, [Friend's Name]!**
-    
-    I hope your day is filled with joy, laughter, and happiness. May all your wishes come true.
-    
-    Have a wonderful day, and I'm looking forward to celebrating with you soon.
-    
-    **Best regards,**
-    
-    [Your Name]
+    Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. Wishing you a very special day filled with memorable moments and sweet treats!
     ----------
     Response 2
     ----------
-    Sure, here's a happy birthday message for your friend:
+    Sure, here is a happy birthday message you can send to your friend:
     
     **Happy Birthday, [Friend's Name]!**
     
-    I hope your day is filled with joy, laughter, and happiness. May all your wishes come true.
-    
-    Have a wonderful day, and I'm looking forward to celebrating with you soon.
-    
-    **Best regards,**
-    
-    [Your Name]
+    May your day be filled with joy, laughter, and happiness. I hope your special day is filled with everything you wish for. I'm sending you positive vibes and can't wait to see you soon.
 
 
-**We can see above it is almost the exact same answer calling the LLM 3 times**.
+**Here, you can see very varied answer in each of the 3 calls to the LLM compared to lower temperature.**
 
-#### High Temperature
+#### Mathematical Interpretation
 
+In LLMs, the `temperature` parameter is used to control the randomness of predictions by scaling the logits before applying soft(arg)max.
 
-```python
-# Set the prompt
-prompt = "Write a happy birthday message, I would like to send to my friend."
+- The model computes a score (also known as logits) for each possible next token based on the current context. These scores are then transformed into probabilities using the soft(arg)max function.
 
-# Set the temperature
-# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.01) make it more deterministic
-temperature = 1.0
-# Instantiate the Ollama class again
-llm = Ollama(model="gemma:7b", temperature=temperature)
+- The soft(arg)max function is defined as follows:
 
-# Generate the response
-response = llm.complete(prompt)
+    $$\text{softargmax}(x_i) = \frac{e^{x_i}}{\sum_i^j e^{x_i}}$$
 
-print(response)
+- Before applying soft(arg)max, the logits are divided by the `temperature` value. This process is called temperature scaling and the equation becomes:
+
+    $$\text{softargmax}(x_i) = \frac{e^{x_i/T}}{\sum_i^j e^{x_i/T}}$$
+
+- When `T` > 1, it makes the distribution more uniform (increases randomness). When `T` < 1, it makes the distribution more peaky (reduces randomness).
+
+So, in simple terms, a higher temperature value makes the model's output more random, and a lower temperature makes it more deterministic.
+
+
+```python
+import numpy as np
+
+def softargmax(x, T=1.0):
+    e_x = np.exp(x / T)
+    return e_x / e_x.sum()
+
+# Define logits
+logits = np.array([0.2, 0.3, 0.1, 0.4])
 
+# Compute soft(arg)max for different temperatures
+for T in [0.1, 1.0]:
+    print(f"Temperature: {T}")
+    print(softargmax(logits, T=T))
+    print()
 ```
 
-    Here are a few happy birthday messages you can send to your friend:
-    
-    **Short and sweet:**
-    
-    * "Happy Birthday, [friend's name]! Wishing you a day filled with joy!"
-    * "Have a very happy birthday, [friend's name]! Can't wait to see you!"
-    * "Happy Birthday, [friend's name]! May your day be filled with happiness!"
-    
-    **A little more personal:**
-    
-    * "Happy Birthday, [friend's name]! I hope your day is as special as you are."
-    * "Have a wonderful birthday, [friend's name]! I'm so glad I have you in my life."
-    * "Wishing you a very happy birthday, [friend's name]. Let's celebrate this special day together!"
+    Temperature: 0.1
+    [0.08714432 0.23688282 0.0320586  0.64391426]
     
-    **Fun and cheeky:**
+    Temperature: 1.0
+    [0.23632778 0.26118259 0.21383822 0.28865141]
     
-    * "Happy Birthday, [friend's name]! I hope your day is filled with cake and laughter."
-    * "Have a great birthday, [friend's name]! I'm not going to tell you how old you are... for now, at least."
-    * "Happy Birthday, [friend's name]! May your day be filled with all your favorite things... even if it's me."
 
 
+In the Python code above leveraging on `numpy` library, you can see that
 
-```python
-# Check model
-llm
-```
+- `softargmax` is a function that computes the soft(arg)max of an array of logits `x` for a given temperature `T`.
+- We define an array of logits and compute the soft(arg)max for different temperatures.
+- When you run this code, you'll see that as the temperature increases, the soft(arg)max output becomes more uniform (i.e., the probabilities are more evenly distributed), and as the temperature decreases, the soft(arg)max output becomes more peaky (i.e., one probability dominates the others). This illustrates how temperature can control the randomness of the model's output.
 
+- To close this off, taking the max of the soft(arg)max output, you will observe how it gets more random in the max value as the soft(arg)max output becomes more uniform. This links to the concept of how the next word gets more random because of the max of the uniformity of the soft(arg)max output.
 
+### Top-K Tuning
 
+In LLMs, the `top_k` hyperparameter is a key factor that influences the unpredictability of the generated output.
 
-    Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7fd7e8b82ea0>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x7fd898237100>, completion_to_prompt=<function default_completion_to_prompt at 0x7fd8982bfd80>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='gemma:7b', temperature=1.0, context_window=3900, request_timeout=30.0, prompt_key='prompt', additional_kwargs={})
+- **For smaller `top_k` values**: The model behaves in a more predictable manner. It only takes into account a limited set of the most probable next tokens at each step of the generation process. This can result in responses that are more concise and consistent, but there’s a possibility that the output may be too restricted or repetitive.
 
+- **For larger `top_k` values**: The model takes into consideration a broader set of potential next tokens. This infuses more variety and randomness into the generated output. However, the responses can become less consistent and may occasionally be less coherent or pertinent.
+Therefore, the selection of the top_k value can be viewed as a balance between consistency and variety in the model’s responses. It’s crucial to adjust this parameter based on the specific needs of your task. 
+
+#### Low K
+
+
+```python
+# Create new model
+
+# Set the temperature
+# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic
+
+# Set the top_k
+# This parameter controls the number of tokens considered for each step of the generation process
+modelfile='''
+FROM gemma:7b
+PARAMETER temperature 0.5
+PARAMETER top_k 3
+'''
+ollama.create('gemma_topk_3', modelfile=modelfile)
+
+# Now you can use the new model with adjusted temperature
+response = ollama.chat(
+    model='gemma_topk_3',
+    messages=[
+        {
+            'role': 'user',
+            'content': 'Write a happy birthday message, I would like to send to my friend.',
+        },
+    ]
+)
+
+print(response['message']['content'])
+```
+
+    Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true!
 
 
 
 ```python
 # Run multiple times
 for i in range(3):
-    # Set the prompt
-    prompt = "Write a happy birthday message, I would like to send to my friend."
-    
-    # Set the temperature
-    # Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.01) make it more deterministic
-    temperature = 1.0
-    # Instantiate the Ollama class again
-    llm = Ollama(model="gemma:7b", temperature=temperature)
+    response = ollama.chat(
+        model='gemma_topk_3',
+        messages=[
+            {
+                'role': 'user',
+                'content': 'Write a happy birthday message, I would like to send to my friend.',
+            },
+        ],
+    )
     
-    # Generate the response
-    response = llm.complete(prompt)
-
     # Print 
     print('-'*10)
     print(f'Response {i}') 
     print('-'*10)
-    print(response)
-
+    print(response['message']['content'])
 ```
 
     ----------
     Response 0
     ----------
-    Here are a few happy birthday messages you can send to your friend:
-    
-    **Classic Wishes:**
-    
-    * "Happy Birthday, [Friend's Name]! Wishing you a day filled with joy, happiness, and laughter."
-    * "Have a very happy birthday, [Friend's Name]! May your day be filled with sunshine and good times."
-    * "Happy Birthday, my dear [Friend's Name]! I hope your day is as awesome as you are."
-    
-    **Personalized Wishes:**
-    
-    * "Happy Birthday, [Friend's Name]! I hope your day is filled with [specific things you know your friend enjoys]."
-    * "I'm so glad it's your birthday, [Friend's Name]! I'm sending you a virtual hug and a bunch of birthday wishes."
-    * "Wishing you a very happy birthday, [Friend's Name]! I can't wait to see what you have planned for this special day."
+    Sure, here's a happy birthday message for your friend:
     
-    **Fun and Quirky Wishes:**
+    **Happy Birthday, [Friend's Name]!**
     
-    * "Happy Birthday, [Friend's Name]! May your day be filled with cake and laughter... and maybe a sprinkle of unicorn magic."
-    * "Have a very happy birthday, [Friend's Name]! I'm hoping your day is as memorable as a trip to the moon."
-    * "Happy Birthday, [Friend's Name]! I'm sending you virtual balloons and a party hat big enough for the both of us."
+    I hope your day is filled with joy, laughter, and happiness. May all your wishes come true. Have a blast! 🎉🎂🎉
     ----------
     Response 1
     ----------
-    Here are some happy birthday messages you can send to your friend:
+    Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true. 🎉🎂
+    ----------
+    Response 2
+    ----------
+    Happy Birthday, [Friend's Name]! I hope your day is filled with joy, laughter, and happiness. May all your wishes come true. 🎉🎂
+
+
+#### High K
+
+
+```python
+# Create new model
+
+# Set the temperature
+# Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic
+
+# Set the top_k
+# This parameter controls the number of tokens considered for each step of the generation process
+modelfile='''
+FROM gemma:7b
+PARAMETER temperature 0.5
+PARAMETER top_k 200
+'''
+ollama.create('gemma_topk_200', modelfile=modelfile)
+
+# Now you can use the new model with adjusted temperature
+response = ollama.chat(
+    model='gemma_topk_200',
+    messages=[
+        {
+            'role': 'user',
+            'content': 'Write a happy birthday message, I would like to send to my friend.',
+        },
+    ]
+)
+
+print(response['message']['content'])
+```
+
+    Happy Birthday, [Friend's name]! I hope your day is filled with joy, laughter, and happiness! 🎉🎂
+
+
+
+```python
+# Run multiple times
+for i in range(3):
+    response = ollama.chat(
+        model='gemma_topk_200',
+        messages=[
+            {
+                'role': 'user',
+                'content': 'Write a happy birthday message, I would like to send to my friend.',
+            },
+        ],
+    )
+    
+    # Print 
+    print('-'*10)
+    print(f'Response {i}') 
+    print('-'*10)
+    print(response['message']['content'])
+```
+
+    ----------
+    Response 0
+    ----------
+    Sure, here is a happy birthday message for your friend:
     
-    **Classic wishes:**
+    **Happy Birthday, [Friend's Name]!**
     
-    * "Happy Birthday, [friend's name]! May your day be filled with joy, laughter, and good times."
-    * "Have a very happy birthday, [friend's name]! I hope all your wishes come true."
-    * "Wishing you a very happy birthday, [friend's name]! I'm sending you warmest wishes for a day filled with happiness."
+    I hope your day is filled with joy, laughter, and happiness. May all your wishes come true today.
     
-    **Personalized wishes:**
+    Have a wonderful birthday, and I look forward to seeing you soon.
     
-    * "Happy Birthday, [friend's name]! I hope your day is as special as you are."
-    * "Have a wonderful birthday, [friend's name]! I'm so glad to have you in my life."
-    * "Sending you big birthday wishes, [friend's name]! I can't wait to see what you have planned."
+    **Best regards,**
     
-    **Fun and cheesy:**
+    [Your Name]
+    ----------
+    Response 1
+    ----------
+    Sure, here's a happy birthday message for your friend:
     
-    * "Happy Birthday, [friend's name]! I'm hoping you have a day as awesome as you are."
-    * "Have a blast on your birthday, [friend's name]! I'm planning on eating a cake in your honor."
-    * "I'm not a party pooper, but I'm definitely not attending your party, [friend's name]. Have a great day!"
+    **Happy Birthday, [Friend's Name]!**
     
-    **Remember:**
+    May your day be filled with joy, laughter, and happiness. I hope your special day is filled with all your favorite things and that your wishes come true.
     
-    * You can personalize the message with your friend's name and preferred gender-neutral pronouns.
-    * You can add a specific wish or goal you have for your friend.
-    * You can include a funny joke or a reference to a shared inside joke.
-    * You can keep the message short and sweet, or you can write a longer, more heartfelt message.
+    **Have a wonderful birthday, my dear friend!**
     ----------
     Response 2
     ----------
-    Sure, here's a happy birthday message you can send to your friend:
+    Sure, here's a happy birthday message for your friend:
     
     **Happy Birthday, [Friend's Name]!**
     
-    May your day be filled with joy, laughter, and happiness. I hope your special day is filled with all your favorite things, and I'm wishing you a very, very happy birthday!
+    May your day be filled with joy, laughter, and happiness. I hope you have a wonderful time celebrating your special day!
 
 
-**Here, you can see very varied answer in each of the 3 calls to the LLM commpared to lower temperature.**
+You can observe that the reply is more diverse with a high `top_k` hyperparameter.
 
 #### Mathematical Interpretation
 
-In LLMs, the `temperature` parameter is used to control the randomness of predictions by scaling the logits before applying soft(arg)max.
-
-- The model computes a score (also known as logits) for each possible next token based on the current context. These scores are then transformed into probabilities using the soft(arg)max function.
+In LLMs, the `top_k` parameter is used to limit the number of next tokens considered for generation.
 
-- The soft(arg)max function is defined as follows:
+- After computing the soft(arg)max probabilities for all possible next tokens, the model sorts these probabilities in descending order.
 
-    $$\text{softargmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}}$$
+- The model then only considers the `top_k` tokens with the highest probabilities for the next step of the generation process.
 
-- Before applying soft(arg)max, the logits are divided by the `temperature` value. This process is called temperature scaling and the equation becomes:
+- This process is called `top_k` sampling.
 
-    $$\text{softargmax}(x_i) = \frac{e^{x_i/T}}{\sum_j e^{x_j/T}}$$
-
-- When `T` > 1, it makes the distribution more uniform (increases randomness). When `T` < 1, it makes the distribution more peaky (reduces randomness).
-
-So, in simple terms, a higher temperature value makes the model's output more random, and a lower temperature makes it more deterministic.
+Here's a simple Python code snippet that illustrates how `top_k` works.
 
 
 ```python
-import numpy as np
-
-def softargmax(x, T=1.0):
-    e_x = np.exp(x / T)
-    return e_x / e_x.sum()
+def top_k(logits, k):
+    # Sort the logits
+    sorted_indices = np.argsort(logits)
+    
+    # Consider only the top k
+    top_k_indices = sorted_indices[-k:]
+    
+    # Create a new array with only the top k probabilities
+    top_k_logits = logits[top_k_indices]
+    
+    return top_k_logits
 
 # Define logits
 logits = np.array([0.2, 0.3, 0.1, 0.4])
 
-# Compute soft(arg)max for different temperatures
-for T in [0.1, 1.0]:
-    print(f"Temperature: {T}")
-    print(softargmax(logits, T=T))
+# Compute top_k for different values of k
+for k in [2, 3, 4]:
+    print(f"Top {k} logits:")
+    print(top_k(logits, k=k))
     print()
 ```
 
-    Temperature: 0.1
-    [0.08714432 0.23688282 0.0320586  0.64391426]
+    Top 2 logits:
+    [0.3 0.4]
     
-    Temperature: 1.0
-    [0.23632778 0.26118259 0.21383822 0.28865141]
+    Top 3 logits:
+    [0.2 0.3 0.4]
+    
+    Top 4 logits:
+    [0.1 0.2 0.3 0.4]
     
 
 
-In the Python code above leveraging on `numpy` library, you can see that
+In the code above
 
-- `softargmax` is a function that computes the soft(arg)max of an array of logits `x` for a given temperature `T`.
-- We define an array of logits and compute the soft(arg)max for different temperatures.
-- When you run this code, you'll see that as the temperature increases, the soft(arg)max output becomes more uniform (i.e., the probabilities are more evenly distributed), and as the temperature decreases, the soft(arg)max output becomes more peaky (i.e., one probability dominates the others). This illustrates how temperature can control the randomness of the model's output.
+- `top_k` is a function that computes the top `k` logits from an array of logits.
 
-- To close this off, taking the max of the soft(arg)max output, you will observe how it gets more random in the max value as the soft(arg)max output becomes more uniform. This links to the concept of how the next word gets more random because of the max of the uniformity of the soft(arg)max output.
+- We define an array of logits and compute the top `k` logits for different values of `k`.
 
+- When you run this code, you'll see that as `k` increases, more logits are considered. This illustrates how `top_k` can control the number of tokens considered by the model.
 
 ## Summary
 
-We covered the functionality of a basic LLM without any hyperparameter tuning. We then covered Temperature hyperparameter tuning. It is important to note that there are many hyperparameters that can be tuned, and we will update this tutorial to gradually include as many as we can.
+We covered the functionality of a basic LLM without any hyperparameter tuning. We then covered Temperature and Top-K hyperparameter tuning. It is important to note that there are many hyperparameters that can be tuned, and we will update this tutorial to gradually include as many as we can.
diff --git a/docs/language_model/rag/rag_intro.ipynb b/docs/language_model/rag/rag_intro.ipynb
index aeeb534f0..50e29ed4a 100644
--- a/docs/language_model/rag/rag_intro.ipynb
+++ b/docs/language_model/rag/rag_intro.ipynb
@@ -5,7 +5,7 @@
    "id": "711dca43-1c6d-4c0c-8888-b7cdcbcbdf3f",
    "metadata": {},
    "source": [
-    "# RAG Introduction"
+    "# RAG Introduction (PROTOTYPE)"
    ]
   },
   {
@@ -29,7 +29,7 @@
    "id": "6906cb0d-e59f-4594-b1d7-cbd4cb8e301f",
    "metadata": {},
    "source": [
-    "Follow our [tutorial on Apptainer](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/) to get started. Once you have followed the tutorial and you completed the [Ollama, LlamaIndex and Gemma:7b](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/#ollama-gemma-workload section), you will be able to run `jupyter lab` in a new window to access and run this notebook.\n",
+    "Follow our [tutorial on Apptainer](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/) to get started. Once you have followed the tutorial till the [Ollama section](https://www.deeplearningwizard.com/language_model/containers/hpc_containers_apptainer/#ollama-gemma-workloads) where you successfully ran `ollama serve` and `ollama run gemma:7b`, you can run the `apptainer shell --nv --nvccli apptainer_container_0.1.sif` command followed by `jupyter lab` to access and run this notebook.\n",
     "\n",
     "!!! info  \"Directory Guide\"\n",
     "\n",