From 64a7d27a9145034bd24890256aa822efe11ccd54 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 20 Apr 2024 04:04:24 +1000
Subject: [PATCH 01/69] Fix prompt

---
 unsloth/chat_templates.py  | 22 ++++++++++++++++++++++
 unsloth/models/mapper.py   |  3 +++
 unsloth/tokenizer_utils.py |  5 +++++
 3 files changed, 30 insertions(+)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 6a0be3862..56749d6c2 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -255,6 +255,20 @@
 CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token,)
 
 
+# Llama-3
+# Weirdly \n\n is needed?
+llama3_template = \
+    "{{ bos_token }}"\
+    "{% for message in messages %}"\
+        "{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\
+    "{% endif %}"
+llama3_template_eos_token = "eos_token"
+CHAT_TEMPLATES["llama-3"] = (llama3_template, gemma_chatml_eos_token,)
+
+
 def get_chat_template(
     tokenizer,
     chat_template = "chatml",
@@ -540,4 +554,12 @@ def test_chat_templates():
     correct_tokenizer.chat_template = gemma_template
     our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
     assert(our_prompt == correct_prompt)
+
+    # Llama-3
+    template = llama3_template
+    correct_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct")
+    correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    correct_tokenizer.chat_template = template
+    our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    assert(correct_prompt == our_prompt)
 pass
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index bad43190b..769cbff53 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -137,6 +137,9 @@
     "unsloth/llama-3-70b-bnb-4bit" : (
         "meta-llama/Meta-Llama-3-70B",
     ),
+    "unsloth/llama-3-70b-Instruct-bnb-4bit" : (
+        "meta-llama/Meta-Llama-3-70B-Instruct",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER = {}
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index fa536ef29..76d9372e2 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -215,6 +215,11 @@ def fix_sentencepiece_tokenizer(
         os.makedirs(temporary_location)
     pass
 
+    # Check if tokenizer.model exists
+    if not os.path.isfile(f"{temporary_location}/tokenizer.model"):
+        return new_tokenizer
+    pass
+
     # First save the old tokenizer
     old_tokenizer.save_pretrained(temporary_location)
 

From 656ab2288c2c5a32d218405ed8e2914db7228b6c Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 20 Apr 2024 19:50:27 +1000
Subject: [PATCH 02/69] Update chat_templates.py

---
 unsloth/chat_templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 56749d6c2..93104b961 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -266,7 +266,7 @@
         "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\
     "{% endif %}"
 llama3_template_eos_token = "eos_token"
-CHAT_TEMPLATES["llama-3"] = (llama3_template, gemma_chatml_eos_token,)
+CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token,)
 
 
 def get_chat_template(

From c4f2f54d7e6dbb20d255be193832946130c4aeac Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 21 Apr 2024 04:33:23 +1000
Subject: [PATCH 03/69] fix_untrained_tokens

---
 unsloth/models/_utils.py | 33 +++++++++++++++++++++++++++++++++
 unsloth/models/llama.py  |  5 +++++
 2 files changed, 38 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 32da0a734..a7e392c25 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -71,6 +71,7 @@
     "patch_tokenizer",
     "get_statistics",
     "Unsloth_Offloaded_Gradient_Checkpointer",
+    "fix_untrained_tokens",
 ]
 
 
@@ -349,3 +350,35 @@ def backward(ctx, dY):
         return (None, hidden_states.grad,) + (None,)*len(ctx.args)
     pass
 pass
+
+
+@torch.inference_mode
+def fix_untrained_tokens(model, eps = 1e-16):
+    """
+    Llama-3 for eg has untrained vectors in the base model.
+    These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
+    We reset them to the mean of the rest of the tokens
+    """
+    embedding_matrix = model.get_input_embeddings ().weight.data
+    lm_head_matrix   = model.get_output_embeddings().weight.data
+    where_untrained = torch.where(torch.amax(embedding_matrix, axis = 1) <= eps)[0]
+    n_untrained = where_untrained.shape[0]
+    n_trained = embedding_matrix.shape[0] - n_untrained
+    if n_untrained != 0:
+        logger.warning_once(
+            f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\
+            "We shall set them to the mean of the other trained tokens."
+        )
+    pass
+
+    # Fix embed_tokens
+    sum_columns  = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
+    mean_columns = (sum_columns / n_trained).to(embedding_matrix.dtype)
+    embedding_matrix[where_untrained] = mean_columns
+
+    # Fix lm_head
+    sum_columns  = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0)
+    mean_columns = (sum_columns / n_trained).to(lm_head_matrix.dtype)
+    lm_head_matrix[where_untrained] = mean_columns
+    return
+pass
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6f70bc510..74500a3d7 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1481,6 +1481,11 @@ def get_peft_model(
                     train_embed_tokens = True
             pass
         pass
+        
+        # First fix untrained tokens
+        if train_embed_tokens or train_lm_head:
+            fix_untrained_tokens(model, eps = 1e-16)
+        pass
 
         # Get LoRA
         arguments = dict(

From 87b4bb961f234b5cc387472a443d34868900c7ec Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 21 Apr 2024 12:52:50 +1000
Subject: [PATCH 04/69] Update llama.py

---
 unsloth/models/llama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 74500a3d7..6b6f887ec 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1483,9 +1483,9 @@ def get_peft_model(
         pass
         
         # First fix untrained tokens
-        if train_embed_tokens or train_lm_head:
-            fix_untrained_tokens(model, eps = 1e-16)
-        pass
+        # if train_embed_tokens or train_lm_head:
+        #     fix_untrained_tokens(model, eps = 1e-16)
+        # pass
 
         # Get LoRA
         arguments = dict(

From abd192fd2d58298369227a68c42305aa8513939c Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 21 Apr 2024 16:18:28 +1000
Subject: [PATCH 05/69] add tokens

---
 unsloth/chat_templates.py  |   5 +-
 unsloth/models/_utils.py   |  31 ----------
 unsloth/models/llama.py    |  22 +++++--
 unsloth/tokenizer_utils.py | 124 +++++++++++++++++++++++++++++++++++++
 4 files changed, 142 insertions(+), 40 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 93104b961..2d15470a0 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -23,10 +23,7 @@
 from .save import patch_saving_functions
 import os
 import shutil
-from .tokenizer_utils import (
-    load_correct_tokenizer,
-    fix_sentencepiece_tokenizer,
-)
+from .tokenizer_utils import *
 from .models._utils import patch_tokenizer
 
 CHAT_TEMPLATES = {}
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index a7e392c25..9b9ba9ac0 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -351,34 +351,3 @@ def backward(ctx, dY):
     pass
 pass
 
-
-@torch.inference_mode
-def fix_untrained_tokens(model, eps = 1e-16):
-    """
-    Llama-3 for eg has untrained vectors in the base model.
-    These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
-    We reset them to the mean of the rest of the tokens
-    """
-    embedding_matrix = model.get_input_embeddings ().weight.data
-    lm_head_matrix   = model.get_output_embeddings().weight.data
-    where_untrained = torch.where(torch.amax(embedding_matrix, axis = 1) <= eps)[0]
-    n_untrained = where_untrained.shape[0]
-    n_trained = embedding_matrix.shape[0] - n_untrained
-    if n_untrained != 0:
-        logger.warning_once(
-            f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\
-            "We shall set them to the mean of the other trained tokens."
-        )
-    pass
-
-    # Fix embed_tokens
-    sum_columns  = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
-    mean_columns = (sum_columns / n_trained).to(embedding_matrix.dtype)
-    embedding_matrix[where_untrained] = mean_columns
-
-    # Fix lm_head
-    sum_columns  = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0)
-    mean_columns = (sum_columns / n_trained).to(lm_head_matrix.dtype)
-    lm_head_matrix[where_untrained] = mean_columns
-    return
-pass
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 6b6f887ec..425d068fd 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1481,12 +1481,24 @@ def get_peft_model(
                     train_embed_tokens = True
             pass
         pass
-        
-        # First fix untrained tokens
-        # if train_embed_tokens or train_lm_head:
-        #     fix_untrained_tokens(model, eps = 1e-16)
-        # pass
 
+        # Check if we added new tokens!
+        if hasattr(model, "_need_to_train_embeddings"):
+            if not train_lm_head or not train_embed_tokens:
+                print(
+                    "Unsloth: You added new tokens but did not specify if you wanted to "\
+                    "train the lm_head and embed_tokens. We must turn it on for you."
+                )
+                train_lm_head = True
+                train_embed_tokens = True
+            pass
+        pass
+
+        # First fix untrained tokens
+        if train_embed_tokens or train_lm_head:
+            fix_untrained_tokens(model, eps = 1e-16)
+        pass
+        
         # Get LoRA
         arguments = dict(
             r                   = r,
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 76d9372e2..c0f6c7fc2 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -18,11 +18,14 @@
 import re
 import os
 from transformers.models.llama.modeling_llama import logger
+from peft import PeftModelForCausalLM
 
 __all__ = [
     "load_correct_tokenizer",
     "fix_sentencepiece_tokenizer",
     "check_tokenizer",
+    "fix_untrained_tokens",
+    "add_new_tokens",
 ]
 
 
@@ -466,3 +469,124 @@ def check_tokenizer(
     pass
     return convert_to_fast_tokenizer(tokenizer)
 pass
+
+
+@torch.inference_mode
+def fix_untrained_tokens(model, eps = 1e-16):
+    """
+    Llama-3 for eg has untrained vectors in the base model.
+    These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
+    We reset them to the mean of the rest of the tokens
+    """
+    embedding_matrix = model.get_input_embeddings ().weight.data
+    lm_head_matrix   = model.get_output_embeddings().weight.data
+
+    # Get untrained tokens
+    indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps
+    where_untrained = torch.where(indicator_untrained)[0]
+    n_untrained = where_untrained.shape[0]
+    n_trained = embedding_matrix.shape[0] - n_untrained
+    if n_untrained != 0:
+        print(
+            f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\
+            "We shall set them to the mean of the other trained tokens."
+        )
+    pass
+
+    # First set untrained to all 0s - sometimes it's not! 1e-23 for bfloat16
+    embedding_matrix[where_untrained] = 0
+    lm_head_matrix  [where_untrained] = 0
+
+    # Find sum
+    sum_embedding  = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
+    sum_lm_head    = torch.sum(lm_head_matrix,   dtype = torch.float32, axis = 0)
+
+    # Find correct average by dividing by sum of trained tokens
+    mean_embedding = (sum_embedding / n_trained).to(embedding_matrix.dtype)
+    mean_lm_head   = (sum_lm_head   / n_trained).to(lm_head_matrix  .dtype)
+
+    # Set them to the mean
+    embedding_matrix[where_untrained] = mean_embedding
+    lm_head_matrix  [where_untrained] = mean_lm_head
+
+    return mean_embedding, mean_lm_head
+pass
+
+
+@torch.inference_mode
+def add_new_tokens(
+    model,
+    tokenizer,
+    new_tokens = [],
+    method = "mean",
+    interpolation = 0.05,
+):
+    """
+    Smartly resizes the tokenizer and adds new tokens to the model.
+    We also disregard untrained tokens by removing them from the mean calculation.
+    """
+    assert(isinstance(new_tokens, (list, tuple)))
+    assert(len(new_tokens) > 0)
+    assert(method == "mean" or method == "interpolation")
+    assert(interpolation >= 0 and interpolation <= 1)
+
+    # Check if tokens already exist
+    overlapping_tokens = set(new_tokens) & set(tokenizer.vocab.keys())
+    if len(overlapping_tokens) != 0:
+        print(
+            f"Unsloth: You're adding new_tokens = {new_tokens}\n"\
+            f"There are tokens which are overlapping = {list(overlapping_tokens)}\n"\
+            f"We shall safely ignore these overlapping tokens."
+        )
+        new_tokens = [x for x in new_tokens if x not in overlapping_tokens]
+    pass
+
+    # Get mean of trained tokens
+    mean_embedding, mean_lm_head = fix_untrained_tokens(model)
+    mean_embedding = mean_embedding.to(torch.float32)
+    mean_lm_head   = mean_lm_head  .to(torch.float32)
+
+    # Add tokens!
+    old_length = len(tokenizer)
+    tokenizer.add_tokens(new_tokens)
+    model.resize_token_embeddings(len(tokenizer))
+
+    # If we use interpolation, we interpolate between the mean embeddings and
+    # the Word2Vec sum of the other vectors
+    embedding_matrix = model.get_input_embeddings ().weight.data
+    lm_head_matrix   = model.get_output_embeddings().weight.data
+
+    if method == "interpolation":
+        print(
+            "Unsloth: You are using interpolation to add new tokens.\n"\
+            f"We shall set new tokens = mean(embeddings)*{1-interpolation} + mean(new_tokens)*{interpolation}"
+        )
+        for j, token in enumerate(new_tokens):
+            input_ids = tokenizer(token, add_special_tokens = False).input_ids
+            mean_embedding_token = embedding_matrix[input_ids].mean(axis = 0, dtype = torch.float32)
+            mean_lm_head_token   = lm_head_matrix  [input_ids].mean(axis = 0, dtype = torch.float32)
+
+            # Interpolate
+            mean_embedding_token = mean_embedding*(1-interpolation) + mean_embedding_token*interpolation
+            mean_lm_head_token   = mean_lm_head  *(1-interpolation) + mean_lm_head_token  *interpolation
+
+            # Set the new vector
+            embedding_matrix[old_length+j] = mean_embedding_token
+            lm_head_matrix  [old_length+j] = mean_lm_head_token
+        pass
+    else:
+        # Now set the new tokens to the mean!
+        embedding_matrix[old_length:] = mean_embedding
+        lm_head_matrix  [old_length:] = mean_lm_head
+    pass
+
+    # We set a flag to say we need to train embeddings
+    internal_model = model
+    while hasattr(internal_model, "model"):
+        internal_model._need_to_train_embeddings = True
+        internal_model = internal_model.model
+    pass
+    internal_model._need_to_train_embeddings = True
+    
+    return
+pass

From 868351ba5ee6d914dc0afc45b1e685396ba545f8 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 21 Apr 2024 16:26:35 +1000
Subject: [PATCH 06/69] Update _utils.py

---
 unsloth/models/_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 9b9ba9ac0..9c4ae8fc6 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -71,7 +71,6 @@
     "patch_tokenizer",
     "get_statistics",
     "Unsloth_Offloaded_Gradient_Checkpointer",
-    "fix_untrained_tokens",
 ]
 
 

From f29a3e758610dcb8430e3c078f63e178d42c1056 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 21 Apr 2024 16:28:29 +1000
Subject: [PATCH 07/69] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index c0f6c7fc2..15606a7c9 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -19,6 +19,7 @@
 import os
 from transformers.models.llama.modeling_llama import logger
 from peft import PeftModelForCausalLM
+import torch
 
 __all__ = [
     "load_correct_tokenizer",

From 2573474bd597823a467606c748440c0cbad2c574 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 21 Apr 2024 16:36:03 +1000
Subject: [PATCH 08/69] Update llama.py

---
 unsloth/models/llama.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 425d068fd..11618eee3 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1472,25 +1472,17 @@ def get_peft_model(
                 final_modules.append(module)
         pass
 
-        # Check modules_to_save
-        if modules_to_save is not None:
-            for module in modules_to_save:
-                if module == "lm_head":
-                    train_lm_head = True
-                elif module == "embed_tokens":
-                    train_embed_tokens = True
-            pass
-        pass
-
         # Check if we added new tokens!
         if hasattr(model, "_need_to_train_embeddings"):
             if not train_lm_head or not train_embed_tokens:
                 print(
                     "Unsloth: You added new tokens but did not specify if you wanted to "\
-                    "train the lm_head and embed_tokens. We must turn it on for you."
+                    "train the lm_head and embed_tokens.\nWe must turn it on for you."
                 )
                 train_lm_head = True
                 train_embed_tokens = True
+                if "lm_head"      not in modules_to_save: modules_to_save.append("lm_head")
+                if "embed_tokens" not in modules_to_save: modules_to_save.append("embed_tokens")
             pass
         pass
 
@@ -1498,7 +1490,17 @@ def get_peft_model(
         if train_embed_tokens or train_lm_head:
             fix_untrained_tokens(model, eps = 1e-16)
         pass
-        
+
+        # Check modules_to_save
+        if modules_to_save is not None:
+            for module in modules_to_save:
+                if module == "lm_head":
+                    train_lm_head = True
+                elif module == "embed_tokens":
+                    train_embed_tokens = True
+            pass
+        pass
+
         # Get LoRA
         arguments = dict(
             r                   = r,

From bfb32a35817db5f79c3929ac25a2792d9b976025 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 21 Apr 2024 16:39:26 +1000
Subject: [PATCH 09/69] Update llama.py

---
 unsloth/models/llama.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 11618eee3..75e9888d6 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1481,8 +1481,12 @@ def get_peft_model(
                 )
                 train_lm_head = True
                 train_embed_tokens = True
-                if "lm_head"      not in modules_to_save: modules_to_save.append("lm_head")
-                if "embed_tokens" not in modules_to_save: modules_to_save.append("embed_tokens")
+
+                if modules_to_save is None: modules_to_save = ["embed_tokens"]
+                else: modules_to_save.append("embed_tokens")
+
+                if modules_to_save is None: modules_to_save = ["lm_head"]
+                else: modules_to_save.append("lm_head")
             pass
         pass
 
@@ -1498,8 +1502,13 @@ def get_peft_model(
                     train_lm_head = True
                 elif module == "embed_tokens":
                     train_embed_tokens = True
+                else:
+                    raise TypeError(
+                        f"Unsloth: Module = {module} is not allowed. Only 'lm_head' and 'embed_tokens' is allowed."
+                    )
             pass
         pass
+        modules_to_save = list(set(modules_to_save))
 
         # Get LoRA
         arguments = dict(

From 40a6d009ef381e72f2f6793701040c4f01c942b5 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 21 Apr 2024 16:57:59 +1000
Subject: [PATCH 10/69] Update llama.py

---
 unsloth/models/llama.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 75e9888d6..45c75010b 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1445,6 +1445,10 @@ def get_peft_model(
                                       "gate_proj", "up_proj", "down_proj",),)
         model.config.update({"unsloth_version" : __version__})
 
+        if type(modules_to_save) is tuple:
+            modules_to_save = list(modules_to_save)
+        pass
+
         train_lm_head = False
         train_embed_tokens = False
         final_modules = []
@@ -1508,7 +1512,9 @@ def get_peft_model(
                     )
             pass
         pass
-        modules_to_save = list(set(modules_to_save))
+        if isinstance(modules_to_save, (tuple, list)):
+            modules_to_save = list(set(modules_to_save))
+        pass
 
         # Get LoRA
         arguments = dict(

From 140a0b0a407d6151c27c0802e4a30380ae4df042 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 21 Apr 2024 17:56:03 +1000
Subject: [PATCH 11/69] Update llama.py

---
 unsloth/models/llama.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 45c75010b..ea01d9080 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -534,8 +534,11 @@ def LlamaModel_fast_forward(
     pass
 
     # Embed positions
+    print(input_ids)
+    print(input_ids.min(), input_ids.max())
     if inputs_embeds is None:
         inputs_embeds = self.embed_tokens(input_ids)
+    print(inputs_embeds)
 
     inputs_embeds = inputs_embeds.to(self.config.torch_dtype)
 

From 88435a80de9644703b5206398e76f71671ef9190 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 21 Apr 2024 18:12:45 +1000
Subject: [PATCH 12/69] pad_token

---
 unsloth/chat_templates.py  | 11 +++++++++--
 unsloth/tokenizer_utils.py |  6 +++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 2d15470a0..a5d39df27 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -340,9 +340,16 @@ def get_chat_template(
                 new_tokenizer = tokenizer._tokenizer.from_str(string_vocab)
 
                 if map_eos_token:
-                    new_tokenizer = tokenizer.__class__(tokenizer_object = new_tokenizer, eos_token = stop_word)
+                    new_tokenizer = tokenizer.__class__(
+                        tokenizer_object = new_tokenizer,
+                        eos_token = stop_word,
+                        pad_token = tokenizer.pad_token,
+                    )
                 else:
-                    new_tokenizer = tokenizer.__class__(tokenizer_object = new_tokenizer)
+                    new_tokenizer = tokenizer.__class__(
+                        tokenizer_object = new_tokenizer,
+                        pad_token = tokenizer.pad_token,
+                    )
                 pass
 
                 # Must fix the sentence piece tokenizer since there's no tokenizer.model file!
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 15606a7c9..f1a9daa99 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -259,7 +259,11 @@ def fix_sentencepiece_tokenizer(
 
     # And load it!
     from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(temporary_location, eos_token = new_tokenizer.eos_token)
+    tokenizer = AutoTokenizer.from_pretrained(
+        temporary_location,
+        eos_token = new_tokenizer.eos_token,
+        pad_token = new_tokenizer.pad_token,
+    )
     return tokenizer
 pass
 

From 24790e270906fb4bbf46ef0ab96f693509e16d9e Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 22 Apr 2024 04:51:23 +1000
Subject: [PATCH 13/69] Update chat_templates.py

---
 unsloth/chat_templates.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index a5d39df27..a7b98aaf9 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -339,16 +339,22 @@ def get_chat_template(
             if skipped != len(token_mapping):
                 new_tokenizer = tokenizer._tokenizer.from_str(string_vocab)
 
+                # Careful on pad_token
+                old_pad_token = tokenizer.pad_token
+                if old_pad_token == tokenizer.eos_token:
+                    old_pad_token = stop_word
+                pass
+
                 if map_eos_token:
                     new_tokenizer = tokenizer.__class__(
                         tokenizer_object = new_tokenizer,
                         eos_token = stop_word,
-                        pad_token = tokenizer.pad_token,
+                        pad_token = old_pad_token,
                     )
                 else:
                     new_tokenizer = tokenizer.__class__(
                         tokenizer_object = new_tokenizer,
-                        pad_token = tokenizer.pad_token,
+                        pad_token = old_pad_token,
                     )
                 pass
 
@@ -384,6 +390,12 @@ def get_chat_template(
                 string_vocab = string_vocab.replace(old_eos_token, stop_word)
             pass
             new_tokenizer = tokenizer._tokenizer.from_str(string_vocab)
+
+            # Careful on pad_token
+            if old_pad_token == old_eos_token:
+                old_pad_token = stop_word
+            pass
+
             new_tokenizer = tokenizer.__class__(
                 tokenizer_object = new_tokenizer,
                 bos_token = old_bos_token,

From 1464f7da271c8398c7a8db41ac361212088068ab Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 22 Apr 2024 04:54:51 +1000
Subject: [PATCH 14/69] Update chat_templates.py

---
 unsloth/chat_templates.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index a7b98aaf9..5d8a15e68 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -285,6 +285,8 @@ def get_chat_template(
     is_fast_tokenizer = getattr(tokenizer, "is_fast", False)
     old_padding_side = tokenizer.padding_side
 
+    same_padding_token = False
+
     if type(chat_template) in (list, tuple,):
         chat_template, stop_word = chat_template
         assert(type(chat_template) is str)
@@ -343,6 +345,7 @@ def get_chat_template(
                 old_pad_token = tokenizer.pad_token
                 if old_pad_token == tokenizer.eos_token:
                     old_pad_token = stop_word
+                    same_padding_token = True
                 pass
 
                 if map_eos_token:
@@ -394,6 +397,7 @@ def get_chat_template(
             # Careful on pad_token
             if old_pad_token == old_eos_token:
                 old_pad_token = stop_word
+                same_padding_token = True
             pass
 
             new_tokenizer = tokenizer.__class__(
@@ -440,9 +444,11 @@ def get_chat_template(
     new_pad_token = getattr(tokenizer,     "pad_token", None)
     new_bos_token = getattr(tokenizer,     "bos_token", None)
     new_unk_token = getattr(tokenizer,     "unk_token", None)
-    if old_pad_token != new_pad_token: tokenizer.pad_token = old_pad_token
     if old_bos_token != new_bos_token: tokenizer.bos_token = old_bos_token
     if old_unk_token != new_unk_token: tokenizer.unk_token = old_unk_token
+    if same_padding_token:
+        if old_pad_token != new_pad_token: tokenizer.pad_token = old_pad_token
+    pass
 
     # stopping_criteria = create_stopping_criteria(tokenizer, stop_word)
 

From df069c51f3fa40eef863a60c9cad2e9f4c855dfb Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 22 Apr 2024 04:57:30 +1000
Subject: [PATCH 15/69] tokenizer

---
 unsloth/chat_templates.py | 1 +
 unsloth/models/llama.py   | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 5d8a15e68..6686c60f7 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -436,6 +436,7 @@ def get_chat_template(
     _, tokenizer = patch_tokenizer(model = None, tokenizer = tokenizer)
     tokenizer.padding_side  = old_padding_side
     tokenizer.chat_template = chat_template
+    print(tokenizer)
 
     # Also fix up other tokens
     old_pad_token = getattr(old_tokenizer, "pad_token", None)
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index ea01d9080..45c75010b 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -534,11 +534,8 @@ def LlamaModel_fast_forward(
     pass
 
     # Embed positions
-    print(input_ids)
-    print(input_ids.min(), input_ids.max())
     if inputs_embeds is None:
         inputs_embeds = self.embed_tokens(input_ids)
-    print(inputs_embeds)
 
     inputs_embeds = inputs_embeds.to(self.config.torch_dtype)
 

From eb00fb7e77a7076f5cafee7761f0e2575ad7cedd Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 22 Apr 2024 05:01:13 +1000
Subject: [PATCH 16/69] Update save.py

---
 unsloth/save.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 655d1c510..6e9d82c88 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -922,9 +922,16 @@ def save_to_gguf(
           f"The output location will be {final_location}\n"\
           "This will take 3 minutes...")
 
+    # We first check if tokenizer.model exists in the model_directory
+    if os.path.exists(f"{model_directory}/tokenizer.model"):
+        vocab_type = "hfft"
+    else:
+        vocab_type = "bpe"
+    pass
+
     if use_fast_convert:
         command = f"python llama.cpp/convert.py {model_directory} "\
-            f"--outfile {final_location} --vocab-type hfft "\
+            f"--outfile {final_location} --vocab-type {vocab_type} "\
             f"--outtype {first_conversion} --concurrency {n_cpus}"
     else:
         # Need to fix convert-hf-to-gguf.py for some models!

From 805f890e30cc97f9dd324b12b45bfe129f86af9d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 22 Apr 2024 05:02:08 +1000
Subject: [PATCH 17/69] Update chat_templates.py

---
 unsloth/chat_templates.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 6686c60f7..5d8a15e68 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -436,7 +436,6 @@ def get_chat_template(
     _, tokenizer = patch_tokenizer(model = None, tokenizer = tokenizer)
     tokenizer.padding_side  = old_padding_side
     tokenizer.chat_template = chat_template
-    print(tokenizer)
 
     # Also fix up other tokens
     old_pad_token = getattr(old_tokenizer, "pad_token", None)

From 80be6ff8f2fd6aa3c580628fc5f33eedc9cf7ca7 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 22 Apr 2024 05:04:38 +1000
Subject: [PATCH 18/69] Update chat_templates.py

---
 unsloth/chat_templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 5d8a15e68..d31b6cf7a 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -446,7 +446,7 @@ def get_chat_template(
     new_unk_token = getattr(tokenizer,     "unk_token", None)
     if old_bos_token != new_bos_token: tokenizer.bos_token = old_bos_token
     if old_unk_token != new_unk_token: tokenizer.unk_token = old_unk_token
-    if same_padding_token:
+    if not same_padding_token:
         if old_pad_token != new_pad_token: tokenizer.pad_token = old_pad_token
     pass
 

From 2e62a6908b5250984dc107eaca3329707733dacc Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 23 Apr 2024 01:51:10 +1000
Subject: [PATCH 19/69] patch tokenizer padding

---
 unsloth/models/llama.py   | 47 +++++++++++++++++++++++++++++++++++++--
 unsloth/models/mistral.py |  9 ++++++++
 2 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 45c75010b..c6b733e12 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1284,6 +1284,15 @@ def from_pretrained(
         # Add save modules
         patch_saving_functions(model)
 
+        # Save tokenizer for inference purposes
+        tokenizer.padding_side = "left" # Force inference
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model._saved_temp_tokenizer = tokenizer
+            internal_model = internal_model.model
+        pass
+        internal_model._saved_temp_tokenizer = tokenizer
+        
         return model, tokenizer
     pass
 
@@ -1554,6 +1563,18 @@ def get_peft_model(
             model.model.lm_head.modules_to_save.default.requires_grad_(True)
         pass
 
+        # Patch tokenizer to pad to the right
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.padding_side = "right"
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.padding_side = "right"
+        pass
+
         return model
     pass
 
@@ -1751,6 +1772,18 @@ def for_inference(model):
         # Wrap model.generate
         model._unwrapped_old_generate = model.generate
         model.generate = _wrap_fast_inference(model.generate, device_type, dtype)
+
+        # Patch tokenizer to pad to the left
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.padding_side = "left"
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.padding_side = "left"
+        pass
     pass
 
 
@@ -1777,8 +1810,18 @@ def for_training(model, use_gradient_checkpointing = True):
             model.generate = model._unwrapped_old_generate
             del model._unwrapped_old_generate
         pass
+
+        # Patch tokenizer to pad to the right
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            if hasattr(internal_model, "_saved_temp_tokenizer"):
+                internal_model._saved_temp_tokenizer.padding_side = "right"
+            pass
+            internal_model = internal_model.model
+        pass
+        if hasattr(internal_model, "_saved_temp_tokenizer"):
+            internal_model._saved_temp_tokenizer.padding_side = "right"
+        pass
     pass
 pass
 
-
-
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index 56108939b..80d0ffdf7 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -559,6 +559,15 @@ def from_pretrained(
 
         # Add save modules
         patch_saving_functions(model)
+
+        # Save tokenizer for inference purposes
+        tokenizer.padding_side = "left" # Force inference
+        internal_model = model
+        while hasattr(internal_model, "model"):
+            internal_model._saved_temp_tokenizer = tokenizer
+            internal_model = internal_model.model
+        pass
+        internal_model._saved_temp_tokenizer = tokenizer
         
         return model, tokenizer
     pass

From b0678d6b8a7a04107967d0264bc2c3989ffb5a75 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 23 Apr 2024 04:41:55 +1000
Subject: [PATCH 20/69] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index f1a9daa99..5dc5856c2 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -524,7 +524,7 @@ def add_new_tokens(
     tokenizer,
     new_tokens = [],
     method = "mean",
-    interpolation = 0.05,
+    interpolation = 0.5,
 ):
     """
     Smartly resizes the tokenizer and adds new tokens to the model.

From f85ef9c0494a277ae4c7fef71444966f732e7586 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 24 Apr 2024 00:03:36 +1000
Subject: [PATCH 21/69] Update save.py

---
 unsloth/save.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 6e9d82c88..493b8acaa 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -689,7 +689,7 @@ def unsloth_save_model(
 
 
 def install_llama_cpp_clone_non_blocking():
-    full_command = ["git", "clone", "https://github.com/ggerganov/llama.cpp"]
+    full_command = ["git", "clone", "--recursive", "https://github.com/ggerganov/llama.cpp"]
     run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
     return run_installer
 pass
@@ -742,7 +742,7 @@ def install_llama_cpp_old(version = -10):
     # Clone a specific commit
     # Also don't use the GPU!
     commands = [
-        "git clone https://github.com/ggerganov/llama.cpp",
+        "git clone --recursive https://github.com/ggerganov/llama.cpp",
         f"cd llama.cpp && git reset --hard {version} && git clean -df",
         "make clean -C llama.cpp",
         f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
@@ -767,7 +767,7 @@ def install_llama_cpp_blocking(use_cuda = True):
     use_cuda = "LLAMA_CUDA=1" if use_cuda else ""
 
     commands = [
-        "git clone https://github.com/ggerganov/llama.cpp",
+        "git clone --recursive https://github.com/ggerganov/llama.cpp",
         "make clean -C llama.cpp",
         f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp",
         "pip install gguf protobuf",
@@ -966,7 +966,7 @@ def save_to_gguf(
                 "You might have to compile llama.cpp yourself, then run this again.\n"\
                 "You do not need to close this Python program. Run the following commands in a new terminal:\n"\
                 "You must run this in the same folder as you're saving your model.\n"\
-                "git clone https://github.com/ggerganov/llama.cpp\n"\
+                "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
                 "cd llama.cpp && make clean && LLAMA_CUDA=1 make all -j\n"\
                 "Once that's done, redo the quantization."
             )
@@ -1006,7 +1006,7 @@ def save_to_gguf(
                     "Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\
                     "You do not need to close this Python program. Run the following commands in a new terminal:\n"\
                     "You must run this in the same folder as you're saving your model.\n"\
-                    "git clone https://github.com/ggerganov/llama.cpp\n"\
+                    "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
                     "cd llama.cpp && make clean && LLAMA_CUDA=1 make all -j\n"\
                     "Once that's done, redo the quantization."
                 )

From d2f10a0f488e036ea4f66caea77bf41ed7746329 Mon Sep 17 00:00:00 2001
From: Igor Kilbas <whitemarsstudios@gmail.com>
Date: Wed, 24 Apr 2024 16:57:24 +0400
Subject: [PATCH 22/69] Fix: loading models with resized vocabulary (#377)

* new: vocab resize on load

* new: gitignore
---
 .gitignore               | 160 +++++++++++++++++++++++++++++++++++++++
 unsloth/models/loader.py |   4 +
 2 files changed, 164 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..68bc17f9f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index fa864a9a8..a107200ea 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -76,6 +76,7 @@ def from_pretrained(
         fix_tokenizer  = True,
         trust_remote_code = False,
         use_gradient_checkpointing = True,
+        resize_model_vocab = None,
         *args, **kwargs,
     ):
         if token is None and "HF_TOKEN" in os.environ:
@@ -149,6 +150,9 @@ def from_pretrained(
             trust_remote_code = trust_remote_code,
             *args, **kwargs,
         )
+        
+        if resize_model_vocab is not None:
+            model.resize_token_embeddings(resize_model_vocab)
 
         # In case the model supports tagging, add the unsloth tag.
         if hasattr(model, "add_model_tags"):

From f5fa6548c6e694b2f688a74cdb9da04ed7cf7603 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 28 Apr 2024 20:08:22 +1000
Subject: [PATCH 23/69] GGUF fix

---
 unsloth/chat_templates.py | 11 +++++++++++
 unsloth/models/llama.py   |  4 ++++
 unsloth/save.py           |  9 +--------
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index d31b6cf7a..4e7a71aee 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -281,6 +281,17 @@ def get_chat_template(
         IS_GEMMA = True
     pass
 
+    # We add a check for Llama-3
+    # if chat_template == "llama-3":
+    #     tokenizer._using_llama3_template = True
+    # else:
+    #     llama3_tokens = set(["<|end_header_id|>", "<|eot_id|>", "<|start_header_id|>"])
+    #     check_llama3_tokens = llama3_tokens & set(str(x) for x in tokenizer.added_tokens_decoder.values())
+    #     if len(check_llama3_tokens) == len(llama3_tokens):
+    #         tokenizer._using_llama3_template = True
+    #     pass
+    # pass
+
     # We first check if the tokenizer is a fast one. If not, we cannot convert this!
     is_fast_tokenizer = getattr(tokenizer, "is_fast", False)
     old_padding_side = tokenizer.padding_side
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index c6b733e12..a7cacea9b 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1543,8 +1543,12 @@ def get_peft_model(
         if not SUPPORTS_LOFTQ:  del arguments["loftq_config"]
         if not SUPPORTS_RSLORA: del arguments["use_rslora"]
 
+        _saved_temp_tokenizer = model._saved_temp_tokenizer
+
         lora_config = LoraConfig(**arguments)
         model = _get_peft_model(model, lora_config)
+        
+        model._saved_temp_tokenizer = _saved_temp_tokenizer
 
         model = FastLlamaModel.patch_peft_model(model, use_gradient_checkpointing)
 
diff --git a/unsloth/save.py b/unsloth/save.py
index 493b8acaa..a2c55bb53 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -922,16 +922,9 @@ def save_to_gguf(
           f"The output location will be {final_location}\n"\
           "This will take 3 minutes...")
 
-    # We first check if tokenizer.model exists in the model_directory
-    if os.path.exists(f"{model_directory}/tokenizer.model"):
-        vocab_type = "hfft"
-    else:
-        vocab_type = "bpe"
-    pass
-
     if use_fast_convert:
         command = f"python llama.cpp/convert.py {model_directory} "\
-            f"--outfile {final_location} --vocab-type {vocab_type} "\
+            f"--outfile {final_location} --vocab-type spm,hfft,bpe "\
             f"--outtype {first_conversion} --concurrency {n_cpus}"
     else:
         # Need to fix convert-hf-to-gguf.py for some models!

From 8325e05dc401e1fed81bedb783b47795a273486d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 29 Apr 2024 03:52:04 +1000
Subject: [PATCH 24/69] Readme (#390)

* Update README.md

* Update README.md

---------

Co-authored-by: Michael Han <107991372+shimmyshimmer@users.noreply.github.com>
---
 README.md | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 555e08089..2a9499c22 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 <a href="https://discord.gg/u54VK8m8tk"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" height="48"></a>
 <a href="https://ko-fi.com/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/buy me a coffee button.png" height="48"></a>
 
-### Finetune Mistral, Gemma, Llama 2-5x faster with 80% less memory!
+### Finetune Llama 3, Mistral & Gemma 2-5x faster with 80% less memory!
 
 ![](https://i.ibb.co/sJ7RhGG/image-41.png)
 
@@ -22,12 +22,11 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 | Unsloth supports          |    Free Notebooks                                                                                           | Performance | Memory use |
 |-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------|
-| **Llama-3 8b**      | [▶️ Start on Colab](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
-| **Gemma 7b**      | [▶️ Start on Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
-| **Mistral 7b**    | [▶️ Start on Colab](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
-| **TinyLlama**  | [▶️ Start on Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)              | 3.9x faster | 82% less |
-| **CodeLlama 34b** A100   | [▶️ Start on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing)              | 1.9x faster | 49% less |
-| **Mistral 7b** 1xT4  | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) | 5x faster\* | 73% less |
+| **Llama 3 (8B)**      | [▶️ Start on Colab](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
+| **Mistral (7B)**    | [▶️ Start on Colab](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
+| **Gemma (7B)**      | [▶️ Start on Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
+| **Llama 3 (8B)** 1xT4  | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 5x faster\* | 73% less |
+| **ORPO**     | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
 | **DPO - Zephyr**     | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
 
 - Benchmarking compared to FA2 + Hugging Face combined.
@@ -36,7 +35,8 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - \* Kaggle has 2x T4s, but we use 1. Due to overhead, 1x T4 is 5x faster.
 
 ## 🦥 Unsloth.ai News
-- 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (just change the model name in the notebook).
+- 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).
+- 📣 NEW! [ORPO support](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) is here!
 - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line:
 ```python
 model = FastLanguageModel.get_peft_model(
@@ -46,8 +46,6 @@ model = FastLanguageModel.get_peft_model(
 ```
 - 📣 [CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing)
 - 📣 [2x faster inference](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) added for all our models
-- 📣 [DPO support](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) is now included. [More info](#DPO) on DPO
-- 📣 We did a [blog](https://huggingface.co/blog/unsloth-trl) with 🤗Hugging Face and are in their official docs! Check out the [SFT docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth) and [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth)
 
 ## 🔗 Links and Resources
 | Type                            | Links                               |

From 13b1ae6b93b53bf38c0ebb2acf9673d382fc2d17 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 29 Apr 2024 04:21:45 +1000
Subject: [PATCH 25/69] Update README.md

---
 README.md | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 2a9499c22..6df661622 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | **Mistral (7B)**    | [▶️ Start on Colab](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
 | **Gemma (7B)**      | [▶️ Start on Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
 | **Llama 3 (8B)** 1xT4  | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 5x faster\* | 73% less |
-| **ORPO**     | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
+| **ORPO**     | [▶️ Start on Colab](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
 | **DPO - Zephyr**     | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
 
 - Benchmarking compared to FA2 + Hugging Face combined.
@@ -36,7 +36,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 ## 🦥 Unsloth.ai News
 - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).
-- 📣 NEW! [ORPO support](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) is here!
+- 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here!
 - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line:
 ```python
 model = FastLanguageModel.get_peft_model(
@@ -180,18 +180,20 @@ max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any!
 url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
 dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
 
-# 4bit pre quantized models we support - 4x faster downloading!
+# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
 fourbit_models = [
     "unsloth/mistral-7b-bnb-4bit",
+    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
     "unsloth/llama-2-7b-bnb-4bit",
-    "unsloth/llama-2-13b-bnb-4bit",
-    "unsloth/codellama-34b-bnb-4bit",
-    "unsloth/tinyllama-bnb-4bit",
-] # Go to https://huggingface.co/unsloth for more 4-bit models!
+    "unsloth/gemma-7b-bnb-4bit",
+    "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
+    "unsloth/gemma-2b-bnb-4bit",
+    "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
+    "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
+] # More models at https://huggingface.co/unsloth
 
-# Load Llama model
 model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name = "unsloth/mistral-7b-bnb-4bit", # Supports Llama, Mistral - replace this!
+    model_name = "unsloth/llama-3-8b-bnb-4bit",
     max_seq_length = max_seq_length,
     dtype = None,
     load_in_4bit = True,
@@ -206,7 +208,8 @@ model = FastLanguageModel.get_peft_model(
     lora_alpha = 16,
     lora_dropout = 0, # Supports any, but = 0 is optimized
     bias = "none",    # Supports any, but = "none" is optimized
-    use_gradient_checkpointing = True,
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
     random_state = 3407,
     max_seq_length = max_seq_length,
     use_rslora = False,  # We support rank stabilized LoRA
@@ -270,7 +273,8 @@ model = FastLanguageModel.get_peft_model(
     lora_alpha = 64,
     lora_dropout = 0, # Supports any, but = 0 is optimized
     bias = "none",    # Supports any, but = "none" is optimized
-    use_gradient_checkpointing = True,
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
     random_state = 3407,
     max_seq_length = max_seq_length,
 )

From 5069a7da39a51498154b740e7faa591a4343700c Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 29 Apr 2024 04:39:33 +1000
Subject: [PATCH 26/69] Delete .gitignore

---
 .gitignore | 160 -----------------------------------------------------
 1 file changed, 160 deletions(-)
 delete mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 68bc17f9f..000000000
--- a/.gitignore
+++ /dev/null
@@ -1,160 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/

From 7c9c3f5fec572d63f72b2ba290c8043d74efd486 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 03:40:53 +1000
Subject: [PATCH 27/69] Phi-3

---
 README.md                | 8 ++++++--
 unsloth/models/mapper.py | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6df661622..ca5419a20 100644
--- a/README.md
+++ b/README.md
@@ -25,18 +25,21 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | **Llama 3 (8B)**      | [▶️ Start on Colab](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
 | **Mistral (7B)**    | [▶️ Start on Colab](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
 | **Gemma (7B)**      | [▶️ Start on Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
-| **Llama 3 (8B)** 1xT4  | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 5x faster\* | 73% less |
+| **Llama 3 (8B)** 1xT4  | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 2x faster | 60% less |
 | **ORPO**     | [▶️ Start on Colab](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
 | **DPO - Zephyr**     | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
+| **Phi-3 (3.8B)**     | [▶️ Start on Colab](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               | 3.9x faster | 74% less |
+| **TinyLlama (1.1B)**  | [▶️ Start on Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 1.9x faster | 43% less |
 
 - Benchmarking compared to FA2 + Hugging Face combined.
 - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates.
 - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for raw text. This [DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr.
-- \* Kaggle has 2x T4s, but we use 1. Due to overhead, 1x T4 is 5x faster.
+- Other Kaggle Notebooks for [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
 
 ## 🦥 Unsloth.ai News
 - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).
 - 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here!
+- 📣 NEW! [Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here!
 - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line:
 ```python
 model = FastLanguageModel.get_peft_model(
@@ -190,6 +193,7 @@ fourbit_models = [
     "unsloth/gemma-2b-bnb-4bit",
     "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
     "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
+    "unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
 ] # More models at https://huggingface.co/unsloth
 
 model, tokenizer = FastLanguageModel.from_pretrained(
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
index 769cbff53..b4fbe5738 100644
--- a/unsloth/models/mapper.py
+++ b/unsloth/models/mapper.py
@@ -140,6 +140,10 @@
     "unsloth/llama-3-70b-Instruct-bnb-4bit" : (
         "meta-llama/Meta-Llama-3-70B-Instruct",
     ),
+    "unsloth/Phi-3-mini-4k-instruct-bnb-4bit" : (
+        "unsloth/Phi-3-mini-4k-instruct",
+        "microsoft/Phi-3-mini-4k-instruct",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER = {}

From 7b696ee6a930f1d65f7e108b636dcee0d4058265 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 03:44:15 +1000
Subject: [PATCH 28/69] Update README.md

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index ca5419a20..8147d44c4 100644
--- a/README.md
+++ b/README.md
@@ -20,16 +20,16 @@
 
 All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face.
 
-| Unsloth supports          |    Free Notebooks                                                                                           | Performance | Memory use |
+| Unsloth supports          |    Notebooks                                                                                           | Performance | Memory use |
 |-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------|
-| **Llama 3 (8B)**      | [▶️ Start on Colab](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
-| **Mistral (7B)**    | [▶️ Start on Colab](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
-| **Gemma (7B)**      | [▶️ Start on Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
+| **Llama 3 (8B)**      | [▶️ Start Free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
+| **Mistral (7B)**    | [▶️ Start Free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
+| **Gemma (7B)**      | [▶️ Start Free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
 | **Llama 3 (8B)** 1xT4  | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 2x faster | 60% less |
-| **ORPO**     | [▶️ Start on Colab](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
-| **DPO - Zephyr**     | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
-| **Phi-3 (3.8B)**     | [▶️ Start on Colab](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               | 3.9x faster | 74% less |
-| **TinyLlama (1.1B)**  | [▶️ Start on Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 1.9x faster | 43% less |
+| **ORPO**     | [▶️ Start Free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
+| **DPO - Zephyr**     | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
+| **Phi-3 (3.8B)**     | [▶️ Start Free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               |  |  |
+| **TinyLlama (1.1B)**  | [▶️ Start Free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
 - Benchmarking compared to FA2 + Hugging Face combined.
 - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates.

From 48334f7d99a0ece72a61d7e2278dae8c3eb736cb Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 03:46:17 +1000
Subject: [PATCH 29/69] Update README.md

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8147d44c4..f46c826d6 100644
--- a/README.md
+++ b/README.md
@@ -20,12 +20,11 @@
 
 All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face.
 
-| Unsloth supports          |    Notebooks                                                                                           | Performance | Memory use |
+| Unsloth supports          |    Colab                                                                                           | Performance | Memory use |
 |-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------|
 | **Llama 3 (8B)**      | [▶️ Start Free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
 | **Mistral (7B)**    | [▶️ Start Free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
 | **Gemma (7B)**      | [▶️ Start Free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
-| **Llama 3 (8B)** 1xT4  | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 2x faster | 60% less |
 | **ORPO**     | [▶️ Start Free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
 | **DPO - Zephyr**     | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
 | **Phi-3 (3.8B)**     | [▶️ Start Free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               |  |  |

From 3665c0bb370e2dc23c73c176fd000d49d87bafaf Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 03:48:22 +1000
Subject: [PATCH 30/69] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f46c826d6..3cbd63bee 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@
 
 All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face.
 
-| Unsloth supports          |    Colab                                                                                           | Performance | Memory use |
+| Unsloth for          |    Colab                                                                                           | Speed | Memory use |
 |-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------|
 | **Llama 3 (8B)**      | [▶️ Start Free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
 | **Mistral (7B)**    | [▶️ Start Free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
@@ -28,7 +28,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | **ORPO**     | [▶️ Start Free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
 | **DPO - Zephyr**     | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
 | **Phi-3 (3.8B)**     | [▶️ Start Free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               |  |  |
-| **TinyLlama (1.1B)**  | [▶️ Start Free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
+| **TinyLlama**  | [▶️ Start Free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
 - Benchmarking compared to FA2 + Hugging Face combined.
 - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates.

From 0f9e073c8cf2a067868fd957e8a24572e0e1c802 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 03:48:57 +1000
Subject: [PATCH 31/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3cbd63bee..85ba5fd99 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | **Mistral (7B)**    | [▶️ Start Free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
 | **Gemma (7B)**      | [▶️ Start Free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
 | **ORPO**     | [▶️ Start Free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
-| **DPO - Zephyr**     | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
+| **DPO Zephyr**     | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
 | **Phi-3 (3.8B)**     | [▶️ Start Free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               |  |  |
 | **TinyLlama**  | [▶️ Start Free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 

From eb135d8831ce1bc5a6ce463962d963c011b6c8fd Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 03:50:31 +1000
Subject: [PATCH 32/69] Update README.md

---
 README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 85ba5fd99..d6a72e4fd 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
     <img alt="unsloth logo" src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png" height="110" style="max-width: 100%;">
   </picture></a>
   
-<a href="https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/start free finetune button.png" height="48"></a>
+<a href="https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Start free finetune button.png" height="48"></a>
 <a href="https://discord.gg/u54VK8m8tk"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" height="48"></a>
 <a href="https://ko-fi.com/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/buy me a coffee button.png" height="48"></a>
 
@@ -20,15 +20,15 @@
 
 All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face.
 
-| Unsloth for          |    Colab                                                                                           | Speed | Memory use |
+| Unsloth for          |    Colab                                                                                           | Speed | Memory |
 |-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------|
-| **Llama 3 (8B)**      | [▶️ Start Free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
-| **Mistral (7B)**    | [▶️ Start Free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
-| **Gemma (7B)**      | [▶️ Start Free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
-| **ORPO**     | [▶️ Start Free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
-| **DPO Zephyr**     | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
-| **Phi-3 (3.8B)**     | [▶️ Start Free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               |  |  |
-| **TinyLlama**  | [▶️ Start Free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
+| **Llama 3 (8B)**      | [▶️ Start free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | - 60% |
+| **Mistral (7B)**    | [▶️ Start free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | - 73% |
+| **Gemma (7B)**      | [▶️ Start free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | - 71% |
+| **ORPO**     | [▶️ Start free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | - 43% |
+| **DPO Zephyr**     | [▶️ Start free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | - 43% |
+| **Phi-3 (3.8B)**     | [▶️ Start free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               |  |  |
+| **TinyLlama**  | [▶️ Start free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | - 74% |
 
 - Benchmarking compared to FA2 + Hugging Face combined.
 - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates.

From 56e2674e1ebfbf512972517967407b5ea57002e1 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 03:53:19 +1000
Subject: [PATCH 33/69] Update README.md

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index d6a72e4fd..45119920b 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
     <img alt="unsloth logo" src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png" height="110" style="max-width: 100%;">
   </picture></a>
   
-<a href="https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Start free finetune button.png" height="48"></a>
+<a href="https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Start free finetune finetune button.png" height="48"></a>
 <a href="https://discord.gg/u54VK8m8tk"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" height="48"></a>
 <a href="https://ko-fi.com/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/buy me a coffee button.png" height="48"></a>
 
@@ -20,15 +20,15 @@
 
 All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face.
 
-| Unsloth for          |    Colab                                                                                           | Speed | Memory |
-|-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------|
-| **Llama 3 (8B)**      | [▶️ Start free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | - 60% |
-| **Mistral (7B)**    | [▶️ Start free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | - 73% |
-| **Gemma (7B)**      | [▶️ Start free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | - 71% |
-| **ORPO**     | [▶️ Start free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | - 43% |
-| **DPO Zephyr**     | [▶️ Start free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | - 43% |
-| **Phi-3 (3.8B)**     | [▶️ Start free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               |  |  |
-| **TinyLlama**  | [▶️ Start free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | - 74% |
+| Unsloth for | Free Notebooks | Performance | Memory use |
+|-----------|---------|--------|----------|
+| **Llama 3 (8B)**      | [▶️ Start free finetune](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
+| **Mistral (7B)**    | [▶️ Start free finetune](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
+| **Gemma (7B)**      | [▶️ Start free finetune](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
+| **ORPO**     | [▶️ Start free finetune](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
+| **DPO Zephyr**     | [▶️ Start free finetune](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
+| **Phi-3 (3.8B)**     | [▶️ Start free finetune](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               |  |  |
+| **TinyLlama**  | [▶️ Start free finetune](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
 - Benchmarking compared to FA2 + Hugging Face combined.
 - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates.

From b091a0b6bc9ffd7827c3db8e58e1b0febaf05ef9 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 03:54:41 +1000
Subject: [PATCH 34/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 45119920b..190b30ce5 100644
--- a/README.md
+++ b/README.md
@@ -31,9 +31,9 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | **TinyLlama**  | [▶️ Start free finetune](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
 - Benchmarking compared to FA2 + Hugging Face combined.
+- **Kaggle Notebooks** for [Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
 - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates.
 - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for raw text. This [DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr.
-- Other Kaggle Notebooks for [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
 
 ## 🦥 Unsloth.ai News
 - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).

From 18533ab8b909d6066c76a3b073f286260e1c5fff Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 03:55:48 +1000
Subject: [PATCH 35/69] Update README.md

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 190b30ce5..6a866f05e 100644
--- a/README.md
+++ b/README.md
@@ -31,14 +31,14 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | **TinyLlama**  | [▶️ Start free finetune](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
 - Benchmarking compared to FA2 + Hugging Face combined.
-- **Kaggle Notebooks** for [Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
-- This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates.
-- This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for raw text. This [DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr.
+- **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
+- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates.
+- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr.
 
 ## 🦥 Unsloth.ai News
-- 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).
-- 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here!
-- 📣 NEW! [Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here!
+- 📣 NEW! [▶️ Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).
+- 📣 NEW! [▶️ ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here!
+- 📣 NEW! [▶️ Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here!
 - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line:
 ```python
 model = FastLanguageModel.get_peft_model(
@@ -46,8 +46,8 @@ model = FastLanguageModel.get_peft_model(
     use_gradient_checkpointing = "unsloth", # <<<<<<<
 )
 ```
-- 📣 [CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing)
-- 📣 [2x faster inference](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) added for all our models
+- 📣 [▶️ CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [▶️ Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [▶️ Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing)
+- 📣 [▶️ 2x faster inference](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) added for all our models
 
 ## 🔗 Links and Resources
 | Type                            | Links                               |

From 3e84338c693acbf1e698a5f7b4f21f29c34a48d7 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 03:57:08 +1000
Subject: [PATCH 36/69] Update README.md

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 6a866f05e..2a893bcbb 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
     <img alt="unsloth logo" src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png" height="110" style="max-width: 100%;">
   </picture></a>
   
-<a href="https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Start free finetune finetune button.png" height="48"></a>
+<a href="https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/start free finetune button.png" height="48"></a>
 <a href="https://discord.gg/u54VK8m8tk"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" height="48"></a>
 <a href="https://ko-fi.com/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/buy me a coffee button.png" height="48"></a>
 
@@ -22,13 +22,13 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 | Unsloth for | Free Notebooks | Performance | Memory use |
 |-----------|---------|--------|----------|
-| **Llama 3 (8B)**      | [▶️ Start free finetune](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
-| **Mistral (7B)**    | [▶️ Start free finetune](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
-| **Gemma (7B)**      | [▶️ Start free finetune](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
-| **ORPO**     | [▶️ Start free finetune](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
-| **DPO Zephyr**     | [▶️ Start free finetune](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
-| **Phi-3 (3.8B)**     | [▶️ Start free finetune](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               |  |  |
-| **TinyLlama**  | [▶️ Start free finetune](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
+| **Llama 3 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
+| **Mistral (7B)**    | [▶️ Start for free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
+| **Gemma (7B)**      | [▶️ Start for free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
+| **ORPO**     | [▶️ Start for free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
+| **DPO Zephyr**     | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
+| **Phi-3 (3.8B)**     | [▶️ Start for free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               |  |  |
+| **TinyLlama**  | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
 - Benchmarking compared to FA2 + Hugging Face combined.
 - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)

From d8feef5824699925dba90ca40553aac17d23e2ae Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 03:58:28 +1000
Subject: [PATCH 37/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2a893bcbb..523f4a55e 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - Benchmarking compared to FA2 + Hugging Face combined.
 - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
 - This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates.
-- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr.
+- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr.
 
 ## 🦥 Unsloth.ai News
 - 📣 NEW! [▶️ Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).

From 392c034c7835e5e266bcc795ee89f45e06a430b1 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 04:01:51 +1000
Subject: [PATCH 38/69] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 523f4a55e..3893be43e 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 - Benchmarking compared to FA2 + Hugging Face combined.
 - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
-- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates.
+- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML templates for Llama-3. For [▶️ conversational Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).
 - This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr.
 
 ## 🦥 Unsloth.ai News
@@ -47,7 +47,7 @@ model = FastLanguageModel.get_peft_model(
 )
 ```
 - 📣 [▶️ CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [▶️ Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [▶️ Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing)
-- 📣 [▶️ 2x faster inference](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) added for all our models
+- 📣 [▶️ 2x faster inference](https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing) added for all our models
 
 ## 🔗 Links and Resources
 | Type                            | Links                               |

From df6fb5291602687fb3da387411ef6bf797f0fdad Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 04:02:54 +1000
Subject: [PATCH 39/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3893be43e..62cf4fccc 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 - Benchmarking compared to FA2 + Hugging Face combined.
 - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
-- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML templates for Llama-3. For [▶️ conversational Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).
+- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. For [▶️ ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).
 - This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr.
 
 ## 🦥 Unsloth.ai News

From 99ed47a6fbdbb031eabeb5efaba3afab7e017459 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 04:03:08 +1000
Subject: [PATCH 40/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 62cf4fccc..a72dc313e 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 - Benchmarking compared to FA2 + Hugging Face combined.
 - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
-- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. For [▶️ ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).
+- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And [▶️ ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).
 - This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr.
 
 ## 🦥 Unsloth.ai News

From 7fae556c3ee78fcf8ee3bf950941e4966c95e16a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 04:07:15 +1000
Subject: [PATCH 41/69] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a72dc313e..9f54df092 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@
 
 All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face.
 
-| Unsloth for | Free Notebooks | Performance | Memory use |
+| Unsloth supports | Free Notebooks | Performance | Memory use |
 |-----------|---------|--------|----------|
 | **Llama 3 (8B)**      | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)               | 2x faster | 60% less |
 | **Mistral (7B)**    | [▶️ Start for free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing)               | 2.2x faster | 73% less |
@@ -33,7 +33,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - Benchmarking compared to FA2 + Hugging Face combined.
 - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
 - This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And [▶️ ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).
-- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr.
+- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text.
 
 ## 🦥 Unsloth.ai News
 - 📣 NEW! [▶️ Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).

From 000d050c1da66587cf5166964455dabba54d3ed0 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 04:08:07 +1000
Subject: [PATCH 42/69] Update README.md

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 9f54df092..c13e69aa5 100644
--- a/README.md
+++ b/README.md
@@ -36,9 +36,9 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text.
 
 ## 🦥 Unsloth.ai News
-- 📣 NEW! [▶️ Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).
-- 📣 NEW! [▶️ ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here!
-- 📣 NEW! [▶️ Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here!
+- 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).
+- 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here!
+- 📣 NEW! [Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here!
 - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line:
 ```python
 model = FastLanguageModel.get_peft_model(
@@ -46,8 +46,8 @@ model = FastLanguageModel.get_peft_model(
     use_gradient_checkpointing = "unsloth", # <<<<<<<
 )
 ```
-- 📣 [▶️ CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [▶️ Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [▶️ Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing)
-- 📣 [▶️ 2x faster inference](https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing) added for all our models
+- 📣 [CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing)
+- 📣 [2x faster inference](https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing) added for all our models
 
 ## 🔗 Links and Resources
 | Type                            | Links                               |

From 27f88f0ab040b66c06294da4f0c2e262601bcdbf Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 04:08:41 +1000
Subject: [PATCH 43/69] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c13e69aa5..f207d69ee 100644
--- a/README.md
+++ b/README.md
@@ -31,9 +31,9 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | **TinyLlama**  | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
 - Benchmarking compared to FA2 + Hugging Face combined.
-- **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
-- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And [▶️ ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).
-- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text.
+- **Kaggle Notebooks** for [Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
+- This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And [ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).
+- This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text.
 
 ## 🦥 Unsloth.ai News
 - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).

From affbba181aefd2506d60354cf1d1182871fd3b79 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 04:09:42 +1000
Subject: [PATCH 44/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f207d69ee..ae3ad69d9 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 
 - Benchmarking compared to FA2 + Hugging Face combined.
 - **Kaggle Notebooks** for [Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
-- This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And [ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).
+- This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And ChatML for [Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing).
 - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text.
 
 ## 🦥 Unsloth.ai News

From 14f104ad558070f3f8068d21515d63df57bedadf Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 30 Apr 2024 05:58:12 +1000
Subject: [PATCH 45/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ae3ad69d9..07bcf2cf5 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 | **Gemma (7B)**      | [▶️ Start for free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)               | 2.4x faster | 71% less |
 | **ORPO**     | [▶️ Start for free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing)               | 1.9x faster | 43% less |
 | **DPO Zephyr**     | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)               | 1.9x faster | 43% less |
-| **Phi-3 (3.8B)**     | [▶️ Start for free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               |  |  |
+| **Phi-3 (3.8B)** | [▶️ Start for free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing)               | 2x faster | 50% less |
 | **TinyLlama**  | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)               | 3.9x faster | 74% less |
 
 - Benchmarking compared to FA2 + Hugging Face combined.

From e040d18691af0f1852fca21ff7c42d36829d5456 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 5 May 2024 04:07:11 +1000
Subject: [PATCH 46/69] Fix reserved tokens

---
 unsloth/models/llama.py    | 14 +++++++++----
 unsloth/save.py            | 31 ++++++++++++++++++---------
 unsloth/tokenizer_utils.py | 43 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 73 insertions(+), 15 deletions(-)

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index a7cacea9b..136ceb2c7 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1503,10 +1503,16 @@ def get_peft_model(
             pass
         pass
 
+        # Check for Llama-3
+        # if hasattr(model._saved_temp_tokenizer, "_using_llama3_template"):
+        #     if not train_embed_tokens and not train_lm_head:
+        #         raise RuntimeError("")
+
         # First fix untrained tokens
-        if train_embed_tokens or train_lm_head:
-            fix_untrained_tokens(model, eps = 1e-16)
-        pass
+        # Wrong - can cause reserved tokens to pop out!!
+        # if train_embed_tokens or train_lm_head:
+        #     fix_untrained_tokens(model, eps = 1e-16)
+        # pass
 
         # Check modules_to_save
         if modules_to_save is not None:
@@ -1547,7 +1553,7 @@ def get_peft_model(
 
         lora_config = LoraConfig(**arguments)
         model = _get_peft_model(model, lora_config)
-        
+
         model._saved_temp_tokenizer = _saved_temp_tokenizer
 
         model = FastLlamaModel.patch_peft_model(model, use_gradient_checkpointing)
diff --git a/unsloth/save.py b/unsloth/save.py
index 0e131fe30..a5ceb1299 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -118,14 +118,14 @@ def _merge_lora(layer, name):
             W = fast_dequantize(W, quant_state)
         else:
             dtype = W.dtype
-        # W = W.to(torch.float32).t()
-        W = W.t()
+        W = W.to(torch.float32).t()
+        # W = W.t()
 
         if A is not None:
             # sAB = (A.t().to(torch.float32) @ (s * B.t().to(torch.float32)))
             # W += sAB
-            # W.addmm_(A.t().to(torch.float32), B.t().to(torch.float32), alpha = s)
-            W.addmm_(A.t().to(W.dtype), B.t().to(W.dtype), alpha = s)
+            W.addmm_(A.t().to(torch.float32), B.t().to(torch.float32), alpha = s)
+            # W.addmm_(A.t().to(W.dtype), B.t().to(W.dtype), alpha = s)
             # if not torch.isfinite(W).all():
             maximum_element = torch.max(W.min().abs(), W.max())
             if not torch.isfinite(maximum_element).item():
@@ -696,12 +696,18 @@ def install_llama_cpp_clone_non_blocking():
 
 
 def install_llama_cpp_make_non_blocking():
-    env = { **os.environ, "LLAMA_CUDA": "1", }
+    # https://github.com/ggerganov/llama.cpp/issues/7062
+    # Weirdly GPU conversion for GGUF breaks??
+    # env = { **os.environ, "LLAMA_CUDA": "1", }
     n_jobs = max(int(psutil.cpu_count()*1.5), 1)
     # Force make clean
     os.system("make clean -C llama.cpp")
     full_command = ["make", "all", "-j"+str(n_jobs), "-C", "llama.cpp"]
-    run_installer = subprocess.Popen(full_command, env = env, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
+
+    # https://github.com/ggerganov/llama.cpp/issues/7062
+    # Weirdly GPU conversion for GGUF breaks??
+    # run_installer = subprocess.Popen(full_command, env = env, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
+    run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT)
     return run_installer
 pass
 
@@ -764,12 +770,17 @@ def install_llama_cpp_old(version = -10):
 
 
 def install_llama_cpp_blocking(use_cuda = True):
-    use_cuda = "LLAMA_CUDA=1" if use_cuda else ""
+    # https://github.com/ggerganov/llama.cpp/issues/7062
+    # Weirdly GPU conversion for GGUF breaks??
+    # use_cuda = "LLAMA_CUDA=1" if use_cuda else ""
 
     commands = [
         "git clone --recursive https://github.com/ggerganov/llama.cpp",
         "make clean -C llama.cpp",
-        f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp",
+        # https://github.com/ggerganov/llama.cpp/issues/7062
+        # Weirdly GPU conversion for GGUF breaks??
+        # f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp",
+        f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
         "pip install gguf protobuf",
     ]
     if os.path.exists("llama.cpp"): return
@@ -967,7 +978,7 @@ def save_to_gguf(
                 "You do not need to close this Python program. Run the following commands in a new terminal:\n"\
                 "You must run this in the same folder as you're saving your model.\n"\
                 "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
-                "cd llama.cpp && make clean && LLAMA_CUDA=1 make all -j\n"\
+                "cd llama.cpp && make clean && make all -j\n"\
                 "Once that's done, redo the quantization."
             )
         pass
@@ -1007,7 +1018,7 @@ def save_to_gguf(
                     "You do not need to close this Python program. Run the following commands in a new terminal:\n"\
                     "You must run this in the same folder as you're saving your model.\n"\
                     "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\
-                    "cd llama.cpp && make clean && LLAMA_CUDA=1 make all -j\n"\
+                    "cd llama.cpp && make clean && make all -j\n"\
                     "Once that's done, redo the quantization."
                 )
             pass
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 5dc5856c2..fe2b2d837 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -518,6 +518,44 @@ def fix_untrained_tokens(model, eps = 1e-16):
 pass
 
 
+@torch.inference_mode
+def mean_of_trained_tokens(model, eps = 1e-16):
+    """
+    Llama-3 for eg has untrained vectors in the base model.
+    These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
+    We reset them to the mean of the rest of the tokens
+    """
+    embedding_matrix = model.get_input_embeddings ().weight.data.clone()
+    lm_head_matrix   = model.get_output_embeddings().weight.data.clone()
+
+    # Get untrained tokens
+    indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps
+    where_untrained = torch.where(indicator_untrained)[0]
+    n_untrained = where_untrained.shape[0]
+    n_trained = embedding_matrix.shape[0] - n_untrained
+    if n_untrained != 0:
+        print(
+            f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\
+            "We shall set them to the mean of the other trained tokens."
+        )
+    pass
+
+    # First set untrained to all 0s - sometimes it's not! 1e-23 for bfloat16
+    embedding_matrix[where_untrained] = 0
+    lm_head_matrix  [where_untrained] = 0
+
+    # Find sum
+    sum_embedding  = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0)
+    sum_lm_head    = torch.sum(lm_head_matrix,   dtype = torch.float32, axis = 0)
+
+    # Find correct average by dividing by sum of trained tokens
+    mean_embedding = (sum_embedding / n_trained).to(embedding_matrix.dtype)
+    mean_lm_head   = (sum_lm_head   / n_trained).to(lm_head_matrix  .dtype)
+
+    return mean_embedding, mean_lm_head
+pass
+
+
 @torch.inference_mode
 def add_new_tokens(
     model,
@@ -547,7 +585,10 @@ def add_new_tokens(
     pass
 
     # Get mean of trained tokens
-    mean_embedding, mean_lm_head = fix_untrained_tokens(model)
+    # mean_embedding, mean_lm_head = fix_untrained_tokens(model)
+
+    # Weirdly be careful reserved tokens can pop out
+    mean_embedding, mean_lm_head = mean_of_trained_tokens(model)
     mean_embedding = mean_embedding.to(torch.float32)
     mean_lm_head   = mean_lm_head  .to(torch.float32)
 

From f53944a44ceb21acd7efe40034160bfd6db814ff Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 5 May 2024 04:21:40 +1000
Subject: [PATCH 47/69] Update save.py

---
 unsloth/save.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/unsloth/save.py b/unsloth/save.py
index a5ceb1299..e50f0d34d 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -844,6 +844,12 @@ def save_to_gguf(
     first_conversion     : str = "f16",
     _run_installer = None, # Non blocking install of llama.cpp
 ):
+    logger.warning(
+        "WARNING: llama.cpp GGUF conversion is currently unstable, since llama.cpp is\n"\
+        "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\
+        "Please be patient - GGUF saving should still work, but might not work as well."
+    )
+
     from transformers.models.llama.modeling_llama import logger
 
     if quantization_method.startswith("iq2"):

From 70b41d1fc1f504df90be04f889bf2f1ac7613b47 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 5 May 2024 04:49:13 +1000
Subject: [PATCH 48/69] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 42 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index fe2b2d837..21450ff73 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -25,7 +25,6 @@
     "load_correct_tokenizer",
     "fix_sentencepiece_tokenizer",
     "check_tokenizer",
-    "fix_untrained_tokens",
     "add_new_tokens",
 ]
 
@@ -636,3 +635,44 @@ def add_new_tokens(
     
     return
 pass
+
+
+def fix_sft_trainer_tokenizer():
+    """
+        Fixes double adding BOS tokens like in llama-3
+    """
+    from inspect import getsource
+    import trl.trainer.sft_trainer
+    from trl.trainer.sft_trainer import *
+
+    for function_name, replacer in (
+        ("_prepare_non_packed_dataloader", "def tokenize(element):",),
+        ("_prepare_packed_dataloader", "if dataset_text_field is not None",),
+    ):
+        function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}"))
+        where = function.find("def")
+        function = function.split("\n")
+        function = "\n".join(x[where:] for x in function)
+
+        check_text = \
+        "\n"\
+        "print(1)\n"\
+        "test_text = dataset[0][dataset_text_field] if not use_formatting_func else formatting_func(dataset[0])\n"\
+        "chat_template = getattr(tokenizer, 'chat_template', None)\n"\
+        "chat_template = '' if chat_template is None else chat_template\n"\
+        "has_bos_token_already = tokenizer.bos_token in test_text or tokenizer.bos_token in chat_template\n"\
+        "add_special_tokens = False if has_bos_token_already else add_special_tokens\n\n"
+
+        check_text = check_text.split("\n")
+        check_text = "\n".join(" "*where + x for x in check_text)
+
+        function = function.replace(replacer, check_text + replacer)
+        exec(function, globals())
+
+        # Replace TRL's SFTTrainer
+        exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals())
+    pass
+pass
+
+# Fixes double adding BOS tokens like in llama-3
+fix_sft_trainer_tokenizer()

From 1b1b931260efcbb5fda84b073d7e6dfecb9245ac Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 5 May 2024 04:51:48 +1000
Subject: [PATCH 49/69] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 21450ff73..58e36c674 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -637,14 +637,14 @@ def add_new_tokens(
 pass
 
 
+from inspect import getsource
+import trl.trainer.sft_trainer
+from trl.trainer.sft_trainer import *
+
 def fix_sft_trainer_tokenizer():
     """
         Fixes double adding BOS tokens like in llama-3
     """
-    from inspect import getsource
-    import trl.trainer.sft_trainer
-    from trl.trainer.sft_trainer import *
-
     for function_name, replacer in (
         ("_prepare_non_packed_dataloader", "def tokenize(element):",),
         ("_prepare_packed_dataloader", "if dataset_text_field is not None",),

From 61edc3cfbf048c3b73f6abb2d28bc62845a04acf Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 5 May 2024 04:56:07 +1000
Subject: [PATCH 50/69] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 58e36c674..43014df8f 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -656,8 +656,7 @@ def fix_sft_trainer_tokenizer():
 
         check_text = \
         "\n"\
-        "print(1)\n"\
-        "test_text = dataset[0][dataset_text_field] if not use_formatting_func else formatting_func(dataset[0])\n"\
+        "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])\n"\
         "chat_template = getattr(tokenizer, 'chat_template', None)\n"\
         "chat_template = '' if chat_template is None else chat_template\n"\
         "has_bos_token_already = tokenizer.bos_token in test_text or tokenizer.bos_token in chat_template\n"\

From 73df3ee5325dd4544c78441bdc3f093b583c70a6 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 5 May 2024 05:00:12 +1000
Subject: [PATCH 51/69] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 43014df8f..3e78b6b62 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -647,7 +647,7 @@ def fix_sft_trainer_tokenizer():
     """
     for function_name, replacer in (
         ("_prepare_non_packed_dataloader", "def tokenize(element):",),
-        ("_prepare_packed_dataloader", "if dataset_text_field is not None",),
+        # ("_prepare_packed_dataloader", "if dataset_text_field is not None",),
     ):
         function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}"))
         where = function.find("def")

From 15d78981bba72dbc6b8146b2a4cdb9220259b10a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 5 May 2024 05:14:38 +1000
Subject: [PATCH 52/69] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 3e78b6b62..0d6dadf7d 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -659,7 +659,7 @@ def fix_sft_trainer_tokenizer():
         "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])\n"\
         "chat_template = getattr(tokenizer, 'chat_template', None)\n"\
         "chat_template = '' if chat_template is None else chat_template\n"\
-        "has_bos_token_already = tokenizer.bos_token in test_text or tokenizer.bos_token in chat_template\n"\
+        "has_bos_token_already = test_text.startswith(tokenizer.bos_token) or tokenizer.bos_token in chat_template\n"\
         "add_special_tokens = False if has_bos_token_already else add_special_tokens\n\n"
 
         check_text = check_text.split("\n")

From 76ed0a49ff8f532d94e85d6d3305f47b020c6d6f Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 6 May 2024 20:21:48 +1000
Subject: [PATCH 53/69] Update chat_templates.py

---
 unsloth/chat_templates.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 4e7a71aee..c086c7e87 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -266,6 +266,20 @@
 CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token,)
 
 
+# Phi-3
+phi3_template = \
+    "{{ bos_token }}"\
+    "{% for message in messages %}"\
+        "{% if (message['role'] == 'user') %}"\
+            "{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}"\
+        "{% elif (message['role'] == 'assistant') %}"\
+            "{{message['content'] + '<|end|>' + '\n'}}"\
+        "{% endif %}"\
+    "{% endfor %}"
+phi3_template_eos_token = "<|end|>"
+CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token,)
+
+
 def get_chat_template(
     tokenizer,
     chat_template = "chatml",
@@ -595,4 +609,12 @@ def test_chat_templates():
     correct_tokenizer.chat_template = template
     our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
     assert(correct_prompt == our_prompt)
+
+    # Phi-3
+    template = phi3_templatetemplate
+    correct_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    correct_tokenizer.chat_template = template
+    our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    assert(correct_prompt == our_prompt)
 pass

From dfec8dd72842b5c7cac09062e01942c5a9f4062e Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 8 May 2024 06:31:02 +1000
Subject: [PATCH 54/69] Update save.py

---
 unsloth/save.py | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index 868d25de4..b825b10fb 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -18,6 +18,7 @@
 from typing import Optional, Callable, Union, List
 import torch
 import os
+import shutil
 import pickle
 import gc
 from transformers.models.llama.modeling_llama import logger
@@ -87,6 +88,24 @@ def print_quantization_methods():
 pass
 
 
+def check_if_sentencepiece_model(model, temporary_location = "_unsloth_sentencepiece_temp"):
+    if not hasattr(model, "_saved_temp_tokenizer"): return False
+
+    temp_tokenizer = model._saved_temp_tokenizer
+    sentencepiece_model = False
+    file_location = f"{temporary_location}/{temp_tokenizer.name_or_path}"
+    if not os.path.exists(file_location):
+        os.makedirs(file_location)
+    pass
+    temp_tokenizer.save_pretrained(file_location)
+    if os.path.isfile(f"{file_location}/tokenizer.model"):
+        sentencepiece_model = True
+    pass
+    shutil.rmtree(file_location)
+    return sentencepiece_model
+pass
+
+
 def _free_cached_model(model):
     from huggingface_hub import scan_cache_dir
     cached_repos = list(scan_cache_dir().repos)
@@ -840,6 +859,7 @@ def _fix_gemma_gguf():
 
 def save_to_gguf(
     model_type           : str,
+    is_sentencepiece     : bool = False,
     model_directory      : str = "unsloth_finetuned_model",
     quantization_method  : str = "fast_quantized",
     first_conversion     : str = "f16",
@@ -856,7 +876,8 @@ def save_to_gguf(
 
     # Careful convert.py is only for Llama / Mistral based archs
     use_fast_convert = False
-    if   model_type == "llama":   use_fast_convert = True
+    if not is_sentencepiece:      use_fast_convert = False # Llama-3
+    elif model_type == "llama":   use_fast_convert = True
     elif model_type == "mistral": use_fast_convert = True
     pass
     logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.")
@@ -951,7 +972,7 @@ def save_to_gguf(
             f"--outtype {first_conversion} --concurrency {n_cpus}"
     else:
         # Need to fix convert-hf-to-gguf.py for some models!
-        _fix_gemma_gguf()
+        # _fix_gemma_gguf()
 
         command = f"python llama.cpp/convert-hf-to-gguf.py {model_directory} "\
             f"--outfile {final_location} "\
@@ -1353,7 +1374,10 @@ def unsloth_save_pretrained_gguf(
         gc.collect()
 
     model_type = self.config.model_type
-    file_location = save_to_gguf(model_type, new_save_directory, quantization_method, first_conversion, makefile)
+    is_sentencepiece_model = check_if_sentencepiece_model(self)
+    file_location = save_to_gguf(model_type, is_sentencepiece_model, 
+        new_save_directory, quantization_method, first_conversion, makefile,
+    )
 
     if push_to_hub:
         print("Unsloth: Uploading GGUF to Huggingface Hub...")
@@ -1473,7 +1497,10 @@ def unsloth_push_to_hub_gguf(
         gc.collect()
 
     model_type = self.config.model_type
-    file_location = save_to_gguf(model_type, new_save_directory, quantization_method, first_conversion, makefile)
+    is_sentencepiece_model = check_if_sentencepiece_model(self)
+    file_location = save_to_gguf(model_type, is_sentencepiece_model, 
+        new_save_directory, quantization_method, first_conversion, makefile,
+    )
 
     print("Unsloth: Uploading GGUF to Huggingface Hub...")
     username = upload_to_huggingface(

From 73af5d11f1d7c052d42ed47b62e0c73389a26f82 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 8 May 2024 07:35:24 +1000
Subject: [PATCH 55/69] Update _utils.py

---
 unsloth/models/_utils.py | 64 +++++++++++++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 14 deletions(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 9c4ae8fc6..49f054e43 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -144,24 +144,60 @@ def make_inputs_require_grad(module, input, output):
 
 
 def patch_tokenizer(model, tokenizer):
+    """
+        Phi3's pad_token isn't set. We set it to <|placeholder...
+        Llama-3 is <|reserved...
+        Llama-2 is <unk>
+        Check if pad_token is not the same as eos_token otherwise the loss will ignore it!!
+        Fixes https://github.com/unslothai/unsloth/issues/5
+    """
+    possible_reserved_tokens = ("<|reserved", "<|placeholder",)
+
     if model is not None:
         model.config.update({"unsloth_version" : __version__})
-    if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
-        # Fixes https://github.com/unslothai/unsloth/issues/5
-        if hasattr(tokenizer, "unk_token") and tokenizer.unk_token is not None:
-            tokenizer.add_special_tokens({"pad_token" : tokenizer.unk_token})
-            tokenizer.pad_token = tokenizer.unk_token
-        else:
-            name = model.config._name_or_path if model is not None else "Model"
-            logger.warning_once(
-                f"{name} does not have a padding or unknown token!\n"\
-                f"Will use the EOS token of id {tokenizer.eos_token_id} as padding."
+
+    bad_pad_token = False
+    if hasattr(tokenizer, "pad_token") and tokenizer.pad_token is not None:
+        # Check if pad_token is not the same as eos_token otherwise the loss will ignore it!!
+        bad_pad_token = tokenizer.eos_token == tokenizer.pad_token
+    elif hasattr(tokenizer, "pad_token") and tokenizer.pad_token is None:
+        bad_pad_token = True
+    else:
+        bad_pad_token = False
+    pass
+
+    if bad_pad_token:
+        # Find a better pad token
+        added_tokens = [str(x) for x in tokenizer.added_tokens_decoder.values()]
+        possible_pad_token = None
+        for added_token in added_tokens[::-1]:
+            if added_token.startswith(possible_reserved_tokens):
+                possible_pad_token = added_token
+                break
+            pass
+        pass
+        if possible_pad_token is None:
+            # Try unk_token
+            possible_pad_token = tokenizer.unk_token
+        pass
+        if possible_pad_token is None:
+            # Failure!!
+            raise RuntimeError(
+                "Unsloth: Tokenizer's pad_token cannot be = eos_token, and we couldn't find a\n"\
+                "replacement of either <|reserved... or <|placeholder..."
             )
-            assert(hasattr(tokenizer, "eos_token"))
-            tokenizer.add_special_tokens({"pad_token" : tokenizer.eos_token})
-            tokenizer.pad_token = tokenizer.eos_token
+        pass
+
+        name = model.config._name_or_path if model is not None else "Model"
+        logger.warning_once(
+            f"{name} does not have a padding token! Will use pad_token = {possible_pad_token}."
+        )
+        
+        # Edit pad_token
+        tokenizer.add_special_tokens({"pad_token" : possible_pad_token})
+        tokenizer.pad_token = possible_pad_token
         if model is not None:
-            config = model.config.update({"pad_token_id" : tokenizer.eos_token_id})
+            config = model.config.update({"pad_token_id" : tokenizer.pad_token_id})
     pass
     return model, tokenizer
 pass

From 9c7d9a7dec537870099a0be822def59f3f7c3db0 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Wed, 8 May 2024 07:37:30 +1000
Subject: [PATCH 56/69] Update chat_templates.py

---
 unsloth/chat_templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index c086c7e87..07999ea0d 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -611,7 +611,7 @@ def test_chat_templates():
     assert(correct_prompt == our_prompt)
 
     # Phi-3
-    template = phi3_templatetemplate
+    template = phi3_template
     correct_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
     correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
     correct_tokenizer.chat_template = template

From 7c536521addf628c0d7c412e8108dee1e651d768 Mon Sep 17 00:00:00 2001
From: Nathan Azrak <42650258+nathan-az@users.noreply.github.com>
Date: Sat, 11 May 2024 02:53:19 +1000
Subject: [PATCH 57/69] Adds dependencies and extras for torch 2.3.0 with new
 xformers versions (#415)

* Adds dependencies and extras for torch 2.3.0 with new xformers versions

* Add 2.3.0 section to readme
---
 README.md      |  9 ++++++++-
 pyproject.toml | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c6e7d6c50..c06dd9796 100644
--- a/README.md
+++ b/README.md
@@ -159,7 +159,14 @@ pip install --no-deps packaging ninja einops flash-attn xformers trl peft accele
 pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
 pip install --no-deps xformers trl peft accelerate bitsandbytes
 ```
-7. To troubleshoot installs try the below (all must succeed). Xformers should mostly all be available.
+7. For Pytorch 2.3.0: Use the `"ampere"` path for newer RTX 30xx GPUs or higher.
+```bash
+pip install "unsloth[cu118-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu118-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
+```
+8. To troubleshoot installs try the below (all must succeed). Xformers should mostly all be available.
 ```bash
 nvcc
 python -m xformers.info
diff --git a/pyproject.toml b/pyproject.toml
index e6f663a96..0398d0df4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,6 +86,17 @@ cu121onlytorch220 = [
     "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
     "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
+cu118onlytorch230 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+cu121onlytorch230 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+
 cu118 = [
     "unsloth[huggingface]",
     "bitsandbytes",
@@ -126,6 +137,16 @@ cu121-torch220 = [
     "bitsandbytes",
     "unsloth[cu121onlytorch220]",
 ]
+cu118-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes",
+    "unsloth[cu118onlytorch230]",
+]
+cu121-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes",
+    "unsloth[cu121onlytorch230]",
+]
 kaggle = [
     "unsloth[huggingface]",
 ]
@@ -238,6 +259,22 @@ cu121-ampere-torch220 = [
     "ninja",
     "flash-attn",
 ]
+cu118-ampere-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes",
+    "unsloth[cu118onlytorch230]",
+    "packaging",
+    "ninja",
+    "flash-attn",
+]
+cu121-ampere-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes",
+    "unsloth[cu121onlytorch230]",
+    "packaging",
+    "ninja",
+    "flash-attn",
+]
 
 [project.urls]
 homepage = "http://www.unsloth.ai"

From cf83fe331a159fb6d98f84330a470f708aa1aa74 Mon Sep 17 00:00:00 2001
From: Yang JianXin <995462226@qq.com>
Date: Sat, 11 May 2024 01:23:55 +0800
Subject: [PATCH 58/69] Support Qwen2 (#428)

* support Qwen2

* support Qwen2

* Delete README.md

* Revert "Delete README.md"

This reverts commit 026b05f859410ddd04e1a2b4b54e950b89b4a58a.

* Update README.md

* Qwen2 == Mistral

* Update llama.py

* Update __init__.py

* Update README.md

---------

Co-authored-by: Daniel Han <danielhanchen@gmail.com>
---
 README.md                  |  1 +
 unsloth/models/__init__.py |  7 +--
 unsloth/models/llama.py    |  1 +
 unsloth/models/loader.py   |  3 ++
 unsloth/models/mistral.py  |  2 +-
 unsloth/models/qwen2.py    | 91 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100644 unsloth/models/qwen2.py

diff --git a/README.md b/README.md
index c06dd9796..ca5b6533b 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text.
 
 ## 🦥 Unsloth.ai News
+- 📣 NEW! Qwen1.5-7B, Qwen1.5-14B, Qwen1.5-32B, Qwen1.5-72B now work, courtesy of Firefly's PR [#428](https://github.com/unslothai/unsloth/pull/428)
 - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).
 - 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here!
 - 📣 NEW! [Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here!
diff --git a/unsloth/models/__init__.py b/unsloth/models/__init__.py
index 891947d69..ff7129e06 100644
--- a/unsloth/models/__init__.py
+++ b/unsloth/models/__init__.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .loader import FastLanguageModel
-from .llama import FastLlamaModel
+from .loader  import FastLanguageModel
+from .llama   import FastLlamaModel
 from .mistral import FastMistralModel
-from .dpo import PatchDPOTrainer
+from .qwen2   import FastQwen2Model
+from .dpo     import PatchDPOTrainer
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 136ceb2c7..44998b4cf 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1605,6 +1605,7 @@ def patch_peft_model(
 
         if   model_type == "llama":   apply_lora_mlp = apply_lora_mlp_swiglu
         elif model_type == "mistral": apply_lora_mlp = apply_lora_mlp_swiglu
+        elif model_type == "qwen2":   apply_lora_mlp = apply_lora_mlp_swiglu
         elif model_type == "gemma":   apply_lora_mlp = apply_lora_mlp_geglu_approx
         else:
             raise NotImplementedError(f"Unsloth: {model_type} is not yet implemented!")
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
index a107200ea..2b3bf4794 100644
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@@ -14,6 +14,7 @@
 
 from .llama import FastLlamaModel, logger
 from .mistral import FastMistralModel
+from .qwen2 import FastQwen2Model
 from transformers import AutoConfig
 from transformers import __version__ as transformers_version
 from peft import PeftConfig, PeftModel
@@ -119,6 +120,8 @@ def from_pretrained(
                     f"to obtain the latest transformers build, then restart this session."\
                 )
             dispatch_model = FastGemmaModel
+        elif model_type == "qwen2":
+            dispatch_model = FastQwen2Model
         else:
             raise NotImplementedError(
                 f"Unsloth: {model_name} not supported yet!\n"\
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index 80d0ffdf7..902177cb1 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -343,7 +343,7 @@ def from_pretrained(
         # Mistral does NOT support RoPE Scaling sadly so we have to error out.
         if max_seq_length > model_max_seq_length:
             raise RuntimeError(
-                "Unsloth: Unfortunately Mistral type models do not support RoPE scaling!\n"\
+                f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\
                 f"The maximum sequence length supported is {model_max_seq_length}.",
             )
         pass
diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
new file mode 100644
index 000000000..76fe31a6d
--- /dev/null
+++ b/unsloth/models/qwen2.py
@@ -0,0 +1,91 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .llama import *
+from .mistral import FastMistralModel
+import os
+from ._utils import __version__
+
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2Attention,
+    Qwen2DecoderLayer,
+    Qwen2Model,
+    Qwen2ForCausalLM,
+)
+# For Pytorch 2.1.1
+try:
+    from transformers.models.qwen2.modeling_qwen2 import (
+        Qwen2SdpaAttention,
+        Qwen2FlashAttention2,
+    )
+except:
+    Qwen2SdpaAttention   = Qwen2Attention
+    Qwen2FlashAttention2 = Qwen2Attention
+pass
+
+
+class FastQwen2Model(FastLlamaModel):
+
+    @staticmethod
+    def pre_patch():
+        Qwen2Attention      .forward = LlamaAttention_fast_forward
+        Qwen2SdpaAttention  .forward = LlamaAttention_fast_forward
+        Qwen2FlashAttention2.forward = LlamaAttention_fast_forward
+        Qwen2DecoderLayer   .forward = LlamaDecoderLayer_fast_forward
+        Qwen2Model          .forward = LlamaModel_fast_forward
+        Qwen2ForCausalLM    .forward = CausalLM_fast_forward(LlamaModel_fast_forward_inference)
+        PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
+
+        # Solves https://github.com/unslothai/unsloth/issues/168
+        # Static KV Cache was introduced in 4.38.0, causing training to be much slower.
+        # Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
+        # https://github.com/huggingface/transformers/pull/27931
+        # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
+        import transformers.models.qwen2.modeling_qwen2
+        transformers.models.qwen2.modeling_qwen2.Qwen2RotaryEmbedding = LlamaRotaryEmbedding
+        return
+    pass
+
+
+    @staticmethod
+    def from_pretrained(
+        model_name     = "Qwen/Qwen1.5-7B",
+        max_seq_length = 4096,
+        dtype          = None,
+        load_in_4bit   = True,
+        token          = None,
+        device_map     = "sequential",
+        rope_scaling   = None, # Qwen2 does not support RoPE scaling
+        fix_tokenizer  = True,
+        model_patcher  = None,
+        tokenizer_name = None,
+        trust_remote_code = False,
+        **kwargs,
+    ):
+        return FastMistralModel.from_pretrained(
+            model_name     = model_name,
+            max_seq_length = max_seq_length,
+            dtype          = dtype,
+            load_in_4bit   = load_in_4bit,
+            token          = token,
+            device_map     = device_map,
+            rope_scaling   = rope_scaling,
+            fix_tokenizer  = fix_tokenizer,
+            model_patcher  = FastQwen2Model,
+            tokenizer_name = tokenizer_name,
+            trust_remote_code = trust_remote_code,
+            **kwargs,
+        )
+    pass
+pass

From f7dab306196af4a68236e04399853abad4d67454 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 11 May 2024 03:24:10 +1000
Subject: [PATCH 59/69] Update save.py

---
 unsloth/save.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index b825b10fb..e247cd1f0 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -865,11 +865,11 @@ def save_to_gguf(
     first_conversion     : str = "f16",
     _run_installer = None, # Non blocking install of llama.cpp
 ):
-    logger.warning(
-        "NOTICE: llama.cpp GGUF conversion is currently unstable, since llama.cpp is\n"\
-        "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\
-        "Please be patient - GGUF saving should still work, but might not work as well."
-    )
+    # logger.warning(
+    #     "NOTICE: llama.cpp GGUF conversion is currently unstable, since llama.cpp is\n"\
+    #     "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\
+    #     "Please be patient - GGUF saving should still work, but might not work as well."
+    # )
 
     if quantization_method.startswith("iq2"):
         raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!")

From 6c9fcc9aefa4394681fda8ba2c86ad99c45de0f4 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 11 May 2024 03:34:15 +1000
Subject: [PATCH 60/69] Update save.py

---
 unsloth/save.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/save.py b/unsloth/save.py
index e247cd1f0..bebffd9e3 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -980,6 +980,8 @@ def save_to_gguf(
     pass
 
     with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE, bufsize = 1) as sp:
+        for line in sp.stderr:
+            print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
         for line in sp.stdout:
             print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
         if sp.returncode is not None and sp.returncode != 0:

From f1350ca9bfd253e8295c8c8e4ce1124551543870 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 11 May 2024 03:36:30 +1000
Subject: [PATCH 61/69] Update _utils.py

---
 unsloth/models/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 49f054e43..80cb19517 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -30,7 +30,7 @@
 import os
 import psutil
 
-__version__ = "2024.4"
+__version__ = "2024.5"
 
 # Get Flash Attention v2 if Ampere (RTX 30xx, A100)
 major_version, minor_version = torch.cuda.get_device_capability()

From 73b941da766a8a9d7c310649562de52cff6dd58f Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 11 May 2024 19:14:40 +1000
Subject: [PATCH 62/69] Update save.py

---
 unsloth/save.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unsloth/save.py b/unsloth/save.py
index bebffd9e3..e92caf53a 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1025,6 +1025,8 @@ def save_to_gguf(
         with subprocess.Popen(command, shell = True, stderr = subprocess.PIPE, bufsize = 1) as sp:
             for line in sp.stderr:
                 print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
+            for line in sp.stdout:
+                print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
             if sp.returncode is not None and sp.returncode != 0:
                 raise subprocess.CalledProcessError(sp.returncode, sp.args)
         pass

From 7d502d77483a681ddea41c46cdbd47730b1e33a6 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 11 May 2024 19:18:41 +1000
Subject: [PATCH 63/69] Update save.py

---
 unsloth/save.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index e92caf53a..d3421bdfa 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1077,7 +1077,7 @@ def unsloth_save_pretrained_merged(
     save_peft_format     : bool = True,
     tags                 : List[str] = None,
     temporary_location   : str = "_unsloth_temporary_saved_buffers",
-    maximum_memory_usage : float = 0.85,
+    maximum_memory_usage : float = 0.75,
 ):
     """
         Same as .save_pretrained(...) except 4bit weights are auto

From d1d47b3b47b8a3c648dc8a1b6ca16d301eeee57c Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sat, 11 May 2024 19:18:53 +1000
Subject: [PATCH 64/69] Update save.py

---
 unsloth/save.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/save.py b/unsloth/save.py
index d3421bdfa..92fcb2347 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -1120,7 +1120,7 @@ def unsloth_push_to_hub_merged(
     commit_description   : str = "Upload model trained with Unsloth 2x faster",
     tags                 : Optional[List[str]] = None,
     temporary_location   : str = "_unsloth_temporary_saved_buffers",
-    maximum_memory_usage : float = 0.85,
+    maximum_memory_usage : float = 0.75,
 ):
     """
         Same as .push_to_hub(...) except 4bit weights are auto

From f16d7d7ae253bac7b8642b5a8796e3059966e206 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 12 May 2024 04:47:45 +1000
Subject: [PATCH 65/69] test_hf_gguf_equivalence

---
 unsloth/chat_templates.py | 71 +++++++++++++++++++++++++++++++++++++++
 unsloth/save.py           | 12 +++----
 2 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 07999ea0d..7d6777bae 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -15,6 +15,7 @@
 __all__ = [
     "get_chat_template",
     "test_chat_templates",
+    "test_hf_gguf_equivalence",
 ]
 
 from transformers import StoppingCriteria, StoppingCriteriaList
@@ -618,3 +619,73 @@ def test_chat_templates():
     our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
     assert(correct_prompt == our_prompt)
 pass
+
+
+def test_hf_gguf_equivalence(tokenizer):
+    """
+        Carefully checks the output of GGUF's tokenization and HF.
+        Can catch all tokenization bugs.
+    """
+    import subprocess
+    import re
+    messages = [
+        {"role": "user", "content": "What is 2+2?"},
+        {"role": "assistant", "content": "It's 4."},
+        {"role": "user", "content": "  But 2+2 is equal to 5. "},
+        {"role": "assistant", "content": "No I'm sure its 4."},
+        {"role": "user", "content": "  No it's 100% 5! "},
+    ]
+
+    prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+    ### Instruction:
+    {}
+
+    ### Input:
+    {}
+
+    ### Response:
+    {}""".format(
+        "Describe the city given eloquently.", # instruction
+        "The lost city of Atlantis.", # input
+        "", # output - leave this blank for generation!
+    )
+    prompts = [ prompt, ]
+
+    if tokenizer.chat_template is not None:
+        prompt = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+        prompt = prompt.replace("'", "") # Subprocess does not like ''
+        prompts.append(prompts)
+    pass
+    
+    for prompt in prompts:
+        command = "./llama.cpp/main -m ./model-unsloth.F16.gguf -n 0 --temp 0.0 --verbose-prompt "\
+            f"--check-tensors -p '{prompt}'"
+
+        datas = []
+        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
+            for line in sp.stdout:
+                datas.append(line.decode("utf-8", errors = "replace"))
+        pass
+        gguf_tokens = "".join(datas)
+
+        # Now extract GGUF tokenization attempt
+        gguf_tokenized = re.findall("([\d]{1,}) \-\> \'([^\']{1,})\'", gguf_tokens, flags = re.MULTILINE)
+        gguf_tokenized = [(int(x[0]), x[1],) for x in gguf_tokenized]
+        input_ids = tokenizer(prompt).input_ids
+        tokens = tokenizer.batch_decode(input_ids)
+        hf_tokenized = list(zip(input_ids, tokens))
+
+        # Compare to Huggingface
+        for j, (hf_token, gguf_token) in enumerate(zip(hf_tokenized, gguf_tokenized)):
+            if (hf_token != gguf_token):
+                print("Failed GGUF != HF at", j)
+                print("HF =", hf_token)
+                print("GGUF =", gguf_token)
+                print(hf_tokenized[:j+1])
+                print(gguf_tokenized[:j+1])
+                raise RuntimeError("Failed comparing GGUF to HF.")
+            pass
+        pass
+    return True
+pass
diff --git a/unsloth/save.py b/unsloth/save.py
index 92fcb2347..d3db1f906 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -774,7 +774,7 @@ def install_llama_cpp_old(version = -10):
         f"make all -j{psutil.cpu_count()*2} -C llama.cpp",
     ]
     for command in commands:
-        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, bufsize = 1) as sp:
+        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
             for line in sp.stdout:
                 print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
         pass
@@ -806,7 +806,7 @@ def install_llama_cpp_blocking(use_cuda = True):
     if os.path.exists("llama.cpp"): return
 
     for command in commands:
-        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, bufsize = 1) as sp:
+        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
             for line in sp.stdout:
                 print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
         pass
@@ -979,9 +979,7 @@ def save_to_gguf(
             f"--outtype {first_conversion}"
     pass
 
-    with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE, bufsize = 1) as sp:
-        for line in sp.stderr:
-            print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
+    with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
         for line in sp.stdout:
             print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
         if sp.returncode is not None and sp.returncode != 0:
@@ -1022,9 +1020,7 @@ def save_to_gguf(
             f"{final_location} {quantization_method} {n_cpus}"
         
         # quantize uses stderr
-        with subprocess.Popen(command, shell = True, stderr = subprocess.PIPE, bufsize = 1) as sp:
-            for line in sp.stderr:
-                print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
+        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
             for line in sp.stdout:
                 print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
             if sp.returncode is not None and sp.returncode != 0:

From 01284f473067265adce48ef8490cd6127c40c717 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 12 May 2024 17:56:32 +1000
Subject: [PATCH 66/69] Update chat_templates.py

---
 unsloth/chat_templates.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 7d6777bae..9101df67d 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -621,7 +621,7 @@ def test_chat_templates():
 pass
 
 
-def test_hf_gguf_equivalence(tokenizer):
+def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf"):
     """
         Carefully checks the output of GGUF's tokenization and HF.
         Can catch all tokenization bugs.
@@ -659,7 +659,7 @@ def test_hf_gguf_equivalence(tokenizer):
     pass
     
     for prompt in prompts:
-        command = "./llama.cpp/main -m ./model-unsloth.F16.gguf -n 0 --temp 0.0 --verbose-prompt "\
+        command = f"./llama.cpp/main -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\
             f"--check-tensors -p '{prompt}'"
 
         datas = []
@@ -684,6 +684,7 @@ def test_hf_gguf_equivalence(tokenizer):
                 print("GGUF =", gguf_token)
                 print(hf_tokenized[:j+1])
                 print(gguf_tokenized[:j+1])
+                print(gguf_tokens)
                 raise RuntimeError("Failed comparing GGUF to HF.")
             pass
         pass

From 4f1e6fbd10d5561f7e2cd5de98128680b2a3a808 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 12 May 2024 18:27:17 +1000
Subject: [PATCH 67/69] Update chat_templates.py

---
 unsloth/chat_templates.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 9101df67d..5033c1db9 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -675,10 +675,11 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf")
         input_ids = tokenizer(prompt).input_ids
         tokens = tokenizer.batch_decode(input_ids)
         hf_tokenized = list(zip(input_ids, tokens))
+        print(gguf_tokenized[:5])
 
         # Compare to Huggingface
         for j, (hf_token, gguf_token) in enumerate(zip(hf_tokenized, gguf_tokenized)):
-            if (hf_token != gguf_token):
+            if (hf_token[0] != gguf_token[0]):
                 print("Failed GGUF != HF at", j)
                 print("HF =", hf_token)
                 print("GGUF =", gguf_token)

From 36cfcf4165d85d8dd0c7fccdb58ec7ee270d05d4 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 13 May 2024 04:58:36 +1000
Subject: [PATCH 68/69] --pad-vocab

---
 unsloth/chat_templates.py  | 15 ++++-----
 unsloth/save.py            |  5 ++-
 unsloth/tokenizer_utils.py | 66 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 5033c1db9..3af4c4e9a 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -271,12 +271,11 @@
 phi3_template = \
     "{{ bos_token }}"\
     "{% for message in messages %}"\
-        "{% if (message['role'] == 'user') %}"\
-            "{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}"\
-        "{% elif (message['role'] == 'assistant') %}"\
-            "{{message['content'] + '<|end|>' + '\n'}}"\
-        "{% endif %}"\
-    "{% endfor %}"
+        "{{'<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n'}}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ '<|assistant|>\n' }}"\
+    "{% endif %}"
 phi3_template_eos_token = "<|end|>"
 CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token,)
 
@@ -614,9 +613,9 @@ def test_chat_templates():
     # Phi-3
     template = phi3_template
     correct_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
-    correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    correct_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
     correct_tokenizer.chat_template = template
-    our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
     assert(correct_prompt == our_prompt)
 pass
 
diff --git a/unsloth/save.py b/unsloth/save.py
index d3db1f906..39b18d0dd 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -27,6 +27,7 @@
 import psutil
 import re
 from transformers.models.llama.modeling_llama import logger
+from .tokenizer_utils import fix_sentencepiece_gguf
 
 __all__ = [
     "print_quantization_methods",
@@ -962,6 +963,8 @@ def save_to_gguf(
     # We first check if tokenizer.model exists in the model_directory
     if os.path.exists(f"{model_directory}/tokenizer.model"):
         vocab_type = "spm,hfft,bpe"
+        # Fix Sentencepiece model as well!
+        fix_sentencepiece_gguf(model_directory)
     else:
         vocab_type = "bpe"
     pass
@@ -969,7 +972,7 @@ def save_to_gguf(
     if use_fast_convert:
         command = f"python llama.cpp/convert.py {model_directory} "\
             f"--outfile {final_location} --vocab-type {vocab_type} "\
-            f"--outtype {first_conversion} --concurrency {n_cpus}"
+            f"--outtype {first_conversion} --concurrency {n_cpus} --pad-vocab"
     else:
         # Need to fix convert-hf-to-gguf.py for some models!
         # _fix_gemma_gguf()
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 0d6dadf7d..1ca844b98 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -26,6 +26,7 @@
     "fix_sentencepiece_tokenizer",
     "check_tokenizer",
     "add_new_tokens",
+    "fix_sentencepiece_gguf",
 ]
 
 
@@ -267,6 +268,71 @@ def fix_sentencepiece_tokenizer(
 pass
 
 
+def fix_sentencepiece_gguf(saved_location):
+    """
+        Fixes sentencepiece tokenizers which did not extend the vocabulary with
+        user defined tokens.
+        Inspiration from https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py
+    """
+    import numpy as np
+    from copy import deepcopy
+    from transformers.utils import sentencepiece_model_pb2
+    import json
+    from enum import IntEnum
+    import os
+    
+    class SentencePieceTokenTypes(IntEnum):
+        NORMAL = 1
+        UNKNOWN = 2
+        CONTROL = 3
+        USER_DEFINED = 4
+        UNUSED = 5
+        BYTE = 6
+    pass
+
+    # Load tokenizer.model
+    tokenizer_file = sentencepiece_model_pb2.ModelProto()
+    if not os.path.isfile(f"{saved_location}/tokenizer.model"): return
+    tokenizer_file.ParseFromString(open(f"{saved_location}/tokenizer.model", "rb").read())
+    sentence_piece_size = len(tokenizer_file.pieces)
+
+    # Load added_tokens_json
+    if not os.path.isfile(f"{saved_location}/added_tokens.json"): return
+    with open(f"{tokenizer_path}/added_tokens.json", "r", encoding = "utf-8") as file:
+        added_tokens_json = json.load(file)
+    pass
+    if len(added_tokens_json) == 0: return
+
+    added_tokens_json = dict(sorted(added_tokens_json.items(), key = lambda item: item[1]))
+
+    # Confirm added_tokens_json is correct
+    added_tokens_ids = np.array(list(added_tokens_json.values()))
+    diff = np.diff(added_tokens_ids)
+    if (diff.min() != 1 or diff.max() != 1): return
+    if (added_tokens_ids.min() != sentence_piece_size): return
+
+    # Edit sentence piece tokens with added_tokens_json
+    logger.warning("Unsloth: Extending tokenizer.model with added_tokens.json!")
+    new_tokens = deepcopy(tokenizer_file.pieces[-len(added_tokens_ids):])
+    for new_token, added_token in zip(new_tokens, added_tokens_json.keys()):
+        new_token.piece = added_token.encode("utf-8")
+        new_token.score = -1000.0
+        new_token.type  = SentencePieceTokenTypes.USER_DEFINED
+    pass
+
+    tokenizer_file.pieces.extend(new_tokens)
+
+    with open(f"{saved_location}/tokenizer.model", "wb") as file:
+        file.write(tokenizer_file.SerializeToString())
+    pass
+
+    # Add padding tokens
+    # actual_vocab_size = model.config.vocab_size
+    # padding = actual_vocab_size - len(tokenizer_file.pieces)
+    return
+pass
+
+
 def load_correct_tokenizer(
     tokenizer_name,
     model_max_length = None,

From 6b2ee164e018fe90fdb07a6a01c86ce2aedb8ad9 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 13 May 2024 05:13:28 +1000
Subject: [PATCH 69/69] Update tokenizer_utils.py

---
 unsloth/tokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 1ca844b98..87cba843d 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -298,7 +298,7 @@ class SentencePieceTokenTypes(IntEnum):
 
     # Load added_tokens_json
     if not os.path.isfile(f"{saved_location}/added_tokens.json"): return
-    with open(f"{tokenizer_path}/added_tokens.json", "r", encoding = "utf-8") as file:
+    with open(f"{saved_location}/added_tokens.json", "r", encoding = "utf-8") as file:
         added_tokens_json = json.load(file)
     pass
     if len(added_tokens_json) == 0: return