From 64a7d27a9145034bd24890256aa822efe11ccd54 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 20 Apr 2024 04:04:24 +1000 Subject: [PATCH 01/69] Fix prompt --- unsloth/chat_templates.py | 22 ++++++++++++++++++++++ unsloth/models/mapper.py | 3 +++ unsloth/tokenizer_utils.py | 5 +++++ 3 files changed, 30 insertions(+) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 6a0be3862..56749d6c2 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -255,6 +255,20 @@ CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token,) +# Llama-3 +# Weirdly \n\n is needed? +llama3_template = \ + "{{ bos_token }}"\ + "{% for message in messages %}"\ + "{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}"\ + "{% endfor %}"\ + "{% if add_generation_prompt %}"\ + "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\ + "{% endif %}" +llama3_template_eos_token = "eos_token" +CHAT_TEMPLATES["llama-3"] = (llama3_template, gemma_chatml_eos_token,) + + def get_chat_template( tokenizer, chat_template = "chatml", @@ -540,4 +554,12 @@ def test_chat_templates(): correct_tokenizer.chat_template = gemma_template our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True) assert(our_prompt == correct_prompt) + + # Llama-3 + template = llama3_template + correct_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct") + correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) + correct_tokenizer.chat_template = template + our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) + assert(correct_prompt == our_prompt) pass diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index bad43190b..769cbff53 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -137,6 +137,9 @@ "unsloth/llama-3-70b-bnb-4bit" : ( "meta-llama/Meta-Llama-3-70B", ), + "unsloth/llama-3-70b-Instruct-bnb-4bit" : ( + "meta-llama/Meta-Llama-3-70B-Instruct", + ), } INT_TO_FLOAT_MAPPER = {} diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index fa536ef29..76d9372e2 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -215,6 +215,11 @@ def fix_sentencepiece_tokenizer( os.makedirs(temporary_location) pass + # Check if tokenizer.model exists + if not os.path.isfile(f"{temporary_location}/tokenizer.model"): + return new_tokenizer + pass + # First save the old tokenizer old_tokenizer.save_pretrained(temporary_location) From 656ab2288c2c5a32d218405ed8e2914db7228b6c Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 20 Apr 2024 19:50:27 +1000 Subject: [PATCH 02/69] Update chat_templates.py --- unsloth/chat_templates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 56749d6c2..93104b961 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -266,7 +266,7 @@ "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"\ "{% endif %}" llama3_template_eos_token = "eos_token" -CHAT_TEMPLATES["llama-3"] = (llama3_template, gemma_chatml_eos_token,) +CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token,) def get_chat_template( From c4f2f54d7e6dbb20d255be193832946130c4aeac Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 21 Apr 2024 04:33:23 +1000 Subject: [PATCH 03/69] fix_untrained_tokens --- unsloth/models/_utils.py | 33 +++++++++++++++++++++++++++++++++ unsloth/models/llama.py | 5 +++++ 2 files changed, 38 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 32da0a734..a7e392c25 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -71,6 +71,7 @@ "patch_tokenizer", "get_statistics", "Unsloth_Offloaded_Gradient_Checkpointer", + "fix_untrained_tokens", ] @@ -349,3 +350,35 @@ def backward(ctx, dY): return (None, hidden_states.grad,) + (None,)*len(ctx.args) pass pass + + +@torch.inference_mode +def fix_untrained_tokens(model, eps = 1e-16): + """ + Llama-3 for eg has untrained vectors in the base model. + These include <|eot_id|>, <|start_header_id|>, <|end_header_id|> + We reset them to the mean of the rest of the tokens + """ + embedding_matrix = model.get_input_embeddings ().weight.data + lm_head_matrix = model.get_output_embeddings().weight.data + where_untrained = torch.where(torch.amax(embedding_matrix, axis = 1) <= eps)[0] + n_untrained = where_untrained.shape[0] + n_trained = embedding_matrix.shape[0] - n_untrained + if n_untrained != 0: + logger.warning_once( + f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\ + "We shall set them to the mean of the other trained tokens." + ) + pass + + # Fix embed_tokens + sum_columns = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0) + mean_columns = (sum_columns / n_trained).to(embedding_matrix.dtype) + embedding_matrix[where_untrained] = mean_columns + + # Fix lm_head + sum_columns = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0) + mean_columns = (sum_columns / n_trained).to(lm_head_matrix.dtype) + lm_head_matrix[where_untrained] = mean_columns + return +pass diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6f70bc510..74500a3d7 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1481,6 +1481,11 @@ def get_peft_model( train_embed_tokens = True pass pass + + # First fix untrained tokens + if train_embed_tokens or train_lm_head: + fix_untrained_tokens(model, eps = 1e-16) + pass # Get LoRA arguments = dict( From 87b4bb961f234b5cc387472a443d34868900c7ec Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 21 Apr 2024 12:52:50 +1000 Subject: [PATCH 04/69] Update llama.py --- unsloth/models/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 74500a3d7..6b6f887ec 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1483,9 +1483,9 @@ def get_peft_model( pass # First fix untrained tokens - if train_embed_tokens or train_lm_head: - fix_untrained_tokens(model, eps = 1e-16) - pass + # if train_embed_tokens or train_lm_head: + # fix_untrained_tokens(model, eps = 1e-16) + # pass # Get LoRA arguments = dict( From abd192fd2d58298369227a68c42305aa8513939c Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 21 Apr 2024 16:18:28 +1000 Subject: [PATCH 05/69] add tokens --- unsloth/chat_templates.py | 5 +- unsloth/models/_utils.py | 31 ---------- unsloth/models/llama.py | 22 +++++-- unsloth/tokenizer_utils.py | 124 +++++++++++++++++++++++++++++++++++++ 4 files changed, 142 insertions(+), 40 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 93104b961..2d15470a0 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -23,10 +23,7 @@ from .save import patch_saving_functions import os import shutil -from .tokenizer_utils import ( - load_correct_tokenizer, - fix_sentencepiece_tokenizer, -) +from .tokenizer_utils import * from .models._utils import patch_tokenizer CHAT_TEMPLATES = {} diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index a7e392c25..9b9ba9ac0 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -351,34 +351,3 @@ def backward(ctx, dY): pass pass - -@torch.inference_mode -def fix_untrained_tokens(model, eps = 1e-16): - """ - Llama-3 for eg has untrained vectors in the base model. - These include <|eot_id|>, <|start_header_id|>, <|end_header_id|> - We reset them to the mean of the rest of the tokens - """ - embedding_matrix = model.get_input_embeddings ().weight.data - lm_head_matrix = model.get_output_embeddings().weight.data - where_untrained = torch.where(torch.amax(embedding_matrix, axis = 1) <= eps)[0] - n_untrained = where_untrained.shape[0] - n_trained = embedding_matrix.shape[0] - n_untrained - if n_untrained != 0: - logger.warning_once( - f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\ - "We shall set them to the mean of the other trained tokens." - ) - pass - - # Fix embed_tokens - sum_columns = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0) - mean_columns = (sum_columns / n_trained).to(embedding_matrix.dtype) - embedding_matrix[where_untrained] = mean_columns - - # Fix lm_head - sum_columns = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0) - mean_columns = (sum_columns / n_trained).to(lm_head_matrix.dtype) - lm_head_matrix[where_untrained] = mean_columns - return -pass diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 6b6f887ec..425d068fd 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1481,12 +1481,24 @@ def get_peft_model( train_embed_tokens = True pass pass - - # First fix untrained tokens - # if train_embed_tokens or train_lm_head: - # fix_untrained_tokens(model, eps = 1e-16) - # pass + # Check if we added new tokens! + if hasattr(model, "_need_to_train_embeddings"): + if not train_lm_head or not train_embed_tokens: + print( + "Unsloth: You added new tokens but did not specify if you wanted to "\ + "train the lm_head and embed_tokens. We must turn it on for you." + ) + train_lm_head = True + train_embed_tokens = True + pass + pass + + # First fix untrained tokens + if train_embed_tokens or train_lm_head: + fix_untrained_tokens(model, eps = 1e-16) + pass + # Get LoRA arguments = dict( r = r, diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 76d9372e2..c0f6c7fc2 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -18,11 +18,14 @@ import re import os from transformers.models.llama.modeling_llama import logger +from peft import PeftModelForCausalLM __all__ = [ "load_correct_tokenizer", "fix_sentencepiece_tokenizer", "check_tokenizer", + "fix_untrained_tokens", + "add_new_tokens", ] @@ -466,3 +469,124 @@ def check_tokenizer( pass return convert_to_fast_tokenizer(tokenizer) pass + + +@torch.inference_mode +def fix_untrained_tokens(model, eps = 1e-16): + """ + Llama-3 for eg has untrained vectors in the base model. + These include <|eot_id|>, <|start_header_id|>, <|end_header_id|> + We reset them to the mean of the rest of the tokens + """ + embedding_matrix = model.get_input_embeddings ().weight.data + lm_head_matrix = model.get_output_embeddings().weight.data + + # Get untrained tokens + indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps + where_untrained = torch.where(indicator_untrained)[0] + n_untrained = where_untrained.shape[0] + n_trained = embedding_matrix.shape[0] - n_untrained + if n_untrained != 0: + print( + f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\ + "We shall set them to the mean of the other trained tokens." + ) + pass + + # First set untrained to all 0s - sometimes it's not! 1e-23 for bfloat16 + embedding_matrix[where_untrained] = 0 + lm_head_matrix [where_untrained] = 0 + + # Find sum + sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0) + sum_lm_head = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0) + + # Find correct average by dividing by sum of trained tokens + mean_embedding = (sum_embedding / n_trained).to(embedding_matrix.dtype) + mean_lm_head = (sum_lm_head / n_trained).to(lm_head_matrix .dtype) + + # Set them to the mean + embedding_matrix[where_untrained] = mean_embedding + lm_head_matrix [where_untrained] = mean_lm_head + + return mean_embedding, mean_lm_head +pass + + +@torch.inference_mode +def add_new_tokens( + model, + tokenizer, + new_tokens = [], + method = "mean", + interpolation = 0.05, +): + """ + Smartly resizes the tokenizer and adds new tokens to the model. + We also disregard untrained tokens by removing them from the mean calculation. + """ + assert(isinstance(new_tokens, (list, tuple))) + assert(len(new_tokens) > 0) + assert(method == "mean" or method == "interpolation") + assert(interpolation >= 0 and interpolation <= 1) + + # Check if tokens already exist + overlapping_tokens = set(new_tokens) & set(tokenizer.vocab.keys()) + if len(overlapping_tokens) != 0: + print( + f"Unsloth: You're adding new_tokens = {new_tokens}\n"\ + f"There are tokens which are overlapping = {list(overlapping_tokens)}\n"\ + f"We shall safely ignore these overlapping tokens." + ) + new_tokens = [x for x in new_tokens if x not in overlapping_tokens] + pass + + # Get mean of trained tokens + mean_embedding, mean_lm_head = fix_untrained_tokens(model) + mean_embedding = mean_embedding.to(torch.float32) + mean_lm_head = mean_lm_head .to(torch.float32) + + # Add tokens! + old_length = len(tokenizer) + tokenizer.add_tokens(new_tokens) + model.resize_token_embeddings(len(tokenizer)) + + # If we use interpolation, we interpolate between the mean embeddings and + # the Word2Vec sum of the other vectors + embedding_matrix = model.get_input_embeddings ().weight.data + lm_head_matrix = model.get_output_embeddings().weight.data + + if method == "interpolation": + print( + "Unsloth: You are using interpolation to add new tokens.\n"\ + f"We shall set new tokens = mean(embeddings)*{1-interpolation} + mean(new_tokens)*{interpolation}" + ) + for j, token in enumerate(new_tokens): + input_ids = tokenizer(token, add_special_tokens = False).input_ids + mean_embedding_token = embedding_matrix[input_ids].mean(axis = 0, dtype = torch.float32) + mean_lm_head_token = lm_head_matrix [input_ids].mean(axis = 0, dtype = torch.float32) + + # Interpolate + mean_embedding_token = mean_embedding*(1-interpolation) + mean_embedding_token*interpolation + mean_lm_head_token = mean_lm_head *(1-interpolation) + mean_lm_head_token *interpolation + + # Set the new vector + embedding_matrix[old_length+j] = mean_embedding_token + lm_head_matrix [old_length+j] = mean_lm_head_token + pass + else: + # Now set the new tokens to the mean! + embedding_matrix[old_length:] = mean_embedding + lm_head_matrix [old_length:] = mean_lm_head + pass + + # We set a flag to say we need to train embeddings + internal_model = model + while hasattr(internal_model, "model"): + internal_model._need_to_train_embeddings = True + internal_model = internal_model.model + pass + internal_model._need_to_train_embeddings = True + + return +pass From 868351ba5ee6d914dc0afc45b1e685396ba545f8 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 21 Apr 2024 16:26:35 +1000 Subject: [PATCH 06/69] Update _utils.py --- unsloth/models/_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 9b9ba9ac0..9c4ae8fc6 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -71,7 +71,6 @@ "patch_tokenizer", "get_statistics", "Unsloth_Offloaded_Gradient_Checkpointer", - "fix_untrained_tokens", ] From f29a3e758610dcb8430e3c078f63e178d42c1056 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 21 Apr 2024 16:28:29 +1000 Subject: [PATCH 07/69] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index c0f6c7fc2..15606a7c9 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -19,6 +19,7 @@ import os from transformers.models.llama.modeling_llama import logger from peft import PeftModelForCausalLM +import torch __all__ = [ "load_correct_tokenizer", From 2573474bd597823a467606c748440c0cbad2c574 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 21 Apr 2024 16:36:03 +1000 Subject: [PATCH 08/69] Update llama.py --- unsloth/models/llama.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 425d068fd..11618eee3 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1472,25 +1472,17 @@ def get_peft_model( final_modules.append(module) pass - # Check modules_to_save - if modules_to_save is not None: - for module in modules_to_save: - if module == "lm_head": - train_lm_head = True - elif module == "embed_tokens": - train_embed_tokens = True - pass - pass - # Check if we added new tokens! if hasattr(model, "_need_to_train_embeddings"): if not train_lm_head or not train_embed_tokens: print( "Unsloth: You added new tokens but did not specify if you wanted to "\ - "train the lm_head and embed_tokens. We must turn it on for you." + "train the lm_head and embed_tokens.\nWe must turn it on for you." ) train_lm_head = True train_embed_tokens = True + if "lm_head" not in modules_to_save: modules_to_save.append("lm_head") + if "embed_tokens" not in modules_to_save: modules_to_save.append("embed_tokens") pass pass @@ -1498,7 +1490,17 @@ def get_peft_model( if train_embed_tokens or train_lm_head: fix_untrained_tokens(model, eps = 1e-16) pass - + + # Check modules_to_save + if modules_to_save is not None: + for module in modules_to_save: + if module == "lm_head": + train_lm_head = True + elif module == "embed_tokens": + train_embed_tokens = True + pass + pass + # Get LoRA arguments = dict( r = r, From bfb32a35817db5f79c3929ac25a2792d9b976025 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 21 Apr 2024 16:39:26 +1000 Subject: [PATCH 09/69] Update llama.py --- unsloth/models/llama.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 11618eee3..75e9888d6 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1481,8 +1481,12 @@ def get_peft_model( ) train_lm_head = True train_embed_tokens = True - if "lm_head" not in modules_to_save: modules_to_save.append("lm_head") - if "embed_tokens" not in modules_to_save: modules_to_save.append("embed_tokens") + + if modules_to_save is None: modules_to_save = ["embed_tokens"] + else: modules_to_save.append("embed_tokens") + + if modules_to_save is None: modules_to_save = ["lm_head"] + else: modules_to_save.append("lm_head") pass pass @@ -1498,8 +1502,13 @@ def get_peft_model( train_lm_head = True elif module == "embed_tokens": train_embed_tokens = True + else: + raise TypeError( + f"Unsloth: Module = {module} is not allowed. Only 'lm_head' and 'embed_tokens' is allowed." + ) pass pass + modules_to_save = list(set(modules_to_save)) # Get LoRA arguments = dict( From 40a6d009ef381e72f2f6793701040c4f01c942b5 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 21 Apr 2024 16:57:59 +1000 Subject: [PATCH 10/69] Update llama.py --- unsloth/models/llama.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 75e9888d6..45c75010b 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1445,6 +1445,10 @@ def get_peft_model( "gate_proj", "up_proj", "down_proj",),) model.config.update({"unsloth_version" : __version__}) + if type(modules_to_save) is tuple: + modules_to_save = list(modules_to_save) + pass + train_lm_head = False train_embed_tokens = False final_modules = [] @@ -1508,7 +1512,9 @@ def get_peft_model( ) pass pass - modules_to_save = list(set(modules_to_save)) + if isinstance(modules_to_save, (tuple, list)): + modules_to_save = list(set(modules_to_save)) + pass # Get LoRA arguments = dict( From 140a0b0a407d6151c27c0802e4a30380ae4df042 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 21 Apr 2024 17:56:03 +1000 Subject: [PATCH 11/69] Update llama.py --- unsloth/models/llama.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 45c75010b..ea01d9080 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -534,8 +534,11 @@ def LlamaModel_fast_forward( pass # Embed positions + print(input_ids) + print(input_ids.min(), input_ids.max()) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + print(inputs_embeds) inputs_embeds = inputs_embeds.to(self.config.torch_dtype) From 88435a80de9644703b5206398e76f71671ef9190 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 21 Apr 2024 18:12:45 +1000 Subject: [PATCH 12/69] pad_token --- unsloth/chat_templates.py | 11 +++++++++-- unsloth/tokenizer_utils.py | 6 +++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 2d15470a0..a5d39df27 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -340,9 +340,16 @@ def get_chat_template( new_tokenizer = tokenizer._tokenizer.from_str(string_vocab) if map_eos_token: - new_tokenizer = tokenizer.__class__(tokenizer_object = new_tokenizer, eos_token = stop_word) + new_tokenizer = tokenizer.__class__( + tokenizer_object = new_tokenizer, + eos_token = stop_word, + pad_token = tokenizer.pad_token, + ) else: - new_tokenizer = tokenizer.__class__(tokenizer_object = new_tokenizer) + new_tokenizer = tokenizer.__class__( + tokenizer_object = new_tokenizer, + pad_token = tokenizer.pad_token, + ) pass # Must fix the sentence piece tokenizer since there's no tokenizer.model file! diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 15606a7c9..f1a9daa99 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -259,7 +259,11 @@ def fix_sentencepiece_tokenizer( # And load it! from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(temporary_location, eos_token = new_tokenizer.eos_token) + tokenizer = AutoTokenizer.from_pretrained( + temporary_location, + eos_token = new_tokenizer.eos_token, + pad_token = new_tokenizer.pad_token, + ) return tokenizer pass From 24790e270906fb4bbf46ef0ab96f693509e16d9e Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 22 Apr 2024 04:51:23 +1000 Subject: [PATCH 13/69] Update chat_templates.py --- unsloth/chat_templates.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index a5d39df27..a7b98aaf9 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -339,16 +339,22 @@ def get_chat_template( if skipped != len(token_mapping): new_tokenizer = tokenizer._tokenizer.from_str(string_vocab) + # Careful on pad_token + old_pad_token = tokenizer.pad_token + if old_pad_token == tokenizer.eos_token: + old_pad_token = stop_word + pass + if map_eos_token: new_tokenizer = tokenizer.__class__( tokenizer_object = new_tokenizer, eos_token = stop_word, - pad_token = tokenizer.pad_token, + pad_token = old_pad_token, ) else: new_tokenizer = tokenizer.__class__( tokenizer_object = new_tokenizer, - pad_token = tokenizer.pad_token, + pad_token = old_pad_token, ) pass @@ -384,6 +390,12 @@ def get_chat_template( string_vocab = string_vocab.replace(old_eos_token, stop_word) pass new_tokenizer = tokenizer._tokenizer.from_str(string_vocab) + + # Careful on pad_token + if old_pad_token == old_eos_token: + old_pad_token = stop_word + pass + new_tokenizer = tokenizer.__class__( tokenizer_object = new_tokenizer, bos_token = old_bos_token, From 1464f7da271c8398c7a8db41ac361212088068ab Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 22 Apr 2024 04:54:51 +1000 Subject: [PATCH 14/69] Update chat_templates.py --- unsloth/chat_templates.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index a7b98aaf9..5d8a15e68 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -285,6 +285,8 @@ def get_chat_template( is_fast_tokenizer = getattr(tokenizer, "is_fast", False) old_padding_side = tokenizer.padding_side + same_padding_token = False + if type(chat_template) in (list, tuple,): chat_template, stop_word = chat_template assert(type(chat_template) is str) @@ -343,6 +345,7 @@ def get_chat_template( old_pad_token = tokenizer.pad_token if old_pad_token == tokenizer.eos_token: old_pad_token = stop_word + same_padding_token = True pass if map_eos_token: @@ -394,6 +397,7 @@ def get_chat_template( # Careful on pad_token if old_pad_token == old_eos_token: old_pad_token = stop_word + same_padding_token = True pass new_tokenizer = tokenizer.__class__( @@ -440,9 +444,11 @@ def get_chat_template( new_pad_token = getattr(tokenizer, "pad_token", None) new_bos_token = getattr(tokenizer, "bos_token", None) new_unk_token = getattr(tokenizer, "unk_token", None) - if old_pad_token != new_pad_token: tokenizer.pad_token = old_pad_token if old_bos_token != new_bos_token: tokenizer.bos_token = old_bos_token if old_unk_token != new_unk_token: tokenizer.unk_token = old_unk_token + if same_padding_token: + if old_pad_token != new_pad_token: tokenizer.pad_token = old_pad_token + pass # stopping_criteria = create_stopping_criteria(tokenizer, stop_word) From df069c51f3fa40eef863a60c9cad2e9f4c855dfb Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 22 Apr 2024 04:57:30 +1000 Subject: [PATCH 15/69] tokenizer --- unsloth/chat_templates.py | 1 + unsloth/models/llama.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 5d8a15e68..6686c60f7 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -436,6 +436,7 @@ def get_chat_template( _, tokenizer = patch_tokenizer(model = None, tokenizer = tokenizer) tokenizer.padding_side = old_padding_side tokenizer.chat_template = chat_template + print(tokenizer) # Also fix up other tokens old_pad_token = getattr(old_tokenizer, "pad_token", None) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index ea01d9080..45c75010b 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -534,11 +534,8 @@ def LlamaModel_fast_forward( pass # Embed positions - print(input_ids) - print(input_ids.min(), input_ids.max()) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) - print(inputs_embeds) inputs_embeds = inputs_embeds.to(self.config.torch_dtype) From eb00fb7e77a7076f5cafee7761f0e2575ad7cedd Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 22 Apr 2024 05:01:13 +1000 Subject: [PATCH 16/69] Update save.py --- unsloth/save.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/unsloth/save.py b/unsloth/save.py index 655d1c510..6e9d82c88 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -922,9 +922,16 @@ def save_to_gguf( f"The output location will be {final_location}\n"\ "This will take 3 minutes...") + # We first check if tokenizer.model exists in the model_directory + if os.path.exists(f"{model_directory}/tokenizer.model"): + vocab_type = "hfft" + else: + vocab_type = "bpe" + pass + if use_fast_convert: command = f"python llama.cpp/convert.py {model_directory} "\ - f"--outfile {final_location} --vocab-type hfft "\ + f"--outfile {final_location} --vocab-type {vocab_type} "\ f"--outtype {first_conversion} --concurrency {n_cpus}" else: # Need to fix convert-hf-to-gguf.py for some models! From 805f890e30cc97f9dd324b12b45bfe129f86af9d Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 22 Apr 2024 05:02:08 +1000 Subject: [PATCH 17/69] Update chat_templates.py --- unsloth/chat_templates.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 6686c60f7..5d8a15e68 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -436,7 +436,6 @@ def get_chat_template( _, tokenizer = patch_tokenizer(model = None, tokenizer = tokenizer) tokenizer.padding_side = old_padding_side tokenizer.chat_template = chat_template - print(tokenizer) # Also fix up other tokens old_pad_token = getattr(old_tokenizer, "pad_token", None) From 80be6ff8f2fd6aa3c580628fc5f33eedc9cf7ca7 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 22 Apr 2024 05:04:38 +1000 Subject: [PATCH 18/69] Update chat_templates.py --- unsloth/chat_templates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 5d8a15e68..d31b6cf7a 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -446,7 +446,7 @@ def get_chat_template( new_unk_token = getattr(tokenizer, "unk_token", None) if old_bos_token != new_bos_token: tokenizer.bos_token = old_bos_token if old_unk_token != new_unk_token: tokenizer.unk_token = old_unk_token - if same_padding_token: + if not same_padding_token: if old_pad_token != new_pad_token: tokenizer.pad_token = old_pad_token pass From 2e62a6908b5250984dc107eaca3329707733dacc Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 23 Apr 2024 01:51:10 +1000 Subject: [PATCH 19/69] patch tokenizer padding --- unsloth/models/llama.py | 47 +++++++++++++++++++++++++++++++++++++-- unsloth/models/mistral.py | 9 ++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 45c75010b..c6b733e12 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1284,6 +1284,15 @@ def from_pretrained( # Add save modules patch_saving_functions(model) + # Save tokenizer for inference purposes + tokenizer.padding_side = "left" # Force inference + internal_model = model + while hasattr(internal_model, "model"): + internal_model._saved_temp_tokenizer = tokenizer + internal_model = internal_model.model + pass + internal_model._saved_temp_tokenizer = tokenizer + return model, tokenizer pass @@ -1554,6 +1563,18 @@ def get_peft_model( model.model.lm_head.modules_to_save.default.requires_grad_(True) pass + # Patch tokenizer to pad to the right + internal_model = model + while hasattr(internal_model, "model"): + if hasattr(internal_model, "_saved_temp_tokenizer"): + internal_model._saved_temp_tokenizer.padding_side = "right" + pass + internal_model = internal_model.model + pass + if hasattr(internal_model, "_saved_temp_tokenizer"): + internal_model._saved_temp_tokenizer.padding_side = "right" + pass + return model pass @@ -1751,6 +1772,18 @@ def for_inference(model): # Wrap model.generate model._unwrapped_old_generate = model.generate model.generate = _wrap_fast_inference(model.generate, device_type, dtype) + + # Patch tokenizer to pad to the left + internal_model = model + while hasattr(internal_model, "model"): + if hasattr(internal_model, "_saved_temp_tokenizer"): + internal_model._saved_temp_tokenizer.padding_side = "left" + pass + internal_model = internal_model.model + pass + if hasattr(internal_model, "_saved_temp_tokenizer"): + internal_model._saved_temp_tokenizer.padding_side = "left" + pass pass @@ -1777,8 +1810,18 @@ def for_training(model, use_gradient_checkpointing = True): model.generate = model._unwrapped_old_generate del model._unwrapped_old_generate pass + + # Patch tokenizer to pad to the right + internal_model = model + while hasattr(internal_model, "model"): + if hasattr(internal_model, "_saved_temp_tokenizer"): + internal_model._saved_temp_tokenizer.padding_side = "right" + pass + internal_model = internal_model.model + pass + if hasattr(internal_model, "_saved_temp_tokenizer"): + internal_model._saved_temp_tokenizer.padding_side = "right" + pass pass pass - - diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index 56108939b..80d0ffdf7 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -559,6 +559,15 @@ def from_pretrained( # Add save modules patch_saving_functions(model) + + # Save tokenizer for inference purposes + tokenizer.padding_side = "left" # Force inference + internal_model = model + while hasattr(internal_model, "model"): + internal_model._saved_temp_tokenizer = tokenizer + internal_model = internal_model.model + pass + internal_model._saved_temp_tokenizer = tokenizer return model, tokenizer pass From b0678d6b8a7a04107967d0264bc2c3989ffb5a75 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 23 Apr 2024 04:41:55 +1000 Subject: [PATCH 20/69] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index f1a9daa99..5dc5856c2 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -524,7 +524,7 @@ def add_new_tokens( tokenizer, new_tokens = [], method = "mean", - interpolation = 0.05, + interpolation = 0.5, ): """ Smartly resizes the tokenizer and adds new tokens to the model. From f85ef9c0494a277ae4c7fef71444966f732e7586 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 24 Apr 2024 00:03:36 +1000 Subject: [PATCH 21/69] Update save.py --- unsloth/save.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 6e9d82c88..493b8acaa 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -689,7 +689,7 @@ def unsloth_save_model( def install_llama_cpp_clone_non_blocking(): - full_command = ["git", "clone", "https://github.com/ggerganov/llama.cpp"] + full_command = ["git", "clone", "--recursive", "https://github.com/ggerganov/llama.cpp"] run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT) return run_installer pass @@ -742,7 +742,7 @@ def install_llama_cpp_old(version = -10): # Clone a specific commit # Also don't use the GPU! commands = [ - "git clone https://github.com/ggerganov/llama.cpp", + "git clone --recursive https://github.com/ggerganov/llama.cpp", f"cd llama.cpp && git reset --hard {version} && git clean -df", "make clean -C llama.cpp", f"make all -j{psutil.cpu_count()*2} -C llama.cpp", @@ -767,7 +767,7 @@ def install_llama_cpp_blocking(use_cuda = True): use_cuda = "LLAMA_CUDA=1" if use_cuda else "" commands = [ - "git clone https://github.com/ggerganov/llama.cpp", + "git clone --recursive https://github.com/ggerganov/llama.cpp", "make clean -C llama.cpp", f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp", "pip install gguf protobuf", @@ -966,7 +966,7 @@ def save_to_gguf( "You might have to compile llama.cpp yourself, then run this again.\n"\ "You do not need to close this Python program. Run the following commands in a new terminal:\n"\ "You must run this in the same folder as you're saving your model.\n"\ - "git clone https://github.com/ggerganov/llama.cpp\n"\ + "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\ "cd llama.cpp && make clean && LLAMA_CUDA=1 make all -j\n"\ "Once that's done, redo the quantization." ) @@ -1006,7 +1006,7 @@ def save_to_gguf( "Unsloth: Quantization failed! You might have to compile llama.cpp yourself, then run this again.\n"\ "You do not need to close this Python program. Run the following commands in a new terminal:\n"\ "You must run this in the same folder as you're saving your model.\n"\ - "git clone https://github.com/ggerganov/llama.cpp\n"\ + "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\ "cd llama.cpp && make clean && LLAMA_CUDA=1 make all -j\n"\ "Once that's done, redo the quantization." ) From d2f10a0f488e036ea4f66caea77bf41ed7746329 Mon Sep 17 00:00:00 2001 From: Igor Kilbas Date: Wed, 24 Apr 2024 16:57:24 +0400 Subject: [PATCH 22/69] Fix: loading models with resized vocabulary (#377) * new: vocab resize on load * new: gitignore --- .gitignore | 160 +++++++++++++++++++++++++++++++++++++++ unsloth/models/loader.py | 4 + 2 files changed, 164 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..68bc17f9f --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index fa864a9a8..a107200ea 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -76,6 +76,7 @@ def from_pretrained( fix_tokenizer = True, trust_remote_code = False, use_gradient_checkpointing = True, + resize_model_vocab = None, *args, **kwargs, ): if token is None and "HF_TOKEN" in os.environ: @@ -149,6 +150,9 @@ def from_pretrained( trust_remote_code = trust_remote_code, *args, **kwargs, ) + + if resize_model_vocab is not None: + model.resize_token_embeddings(resize_model_vocab) # In case the model supports tagging, add the unsloth tag. if hasattr(model, "add_model_tags"): From f5fa6548c6e694b2f688a74cdb9da04ed7cf7603 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 28 Apr 2024 20:08:22 +1000 Subject: [PATCH 23/69] GGUF fix --- unsloth/chat_templates.py | 11 +++++++++++ unsloth/models/llama.py | 4 ++++ unsloth/save.py | 9 +-------- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index d31b6cf7a..4e7a71aee 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -281,6 +281,17 @@ def get_chat_template( IS_GEMMA = True pass + # We add a check for Llama-3 + # if chat_template == "llama-3": + # tokenizer._using_llama3_template = True + # else: + # llama3_tokens = set(["<|end_header_id|>", "<|eot_id|>", "<|start_header_id|>"]) + # check_llama3_tokens = llama3_tokens & set(str(x) for x in tokenizer.added_tokens_decoder.values()) + # if len(check_llama3_tokens) == len(llama3_tokens): + # tokenizer._using_llama3_template = True + # pass + # pass + # We first check if the tokenizer is a fast one. If not, we cannot convert this! is_fast_tokenizer = getattr(tokenizer, "is_fast", False) old_padding_side = tokenizer.padding_side diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index c6b733e12..a7cacea9b 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1543,8 +1543,12 @@ def get_peft_model( if not SUPPORTS_LOFTQ: del arguments["loftq_config"] if not SUPPORTS_RSLORA: del arguments["use_rslora"] + _saved_temp_tokenizer = model._saved_temp_tokenizer + lora_config = LoraConfig(**arguments) model = _get_peft_model(model, lora_config) + + model._saved_temp_tokenizer = _saved_temp_tokenizer model = FastLlamaModel.patch_peft_model(model, use_gradient_checkpointing) diff --git a/unsloth/save.py b/unsloth/save.py index 493b8acaa..a2c55bb53 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -922,16 +922,9 @@ def save_to_gguf( f"The output location will be {final_location}\n"\ "This will take 3 minutes...") - # We first check if tokenizer.model exists in the model_directory - if os.path.exists(f"{model_directory}/tokenizer.model"): - vocab_type = "hfft" - else: - vocab_type = "bpe" - pass - if use_fast_convert: command = f"python llama.cpp/convert.py {model_directory} "\ - f"--outfile {final_location} --vocab-type {vocab_type} "\ + f"--outfile {final_location} --vocab-type spm,hfft,bpe "\ f"--outtype {first_conversion} --concurrency {n_cpus}" else: # Need to fix convert-hf-to-gguf.py for some models! From 8325e05dc401e1fed81bedb783b47795a273486d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 29 Apr 2024 03:52:04 +1000 Subject: [PATCH 24/69] Readme (#390) * Update README.md * Update README.md --------- Co-authored-by: Michael Han <107991372+shimmyshimmer@users.noreply.github.com> --- README.md | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 555e08089..2a9499c22 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ -### Finetune Mistral, Gemma, Llama 2-5x faster with 80% less memory! +### Finetune Llama 3, Mistral & Gemma 2-5x faster with 80% less memory! ![](https://i.ibb.co/sJ7RhGG/image-41.png) @@ -22,12 +22,11 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | Unsloth supports | Free Notebooks | Performance | Memory use | |-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------| -| **Llama-3 8b** | [▶️ Start on Colab](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | -| **Gemma 7b** | [▶️ Start on Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | -| **Mistral 7b** | [▶️ Start on Colab](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | -| **TinyLlama** | [▶️ Start on Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 82% less | -| **CodeLlama 34b** A100 | [▶️ Start on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing) | 1.9x faster | 49% less | -| **Mistral 7b** 1xT4 | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) | 5x faster\* | 73% less | +| **Llama 3 (8B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | +| **Mistral (7B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | +| **Gemma (7B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | +| **Llama 3 (8B)** 1xT4 | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 5x faster\* | 73% less | +| **ORPO** | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | | **DPO - Zephyr** | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | - Benchmarking compared to FA2 + Hugging Face combined. @@ -36,7 +35,8 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - \* Kaggle has 2x T4s, but we use 1. Due to overhead, 1x T4 is 5x faster. ## 🦥 Unsloth.ai News -- 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (just change the model name in the notebook). +- 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). +- 📣 NEW! [ORPO support](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) is here! - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line: ```python model = FastLanguageModel.get_peft_model( @@ -46,8 +46,6 @@ model = FastLanguageModel.get_peft_model( ``` - 📣 [CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing) - 📣 [2x faster inference](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) added for all our models -- 📣 [DPO support](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) is now included. [More info](#DPO) on DPO -- 📣 We did a [blog](https://huggingface.co/blog/unsloth-trl) with 🤗Hugging Face and are in their official docs! Check out the [SFT docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth) and [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth) ## 🔗 Links and Resources | Type | Links | From 13b1ae6b93b53bf38c0ebb2acf9673d382fc2d17 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 29 Apr 2024 04:21:45 +1000 Subject: [PATCH 25/69] Update README.md --- README.md | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 2a9499c22..6df661622 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | **Mistral (7B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | | **Gemma (7B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | | **Llama 3 (8B)** 1xT4 | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 5x faster\* | 73% less | -| **ORPO** | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | +| **ORPO** | [▶️ Start on Colab](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | | **DPO - Zephyr** | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | - Benchmarking compared to FA2 + Hugging Face combined. @@ -36,7 +36,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and ## 🦥 Unsloth.ai News - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). -- 📣 NEW! [ORPO support](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) is here! +- 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here! - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line: ```python model = FastLanguageModel.get_peft_model( @@ -180,18 +180,20 @@ max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any! url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl" dataset = load_dataset("json", data_files = {"train" : url}, split = "train") -# 4bit pre quantized models we support - 4x faster downloading! +# 4bit pre quantized models we support for 4x faster downloading + no OOMs. fourbit_models = [ "unsloth/mistral-7b-bnb-4bit", + "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", "unsloth/llama-2-7b-bnb-4bit", - "unsloth/llama-2-13b-bnb-4bit", - "unsloth/codellama-34b-bnb-4bit", - "unsloth/tinyllama-bnb-4bit", -] # Go to https://huggingface.co/unsloth for more 4-bit models! + "unsloth/gemma-7b-bnb-4bit", + "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b + "unsloth/gemma-2b-bnb-4bit", + "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b + "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3 +] # More models at https://huggingface.co/unsloth -# Load Llama model model, tokenizer = FastLanguageModel.from_pretrained( - model_name = "unsloth/mistral-7b-bnb-4bit", # Supports Llama, Mistral - replace this! + model_name = "unsloth/llama-3-8b-bnb-4bit", max_seq_length = max_seq_length, dtype = None, load_in_4bit = True, @@ -206,7 +208,8 @@ model = FastLanguageModel.get_peft_model( lora_alpha = 16, lora_dropout = 0, # Supports any, but = 0 is optimized bias = "none", # Supports any, but = "none" is optimized - use_gradient_checkpointing = True, + # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! + use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context random_state = 3407, max_seq_length = max_seq_length, use_rslora = False, # We support rank stabilized LoRA @@ -270,7 +273,8 @@ model = FastLanguageModel.get_peft_model( lora_alpha = 64, lora_dropout = 0, # Supports any, but = 0 is optimized bias = "none", # Supports any, but = "none" is optimized - use_gradient_checkpointing = True, + # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! + use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context random_state = 3407, max_seq_length = max_seq_length, ) From 5069a7da39a51498154b740e7faa591a4343700c Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 29 Apr 2024 04:39:33 +1000 Subject: [PATCH 26/69] Delete .gitignore --- .gitignore | 160 ----------------------------------------------------- 1 file changed, 160 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 68bc17f9f..000000000 --- a/.gitignore +++ /dev/null @@ -1,160 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ From 7c9c3f5fec572d63f72b2ba290c8043d74efd486 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 03:40:53 +1000 Subject: [PATCH 27/69] Phi-3 --- README.md | 8 ++++++-- unsloth/models/mapper.py | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6df661622..ca5419a20 100644 --- a/README.md +++ b/README.md @@ -25,18 +25,21 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | **Llama 3 (8B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | | **Mistral (7B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | | **Gemma (7B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | -| **Llama 3 (8B)** 1xT4 | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 5x faster\* | 73% less | +| **Llama 3 (8B)** 1xT4 | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 2x faster | 60% less | | **ORPO** | [▶️ Start on Colab](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | | **DPO - Zephyr** | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | +| **Phi-3 (3.8B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | 3.9x faster | 74% less | +| **TinyLlama (1.1B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 1.9x faster | 43% less | - Benchmarking compared to FA2 + Hugging Face combined. - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates. - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for raw text. This [DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr. -- \* Kaggle has 2x T4s, but we use 1. Due to overhead, 1x T4 is 5x faster. +- Other Kaggle Notebooks for [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) ## 🦥 Unsloth.ai News - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). - 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here! +- 📣 NEW! [Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here! - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line: ```python model = FastLanguageModel.get_peft_model( @@ -190,6 +193,7 @@ fourbit_models = [ "unsloth/gemma-2b-bnb-4bit", "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3 + "unsloth/Phi-3-mini-4k-instruct-bnb-4bit", ] # More models at https://huggingface.co/unsloth model, tokenizer = FastLanguageModel.from_pretrained( diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py index 769cbff53..b4fbe5738 100644 --- a/unsloth/models/mapper.py +++ b/unsloth/models/mapper.py @@ -140,6 +140,10 @@ "unsloth/llama-3-70b-Instruct-bnb-4bit" : ( "meta-llama/Meta-Llama-3-70B-Instruct", ), + "unsloth/Phi-3-mini-4k-instruct-bnb-4bit" : ( + "unsloth/Phi-3-mini-4k-instruct", + "microsoft/Phi-3-mini-4k-instruct", + ), } INT_TO_FLOAT_MAPPER = {} From 7b696ee6a930f1d65f7e108b636dcee0d4058265 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 03:44:15 +1000 Subject: [PATCH 28/69] Update README.md --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index ca5419a20..8147d44c4 100644 --- a/README.md +++ b/README.md @@ -20,16 +20,16 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. -| Unsloth supports | Free Notebooks | Performance | Memory use | +| Unsloth supports | Notebooks | Performance | Memory use | |-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------| -| **Llama 3 (8B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | -| **Mistral (7B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | -| **Gemma (7B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | +| **Llama 3 (8B)** | [▶️ Start Free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | +| **Mistral (7B)** | [▶️ Start Free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | +| **Gemma (7B)** | [▶️ Start Free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | | **Llama 3 (8B)** 1xT4 | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 2x faster | 60% less | -| **ORPO** | [▶️ Start on Colab](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | -| **DPO - Zephyr** | [▶️ Start on Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | -| **Phi-3 (3.8B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | 3.9x faster | 74% less | -| **TinyLlama (1.1B)** | [▶️ Start on Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 1.9x faster | 43% less | +| **ORPO** | [▶️ Start Free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | +| **DPO - Zephyr** | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | +| **Phi-3 (3.8B)** | [▶️ Start Free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | | | +| **TinyLlama (1.1B)** | [▶️ Start Free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | - Benchmarking compared to FA2 + Hugging Face combined. - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates. From 48334f7d99a0ece72a61d7e2278dae8c3eb736cb Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 03:46:17 +1000 Subject: [PATCH 29/69] Update README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 8147d44c4..f46c826d6 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,11 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. -| Unsloth supports | Notebooks | Performance | Memory use | +| Unsloth supports | Colab | Performance | Memory use | |-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------| | **Llama 3 (8B)** | [▶️ Start Free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | | **Mistral (7B)** | [▶️ Start Free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | | **Gemma (7B)** | [▶️ Start Free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | -| **Llama 3 (8B)** 1xT4 | [▶️ Start on Kaggle](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook) | 2x faster | 60% less | | **ORPO** | [▶️ Start Free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | | **DPO - Zephyr** | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | | **Phi-3 (3.8B)** | [▶️ Start Free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | | | From 3665c0bb370e2dc23c73c176fd000d49d87bafaf Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 03:48:22 +1000 Subject: [PATCH 30/69] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f46c826d6..3cbd63bee 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. -| Unsloth supports | Colab | Performance | Memory use | +| Unsloth for | Colab | Speed | Memory use | |-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------| | **Llama 3 (8B)** | [▶️ Start Free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | | **Mistral (7B)** | [▶️ Start Free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | @@ -28,7 +28,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | **ORPO** | [▶️ Start Free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | | **DPO - Zephyr** | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | | **Phi-3 (3.8B)** | [▶️ Start Free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | | | -| **TinyLlama (1.1B)** | [▶️ Start Free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | +| **TinyLlama** | [▶️ Start Free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | - Benchmarking compared to FA2 + Hugging Face combined. - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates. From 0f9e073c8cf2a067868fd957e8a24572e0e1c802 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 03:48:57 +1000 Subject: [PATCH 31/69] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3cbd63bee..85ba5fd99 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | **Mistral (7B)** | [▶️ Start Free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | | **Gemma (7B)** | [▶️ Start Free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | | **ORPO** | [▶️ Start Free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | -| **DPO - Zephyr** | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | +| **DPO Zephyr** | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | | **Phi-3 (3.8B)** | [▶️ Start Free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | | | | **TinyLlama** | [▶️ Start Free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | From eb135d8831ce1bc5a6ce463962d963c011b6c8fd Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 03:50:31 +1000 Subject: [PATCH 32/69] Update README.md --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 85ba5fd99..d6a72e4fd 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ unsloth logo - + @@ -20,15 +20,15 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. -| Unsloth for | Colab | Speed | Memory use | +| Unsloth for | Colab | Speed | Memory | |-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------| -| **Llama 3 (8B)** | [▶️ Start Free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | -| **Mistral (7B)** | [▶️ Start Free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | -| **Gemma (7B)** | [▶️ Start Free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | -| **ORPO** | [▶️ Start Free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | -| **DPO Zephyr** | [▶️ Start Free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | -| **Phi-3 (3.8B)** | [▶️ Start Free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | | | -| **TinyLlama** | [▶️ Start Free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | +| **Llama 3 (8B)** | [▶️ Start free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | - 60% | +| **Mistral (7B)** | [▶️ Start free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | - 73% | +| **Gemma (7B)** | [▶️ Start free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | - 71% | +| **ORPO** | [▶️ Start free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | - 43% | +| **DPO Zephyr** | [▶️ Start free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | - 43% | +| **Phi-3 (3.8B)** | [▶️ Start free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | | | +| **TinyLlama** | [▶️ Start free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | - 74% | - Benchmarking compared to FA2 + Hugging Face combined. - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates. From 56e2674e1ebfbf512972517967407b5ea57002e1 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 03:53:19 +1000 Subject: [PATCH 33/69] Update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index d6a72e4fd..45119920b 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ unsloth logo - + @@ -20,15 +20,15 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. -| Unsloth for | Colab | Speed | Memory | -|-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------| -| **Llama 3 (8B)** | [▶️ Start free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | - 60% | -| **Mistral (7B)** | [▶️ Start free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | - 73% | -| **Gemma (7B)** | [▶️ Start free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | - 71% | -| **ORPO** | [▶️ Start free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | - 43% | -| **DPO Zephyr** | [▶️ Start free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | - 43% | -| **Phi-3 (3.8B)** | [▶️ Start free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | | | -| **TinyLlama** | [▶️ Start free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | - 74% | +| Unsloth for | Free Notebooks | Performance | Memory use | +|-----------|---------|--------|----------| +| **Llama 3 (8B)** | [▶️ Start free finetune](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | +| **Mistral (7B)** | [▶️ Start free finetune](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | +| **Gemma (7B)** | [▶️ Start free finetune](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | +| **ORPO** | [▶️ Start free finetune](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | +| **DPO Zephyr** | [▶️ Start free finetune](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | +| **Phi-3 (3.8B)** | [▶️ Start free finetune](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | | | +| **TinyLlama** | [▶️ Start free finetune](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | - Benchmarking compared to FA2 + Hugging Face combined. - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates. From b091a0b6bc9ffd7827c3db8e58e1b0febaf05ef9 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 03:54:41 +1000 Subject: [PATCH 34/69] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 45119920b..190b30ce5 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,9 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | **TinyLlama** | [▶️ Start free finetune](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | - Benchmarking compared to FA2 + Hugging Face combined. +- **Kaggle Notebooks** for [Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) - This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates. - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for raw text. This [DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr. -- Other Kaggle Notebooks for [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) ## 🦥 Unsloth.ai News - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). From 18533ab8b909d6066c76a3b073f286260e1c5fff Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 03:55:48 +1000 Subject: [PATCH 35/69] Update README.md --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 190b30ce5..6a866f05e 100644 --- a/README.md +++ b/README.md @@ -31,14 +31,14 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | **TinyLlama** | [▶️ Start free finetune](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | - Benchmarking compared to FA2 + Hugging Face combined. -- **Kaggle Notebooks** for [Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) -- This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates. -- This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for raw text. This [DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr. +- **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) +- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates. +- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr. ## 🦥 Unsloth.ai News -- 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). -- 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here! -- 📣 NEW! [Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here! +- 📣 NEW! [▶️ Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). +- 📣 NEW! [▶️ ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here! +- 📣 NEW! [▶️ Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here! - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line: ```python model = FastLanguageModel.get_peft_model( @@ -46,8 +46,8 @@ model = FastLanguageModel.get_peft_model( use_gradient_checkpointing = "unsloth", # <<<<<<< ) ``` -- 📣 [CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing) -- 📣 [2x faster inference](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) added for all our models +- 📣 [▶️ CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [▶️ Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [▶️ Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing) +- 📣 [▶️ 2x faster inference](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) added for all our models ## 🔗 Links and Resources | Type | Links | From 3e84338c693acbf1e698a5f7b4f21f29c34a48d7 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 03:57:08 +1000 Subject: [PATCH 36/69] Update README.md --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6a866f05e..2a893bcbb 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ unsloth logo - + @@ -22,13 +22,13 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | Unsloth for | Free Notebooks | Performance | Memory use | |-----------|---------|--------|----------| -| **Llama 3 (8B)** | [▶️ Start free finetune](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | -| **Mistral (7B)** | [▶️ Start free finetune](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | -| **Gemma (7B)** | [▶️ Start free finetune](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | -| **ORPO** | [▶️ Start free finetune](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | -| **DPO Zephyr** | [▶️ Start free finetune](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | -| **Phi-3 (3.8B)** | [▶️ Start free finetune](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | | | -| **TinyLlama** | [▶️ Start free finetune](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | +| **Llama 3 (8B)** | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | +| **Mistral (7B)** | [▶️ Start for free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | +| **Gemma (7B)** | [▶️ Start for free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | +| **ORPO** | [▶️ Start for free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | +| **DPO Zephyr** | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | +| **Phi-3 (3.8B)** | [▶️ Start for free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | | | +| **TinyLlama** | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | - Benchmarking compared to FA2 + Hugging Face combined. - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) From d8feef5824699925dba90ca40553aac17d23e2ae Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 03:58:28 +1000 Subject: [PATCH 37/69] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2a893bcbb..523f4a55e 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - Benchmarking compared to FA2 + Hugging Face combined. - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) - This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates. -- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr. +- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr. ## 🦥 Unsloth.ai News - 📣 NEW! [▶️ Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). From 392c034c7835e5e266bcc795ee89f45e06a430b1 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 04:01:51 +1000 Subject: [PATCH 38/69] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 523f4a55e..3893be43e 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - Benchmarking compared to FA2 + Hugging Face combined. - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) -- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML / Vicuna templates. +- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML templates for Llama-3. For [▶️ conversational Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). - This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr. ## 🦥 Unsloth.ai News @@ -47,7 +47,7 @@ model = FastLanguageModel.get_peft_model( ) ``` - 📣 [▶️ CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [▶️ Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [▶️ Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing) -- 📣 [▶️ 2x faster inference](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) added for all our models +- 📣 [▶️ 2x faster inference](https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing) added for all our models ## 🔗 Links and Resources | Type | Links | From df6fb5291602687fb3da387411ef6bf797f0fdad Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 04:02:54 +1000 Subject: [PATCH 39/69] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3893be43e..62cf4fccc 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - Benchmarking compared to FA2 + Hugging Face combined. - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) -- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for ShareGPT ChatML templates for Llama-3. For [▶️ conversational Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). +- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. For [▶️ ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). - This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr. ## 🦥 Unsloth.ai News From 99ed47a6fbdbb031eabeb5efaba3afab7e017459 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 04:03:08 +1000 Subject: [PATCH 40/69] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 62cf4fccc..a72dc313e 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - Benchmarking compared to FA2 + Hugging Face combined. - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) -- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. For [▶️ ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). +- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And [▶️ ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). - This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr. ## 🦥 Unsloth.ai News From 7fae556c3ee78fcf8ee3bf950941e4966c95e16a Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 04:07:15 +1000 Subject: [PATCH 41/69] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a72dc313e..9f54df092 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and you'll get a 2x faster finetuned model which can be exported to GGUF, vLLM or uploaded to Hugging Face. -| Unsloth for | Free Notebooks | Performance | Memory use | +| Unsloth supports | Free Notebooks | Performance | Memory use | |-----------|---------|--------|----------| | **Llama 3 (8B)** | [▶️ Start for free](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) | 2x faster | 60% less | | **Mistral (7B)** | [▶️ Start for free](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing) | 2.2x faster | 73% less | @@ -33,7 +33,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - Benchmarking compared to FA2 + Hugging Face combined. - **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) - This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And [▶️ ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). -- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. This [▶️ DPO notebook](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) replicates Zephyr. +- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. ## 🦥 Unsloth.ai News - 📣 NEW! [▶️ Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). From 000d050c1da66587cf5166964455dabba54d3ed0 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 04:08:07 +1000 Subject: [PATCH 42/69] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9f54df092..c13e69aa5 100644 --- a/README.md +++ b/README.md @@ -36,9 +36,9 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. ## 🦥 Unsloth.ai News -- 📣 NEW! [▶️ Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). -- 📣 NEW! [▶️ ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here! -- 📣 NEW! [▶️ Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here! +- 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). +- 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here! +- 📣 NEW! [Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here! - 📣 NEW! We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support fine-tuning of LLMs with [4x longer context windows](https://unsloth.ai/blog/long-context)! No change required if you're using our notebooks. To enable, simply change 1 line: ```python model = FastLanguageModel.get_peft_model( @@ -46,8 +46,8 @@ model = FastLanguageModel.get_peft_model( use_gradient_checkpointing = "unsloth", # <<<<<<< ) ``` -- 📣 [▶️ CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [▶️ Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [▶️ Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing) -- 📣 [▶️ 2x faster inference](https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing) added for all our models +- 📣 [CodeGemma](https://colab.research.google.com/drive/19lwcRk_ZQ_ZtX-qzFP3qZBBHZNcMD1hh?usp=sharing) now works along with [Gemma 7b](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) and [Gemma 2b](https://colab.research.google.com/drive/15gGm7x_jTm017_Ic8e317tdIpDG53Mtu?usp=sharing) +- 📣 [2x faster inference](https://colab.research.google.com/drive/1aqlNQi7MMJbynFDyOQteD2t0yVfjb9Zh?usp=sharing) added for all our models ## 🔗 Links and Resources | Type | Links | From 27f88f0ab040b66c06294da4f0c2e262601bcdbf Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 04:08:41 +1000 Subject: [PATCH 43/69] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c13e69aa5..f207d69ee 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,9 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | **TinyLlama** | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | - Benchmarking compared to FA2 + Hugging Face combined. -- **Kaggle Notebooks** for [▶️ Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [▶️ Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [▶️ Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) -- This [▶️ conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And [▶️ ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). -- This [▶️ text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. +- **Kaggle Notebooks** for [Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) +- This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And [ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). +- This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. ## 🦥 Unsloth.ai News - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). From affbba181aefd2506d60354cf1d1182871fd3b79 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 04:09:42 +1000 Subject: [PATCH 44/69] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f207d69ee..ae3ad69d9 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - Benchmarking compared to FA2 + Hugging Face combined. - **Kaggle Notebooks** for [Llama-3 8b](https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-8b-unsloth-notebook), [Gemma 7b](https://www.kaggle.com/code/danielhanchen/kaggle-gemma-7b-unsloth-notebook/), [Mistral 7b](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook) -- This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And [ChatML Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). +- This [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing) is useful for Llama-3. And ChatML for [Mistral 7b](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing). - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. ## 🦥 Unsloth.ai News From 14f104ad558070f3f8068d21515d63df57bedadf Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 30 Apr 2024 05:58:12 +1000 Subject: [PATCH 45/69] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ae3ad69d9..07bcf2cf5 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and | **Gemma (7B)** | [▶️ Start for free](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing) | 2.4x faster | 71% less | | **ORPO** | [▶️ Start for free](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) | 1.9x faster | 43% less | | **DPO Zephyr** | [▶️ Start for free](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing) | 1.9x faster | 43% less | -| **Phi-3 (3.8B)** | [▶️ Start for free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | | | +| **Phi-3 (3.8B)** | [▶️ Start for free](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) | 2x faster | 50% less | | **TinyLlama** | [▶️ Start for free](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) | 3.9x faster | 74% less | - Benchmarking compared to FA2 + Hugging Face combined. From e040d18691af0f1852fca21ff7c42d36829d5456 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 5 May 2024 04:07:11 +1000 Subject: [PATCH 46/69] Fix reserved tokens --- unsloth/models/llama.py | 14 +++++++++---- unsloth/save.py | 31 ++++++++++++++++++--------- unsloth/tokenizer_utils.py | 43 +++++++++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 15 deletions(-) diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index a7cacea9b..136ceb2c7 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1503,10 +1503,16 @@ def get_peft_model( pass pass + # Check for Llama-3 + # if hasattr(model._saved_temp_tokenizer, "_using_llama3_template"): + # if not train_embed_tokens and not train_lm_head: + # raise RuntimeError("") + # First fix untrained tokens - if train_embed_tokens or train_lm_head: - fix_untrained_tokens(model, eps = 1e-16) - pass + # Wrong - can cause reserved tokens to pop out!! + # if train_embed_tokens or train_lm_head: + # fix_untrained_tokens(model, eps = 1e-16) + # pass # Check modules_to_save if modules_to_save is not None: @@ -1547,7 +1553,7 @@ def get_peft_model( lora_config = LoraConfig(**arguments) model = _get_peft_model(model, lora_config) - + model._saved_temp_tokenizer = _saved_temp_tokenizer model = FastLlamaModel.patch_peft_model(model, use_gradient_checkpointing) diff --git a/unsloth/save.py b/unsloth/save.py index 0e131fe30..a5ceb1299 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -118,14 +118,14 @@ def _merge_lora(layer, name): W = fast_dequantize(W, quant_state) else: dtype = W.dtype - # W = W.to(torch.float32).t() - W = W.t() + W = W.to(torch.float32).t() + # W = W.t() if A is not None: # sAB = (A.t().to(torch.float32) @ (s * B.t().to(torch.float32))) # W += sAB - # W.addmm_(A.t().to(torch.float32), B.t().to(torch.float32), alpha = s) - W.addmm_(A.t().to(W.dtype), B.t().to(W.dtype), alpha = s) + W.addmm_(A.t().to(torch.float32), B.t().to(torch.float32), alpha = s) + # W.addmm_(A.t().to(W.dtype), B.t().to(W.dtype), alpha = s) # if not torch.isfinite(W).all(): maximum_element = torch.max(W.min().abs(), W.max()) if not torch.isfinite(maximum_element).item(): @@ -696,12 +696,18 @@ def install_llama_cpp_clone_non_blocking(): def install_llama_cpp_make_non_blocking(): - env = { **os.environ, "LLAMA_CUDA": "1", } + # https://github.com/ggerganov/llama.cpp/issues/7062 + # Weirdly GPU conversion for GGUF breaks?? + # env = { **os.environ, "LLAMA_CUDA": "1", } n_jobs = max(int(psutil.cpu_count()*1.5), 1) # Force make clean os.system("make clean -C llama.cpp") full_command = ["make", "all", "-j"+str(n_jobs), "-C", "llama.cpp"] - run_installer = subprocess.Popen(full_command, env = env, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT) + + # https://github.com/ggerganov/llama.cpp/issues/7062 + # Weirdly GPU conversion for GGUF breaks?? + # run_installer = subprocess.Popen(full_command, env = env, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT) + run_installer = subprocess.Popen(full_command, stdout = subprocess.DEVNULL, stderr = subprocess.STDOUT) return run_installer pass @@ -764,12 +770,17 @@ def install_llama_cpp_old(version = -10): def install_llama_cpp_blocking(use_cuda = True): - use_cuda = "LLAMA_CUDA=1" if use_cuda else "" + # https://github.com/ggerganov/llama.cpp/issues/7062 + # Weirdly GPU conversion for GGUF breaks?? + # use_cuda = "LLAMA_CUDA=1" if use_cuda else "" commands = [ "git clone --recursive https://github.com/ggerganov/llama.cpp", "make clean -C llama.cpp", - f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp", + # https://github.com/ggerganov/llama.cpp/issues/7062 + # Weirdly GPU conversion for GGUF breaks?? + # f"{use_cuda} make all -j{psutil.cpu_count()*2} -C llama.cpp", + f"make all -j{psutil.cpu_count()*2} -C llama.cpp", "pip install gguf protobuf", ] if os.path.exists("llama.cpp"): return @@ -967,7 +978,7 @@ def save_to_gguf( "You do not need to close this Python program. Run the following commands in a new terminal:\n"\ "You must run this in the same folder as you're saving your model.\n"\ "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\ - "cd llama.cpp && make clean && LLAMA_CUDA=1 make all -j\n"\ + "cd llama.cpp && make clean && make all -j\n"\ "Once that's done, redo the quantization." ) pass @@ -1007,7 +1018,7 @@ def save_to_gguf( "You do not need to close this Python program. Run the following commands in a new terminal:\n"\ "You must run this in the same folder as you're saving your model.\n"\ "git clone --recursive https://github.com/ggerganov/llama.cpp\n"\ - "cd llama.cpp && make clean && LLAMA_CUDA=1 make all -j\n"\ + "cd llama.cpp && make clean && make all -j\n"\ "Once that's done, redo the quantization." ) pass diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 5dc5856c2..fe2b2d837 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -518,6 +518,44 @@ def fix_untrained_tokens(model, eps = 1e-16): pass +@torch.inference_mode +def mean_of_trained_tokens(model, eps = 1e-16): + """ + Llama-3 for eg has untrained vectors in the base model. + These include <|eot_id|>, <|start_header_id|>, <|end_header_id|> + We reset them to the mean of the rest of the tokens + """ + embedding_matrix = model.get_input_embeddings ().weight.data.clone() + lm_head_matrix = model.get_output_embeddings().weight.data.clone() + + # Get untrained tokens + indicator_untrained = torch.amax(embedding_matrix, axis = 1) <= eps + where_untrained = torch.where(indicator_untrained)[0] + n_untrained = where_untrained.shape[0] + n_trained = embedding_matrix.shape[0] - n_untrained + if n_untrained != 0: + print( + f"Unsloth: Not an error, but your model has {n_untrained} untrained tokens.\n"\ + "We shall set them to the mean of the other trained tokens." + ) + pass + + # First set untrained to all 0s - sometimes it's not! 1e-23 for bfloat16 + embedding_matrix[where_untrained] = 0 + lm_head_matrix [where_untrained] = 0 + + # Find sum + sum_embedding = torch.sum(embedding_matrix, dtype = torch.float32, axis = 0) + sum_lm_head = torch.sum(lm_head_matrix, dtype = torch.float32, axis = 0) + + # Find correct average by dividing by sum of trained tokens + mean_embedding = (sum_embedding / n_trained).to(embedding_matrix.dtype) + mean_lm_head = (sum_lm_head / n_trained).to(lm_head_matrix .dtype) + + return mean_embedding, mean_lm_head +pass + + @torch.inference_mode def add_new_tokens( model, @@ -547,7 +585,10 @@ def add_new_tokens( pass # Get mean of trained tokens - mean_embedding, mean_lm_head = fix_untrained_tokens(model) + # mean_embedding, mean_lm_head = fix_untrained_tokens(model) + + # Weirdly be careful reserved tokens can pop out + mean_embedding, mean_lm_head = mean_of_trained_tokens(model) mean_embedding = mean_embedding.to(torch.float32) mean_lm_head = mean_lm_head .to(torch.float32) From f53944a44ceb21acd7efe40034160bfd6db814ff Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 5 May 2024 04:21:40 +1000 Subject: [PATCH 47/69] Update save.py --- unsloth/save.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsloth/save.py b/unsloth/save.py index a5ceb1299..e50f0d34d 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -844,6 +844,12 @@ def save_to_gguf( first_conversion : str = "f16", _run_installer = None, # Non blocking install of llama.cpp ): + logger.warning( + "WARNING: llama.cpp GGUF conversion is currently unstable, since llama.cpp is\n"\ + "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\ + "Please be patient - GGUF saving should still work, but might not work as well." + ) + from transformers.models.llama.modeling_llama import logger if quantization_method.startswith("iq2"): From 70b41d1fc1f504df90be04f889bf2f1ac7613b47 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 5 May 2024 04:49:13 +1000 Subject: [PATCH 48/69] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 42 +++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index fe2b2d837..21450ff73 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -25,7 +25,6 @@ "load_correct_tokenizer", "fix_sentencepiece_tokenizer", "check_tokenizer", - "fix_untrained_tokens", "add_new_tokens", ] @@ -636,3 +635,44 @@ def add_new_tokens( return pass + + +def fix_sft_trainer_tokenizer(): + """ + Fixes double adding BOS tokens like in llama-3 + """ + from inspect import getsource + import trl.trainer.sft_trainer + from trl.trainer.sft_trainer import * + + for function_name, replacer in ( + ("_prepare_non_packed_dataloader", "def tokenize(element):",), + ("_prepare_packed_dataloader", "if dataset_text_field is not None",), + ): + function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}")) + where = function.find("def") + function = function.split("\n") + function = "\n".join(x[where:] for x in function) + + check_text = \ + "\n"\ + "print(1)\n"\ + "test_text = dataset[0][dataset_text_field] if not use_formatting_func else formatting_func(dataset[0])\n"\ + "chat_template = getattr(tokenizer, 'chat_template', None)\n"\ + "chat_template = '' if chat_template is None else chat_template\n"\ + "has_bos_token_already = tokenizer.bos_token in test_text or tokenizer.bos_token in chat_template\n"\ + "add_special_tokens = False if has_bos_token_already else add_special_tokens\n\n" + + check_text = check_text.split("\n") + check_text = "\n".join(" "*where + x for x in check_text) + + function = function.replace(replacer, check_text + replacer) + exec(function, globals()) + + # Replace TRL's SFTTrainer + exec(f"trl.trainer.sft_trainer.SFTTrainer.{function_name} = {function_name}", globals()) + pass +pass + +# Fixes double adding BOS tokens like in llama-3 +fix_sft_trainer_tokenizer() From 1b1b931260efcbb5fda84b073d7e6dfecb9245ac Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 5 May 2024 04:51:48 +1000 Subject: [PATCH 49/69] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 21450ff73..58e36c674 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -637,14 +637,14 @@ def add_new_tokens( pass +from inspect import getsource +import trl.trainer.sft_trainer +from trl.trainer.sft_trainer import * + def fix_sft_trainer_tokenizer(): """ Fixes double adding BOS tokens like in llama-3 """ - from inspect import getsource - import trl.trainer.sft_trainer - from trl.trainer.sft_trainer import * - for function_name, replacer in ( ("_prepare_non_packed_dataloader", "def tokenize(element):",), ("_prepare_packed_dataloader", "if dataset_text_field is not None",), From 61edc3cfbf048c3b73f6abb2d28bc62845a04acf Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 5 May 2024 04:56:07 +1000 Subject: [PATCH 50/69] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 58e36c674..43014df8f 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -656,8 +656,7 @@ def fix_sft_trainer_tokenizer(): check_text = \ "\n"\ - "print(1)\n"\ - "test_text = dataset[0][dataset_text_field] if not use_formatting_func else formatting_func(dataset[0])\n"\ + "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])\n"\ "chat_template = getattr(tokenizer, 'chat_template', None)\n"\ "chat_template = '' if chat_template is None else chat_template\n"\ "has_bos_token_already = tokenizer.bos_token in test_text or tokenizer.bos_token in chat_template\n"\ From 73df3ee5325dd4544c78441bdc3f093b583c70a6 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 5 May 2024 05:00:12 +1000 Subject: [PATCH 51/69] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 43014df8f..3e78b6b62 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -647,7 +647,7 @@ def fix_sft_trainer_tokenizer(): """ for function_name, replacer in ( ("_prepare_non_packed_dataloader", "def tokenize(element):",), - ("_prepare_packed_dataloader", "if dataset_text_field is not None",), + # ("_prepare_packed_dataloader", "if dataset_text_field is not None",), ): function = getsource(eval(f"trl.trainer.sft_trainer.SFTTrainer.{function_name}")) where = function.find("def") From 15d78981bba72dbc6b8146b2a4cdb9220259b10a Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 5 May 2024 05:14:38 +1000 Subject: [PATCH 52/69] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 3e78b6b62..0d6dadf7d 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -659,7 +659,7 @@ def fix_sft_trainer_tokenizer(): "test_text = dataset[0][dataset_text_field] if (formatting_func is None or not use_formatting_func) else formatting_func(dataset[0])\n"\ "chat_template = getattr(tokenizer, 'chat_template', None)\n"\ "chat_template = '' if chat_template is None else chat_template\n"\ - "has_bos_token_already = tokenizer.bos_token in test_text or tokenizer.bos_token in chat_template\n"\ + "has_bos_token_already = test_text.startswith(tokenizer.bos_token) or tokenizer.bos_token in chat_template\n"\ "add_special_tokens = False if has_bos_token_already else add_special_tokens\n\n" check_text = check_text.split("\n") From 76ed0a49ff8f532d94e85d6d3305f47b020c6d6f Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 6 May 2024 20:21:48 +1000 Subject: [PATCH 53/69] Update chat_templates.py --- unsloth/chat_templates.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 4e7a71aee..c086c7e87 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -266,6 +266,20 @@ CHAT_TEMPLATES["llama-3"] = (llama3_template, llama3_template_eos_token,) +# Phi-3 +phi3_template = \ + "{{ bos_token }}"\ + "{% for message in messages %}"\ + "{% if (message['role'] == 'user') %}"\ + "{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}"\ + "{% elif (message['role'] == 'assistant') %}"\ + "{{message['content'] + '<|end|>' + '\n'}}"\ + "{% endif %}"\ + "{% endfor %}" +phi3_template_eos_token = "<|end|>" +CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token,) + + def get_chat_template( tokenizer, chat_template = "chatml", @@ -595,4 +609,12 @@ def test_chat_templates(): correct_tokenizer.chat_template = template our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) assert(correct_prompt == our_prompt) + + # Phi-3 + template = phi3_templatetemplate + correct_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") + correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) + correct_tokenizer.chat_template = template + our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) + assert(correct_prompt == our_prompt) pass From dfec8dd72842b5c7cac09062e01942c5a9f4062e Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 8 May 2024 06:31:02 +1000 Subject: [PATCH 54/69] Update save.py --- unsloth/save.py | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index 868d25de4..b825b10fb 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -18,6 +18,7 @@ from typing import Optional, Callable, Union, List import torch import os +import shutil import pickle import gc from transformers.models.llama.modeling_llama import logger @@ -87,6 +88,24 @@ def print_quantization_methods(): pass +def check_if_sentencepiece_model(model, temporary_location = "_unsloth_sentencepiece_temp"): + if not hasattr(model, "_saved_temp_tokenizer"): return False + + temp_tokenizer = model._saved_temp_tokenizer + sentencepiece_model = False + file_location = f"{temporary_location}/{temp_tokenizer.name_or_path}" + if not os.path.exists(file_location): + os.makedirs(file_location) + pass + temp_tokenizer.save_pretrained(file_location) + if os.path.isfile(f"{file_location}/tokenizer.model"): + sentencepiece_model = True + pass + shutil.rmtree(file_location) + return sentencepiece_model +pass + + def _free_cached_model(model): from huggingface_hub import scan_cache_dir cached_repos = list(scan_cache_dir().repos) @@ -840,6 +859,7 @@ def _fix_gemma_gguf(): def save_to_gguf( model_type : str, + is_sentencepiece : bool = False, model_directory : str = "unsloth_finetuned_model", quantization_method : str = "fast_quantized", first_conversion : str = "f16", @@ -856,7 +876,8 @@ def save_to_gguf( # Careful convert.py is only for Llama / Mistral based archs use_fast_convert = False - if model_type == "llama": use_fast_convert = True + if not is_sentencepiece: use_fast_convert = False # Llama-3 + elif model_type == "llama": use_fast_convert = True elif model_type == "mistral": use_fast_convert = True pass logger.warning_once(f"Unsloth: Converting {model_type} model. Can use fast conversion = {use_fast_convert}.") @@ -951,7 +972,7 @@ def save_to_gguf( f"--outtype {first_conversion} --concurrency {n_cpus}" else: # Need to fix convert-hf-to-gguf.py for some models! - _fix_gemma_gguf() + # _fix_gemma_gguf() command = f"python llama.cpp/convert-hf-to-gguf.py {model_directory} "\ f"--outfile {final_location} "\ @@ -1353,7 +1374,10 @@ def unsloth_save_pretrained_gguf( gc.collect() model_type = self.config.model_type - file_location = save_to_gguf(model_type, new_save_directory, quantization_method, first_conversion, makefile) + is_sentencepiece_model = check_if_sentencepiece_model(self) + file_location = save_to_gguf(model_type, is_sentencepiece_model, + new_save_directory, quantization_method, first_conversion, makefile, + ) if push_to_hub: print("Unsloth: Uploading GGUF to Huggingface Hub...") @@ -1473,7 +1497,10 @@ def unsloth_push_to_hub_gguf( gc.collect() model_type = self.config.model_type - file_location = save_to_gguf(model_type, new_save_directory, quantization_method, first_conversion, makefile) + is_sentencepiece_model = check_if_sentencepiece_model(self) + file_location = save_to_gguf(model_type, is_sentencepiece_model, + new_save_directory, quantization_method, first_conversion, makefile, + ) print("Unsloth: Uploading GGUF to Huggingface Hub...") username = upload_to_huggingface( From 73af5d11f1d7c052d42ed47b62e0c73389a26f82 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 8 May 2024 07:35:24 +1000 Subject: [PATCH 55/69] Update _utils.py --- unsloth/models/_utils.py | 64 +++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 9c4ae8fc6..49f054e43 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -144,24 +144,60 @@ def make_inputs_require_grad(module, input, output): def patch_tokenizer(model, tokenizer): + """ + Phi3's pad_token isn't set. We set it to <|placeholder... + Llama-3 is <|reserved... + Llama-2 is + Check if pad_token is not the same as eos_token otherwise the loss will ignore it!! + Fixes https://github.com/unslothai/unsloth/issues/5 + """ + possible_reserved_tokens = ("<|reserved", "<|placeholder",) + if model is not None: model.config.update({"unsloth_version" : __version__}) - if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: - # Fixes https://github.com/unslothai/unsloth/issues/5 - if hasattr(tokenizer, "unk_token") and tokenizer.unk_token is not None: - tokenizer.add_special_tokens({"pad_token" : tokenizer.unk_token}) - tokenizer.pad_token = tokenizer.unk_token - else: - name = model.config._name_or_path if model is not None else "Model" - logger.warning_once( - f"{name} does not have a padding or unknown token!\n"\ - f"Will use the EOS token of id {tokenizer.eos_token_id} as padding." + + bad_pad_token = False + if hasattr(tokenizer, "pad_token") and tokenizer.pad_token is not None: + # Check if pad_token is not the same as eos_token otherwise the loss will ignore it!! + bad_pad_token = tokenizer.eos_token == tokenizer.pad_token + elif hasattr(tokenizer, "pad_token") and tokenizer.pad_token is None: + bad_pad_token = True + else: + bad_pad_token = False + pass + + if bad_pad_token: + # Find a better pad token + added_tokens = [str(x) for x in tokenizer.added_tokens_decoder.values()] + possible_pad_token = None + for added_token in added_tokens[::-1]: + if added_token.startswith(possible_reserved_tokens): + possible_pad_token = added_token + break + pass + pass + if possible_pad_token is None: + # Try unk_token + possible_pad_token = tokenizer.unk_token + pass + if possible_pad_token is None: + # Failure!! + raise RuntimeError( + "Unsloth: Tokenizer's pad_token cannot be = eos_token, and we couldn't find a\n"\ + "replacement of either <|reserved... or <|placeholder..." ) - assert(hasattr(tokenizer, "eos_token")) - tokenizer.add_special_tokens({"pad_token" : tokenizer.eos_token}) - tokenizer.pad_token = tokenizer.eos_token + pass + + name = model.config._name_or_path if model is not None else "Model" + logger.warning_once( + f"{name} does not have a padding token! Will use pad_token = {possible_pad_token}." + ) + + # Edit pad_token + tokenizer.add_special_tokens({"pad_token" : possible_pad_token}) + tokenizer.pad_token = possible_pad_token if model is not None: - config = model.config.update({"pad_token_id" : tokenizer.eos_token_id}) + config = model.config.update({"pad_token_id" : tokenizer.pad_token_id}) pass return model, tokenizer pass From 9c7d9a7dec537870099a0be822def59f3f7c3db0 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Wed, 8 May 2024 07:37:30 +1000 Subject: [PATCH 56/69] Update chat_templates.py --- unsloth/chat_templates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index c086c7e87..07999ea0d 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -611,7 +611,7 @@ def test_chat_templates(): assert(correct_prompt == our_prompt) # Phi-3 - template = phi3_templatetemplate + template = phi3_template correct_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) correct_tokenizer.chat_template = template From 7c536521addf628c0d7c412e8108dee1e651d768 Mon Sep 17 00:00:00 2001 From: Nathan Azrak <42650258+nathan-az@users.noreply.github.com> Date: Sat, 11 May 2024 02:53:19 +1000 Subject: [PATCH 57/69] Adds dependencies and extras for torch 2.3.0 with new xformers versions (#415) * Adds dependencies and extras for torch 2.3.0 with new xformers versions * Add 2.3.0 section to readme --- README.md | 9 ++++++++- pyproject.toml | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c6e7d6c50..c06dd9796 100644 --- a/README.md +++ b/README.md @@ -159,7 +159,14 @@ pip install --no-deps packaging ninja einops flash-attn xformers trl peft accele pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" pip install --no-deps xformers trl peft accelerate bitsandbytes ``` -7. To troubleshoot installs try the below (all must succeed). Xformers should mostly all be available. +7. For Pytorch 2.3.0: Use the `"ampere"` path for newer RTX 30xx GPUs or higher. +```bash +pip install "unsloth[cu118-torch230] @ git+https://github.com/unslothai/unsloth.git" +pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git" +pip install "unsloth[cu118-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git" +pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git" +``` +8. To troubleshoot installs try the below (all must succeed). Xformers should mostly all be available. ```bash nvcc python -m xformers.info diff --git a/pyproject.toml b/pyproject.toml index e6f663a96..0398d0df4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,17 @@ cu121onlytorch220 = [ "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", ] +cu118onlytorch230 = [ + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", +] +cu121onlytorch230 = [ + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'", + "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'", +] + cu118 = [ "unsloth[huggingface]", "bitsandbytes", @@ -126,6 +137,16 @@ cu121-torch220 = [ "bitsandbytes", "unsloth[cu121onlytorch220]", ] +cu118-torch230 = [ + "unsloth[huggingface]", + "bitsandbytes", + "unsloth[cu118onlytorch230]", +] +cu121-torch230 = [ + "unsloth[huggingface]", + "bitsandbytes", + "unsloth[cu121onlytorch230]", +] kaggle = [ "unsloth[huggingface]", ] @@ -238,6 +259,22 @@ cu121-ampere-torch220 = [ "ninja", "flash-attn", ] +cu118-ampere-torch230 = [ + "unsloth[huggingface]", + "bitsandbytes", + "unsloth[cu118onlytorch230]", + "packaging", + "ninja", + "flash-attn", +] +cu121-ampere-torch230 = [ + "unsloth[huggingface]", + "bitsandbytes", + "unsloth[cu121onlytorch230]", + "packaging", + "ninja", + "flash-attn", +] [project.urls] homepage = "http://www.unsloth.ai" From cf83fe331a159fb6d98f84330a470f708aa1aa74 Mon Sep 17 00:00:00 2001 From: Yang JianXin <995462226@qq.com> Date: Sat, 11 May 2024 01:23:55 +0800 Subject: [PATCH 58/69] Support Qwen2 (#428) * support Qwen2 * support Qwen2 * Delete README.md * Revert "Delete README.md" This reverts commit 026b05f859410ddd04e1a2b4b54e950b89b4a58a. * Update README.md * Qwen2 == Mistral * Update llama.py * Update __init__.py * Update README.md --------- Co-authored-by: Daniel Han --- README.md | 1 + unsloth/models/__init__.py | 7 +-- unsloth/models/llama.py | 1 + unsloth/models/loader.py | 3 ++ unsloth/models/mistral.py | 2 +- unsloth/models/qwen2.py | 91 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 101 insertions(+), 4 deletions(-) create mode 100644 unsloth/models/qwen2.py diff --git a/README.md b/README.md index c06dd9796..ca5b6533b 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text. ## 🦥 Unsloth.ai News +- 📣 NEW! Qwen1.5-7B, Qwen1.5-14B, Qwen1.5-32B, Qwen1.5-72B now work, courtesy of Firefly's PR [#428](https://github.com/unslothai/unsloth/pull/428) - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook). - 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here! - 📣 NEW! [Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here! diff --git a/unsloth/models/__init__.py b/unsloth/models/__init__.py index 891947d69..ff7129e06 100644 --- a/unsloth/models/__init__.py +++ b/unsloth/models/__init__.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .loader import FastLanguageModel -from .llama import FastLlamaModel +from .loader import FastLanguageModel +from .llama import FastLlamaModel from .mistral import FastMistralModel -from .dpo import PatchDPOTrainer +from .qwen2 import FastQwen2Model +from .dpo import PatchDPOTrainer diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py index 136ceb2c7..44998b4cf 100644 --- a/unsloth/models/llama.py +++ b/unsloth/models/llama.py @@ -1605,6 +1605,7 @@ def patch_peft_model( if model_type == "llama": apply_lora_mlp = apply_lora_mlp_swiglu elif model_type == "mistral": apply_lora_mlp = apply_lora_mlp_swiglu + elif model_type == "qwen2": apply_lora_mlp = apply_lora_mlp_swiglu elif model_type == "gemma": apply_lora_mlp = apply_lora_mlp_geglu_approx else: raise NotImplementedError(f"Unsloth: {model_type} is not yet implemented!") diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index a107200ea..2b3bf4794 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -14,6 +14,7 @@ from .llama import FastLlamaModel, logger from .mistral import FastMistralModel +from .qwen2 import FastQwen2Model from transformers import AutoConfig from transformers import __version__ as transformers_version from peft import PeftConfig, PeftModel @@ -119,6 +120,8 @@ def from_pretrained( f"to obtain the latest transformers build, then restart this session."\ ) dispatch_model = FastGemmaModel + elif model_type == "qwen2": + dispatch_model = FastQwen2Model else: raise NotImplementedError( f"Unsloth: {model_name} not supported yet!\n"\ diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py index 80d0ffdf7..902177cb1 100644 --- a/unsloth/models/mistral.py +++ b/unsloth/models/mistral.py @@ -343,7 +343,7 @@ def from_pretrained( # Mistral does NOT support RoPE Scaling sadly so we have to error out. if max_seq_length > model_max_seq_length: raise RuntimeError( - "Unsloth: Unfortunately Mistral type models do not support RoPE scaling!\n"\ + f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\ f"The maximum sequence length supported is {model_max_seq_length}.", ) pass diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py new file mode 100644 index 000000000..76fe31a6d --- /dev/null +++ b/unsloth/models/qwen2.py @@ -0,0 +1,91 @@ +# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .llama import * +from .mistral import FastMistralModel +import os +from ._utils import __version__ + +from transformers.models.qwen2.modeling_qwen2 import ( + Qwen2Attention, + Qwen2DecoderLayer, + Qwen2Model, + Qwen2ForCausalLM, +) +# For Pytorch 2.1.1 +try: + from transformers.models.qwen2.modeling_qwen2 import ( + Qwen2SdpaAttention, + Qwen2FlashAttention2, + ) +except: + Qwen2SdpaAttention = Qwen2Attention + Qwen2FlashAttention2 = Qwen2Attention +pass + + +class FastQwen2Model(FastLlamaModel): + + @staticmethod + def pre_patch(): + Qwen2Attention .forward = LlamaAttention_fast_forward + Qwen2SdpaAttention .forward = LlamaAttention_fast_forward + Qwen2FlashAttention2.forward = LlamaAttention_fast_forward + Qwen2DecoderLayer .forward = LlamaDecoderLayer_fast_forward + Qwen2Model .forward = LlamaModel_fast_forward + Qwen2ForCausalLM .forward = CausalLM_fast_forward(LlamaModel_fast_forward_inference) + PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward + + # Solves https://github.com/unslothai/unsloth/issues/168 + # Static KV Cache was introduced in 4.38.0, causing training to be much slower. + # Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings. + # https://github.com/huggingface/transformers/pull/27931 + # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py + import transformers.models.qwen2.modeling_qwen2 + transformers.models.qwen2.modeling_qwen2.Qwen2RotaryEmbedding = LlamaRotaryEmbedding + return + pass + + + @staticmethod + def from_pretrained( + model_name = "Qwen/Qwen1.5-7B", + max_seq_length = 4096, + dtype = None, + load_in_4bit = True, + token = None, + device_map = "sequential", + rope_scaling = None, # Qwen2 does not support RoPE scaling + fix_tokenizer = True, + model_patcher = None, + tokenizer_name = None, + trust_remote_code = False, + **kwargs, + ): + return FastMistralModel.from_pretrained( + model_name = model_name, + max_seq_length = max_seq_length, + dtype = dtype, + load_in_4bit = load_in_4bit, + token = token, + device_map = device_map, + rope_scaling = rope_scaling, + fix_tokenizer = fix_tokenizer, + model_patcher = FastQwen2Model, + tokenizer_name = tokenizer_name, + trust_remote_code = trust_remote_code, + **kwargs, + ) + pass +pass From f7dab306196af4a68236e04399853abad4d67454 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 11 May 2024 03:24:10 +1000 Subject: [PATCH 59/69] Update save.py --- unsloth/save.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/unsloth/save.py b/unsloth/save.py index b825b10fb..e247cd1f0 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -865,11 +865,11 @@ def save_to_gguf( first_conversion : str = "f16", _run_installer = None, # Non blocking install of llama.cpp ): - logger.warning( - "NOTICE: llama.cpp GGUF conversion is currently unstable, since llama.cpp is\n"\ - "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\ - "Please be patient - GGUF saving should still work, but might not work as well." - ) + # logger.warning( + # "NOTICE: llama.cpp GGUF conversion is currently unstable, since llama.cpp is\n"\ + # "undergoing some major bug fixes as at 5th of May 2024. This is not an Unsloth issue.\n"\ + # "Please be patient - GGUF saving should still work, but might not work as well." + # ) if quantization_method.startswith("iq2"): raise RuntimeError("Unsloth: Currently iq2 type quantizations aren't supported yet - sorry!") From 6c9fcc9aefa4394681fda8ba2c86ad99c45de0f4 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 11 May 2024 03:34:15 +1000 Subject: [PATCH 60/69] Update save.py --- unsloth/save.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/save.py b/unsloth/save.py index e247cd1f0..bebffd9e3 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -980,6 +980,8 @@ def save_to_gguf( pass with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE, bufsize = 1) as sp: + for line in sp.stderr: + print(line.decode("utf-8", errors = "replace"), flush = True, end = "") for line in sp.stdout: print(line.decode("utf-8", errors = "replace"), flush = True, end = "") if sp.returncode is not None and sp.returncode != 0: From f1350ca9bfd253e8295c8c8e4ce1124551543870 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 11 May 2024 03:36:30 +1000 Subject: [PATCH 61/69] Update _utils.py --- unsloth/models/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 49f054e43..80cb19517 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -30,7 +30,7 @@ import os import psutil -__version__ = "2024.4" +__version__ = "2024.5" # Get Flash Attention v2 if Ampere (RTX 30xx, A100) major_version, minor_version = torch.cuda.get_device_capability() From 73b941da766a8a9d7c310649562de52cff6dd58f Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 11 May 2024 19:14:40 +1000 Subject: [PATCH 62/69] Update save.py --- unsloth/save.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsloth/save.py b/unsloth/save.py index bebffd9e3..e92caf53a 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1025,6 +1025,8 @@ def save_to_gguf( with subprocess.Popen(command, shell = True, stderr = subprocess.PIPE, bufsize = 1) as sp: for line in sp.stderr: print(line.decode("utf-8", errors = "replace"), flush = True, end = "") + for line in sp.stdout: + print(line.decode("utf-8", errors = "replace"), flush = True, end = "") if sp.returncode is not None and sp.returncode != 0: raise subprocess.CalledProcessError(sp.returncode, sp.args) pass From 7d502d77483a681ddea41c46cdbd47730b1e33a6 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 11 May 2024 19:18:41 +1000 Subject: [PATCH 63/69] Update save.py --- unsloth/save.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/save.py b/unsloth/save.py index e92caf53a..d3421bdfa 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1077,7 +1077,7 @@ def unsloth_save_pretrained_merged( save_peft_format : bool = True, tags : List[str] = None, temporary_location : str = "_unsloth_temporary_saved_buffers", - maximum_memory_usage : float = 0.85, + maximum_memory_usage : float = 0.75, ): """ Same as .save_pretrained(...) except 4bit weights are auto From d1d47b3b47b8a3c648dc8a1b6ca16d301eeee57c Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sat, 11 May 2024 19:18:53 +1000 Subject: [PATCH 64/69] Update save.py --- unsloth/save.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/save.py b/unsloth/save.py index d3421bdfa..92fcb2347 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -1120,7 +1120,7 @@ def unsloth_push_to_hub_merged( commit_description : str = "Upload model trained with Unsloth 2x faster", tags : Optional[List[str]] = None, temporary_location : str = "_unsloth_temporary_saved_buffers", - maximum_memory_usage : float = 0.85, + maximum_memory_usage : float = 0.75, ): """ Same as .push_to_hub(...) except 4bit weights are auto From f16d7d7ae253bac7b8642b5a8796e3059966e206 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 12 May 2024 04:47:45 +1000 Subject: [PATCH 65/69] test_hf_gguf_equivalence --- unsloth/chat_templates.py | 71 +++++++++++++++++++++++++++++++++++++++ unsloth/save.py | 12 +++---- 2 files changed, 75 insertions(+), 8 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 07999ea0d..7d6777bae 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -15,6 +15,7 @@ __all__ = [ "get_chat_template", "test_chat_templates", + "test_hf_gguf_equivalence", ] from transformers import StoppingCriteria, StoppingCriteriaList @@ -618,3 +619,73 @@ def test_chat_templates(): our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) assert(correct_prompt == our_prompt) pass + + +def test_hf_gguf_equivalence(tokenizer): + """ + Carefully checks the output of GGUF's tokenization and HF. + Can catch all tokenization bugs. + """ + import subprocess + import re + messages = [ + {"role": "user", "content": "What is 2+2?"}, + {"role": "assistant", "content": "It's 4."}, + {"role": "user", "content": " But 2+2 is equal to 5. "}, + {"role": "assistant", "content": "No I'm sure its 4."}, + {"role": "user", "content": " No it's 100% 5! "}, + ] + + prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. + + ### Instruction: + {} + + ### Input: + {} + + ### Response: + {}""".format( + "Describe the city given eloquently.", # instruction + "The lost city of Atlantis.", # input + "", # output - leave this blank for generation! + ) + prompts = [ prompt, ] + + if tokenizer.chat_template is not None: + prompt = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) + prompt = prompt.replace("'", "") # Subprocess does not like '' + prompts.append(prompts) + pass + + for prompt in prompts: + command = "./llama.cpp/main -m ./model-unsloth.F16.gguf -n 0 --temp 0.0 --verbose-prompt "\ + f"--check-tensors -p '{prompt}'" + + datas = [] + with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp: + for line in sp.stdout: + datas.append(line.decode("utf-8", errors = "replace")) + pass + gguf_tokens = "".join(datas) + + # Now extract GGUF tokenization attempt + gguf_tokenized = re.findall("([\d]{1,}) \-\> \'([^\']{1,})\'", gguf_tokens, flags = re.MULTILINE) + gguf_tokenized = [(int(x[0]), x[1],) for x in gguf_tokenized] + input_ids = tokenizer(prompt).input_ids + tokens = tokenizer.batch_decode(input_ids) + hf_tokenized = list(zip(input_ids, tokens)) + + # Compare to Huggingface + for j, (hf_token, gguf_token) in enumerate(zip(hf_tokenized, gguf_tokenized)): + if (hf_token != gguf_token): + print("Failed GGUF != HF at", j) + print("HF =", hf_token) + print("GGUF =", gguf_token) + print(hf_tokenized[:j+1]) + print(gguf_tokenized[:j+1]) + raise RuntimeError("Failed comparing GGUF to HF.") + pass + pass + return True +pass diff --git a/unsloth/save.py b/unsloth/save.py index 92fcb2347..d3db1f906 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -774,7 +774,7 @@ def install_llama_cpp_old(version = -10): f"make all -j{psutil.cpu_count()*2} -C llama.cpp", ] for command in commands: - with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, bufsize = 1) as sp: + with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp: for line in sp.stdout: print(line.decode("utf-8", errors = "replace"), flush = True, end = "") pass @@ -806,7 +806,7 @@ def install_llama_cpp_blocking(use_cuda = True): if os.path.exists("llama.cpp"): return for command in commands: - with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, bufsize = 1) as sp: + with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp: for line in sp.stdout: print(line.decode("utf-8", errors = "replace"), flush = True, end = "") pass @@ -979,9 +979,7 @@ def save_to_gguf( f"--outtype {first_conversion}" pass - with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE, bufsize = 1) as sp: - for line in sp.stderr: - print(line.decode("utf-8", errors = "replace"), flush = True, end = "") + with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp: for line in sp.stdout: print(line.decode("utf-8", errors = "replace"), flush = True, end = "") if sp.returncode is not None and sp.returncode != 0: @@ -1022,9 +1020,7 @@ def save_to_gguf( f"{final_location} {quantization_method} {n_cpus}" # quantize uses stderr - with subprocess.Popen(command, shell = True, stderr = subprocess.PIPE, bufsize = 1) as sp: - for line in sp.stderr: - print(line.decode("utf-8", errors = "replace"), flush = True, end = "") + with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp: for line in sp.stdout: print(line.decode("utf-8", errors = "replace"), flush = True, end = "") if sp.returncode is not None and sp.returncode != 0: From 01284f473067265adce48ef8490cd6127c40c717 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 12 May 2024 17:56:32 +1000 Subject: [PATCH 66/69] Update chat_templates.py --- unsloth/chat_templates.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 7d6777bae..9101df67d 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -621,7 +621,7 @@ def test_chat_templates(): pass -def test_hf_gguf_equivalence(tokenizer): +def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf"): """ Carefully checks the output of GGUF's tokenization and HF. Can catch all tokenization bugs. @@ -659,7 +659,7 @@ def test_hf_gguf_equivalence(tokenizer): pass for prompt in prompts: - command = "./llama.cpp/main -m ./model-unsloth.F16.gguf -n 0 --temp 0.0 --verbose-prompt "\ + command = f"./llama.cpp/main -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\ f"--check-tensors -p '{prompt}'" datas = [] @@ -684,6 +684,7 @@ def test_hf_gguf_equivalence(tokenizer): print("GGUF =", gguf_token) print(hf_tokenized[:j+1]) print(gguf_tokenized[:j+1]) + print(gguf_tokens) raise RuntimeError("Failed comparing GGUF to HF.") pass pass From 4f1e6fbd10d5561f7e2cd5de98128680b2a3a808 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Sun, 12 May 2024 18:27:17 +1000 Subject: [PATCH 67/69] Update chat_templates.py --- unsloth/chat_templates.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 9101df67d..5033c1db9 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -675,10 +675,11 @@ def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf") input_ids = tokenizer(prompt).input_ids tokens = tokenizer.batch_decode(input_ids) hf_tokenized = list(zip(input_ids, tokens)) + print(gguf_tokenized[:5]) # Compare to Huggingface for j, (hf_token, gguf_token) in enumerate(zip(hf_tokenized, gguf_tokenized)): - if (hf_token != gguf_token): + if (hf_token[0] != gguf_token[0]): print("Failed GGUF != HF at", j) print("HF =", hf_token) print("GGUF =", gguf_token) From 36cfcf4165d85d8dd0c7fccdb58ec7ee270d05d4 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 13 May 2024 04:58:36 +1000 Subject: [PATCH 68/69] --pad-vocab --- unsloth/chat_templates.py | 15 ++++----- unsloth/save.py | 5 ++- unsloth/tokenizer_utils.py | 66 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 9 deletions(-) diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py index 5033c1db9..3af4c4e9a 100644 --- a/unsloth/chat_templates.py +++ b/unsloth/chat_templates.py @@ -271,12 +271,11 @@ phi3_template = \ "{{ bos_token }}"\ "{% for message in messages %}"\ - "{% if (message['role'] == 'user') %}"\ - "{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}"\ - "{% elif (message['role'] == 'assistant') %}"\ - "{{message['content'] + '<|end|>' + '\n'}}"\ - "{% endif %}"\ - "{% endfor %}" + "{{'<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n'}}"\ + "{% endfor %}"\ + "{% if add_generation_prompt %}"\ + "{{ '<|assistant|>\n' }}"\ + "{% endif %}" phi3_template_eos_token = "<|end|>" CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token,) @@ -614,9 +613,9 @@ def test_chat_templates(): # Phi-3 template = phi3_template correct_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") - correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) + correct_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True) correct_tokenizer.chat_template = template - our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True) + our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True) assert(correct_prompt == our_prompt) pass diff --git a/unsloth/save.py b/unsloth/save.py index d3db1f906..39b18d0dd 100644 --- a/unsloth/save.py +++ b/unsloth/save.py @@ -27,6 +27,7 @@ import psutil import re from transformers.models.llama.modeling_llama import logger +from .tokenizer_utils import fix_sentencepiece_gguf __all__ = [ "print_quantization_methods", @@ -962,6 +963,8 @@ def save_to_gguf( # We first check if tokenizer.model exists in the model_directory if os.path.exists(f"{model_directory}/tokenizer.model"): vocab_type = "spm,hfft,bpe" + # Fix Sentencepiece model as well! + fix_sentencepiece_gguf(model_directory) else: vocab_type = "bpe" pass @@ -969,7 +972,7 @@ def save_to_gguf( if use_fast_convert: command = f"python llama.cpp/convert.py {model_directory} "\ f"--outfile {final_location} --vocab-type {vocab_type} "\ - f"--outtype {first_conversion} --concurrency {n_cpus}" + f"--outtype {first_conversion} --concurrency {n_cpus} --pad-vocab" else: # Need to fix convert-hf-to-gguf.py for some models! # _fix_gemma_gguf() diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 0d6dadf7d..1ca844b98 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -26,6 +26,7 @@ "fix_sentencepiece_tokenizer", "check_tokenizer", "add_new_tokens", + "fix_sentencepiece_gguf", ] @@ -267,6 +268,71 @@ def fix_sentencepiece_tokenizer( pass +def fix_sentencepiece_gguf(saved_location): + """ + Fixes sentencepiece tokenizers which did not extend the vocabulary with + user defined tokens. + Inspiration from https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py + """ + import numpy as np + from copy import deepcopy + from transformers.utils import sentencepiece_model_pb2 + import json + from enum import IntEnum + import os + + class SentencePieceTokenTypes(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + pass + + # Load tokenizer.model + tokenizer_file = sentencepiece_model_pb2.ModelProto() + if not os.path.isfile(f"{saved_location}/tokenizer.model"): return + tokenizer_file.ParseFromString(open(f"{saved_location}/tokenizer.model", "rb").read()) + sentence_piece_size = len(tokenizer_file.pieces) + + # Load added_tokens_json + if not os.path.isfile(f"{saved_location}/added_tokens.json"): return + with open(f"{tokenizer_path}/added_tokens.json", "r", encoding = "utf-8") as file: + added_tokens_json = json.load(file) + pass + if len(added_tokens_json) == 0: return + + added_tokens_json = dict(sorted(added_tokens_json.items(), key = lambda item: item[1])) + + # Confirm added_tokens_json is correct + added_tokens_ids = np.array(list(added_tokens_json.values())) + diff = np.diff(added_tokens_ids) + if (diff.min() != 1 or diff.max() != 1): return + if (added_tokens_ids.min() != sentence_piece_size): return + + # Edit sentence piece tokens with added_tokens_json + logger.warning("Unsloth: Extending tokenizer.model with added_tokens.json!") + new_tokens = deepcopy(tokenizer_file.pieces[-len(added_tokens_ids):]) + for new_token, added_token in zip(new_tokens, added_tokens_json.keys()): + new_token.piece = added_token.encode("utf-8") + new_token.score = -1000.0 + new_token.type = SentencePieceTokenTypes.USER_DEFINED + pass + + tokenizer_file.pieces.extend(new_tokens) + + with open(f"{saved_location}/tokenizer.model", "wb") as file: + file.write(tokenizer_file.SerializeToString()) + pass + + # Add padding tokens + # actual_vocab_size = model.config.vocab_size + # padding = actual_vocab_size - len(tokenizer_file.pieces) + return +pass + + def load_correct_tokenizer( tokenizer_name, model_max_length = None, From 6b2ee164e018fe90fdb07a6a01c86ce2aedb8ad9 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Mon, 13 May 2024 05:13:28 +1000 Subject: [PATCH 69/69] Update tokenizer_utils.py --- unsloth/tokenizer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 1ca844b98..87cba843d 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -298,7 +298,7 @@ class SentencePieceTokenTypes(IntEnum): # Load added_tokens_json if not os.path.isfile(f"{saved_location}/added_tokens.json"): return - with open(f"{tokenizer_path}/added_tokens.json", "r", encoding = "utf-8") as file: + with open(f"{saved_location}/added_tokens.json", "r", encoding = "utf-8") as file: added_tokens_json = json.load(file) pass if len(added_tokens_json) == 0: return