unslothai · danielhanchen · May 12, 2024 · Apr 19, 2024 · Apr 20, 2024 · Apr 20, 2024
diff --git a/README.md b/README.md
@@ -36,6 +36,7 @@ All notebooks are **beginner friendly**! Add your dataset, click "Run All", and
 - This [text completion notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing) is for continued pretraining / raw text.
 
 ## 🦥 Unsloth.ai News
+- 📣 NEW! Qwen1.5-7B, Qwen1.5-14B, Qwen1.5-32B, Qwen1.5-72B now work, courtesy of Firefly's PR [#428](https://github.com/unslothai/unsloth/pull/428)
 - 📣 NEW! [Llama-3 8b](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing) now works! Llama-3 70b also works (change the model name in the notebook).
 - 📣 NEW! [ORPO support](https://colab.research.google.com/drive/11t4njE3c4Lxl-07OD8lJSMKkfyJml3Tn?usp=sharing) is here!
 - 📣 NEW! [Phi-3 3.8b support](https://colab.research.google.com/drive/1NvkBmkHfucGO3Ve9s1NKZvMNlw5p83ym?usp=sharing) is here!
@@ -159,7 +160,14 @@ pip install --no-deps packaging ninja einops flash-attn xformers trl peft accele
 pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
 pip install --no-deps xformers trl peft accelerate bitsandbytes
 ```
-7. To troubleshoot installs try the below (all must succeed). Xformers should mostly all be available.
+7. For Pytorch 2.3.0: Use the `"ampere"` path for newer RTX 30xx GPUs or higher.
+```bash
+pip install "unsloth[cu118-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu118-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
+```
+8. To troubleshoot installs try the below (all must succeed). Xformers should mostly all be available.
 ```bash
 nvcc
 python -m xformers.info

diff --git a/pyproject.toml b/pyproject.toml
@@ -86,6 +86,17 @@ cu121onlytorch220 = [
     "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
     "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
+cu118onlytorch230 = [
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.26.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+cu121onlytorch230 = [
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
+    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
+]
+
 cu118 = [
     "unsloth[huggingface]",
     "bitsandbytes",
@@ -126,6 +137,16 @@ cu121-torch220 = [
     "bitsandbytes",
     "unsloth[cu121onlytorch220]",
 ]
+cu118-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes",
+    "unsloth[cu118onlytorch230]",
+]
+cu121-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes",
+    "unsloth[cu121onlytorch230]",
+]
 kaggle = [
     "unsloth[huggingface]",
 ]
@@ -238,6 +259,22 @@ cu121-ampere-torch220 = [
     "ninja",
     "flash-attn",
 ]
+cu118-ampere-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes",
+    "unsloth[cu118onlytorch230]",
+    "packaging",
+    "ninja",
+    "flash-attn",
+]
+cu121-ampere-torch230 = [
+    "unsloth[huggingface]",
+    "bitsandbytes",
+    "unsloth[cu121onlytorch230]",
+    "packaging",
+    "ninja",
+    "flash-attn",
+]
 
 [project.urls]
 homepage = "http://www.unsloth.ai"

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
@@ -15,6 +15,7 @@
 __all__ = [
     "get_chat_template",
     "test_chat_templates",
+    "test_hf_gguf_equivalence",
 ]
 
 from transformers import StoppingCriteria, StoppingCriteriaList
@@ -270,12 +271,11 @@
 phi3_template = \
     "{{ bos_token }}"\
     "{% for message in messages %}"\
-        "{% if (message['role'] == 'user') %}"\
-            "{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}"\
-        "{% elif (message['role'] == 'assistant') %}"\
-            "{{message['content'] + '<|end|>' + '\n'}}"\
-        "{% endif %}"\
-    "{% endfor %}"
+        "{{'<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n'}}"\
+    "{% endfor %}"\
+    "{% if add_generation_prompt %}"\
+        "{{ '<|assistant|>\n' }}"\
+    "{% endif %}"
 phi3_template_eos_token = "<|end|>"
 CHAT_TEMPLATES["phi-3"] = (phi3_template, phi3_template_eos_token,)
 
@@ -613,8 +613,80 @@ def test_chat_templates():
     # Phi-3
     template = phi3_template
     correct_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
-    correct_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    correct_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
     correct_tokenizer.chat_template = template
-    our_prompt = correct_tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+    our_prompt = correct_tokenizer.apply_chat_template(messages[1:], tokenize = False, add_generation_prompt = True)
     assert(correct_prompt == our_prompt)
 pass
+
+
+def test_hf_gguf_equivalence(tokenizer, gguf_model = "./model-unsloth.F16.gguf"):
+    """
+        Carefully checks the output of GGUF's tokenization and HF.
+        Can catch all tokenization bugs.
+    """
+    import subprocess
+    import re
+    messages = [
+        {"role": "user", "content": "What is 2+2?"},
+        {"role": "assistant", "content": "It's 4."},
+        {"role": "user", "content": "  But 2+2 is equal to 5. "},
+        {"role": "assistant", "content": "No I'm sure its 4."},
+        {"role": "user", "content": "  No it's 100% 5! "},
+    ]
+
+    prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+    ### Instruction:
+    {}
+
+    ### Input:
+    {}
+
+    ### Response:
+    {}""".format(
+        "Describe the city given eloquently.", # instruction
+        "The lost city of Atlantis.", # input
+        "", # output - leave this blank for generation!
+    )
+    prompts = [ prompt, ]
+
+    if tokenizer.chat_template is not None:
+        prompt = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
+        prompt = prompt.replace("'", "") # Subprocess does not like ''
+        prompts.append(prompts)
+    pass
+
+    for prompt in prompts:
+        command = f"./llama.cpp/main -m {gguf_model} -n 0 --temp 0.0 --verbose-prompt "\
+            f"--check-tensors -p '{prompt}'"
+
+        datas = []
+        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, bufsize = 1) as sp:
+            for line in sp.stdout:
+                datas.append(line.decode("utf-8", errors = "replace"))
+        pass
+        gguf_tokens = "".join(datas)
+
+        # Now extract GGUF tokenization attempt
+        gguf_tokenized = re.findall("([\d]{1,}) \-\> \'([^\']{1,})\'", gguf_tokens, flags = re.MULTILINE)
+        gguf_tokenized = [(int(x[0]), x[1],) for x in gguf_tokenized]
+        input_ids = tokenizer(prompt).input_ids
+        tokens = tokenizer.batch_decode(input_ids)
+        hf_tokenized = list(zip(input_ids, tokens))
+        print(gguf_tokenized[:5])
+
+        # Compare to Huggingface
+        for j, (hf_token, gguf_token) in enumerate(zip(hf_tokenized, gguf_tokenized)):
+            if (hf_token[0] != gguf_token[0]):
+                print("Failed GGUF != HF at", j)
+                print("HF =", hf_token)
+                print("GGUF =", gguf_token)
+                print(hf_tokenized[:j+1])
+                print(gguf_tokenized[:j+1])
+                print(gguf_tokens)
+                raise RuntimeError("Failed comparing GGUF to HF.")
+            pass
+        pass
+    return True
+pass
diff --git a/unsloth/models/__init__.py b/unsloth/models/__init__.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .loader import FastLanguageModel
-from .llama import FastLlamaModel
+from .loader  import FastLanguageModel
+from .llama   import FastLlamaModel
 from .mistral import FastMistralModel
-from .dpo import PatchDPOTrainer
+from .qwen2   import FastQwen2Model
+from .dpo     import PatchDPOTrainer
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -30,7 +30,7 @@
 import os
 import psutil
 
-__version__ = "2024.4"
+__version__ = "2024.5"
 
 # Get Flash Attention v2 if Ampere (RTX 30xx, A100)
 major_version, minor_version = torch.cuda.get_device_capability()

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -1605,6 +1605,7 @@ def patch_peft_model(
 
         if   model_type == "llama":   apply_lora_mlp = apply_lora_mlp_swiglu
         elif model_type == "mistral": apply_lora_mlp = apply_lora_mlp_swiglu
+        elif model_type == "qwen2":   apply_lora_mlp = apply_lora_mlp_swiglu
         elif model_type == "gemma":   apply_lora_mlp = apply_lora_mlp_geglu_approx
         else:
             raise NotImplementedError(f"Unsloth: {model_type} is not yet implemented!")

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
@@ -14,6 +14,7 @@
 
 from .llama import FastLlamaModel, logger
 from .mistral import FastMistralModel
+from .qwen2 import FastQwen2Model
 from transformers import AutoConfig
 from transformers import __version__ as transformers_version
 from peft import PeftConfig, PeftModel
@@ -119,6 +120,8 @@ def from_pretrained(
                     f"to obtain the latest transformers build, then restart this session."\
                 )
             dispatch_model = FastGemmaModel
+        elif model_type == "qwen2":
+            dispatch_model = FastQwen2Model
         else:
             raise NotImplementedError(
                 f"Unsloth: {model_name} not supported yet!\n"\

diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
@@ -343,7 +343,7 @@ def from_pretrained(
         # Mistral does NOT support RoPE Scaling sadly so we have to error out.
         if max_seq_length > model_max_seq_length:
             raise RuntimeError(
-                "Unsloth: Unfortunately Mistral type models do not support RoPE scaling!\n"\
+                f"Unsloth: Unfortunately {model_patcher.__name__[4:-5]} type models do not support RoPE scaling!\n"\
                 f"The maximum sequence length supported is {model_max_seq_length}.",
             )
         pass

diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
@@ -0,0 +1,91 @@
+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .llama import *
+from .mistral import FastMistralModel
+import os
+from ._utils import __version__
+
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2Attention,
+    Qwen2DecoderLayer,
+    Qwen2Model,
+    Qwen2ForCausalLM,
+)
+# For Pytorch 2.1.1
+try:
+    from transformers.models.qwen2.modeling_qwen2 import (
+        Qwen2SdpaAttention,
+        Qwen2FlashAttention2,
+    )
+except:
+    Qwen2SdpaAttention   = Qwen2Attention
+    Qwen2FlashAttention2 = Qwen2Attention
+pass
+
+
+class FastQwen2Model(FastLlamaModel):
+
+    @staticmethod
+    def pre_patch():
+        Qwen2Attention      .forward = LlamaAttention_fast_forward
+        Qwen2SdpaAttention  .forward = LlamaAttention_fast_forward
+        Qwen2FlashAttention2.forward = LlamaAttention_fast_forward
+        Qwen2DecoderLayer   .forward = LlamaDecoderLayer_fast_forward
+        Qwen2Model          .forward = LlamaModel_fast_forward
+        Qwen2ForCausalLM    .forward = CausalLM_fast_forward(LlamaModel_fast_forward_inference)
+        PeftModelForCausalLM.forward = PeftModelForCausalLM_fast_forward
+
+        # Solves https://github.com/unslothai/unsloth/issues/168
+        # Static KV Cache was introduced in 4.38.0, causing training to be much slower.
+        # Inferene can now be CUDAGraphed, but we shall retain the old rotary embeddings.
+        # https://github.com/huggingface/transformers/pull/27931
+        # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llama/modeling_llama.py
+        import transformers.models.qwen2.modeling_qwen2
+        transformers.models.qwen2.modeling_qwen2.Qwen2RotaryEmbedding = LlamaRotaryEmbedding
+        return
+    pass
+
+
+    @staticmethod
+    def from_pretrained(
+        model_name     = "Qwen/Qwen1.5-7B",
+        max_seq_length = 4096,
+        dtype          = None,
+        load_in_4bit   = True,
+        token          = None,
+        device_map     = "sequential",
+        rope_scaling   = None, # Qwen2 does not support RoPE scaling
+        fix_tokenizer  = True,
+        model_patcher  = None,
+        tokenizer_name = None,
+        trust_remote_code = False,
+        **kwargs,
+    ):
+        return FastMistralModel.from_pretrained(
+            model_name     = model_name,
+            max_seq_length = max_seq_length,
+            dtype          = dtype,
+            load_in_4bit   = load_in_4bit,
+            token          = token,
+            device_map     = device_map,
+            rope_scaling   = rope_scaling,
+            fix_tokenizer  = fix_tokenizer,
+            model_patcher  = FastQwen2Model,
+            tokenizer_name = tokenizer_name,
+            trust_remote_code = trust_remote_code,
+            **kwargs,
+        )
+    pass
+pass