From 4448918b96bcd8b9d145f02d4e36e509717df7b9 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Tue, 19 Nov 2024 21:44:33 -0500
Subject: [PATCH 01/18] qwen2.5: added config + special attn_bias

---
 litgpt/config.py | 160 +++++++++++++++++++++++++++++++++++++++++++++++
 litgpt/model.py  |   2 +-
 2 files changed, 161 insertions(+), 1 deletion(-)

diff --git a/litgpt/config.py b/litgpt/config.py
index b218df849c..332d25733b 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -31,6 +31,7 @@ class Config:
     parallel_residual: bool = True
     bias: bool = True
     lm_head_bias: bool = False
+    attn_bias: bool = False
     # to use multi-head attention (MHA), set this to `n_head` (default)
     # to use multi-query attention (MQA), set this to 1
     # to use grouped-query attention (GQA), set this to a value in between
@@ -1704,4 +1705,163 @@ def norm_class(self) -> Type:
 
 configs.extend(llama_2_function_calling)
 
+##########
+# Qwen2.5
+##########
+qwen_2_5 = [
+    # https://huggingface.co/Qwen/Qwen2.5-0.5B/blob/main/config.json
+    dict(
+        name="Qwen2.5-0.5B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-0.5B{}"),
+        block_size=32768,
+        vocab_size=151936,
+        padded_vocab_size=151936,
+        n_layer=24,
+        n_head=14,
+        n_embd=896,
+        n_query_groups=2,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=4864,
+        norm_eps=1e-6,
+        rope_base=1000000
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-1.5B/blob/main/config.json
+    dict(
+        name="Qwen2.5-1.5B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-1.5B{}"),
+        block_size=131072,
+        vocab_size=151936,
+        padded_vocab_size=151936,
+        n_layer=28,
+        n_head=12,
+        n_embd=1536,
+        n_query_groups=2,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=8960,
+        norm_eps=1e-6,
+        rope_base=1000000
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-3B/blob/main/config.json
+    dict(
+        name="Qwen2.5-3B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-3B{}"),
+        block_size=32768,
+        vocab_size=151936,
+        padded_vocab_size=151936,
+        n_layer=36,
+        n_head=16,
+        n_embd=2048,
+        n_query_groups=2,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=11008,
+        norm_eps=1e-6,
+        rope_base=1000000
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-7B/blob/main/config.json
+    dict(
+        name="Qwen2.5-7B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-7B{}"),
+        block_size=131072,
+        vocab_size=152064,
+        padded_vocab_size=152064,
+        n_layer=28,
+        n_head=28,
+        n_embd=3584,
+        n_query_groups=4,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=18944,
+        norm_eps=1e-6,
+        rope_base=1000000
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-14B/blob/main/config.json
+    dict(
+        name="Qwen2.5-14B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-14B{}"),
+        block_size=131072,
+        vocab_size=152064,
+        padded_vocab_size=152064,
+        n_layer=48,
+        n_head=40,
+        n_embd=5120,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=13824,
+        norm_eps=1e-5,
+        rope_base=1000000
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-32B/blob/main/config.json
+    dict(
+        name="Qwen2.5-32B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-32B{}"),
+        block_size=131072,
+        vocab_size=152064,
+        padded_vocab_size=152064,
+        n_layer=64,
+        n_head=40,
+        n_embd=5120,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=27648,
+        norm_eps=1e-5,
+        rope_base=1000000
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-72B/blob/main/config.json
+    dict(
+        name="Qwen2.5-72B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-72B{}"),
+        block_size=131072,
+        vocab_size=152064,
+        padded_vocab_size=152064,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=29568,
+        norm_eps=1e-5,
+        rope_base=1000000
+    ),
+]
+for c in qwen_2_5:
+    for kind in ("", "-Instruct"):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+
 name_to_config = {config["name"]: config for config in configs}
diff --git a/litgpt/model.py b/litgpt/model.py
index b60b0506b6..17b3b4ab04 100644
--- a/litgpt/model.py
+++ b/litgpt/model.py
@@ -244,7 +244,7 @@ def __init__(self, config: Config, block_idx: int) -> None:
         super().__init__()
         shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
         # key, query, value projections for all heads, but in a batch
-        self.attn = nn.Linear(config.n_embd, shape, bias=config.bias)
+        self.attn = nn.Linear(config.n_embd, shape, bias=config.bias or config.attn_bias)
         # output projection
         # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head`
         self.proj = nn.Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias)

From f16944f912c06e6820a19422ad5a8adbe7d6b613 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Wed, 20 Nov 2024 13:09:48 -0500
Subject: [PATCH 02/18] Qwen2.5: convert checkpoints scripts

---
 litgpt/scripts/convert_hf_checkpoint.py  | 79 ++++++++++++++++++++++++
 litgpt/scripts/convert_lit_checkpoint.py | 45 ++++++++++++++
 2 files changed, 124 insertions(+)

diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
index 0fbb99bd49..a38ac174a9 100644
--- a/litgpt/scripts/convert_hf_checkpoint.py
+++ b/litgpt/scripts/convert_hf_checkpoint.py
@@ -407,6 +407,81 @@ def copy_weights_phi(
             if progress_per_file is not None:
                 pbar.update(progress_per_file)
 
+def copy_weights_qwen_2_5(
+    config: Config,
+    qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]],
+    state_dict: Dict[str, torch.Tensor],
+    hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    saver: Optional[incremental_save] = None,
+    dtype: Optional[torch.dtype] = None,
+    pbar: Optional[tqdm] = None,
+    progress_per_file: Optional[float] = None,
+    debug_mode: Optional[bool] = False
+) -> None:
+    weight_map = {
+        "model.embed_tokens.weight": "transformer.wte.weight",
+        "model.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight",
+        "model.layers.{}.self_attn.q_proj.weight": None,
+        "model.layers.{}.self_attn.k_proj.weight": None,
+        "model.layers.{}.self_attn.v_proj.weight": None,
+        "model.layers.{}.self_attn.q_proj.bias": None,
+        "model.layers.{}.self_attn.k_proj.bias": None,
+        "model.layers.{}.self_attn.v_proj.bias": None,
+        "model.layers.{}.self_attn.o_proj.weight": "transformer.h.{}.attn.proj.weight",
+        "model.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.norm_2.weight",
+        "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{}.mlp.fc_1.weight",
+        "model.layers.{}.mlp.up_proj.weight": "transformer.h.{}.mlp.fc_2.weight",
+        "model.layers.{}.mlp.down_proj.weight": "transformer.h.{}.mlp.proj.weight",
+        "model.norm.weight": "transformer.ln_f.weight",
+        "lm_head.weight": "lm_head.weight",
+    }
+
+    if progress_per_file is not None:
+        progress_per_file = progress_per_file / max(1, len(hf_weights) + len(qkv_weights))
+
+    for name, param in hf_weights.items():
+        if "model.layers" in name:
+            from_name, l = layer_template(name, 2)
+            qkv = qkv_weights.setdefault(l, defaultdict(dict))
+            if any(w in from_name for w in ("q_proj", "k_proj", "v_proj")):
+                weight_name, weight_type = from_name.split(".")[-2:]
+                qkv[weight_type][weight_name] = param
+            to_name = weight_map[from_name]
+            if to_name is None:
+                continue
+            to_name = to_name.format(l)
+        else:
+            to_name = weight_map[name]
+        param = load_param(param, name, dtype, verbose=debug_mode)
+        if saver is not None:
+            param = saver.store_early(param)
+        state_dict[to_name] = param
+
+        if progress_per_file is not None:
+            pbar.update(progress_per_file)
+
+    if "lm_head.weight" not in state_dict:
+        state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"]
+
+    for i in list(qkv_weights):
+        for weight_type in list(qkv_weights[i]):
+            qkv = qkv_weights[i][weight_type]
+            if len(qkv) != 3:
+                # split across different .bin files
+                continue
+            q = load_param(qkv["q_proj"], f"layer {i} q {weight_type}", dtype, verbose=debug_mode)
+            k = load_param(qkv["k_proj"], f"layer {i} k {weight_type}", dtype, verbose=debug_mode)
+            v = load_param(qkv["v_proj"], f"layer {i} v {weight_type}", dtype, verbose=debug_mode)
+            q_per_kv = config.n_head // config.n_query_groups
+            qs = torch.split(q, config.head_size * q_per_kv)
+            ks = torch.split(k, config.head_size)
+            vs = torch.split(v, config.head_size)
+            cycled = [t for group in zip(qs, ks, vs) for t in group]
+            qkv = torch.cat(cycled)
+            state_dict[f"transformer.h.{i}.attn.attn.{weight_type}"] = qkv
+            del qkv_weights[i][weight_type]
+            if progress_per_file is not None:
+                pbar.update(progress_per_file)
 
 def qkv_reassemble(param: Union[torch.Tensor, NotYetLoadedTensor], config: Config) -> torch.Tensor:
     """Reassemble from a normal to an interleaved placement in a QKV matrix.
@@ -491,6 +566,10 @@ def convert_hf_checkpoint(
         # holder to reconstitute the split q, k, v
         qkv_weights = {}
         copy_fn = partial(copy_weights_hf_llama, config, qkv_weights)
+    elif model_name.lower().startswith("qwen2.5"):
+        # holder to reconstitute the split q, k, v
+        qkv_weights = {}
+        copy_fn = partial(copy_weights_qwen_2_5, config, qkv_weights)
     else:
         copy_fn = copy_weights_gpt_neox
 
diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py
index 3fce585538..adc9d85240 100644
--- a/litgpt/scripts/convert_lit_checkpoint.py
+++ b/litgpt/scripts/convert_lit_checkpoint.py
@@ -298,6 +298,49 @@ def copy_weights_phi(
             state_dict[layer_name] = weight
             del gate_up_proj_weights[i]
 
+def copy_weights_qwen_2_5(
+    config: Config,
+    state_dict: Dict[str, torch.Tensor],
+    lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    saver: Optional[incremental_save] = None,
+) -> None:
+    weight_map = {
+        "transformer.wte.weight": "model.embed_tokens.weight",
+        "transformer.h.{}.norm_1.weight": "model.layers.{}.input_layernorm.weight",
+        "transformer.h.{}.norm_2.weight": "model.layers.{}.post_attention_layernorm.weight",
+        "transformer.h.{}.attn.proj.weight": "model.layers.{}.self_attn.o_proj.weight",
+        "transformer.h.{}.mlp.fc_1.weight": "model.layers.{}.mlp.gate_proj.weight",
+        "transformer.h.{}.mlp.fc_2.weight": "model.layers.{}.mlp.up_proj.weight",
+        "transformer.h.{}.mlp.proj.weight": "model.layers.{}.mlp.down_proj.weight",
+        "transformer.ln_f.weight": "model.norm.weight",
+        "lm_head.weight": "lm_head.weight",
+    }
+
+    for name, param in lit_weights.items():
+        if name.endswith((".attn.attn.weight", ".attn.attn.bias")):
+            from_name, l_idx = layer_template(name, 2)
+            qkv = load_param(param, name, None)
+            qp, kp, vp = qkv_split(qkv, config)
+            
+            weight_type = name.split(".")[-1]  # weight or bias
+            q = f"model.layers.{l_idx}.self_attn.q_proj.{weight_type}"
+            k = f"model.layers.{l_idx}.self_attn.k_proj.{weight_type}"
+            v = f"model.layers.{l_idx}.self_attn.v_proj.{weight_type}"
+            for to_name, param in zip((q, k, v), (qp, kp, vp)):
+                if saver is not None:
+                    param = saver.store_early(param)
+                state_dict[to_name] = param
+        else:
+            if "transformer.h" in name:
+                from_name, l_idx = layer_template(name, 2)
+                to_name = weight_map[from_name]
+                to_name = to_name.format(l_idx)
+            else:
+                to_name = weight_map[name]
+            param = load_param(param, name, None)
+            if saver is not None:
+                param = saver.store_early(param)
+            state_dict[to_name] = param
 
 def qkv_split(
     param: Union[torch.Tensor, NotYetLoadedTensor], config: Config
@@ -344,6 +387,8 @@ def convert_lit_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None:
     elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
         untie_weights = "Gemma" in config.name
         copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights)
+    elif config.name.lower().startswith("qwen2.5"):
+        copy_fn = partial(copy_weights_qwen_2_5, config)
     else:
         copy_fn = copy_weights_gpt_neox
 

From 770350700302fcd4f2791a52d18ec4d6198a34ea Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Wed, 20 Nov 2024 13:47:28 -0500
Subject: [PATCH 03/18] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6092db7ad8..be3b222a03 100644
--- a/README.md
+++ b/README.md
@@ -134,6 +134,7 @@ Every model is written from scratch to maximize performance and remove layers of
 | Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)                                                                            |
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
+| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
 | StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                                  |
 | StableLM  | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM)                                                                    |
 | StableLM Zephyr | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                             |

From 97f6315386751e42735fcd4eb71fbaa2d39fc5b8 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Wed, 20 Nov 2024 13:54:36 -0500
Subject: [PATCH 04/18] Update download_model_weights.md

---
 tutorials/download_model_weights.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
index 9ab0041357..8b90b9400b 100644
--- a/tutorials/download_model_weights.md
+++ b/tutorials/download_model_weights.md
@@ -33,6 +33,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | Phi 3 & 3.5 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
+| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
 | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                                                 |
 | StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                                  |
 | StableLM  | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM)                                                                    |
@@ -161,6 +162,20 @@ nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
 openlm-research/open_llama_13b
 openlm-research/open_llama_3b
 openlm-research/open_llama_7b
+Qwen/Qwen2.5-0.5B
+Qwen/Qwen2.5-0.5B-Instruct
+Qwen/Qwen2.5-1B
+Qwen/Qwen2.5-1B-Instruct
+Qwen/Qwen2.5-3B
+Qwen/Qwen2.5-3B-Instruct
+Qwen/Qwen2.5-7B
+Qwen/Qwen2.5-7B-Instruct
+Qwen/Qwen2.5-14B
+Qwen/Qwen2.5-14B-Instruct
+Qwen/Qwen2.5-32B
+Qwen/Qwen2.5-32B-Instruct
+Qwen/Qwen2.5-72B
+Qwen/Qwen2.5-72B-Instruct
 stabilityai/FreeWilly2
 stabilityai/stable-code-3b
 stabilityai/stablecode-completion-alpha-3b

From 1e793701601ad7b7d114e429d0a12595e22a9ff0 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Wed, 20 Nov 2024 14:14:11 -0500
Subject: [PATCH 05/18] Qwen2.5: added prompt template

---
 litgpt/prompts.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/litgpt/prompts.py b/litgpt/prompts.py
index 09fb86676c..cb26d35e6b 100644
--- a/litgpt/prompts.py
+++ b/litgpt/prompts.py
@@ -279,6 +279,12 @@ def apply(self, prompt: str, **kwargs: str) -> str:
 class OLMo(PromptStyle):
     def apply(self, prompt: str, **kwargs: str) -> str:
         return f"<|endoftext|><|user|>\n{prompt}\n<|assistant|>\n"
+    
+
+class Qwen2_5(PromptStyle):
+    def apply(self, prompt: str, **kwargs: str) -> str:
+        system_message = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
+        return f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
 
 # Maps prompt style names to PromptStyle classes
@@ -304,6 +310,7 @@ def apply(self, prompt: str, **kwargs: str) -> str:
     "gemma": Gemma,
     "llama3": Llama3,
     "olmo": OLMo,
+    "qwen2.5": Qwen2_5,
 }
 
 
@@ -342,6 +349,8 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle:
         return Gemma()
     if re.search(r"OLMo.*-hf", model_name):
         return OLMo()
+    if re.search(r"Qwen2\.5-(?!Coder)", model_name):
+        return Qwen2_5()
     return Default()
 
 

From 43990411e1bf1cb68c2bd088b6ed21ced879b6f0 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Wed, 20 Nov 2024 14:56:44 -0500
Subject: [PATCH 06/18] Qwen2.5: fix tokenizer vocab size

---
 litgpt/config.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/litgpt/config.py b/litgpt/config.py
index 332d25733b..84511a663a 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -1714,7 +1714,7 @@ def norm_class(self) -> Type:
         name="Qwen2.5-0.5B{}",
         hf_config=dict(org="Qwen", name="Qwen2.5-0.5B{}"),
         block_size=32768,
-        vocab_size=151936,
+        vocab_size=151643,
         padded_vocab_size=151936,
         n_layer=24,
         n_head=14,
@@ -1735,7 +1735,7 @@ def norm_class(self) -> Type:
         name="Qwen2.5-1.5B{}",
         hf_config=dict(org="Qwen", name="Qwen2.5-1.5B{}"),
         block_size=131072,
-        vocab_size=151936,
+        vocab_size=151643,
         padded_vocab_size=151936,
         n_layer=28,
         n_head=12,
@@ -1756,7 +1756,7 @@ def norm_class(self) -> Type:
         name="Qwen2.5-3B{}",
         hf_config=dict(org="Qwen", name="Qwen2.5-3B{}"),
         block_size=32768,
-        vocab_size=151936,
+        vocab_size=151643,
         padded_vocab_size=151936,
         n_layer=36,
         n_head=16,
@@ -1777,7 +1777,7 @@ def norm_class(self) -> Type:
         name="Qwen2.5-7B{}",
         hf_config=dict(org="Qwen", name="Qwen2.5-7B{}"),
         block_size=131072,
-        vocab_size=152064,
+        vocab_size=151643,
         padded_vocab_size=152064,
         n_layer=28,
         n_head=28,
@@ -1798,7 +1798,7 @@ def norm_class(self) -> Type:
         name="Qwen2.5-14B{}",
         hf_config=dict(org="Qwen", name="Qwen2.5-14B{}"),
         block_size=131072,
-        vocab_size=152064,
+        vocab_size=151643,
         padded_vocab_size=152064,
         n_layer=48,
         n_head=40,
@@ -1819,7 +1819,7 @@ def norm_class(self) -> Type:
         name="Qwen2.5-32B{}",
         hf_config=dict(org="Qwen", name="Qwen2.5-32B{}"),
         block_size=131072,
-        vocab_size=152064,
+        vocab_size=151643,
         padded_vocab_size=152064,
         n_layer=64,
         n_head=40,
@@ -1840,7 +1840,7 @@ def norm_class(self) -> Type:
         name="Qwen2.5-72B{}",
         hf_config=dict(org="Qwen", name="Qwen2.5-72B{}"),
         block_size=131072,
-        vocab_size=152064,
+        vocab_size=151643,
         padded_vocab_size=152064,
         n_layer=80,
         n_head=64,

From bac1a27211e6a7b37e20870a368a4d823871b304 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Wed, 20 Nov 2024 15:25:28 -0500
Subject: [PATCH 07/18] Qwen2.5: fix test_tokenizer bos exception

---
 tests/test_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index a823eb71cd..60f6eae34a 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -56,7 +56,7 @@ def test_tokenizer_against_hf(config):
     else:
         assert ours.vocab_size == config.vocab_size
 
-    if config.name.startswith("falcon") or config.name.startswith("stablecode"):
+    if config.name.startswith("falcon") or config.name.startswith("stablecode") or config.name.startswith("Qwen2.5"):
         # even though their config defines it, it's set as None in HF
         assert isinstance(ours.bos_id, int)
         assert theirs.bos_token_id is None

From d3576d2ad4ef9d415edcf34a22586ee80e1580b9 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Thu, 21 Nov 2024 13:37:19 -0500
Subject: [PATCH 08/18] Qwen2.5: fixed adding config.attn_bias to lora

---
 litgpt/adapter_v2.py | 2 +-
 litgpt/lora.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/litgpt/adapter_v2.py b/litgpt/adapter_v2.py
index f5e6069343..1ad3d40b9d 100644
--- a/litgpt/adapter_v2.py
+++ b/litgpt/adapter_v2.py
@@ -163,7 +163,7 @@ def __init__(self, config: Config, block_idx: int) -> None:
         nn.Module.__init__(self)
         shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
         # key, query, value projections for all heads, but in a batch
-        self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias)
+        self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias or config.attn_bias)
         # output projection
         # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head`
         self.proj = AdapterV2Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias)
diff --git a/litgpt/lora.py b/litgpt/lora.py
index a540961837..18a472337b 100644
--- a/litgpt/lora.py
+++ b/litgpt/lora.py
@@ -609,7 +609,7 @@ def __init__(self, config: Config, block_idx: int) -> None:
             lora_alpha=config.lora_alpha,
             lora_dropout=config.lora_dropout,
             enable_lora=(config.lora_query, config.lora_key, config.lora_value),
-            bias=config.bias,
+            bias=config.bias or config.attn_bias,
             # for MQA/GQA support
             head_size=config.head_size,
             n_head=config.n_head,

From 0c22a8e5bc0eb931757f995b62532d21976eb6db Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Fri, 22 Nov 2024 13:38:55 -0500
Subject: [PATCH 09/18] Update tests/test_tokenizer.py

Co-authored-by: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com>
---
 tests/test_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 60f6eae34a..2b30c916e9 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -56,7 +56,7 @@ def test_tokenizer_against_hf(config):
     else:
         assert ours.vocab_size == config.vocab_size
 
-    if config.name.startswith("falcon") or config.name.startswith("stablecode") or config.name.startswith("Qwen2.5"):
+    if config.name.startswith(("falcon", "stablecode", "Qwen2.5")):
         # even though their config defines it, it's set as None in HF
         assert isinstance(ours.bos_id, int)
         assert theirs.bos_token_id is None

From e441de39d53709204fe83f6f610a0f74a5b199df Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Fri, 22 Nov 2024 13:53:48 -0500
Subject: [PATCH 10/18] Qwen2.5: fix adding Qwen2.5-Coder in prompts.py

---
 litgpt/prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litgpt/prompts.py b/litgpt/prompts.py
index cb26d35e6b..e11c0e9583 100644
--- a/litgpt/prompts.py
+++ b/litgpt/prompts.py
@@ -349,7 +349,7 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle:
         return Gemma()
     if re.search(r"OLMo.*-hf", model_name):
         return OLMo()
-    if re.search(r"Qwen2\.5-(?!Coder)", model_name):
+    if re.search(r"Qwen2\.5-.*", model_name):
         return Qwen2_5()
     return Default()
 

From 982d0c94291bc092a59781c035039f4b1cc79761 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Fri, 22 Nov 2024 14:11:22 -0500
Subject: [PATCH 11/18] Qwen2.5: added coder variant

---
 litgpt/config.py                    | 132 ++++++++++++++++++++++++++++
 tutorials/download_model_weights.md |  16 +++-
 2 files changed, 146 insertions(+), 2 deletions(-)

diff --git a/litgpt/config.py b/litgpt/config.py
index 84511a663a..5884433372 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -1857,6 +1857,138 @@ def norm_class(self) -> Type:
         rope_base=1000000
     ),
 ]
+
+qwen_2_5_coder = [
+    # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B/blob/main/config.json
+    dict(
+        name="Qwen2.5-Coder-0.5B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-0.5B{}"),
+        block_size=32768,
+        vocab_size=151643,
+        padded_vocab_size=151936,
+        n_layer=24,
+        n_head=14,
+        n_embd=896,
+        n_query_groups=2,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=4864,
+        norm_eps=1e-6,
+        rope_base=1000000
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B/blob/main/config.json
+    dict(
+        name="Qwen2.5-Coder-1.5B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-1.5B{}"),
+        block_size=131072,
+        vocab_size=151643,
+        padded_vocab_size=151936,
+        n_layer=28,
+        n_head=12,
+        n_embd=1536,
+        n_query_groups=2,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=8960,
+        norm_eps=1e-6,
+        rope_base=1000000
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-Coder-3B/blob/main/config.json
+    dict(
+        name="Qwen2.5-Coder-3B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-3B{}"),
+        block_size=32768,
+        vocab_size=151643,
+        padded_vocab_size=151936,
+        n_layer=36,
+        n_head=16,
+        n_embd=2048,
+        n_query_groups=2,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=11008,
+        norm_eps=1e-6,
+        rope_base=1000000
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-Coder-7B/blob/main/config.json
+    dict(
+        name="Qwen2.5-Coder-7B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-7B{}"),
+        block_size=131072,
+        vocab_size=151643,
+        padded_vocab_size=152064,
+        n_layer=28,
+        n_head=28,
+        n_embd=3584,
+        n_query_groups=4,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=18944,
+        norm_eps=1e-6,
+        rope_base=1000000
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-Coder-14B/blob/main/config.json
+    dict(
+        name="Qwen2.5-Coder-14B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-14B{}"),
+        block_size=131072,
+        vocab_size=151643,
+        padded_vocab_size=152064,
+        n_layer=48,
+        n_head=40,
+        n_embd=5120,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=13824,
+        norm_eps=1e-5,
+        rope_base=1000000
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-Coder-32B/blob/main/config.json
+    dict(
+        name="Qwen2.5-Coder-32B{}",
+        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-32B{}"),
+        block_size=131072,
+        vocab_size=151643,
+        padded_vocab_size=152064,
+        n_layer=64,
+        n_head=40,
+        n_embd=5120,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=27648,
+        norm_eps=1e-5,
+        rope_base=1000000
+    ),
+]
+
+qwen_2_5.extend(qwen_2_5_coder)
+
 for c in qwen_2_5:
     for kind in ("", "-Instruct"):
         copy = deepcopy(c)
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
index 8b90b9400b..b14e149d80 100644
--- a/tutorials/download_model_weights.md
+++ b/tutorials/download_model_weights.md
@@ -164,8 +164,8 @@ openlm-research/open_llama_3b
 openlm-research/open_llama_7b
 Qwen/Qwen2.5-0.5B
 Qwen/Qwen2.5-0.5B-Instruct
-Qwen/Qwen2.5-1B
-Qwen/Qwen2.5-1B-Instruct
+Qwen/Qwen2.5-1.5B
+Qwen/Qwen2.5-1.5B-Instruct
 Qwen/Qwen2.5-3B
 Qwen/Qwen2.5-3B-Instruct
 Qwen/Qwen2.5-7B
@@ -176,6 +176,18 @@ Qwen/Qwen2.5-32B
 Qwen/Qwen2.5-32B-Instruct
 Qwen/Qwen2.5-72B
 Qwen/Qwen2.5-72B-Instruct
+Qwen/Qwen2.5-Coder-0.5B
+Qwen/Qwen2.5-Coder-0.5B-Instruct
+Qwen/Qwen2.5-Coder-1.5B
+Qwen/Qwen2.5-Coder-1.5B-Instruct
+Qwen/Qwen2.5-Coder-3B
+Qwen/Qwen2.5-Coder-3B-Instruct
+Qwen/Qwen2.5-Coder-7B
+Qwen/Qwen2.5-Coder-7B-Instruct
+Qwen/Qwen2.5-Coder-14B
+Qwen/Qwen2.5-Coder-14B-Instruct
+Qwen/Qwen2.5-Coder-32B
+Qwen/Qwen2.5-Coder-32B-Instruct
 stabilityai/FreeWilly2
 stabilityai/stable-code-3b
 stabilityai/stablecode-completion-alpha-3b

From 0b9d17209209db31c2872c4c799a9b7770a79760 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Fri, 22 Nov 2024 14:16:16 -0500
Subject: [PATCH 12/18] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index be3b222a03..87389e10fb 100644
--- a/README.md
+++ b/README.md
@@ -135,6 +135,7 @@ Every model is written from scratch to maximize performance and remove layers of
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
 | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
+| Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186)                                          |
 | StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                                  |
 | StableLM  | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM)                                                                    |
 | StableLM Zephyr | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                             |

From f68808d69df1d8bf9f725a655b4e6268f37fcd47 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Fri, 22 Nov 2024 14:17:00 -0500
Subject: [PATCH 13/18] Update download_model_weights.md

---
 tutorials/download_model_weights.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
index b14e149d80..50c6924f63 100644
--- a/tutorials/download_model_weights.md
+++ b/tutorials/download_model_weights.md
@@ -34,6 +34,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
 | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
+| Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186)                                          |
 | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                                                 |
 | StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                                  |
 | StableLM  | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM)                                                                    |

From d58eb95941fbd6d38bb7ff49e841275273363564 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Sun, 24 Nov 2024 12:07:50 -0500
Subject: [PATCH 14/18] Qwen2.5: fix convert lit/hf checkpoint scripts

---
 litgpt/scripts/convert_hf_checkpoint.py  | 8 ++++----
 litgpt/scripts/convert_lit_checkpoint.py | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
index a38ac174a9..7ac7254d60 100644
--- a/litgpt/scripts/convert_hf_checkpoint.py
+++ b/litgpt/scripts/convert_hf_checkpoint.py
@@ -562,14 +562,14 @@ def convert_hf_checkpoint(
         # holder to reconstitute the split q, k, v
         qkv_weights = {}
         copy_fn = partial(copy_weights_phi, config, qkv_weights)
-    elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
-        # holder to reconstitute the split q, k, v
-        qkv_weights = {}
-        copy_fn = partial(copy_weights_hf_llama, config, qkv_weights)
     elif model_name.lower().startswith("qwen2.5"):
         # holder to reconstitute the split q, k, v
         qkv_weights = {}
         copy_fn = partial(copy_weights_qwen_2_5, config, qkv_weights)
+    elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
+        # holder to reconstitute the split q, k, v
+        qkv_weights = {}
+        copy_fn = partial(copy_weights_hf_llama, config, qkv_weights)
     else:
         copy_fn = copy_weights_gpt_neox
 
diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py
index adc9d85240..26eda5e147 100644
--- a/litgpt/scripts/convert_lit_checkpoint.py
+++ b/litgpt/scripts/convert_lit_checkpoint.py
@@ -384,11 +384,11 @@ def convert_lit_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None:
         copy_fn = partial(copy_weights_gemma_2, config)
     elif config.name.lower().startswith("phi"):
         copy_fn = partial(copy_weights_phi, config)
+    elif config.name.lower().startswith("qwen2.5"):
+        copy_fn = partial(copy_weights_qwen_2_5, config)
     elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
         untie_weights = "Gemma" in config.name
         copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights)
-    elif config.name.lower().startswith("qwen2.5"):
-        copy_fn = partial(copy_weights_qwen_2_5, config)
     else:
         copy_fn = copy_weights_gpt_neox
 

From abfecd81c5f02ad96c6de74c0d6403c9e8c38cba Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Mon, 25 Nov 2024 09:55:21 -0500
Subject: [PATCH 15/18] Qwen2.5: added test script components for qwen2.5

---
 tests/test_convert_lit_checkpoint.py | 65 ++++++++++++++++++++++++++++
 tests/test_model.py                  | 63 +++++++++++++++++++++++++++
 2 files changed, 128 insertions(+)

diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py
index f2e0b48459..9f27b80d21 100644
--- a/tests/test_convert_lit_checkpoint.py
+++ b/tests/test_convert_lit_checkpoint.py
@@ -15,6 +15,7 @@
 from transformers.models.llama import LlamaConfig, LlamaForCausalLM
 from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM
 from transformers.models.olmo import OlmoConfig, OlmoForCausalLM
+from transformers.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
 
 from litgpt import GPT, Config
 from litgpt.scripts.convert_lit_checkpoint import (
@@ -25,6 +26,7 @@
     copy_weights_gpt_neox,
     copy_weights_llama,
     copy_weights_phi,
+    copy_weights_qwen_2_5,
     qkv_split,
 )
 from tests.conftest import RunIf
@@ -520,6 +522,69 @@ def test_check_conversion_supported_lora():
     with pytest.raises(ValueError, match=r"LoRA.*cannot be converted"):
         check_conversion_supported(lit_weights=lit_weights)
 
+@torch.inference_mode()
+@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B"))
+@pytest.mark.parametrize(
+    ("device", "dtype"),
+    [
+        (torch.device("cpu"), torch.float32),
+        pytest.param(
+            torch.device("cuda"),
+            torch.float16,
+            marks=[
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                RunIf(min_cuda_gpus=1),
+            ],
+        ),
+    ],
+)
+def test_against_original_qwen_2_5(model_name, device, dtype):
+    torch.set_default_dtype(dtype)
+
+    T = 20
+    ours_config = Config.from_name(
+        model_name,
+        block_size=T,
+        n_layer=2,
+        n_head=16,
+        n_embd=32,
+        intermediate_size=86,
+    )
+    theirs_config = Qwen2Config(
+        vocab_size=ours_config.padded_vocab_size,
+        hidden_size=ours_config.n_embd,
+        head_dim=ours_config.head_size,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=ours_config.block_size,
+        rms_norm_eps=ours_config.norm_eps,
+        num_key_value_heads=ours_config.n_query_groups,
+        rope_theta=ours_config.rope_base,
+        attention_bias=ours_config.attn_bias,
+        tie_word_embeddings=True,
+    )
+
+    assert ours_config.intermediate_size == theirs_config.intermediate_size
+
+    ours_model = GPT(ours_config).to(device)
+    # tie weights
+    ours_model.lm_head.weight = ours_model.transformer.wte.weight
+    ours_state_dict = ours_model.state_dict()
+    theirs_state_dict = {}
+    copy_weights_qwen_2_5(ours_config, theirs_state_dict, ours_state_dict, untie_weights=True)
+    theirs_model = Qwen2ForCausalLM(theirs_config).to(device)
+    keys = theirs_model.load_state_dict(theirs_state_dict, strict=False)
+    assert not keys.unexpected_keys
+
+    # test end to end
+    x = torch.randint(low=0, high=ours_config.padded_vocab_size, size=(T,), device=device).unsqueeze(0)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
 
 def test_qkv_split():
     # MHA
diff --git a/tests/test_model.py b/tests/test_model.py
index f2ec330f14..1a9a94efd5 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -28,6 +28,7 @@
 from transformers.models.mistral import MistralConfig, MistralForCausalLM
 from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM
 from transformers.models.olmo import OlmoConfig, OlmoForCausalLM
+from transformers.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
 
 import litgpt.config as config_module
 from litgpt.model import batched_index_copy_
@@ -38,6 +39,7 @@
     copy_weights_gpt_neox,
     copy_weights_hf_llama,
     copy_weights_phi,
+    copy_weights_qwen_2_5,
 )
 from tests.conftest import RunIf
 
@@ -787,6 +789,67 @@ def test_against_original_gemma_2(model_name, device, dtype):
     torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5)
 
 
+@torch.inference_mode()
+@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B"))
+@pytest.mark.parametrize(
+    ("device", "dtype"),
+    [
+        (torch.device("cpu"), torch.float32),
+        pytest.param(
+            torch.device("cuda"),
+            torch.float16,
+            marks=[
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                RunIf(min_cuda_gpus=1),
+            ],
+        ),
+    ],
+)
+def test_against_original_qwen_2_5(model_name, device, dtype):
+    torch.set_default_dtype(dtype)
+
+    T = 20
+    ours_config = Config.from_name(
+        model_name,
+        block_size=T,
+        n_layer=2,
+        n_head=16,
+        n_embd=32,
+        intermediate_size=86,
+    )
+    theirs_config = Qwen2Config(
+        vocab_size=ours_config.padded_vocab_size,
+        hidden_size=ours_config.n_embd,
+        head_dim=ours_config.head_size,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=ours_config.block_size,
+        rms_norm_eps=ours_config.norm_eps,
+        num_key_value_heads=ours_config.n_query_groups,
+        rope_theta=ours_config.rope_base,
+        attention_bias=ours_config.attn_bias,
+        tie_word_embeddings=True,
+    )
+
+    theirs_model = Qwen2ForCausalLM(theirs_config).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+    # Gemma weights are shipped without `lm_head.weight`
+    theirs_state_dict.pop("lm_head.weight")
+    state_dict = {}
+    copy_weights_qwen_2_5(ours_config, {}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
+    ours_model.load_state_dict(state_dict)
+
+    # test end to end
+    x = torch.randint(low=0, high=ours_config.padded_vocab_size, size=(T,), device=device).unsqueeze(0)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
+
 @RunIf(dynamo=True)
 @torch.inference_mode()
 def test_model_compile():

From f7cb6d375f0f3a3e4261e35e8d3e0fb9abcf8b1c Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Mon, 25 Nov 2024 11:35:51 -0500
Subject: [PATCH 16/18] Update litgpt/scripts/convert_lit_checkpoint.py

Co-authored-by: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com>
---
 litgpt/scripts/convert_lit_checkpoint.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py
index 26eda5e147..c0d4eacfa7 100644
--- a/litgpt/scripts/convert_lit_checkpoint.py
+++ b/litgpt/scripts/convert_lit_checkpoint.py
@@ -302,6 +302,7 @@ def copy_weights_qwen_2_5(
     config: Config,
     state_dict: Dict[str, torch.Tensor],
     lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    untie_weights: bool = False,
     saver: Optional[incremental_save] = None,
 ) -> None:
     weight_map = {

From 425dfdd28fdf0d6e4f66ac7d8a487871044a7e90 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Mon, 25 Nov 2024 11:35:57 -0500
Subject: [PATCH 17/18] Update litgpt/scripts/convert_lit_checkpoint.py

Co-authored-by: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com>
---
 litgpt/scripts/convert_lit_checkpoint.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py
index c0d4eacfa7..e6bdf78abc 100644
--- a/litgpt/scripts/convert_lit_checkpoint.py
+++ b/litgpt/scripts/convert_lit_checkpoint.py
@@ -318,6 +318,8 @@ def copy_weights_qwen_2_5(
     }
 
     for name, param in lit_weights.items():
+        if name == "lm_head.weight" and untie_weights:
+            continue
         if name.endswith((".attn.attn.weight", ".attn.attn.bias")):
             from_name, l_idx = layer_template(name, 2)
             qkv = load_param(param, name, None)

From deecd07b9238cba24bbe232bac85d977056836d2 Mon Sep 17 00:00:00 2001
From: Yu Shi Jie <yshijie1999@gmail.com>
Date: Tue, 26 Nov 2024 13:21:14 -0500
Subject: [PATCH 18/18] Qwen2.5: relaxed assert_close for test_lora.py gemma-2

---
 tests/test_lora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lora.py b/tests/test_lora.py
index f14323b38a..079d900d0b 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -674,7 +674,7 @@ def test_against_original_gemma_2(model_name):
     assert x.size(1) == T
     ours_y = ours_model(x)
     theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
-    torch.testing.assert_close(ours_y, theirs_y)
+    torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5)
 
 
 @RunIf(min_cuda_gpus=1)