From 4448918b96bcd8b9d145f02d4e36e509717df7b9 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Tue, 19 Nov 2024 21:44:33 -0500 Subject: [PATCH 01/18] qwen2.5: added config + special attn_bias --- litgpt/config.py | 160 +++++++++++++++++++++++++++++++++++++++++++++++ litgpt/model.py | 2 +- 2 files changed, 161 insertions(+), 1 deletion(-) diff --git a/litgpt/config.py b/litgpt/config.py index b218df849c..332d25733b 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -31,6 +31,7 @@ class Config: parallel_residual: bool = True bias: bool = True lm_head_bias: bool = False + attn_bias: bool = False # to use multi-head attention (MHA), set this to `n_head` (default) # to use multi-query attention (MQA), set this to 1 # to use grouped-query attention (GQA), set this to a value in between @@ -1704,4 +1705,163 @@ def norm_class(self) -> Type: configs.extend(llama_2_function_calling) +########## +# Qwen2.5 +########## +qwen_2_5 = [ + # https://huggingface.co/Qwen/Qwen2.5-0.5B/blob/main/config.json + dict( + name="Qwen2.5-0.5B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-0.5B{}"), + block_size=32768, + vocab_size=151936, + padded_vocab_size=151936, + n_layer=24, + n_head=14, + n_embd=896, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=4864, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-1.5B/blob/main/config.json + dict( + name="Qwen2.5-1.5B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-1.5B{}"), + block_size=131072, + vocab_size=151936, + padded_vocab_size=151936, + n_layer=28, + n_head=12, + n_embd=1536, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=8960, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-3B/blob/main/config.json + dict( + name="Qwen2.5-3B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-3B{}"), + block_size=32768, + vocab_size=151936, + padded_vocab_size=151936, + n_layer=36, + n_head=16, + n_embd=2048, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=11008, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-7B/blob/main/config.json + dict( + name="Qwen2.5-7B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-7B{}"), + block_size=131072, + vocab_size=152064, + padded_vocab_size=152064, + n_layer=28, + n_head=28, + n_embd=3584, + n_query_groups=4, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=18944, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-14B/blob/main/config.json + dict( + name="Qwen2.5-14B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-14B{}"), + block_size=131072, + vocab_size=152064, + padded_vocab_size=152064, + n_layer=48, + n_head=40, + n_embd=5120, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=13824, + norm_eps=1e-5, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-32B/blob/main/config.json + dict( + name="Qwen2.5-32B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-32B{}"), + block_size=131072, + vocab_size=152064, + padded_vocab_size=152064, + n_layer=64, + n_head=40, + n_embd=5120, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=27648, + norm_eps=1e-5, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-72B/blob/main/config.json + dict( + name="Qwen2.5-72B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-72B{}"), + block_size=131072, + vocab_size=152064, + padded_vocab_size=152064, + n_layer=80, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=29568, + norm_eps=1e-5, + rope_base=1000000 + ), +] +for c in qwen_2_5: + for kind in ("", "-Instruct"): + copy = deepcopy(c) + copy["name"] = c["name"].format(kind) + copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + configs.append(copy) + name_to_config = {config["name"]: config for config in configs} diff --git a/litgpt/model.py b/litgpt/model.py index b60b0506b6..17b3b4ab04 100644 --- a/litgpt/model.py +++ b/litgpt/model.py @@ -244,7 +244,7 @@ def __init__(self, config: Config, block_idx: int) -> None: super().__init__() shape = (config.n_head + 2 * config.n_query_groups) * config.head_size # key, query, value projections for all heads, but in a batch - self.attn = nn.Linear(config.n_embd, shape, bias=config.bias) + self.attn = nn.Linear(config.n_embd, shape, bias=config.bias or config.attn_bias) # output projection # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` self.proj = nn.Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias) From f16944f912c06e6820a19422ad5a8adbe7d6b613 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Wed, 20 Nov 2024 13:09:48 -0500 Subject: [PATCH 02/18] Qwen2.5: convert checkpoints scripts --- litgpt/scripts/convert_hf_checkpoint.py | 79 ++++++++++++++++++++++++ litgpt/scripts/convert_lit_checkpoint.py | 45 ++++++++++++++ 2 files changed, 124 insertions(+) diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py index 0fbb99bd49..a38ac174a9 100644 --- a/litgpt/scripts/convert_hf_checkpoint.py +++ b/litgpt/scripts/convert_hf_checkpoint.py @@ -407,6 +407,81 @@ def copy_weights_phi( if progress_per_file is not None: pbar.update(progress_per_file) +def copy_weights_qwen_2_5( + config: Config, + qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]], + state_dict: Dict[str, torch.Tensor], + hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + saver: Optional[incremental_save] = None, + dtype: Optional[torch.dtype] = None, + pbar: Optional[tqdm] = None, + progress_per_file: Optional[float] = None, + debug_mode: Optional[bool] = False +) -> None: + weight_map = { + "model.embed_tokens.weight": "transformer.wte.weight", + "model.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight", + "model.layers.{}.self_attn.q_proj.weight": None, + "model.layers.{}.self_attn.k_proj.weight": None, + "model.layers.{}.self_attn.v_proj.weight": None, + "model.layers.{}.self_attn.q_proj.bias": None, + "model.layers.{}.self_attn.k_proj.bias": None, + "model.layers.{}.self_attn.v_proj.bias": None, + "model.layers.{}.self_attn.o_proj.weight": "transformer.h.{}.attn.proj.weight", + "model.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.norm_2.weight", + "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{}.mlp.fc_1.weight", + "model.layers.{}.mlp.up_proj.weight": "transformer.h.{}.mlp.fc_2.weight", + "model.layers.{}.mlp.down_proj.weight": "transformer.h.{}.mlp.proj.weight", + "model.norm.weight": "transformer.ln_f.weight", + "lm_head.weight": "lm_head.weight", + } + + if progress_per_file is not None: + progress_per_file = progress_per_file / max(1, len(hf_weights) + len(qkv_weights)) + + for name, param in hf_weights.items(): + if "model.layers" in name: + from_name, l = layer_template(name, 2) + qkv = qkv_weights.setdefault(l, defaultdict(dict)) + if any(w in from_name for w in ("q_proj", "k_proj", "v_proj")): + weight_name, weight_type = from_name.split(".")[-2:] + qkv[weight_type][weight_name] = param + to_name = weight_map[from_name] + if to_name is None: + continue + to_name = to_name.format(l) + else: + to_name = weight_map[name] + param = load_param(param, name, dtype, verbose=debug_mode) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + + if progress_per_file is not None: + pbar.update(progress_per_file) + + if "lm_head.weight" not in state_dict: + state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"] + + for i in list(qkv_weights): + for weight_type in list(qkv_weights[i]): + qkv = qkv_weights[i][weight_type] + if len(qkv) != 3: + # split across different .bin files + continue + q = load_param(qkv["q_proj"], f"layer {i} q {weight_type}", dtype, verbose=debug_mode) + k = load_param(qkv["k_proj"], f"layer {i} k {weight_type}", dtype, verbose=debug_mode) + v = load_param(qkv["v_proj"], f"layer {i} v {weight_type}", dtype, verbose=debug_mode) + q_per_kv = config.n_head // config.n_query_groups + qs = torch.split(q, config.head_size * q_per_kv) + ks = torch.split(k, config.head_size) + vs = torch.split(v, config.head_size) + cycled = [t for group in zip(qs, ks, vs) for t in group] + qkv = torch.cat(cycled) + state_dict[f"transformer.h.{i}.attn.attn.{weight_type}"] = qkv + del qkv_weights[i][weight_type] + if progress_per_file is not None: + pbar.update(progress_per_file) def qkv_reassemble(param: Union[torch.Tensor, NotYetLoadedTensor], config: Config) -> torch.Tensor: """Reassemble from a normal to an interleaved placement in a QKV matrix. @@ -491,6 +566,10 @@ def convert_hf_checkpoint( # holder to reconstitute the split q, k, v qkv_weights = {} copy_fn = partial(copy_weights_hf_llama, config, qkv_weights) + elif model_name.lower().startswith("qwen2.5"): + # holder to reconstitute the split q, k, v + qkv_weights = {} + copy_fn = partial(copy_weights_qwen_2_5, config, qkv_weights) else: copy_fn = copy_weights_gpt_neox diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py index 3fce585538..adc9d85240 100644 --- a/litgpt/scripts/convert_lit_checkpoint.py +++ b/litgpt/scripts/convert_lit_checkpoint.py @@ -298,6 +298,49 @@ def copy_weights_phi( state_dict[layer_name] = weight del gate_up_proj_weights[i] +def copy_weights_qwen_2_5( + config: Config, + state_dict: Dict[str, torch.Tensor], + lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + saver: Optional[incremental_save] = None, +) -> None: + weight_map = { + "transformer.wte.weight": "model.embed_tokens.weight", + "transformer.h.{}.norm_1.weight": "model.layers.{}.input_layernorm.weight", + "transformer.h.{}.norm_2.weight": "model.layers.{}.post_attention_layernorm.weight", + "transformer.h.{}.attn.proj.weight": "model.layers.{}.self_attn.o_proj.weight", + "transformer.h.{}.mlp.fc_1.weight": "model.layers.{}.mlp.gate_proj.weight", + "transformer.h.{}.mlp.fc_2.weight": "model.layers.{}.mlp.up_proj.weight", + "transformer.h.{}.mlp.proj.weight": "model.layers.{}.mlp.down_proj.weight", + "transformer.ln_f.weight": "model.norm.weight", + "lm_head.weight": "lm_head.weight", + } + + for name, param in lit_weights.items(): + if name.endswith((".attn.attn.weight", ".attn.attn.bias")): + from_name, l_idx = layer_template(name, 2) + qkv = load_param(param, name, None) + qp, kp, vp = qkv_split(qkv, config) + + weight_type = name.split(".")[-1] # weight or bias + q = f"model.layers.{l_idx}.self_attn.q_proj.{weight_type}" + k = f"model.layers.{l_idx}.self_attn.k_proj.{weight_type}" + v = f"model.layers.{l_idx}.self_attn.v_proj.{weight_type}" + for to_name, param in zip((q, k, v), (qp, kp, vp)): + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + else: + if "transformer.h" in name: + from_name, l_idx = layer_template(name, 2) + to_name = weight_map[from_name] + to_name = to_name.format(l_idx) + else: + to_name = weight_map[name] + param = load_param(param, name, None) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param def qkv_split( param: Union[torch.Tensor, NotYetLoadedTensor], config: Config @@ -344,6 +387,8 @@ def convert_lit_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None: elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"): untie_weights = "Gemma" in config.name copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights) + elif config.name.lower().startswith("qwen2.5"): + copy_fn = partial(copy_weights_qwen_2_5, config) else: copy_fn = copy_weights_gpt_neox From 770350700302fcd4f2791a52d18ec4d6198a34ea Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Wed, 20 Nov 2024 13:47:28 -0500 Subject: [PATCH 03/18] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6092db7ad8..be3b222a03 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,7 @@ Every model is written from scratch to maximize performance and remove layers of | Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) | | Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) | | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) | +| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) | | StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) | | StableLM | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM) | | StableLM Zephyr | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) | From 97f6315386751e42735fcd4eb71fbaa2d39fc5b8 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Wed, 20 Nov 2024 13:54:36 -0500 Subject: [PATCH 04/18] Update download_model_weights.md --- tutorials/download_model_weights.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index 9ab0041357..8b90b9400b 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -33,6 +33,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights. | Phi 3 & 3.5 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) | Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) | | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) | +| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) | | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1) | | StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) | | StableLM | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM) | @@ -161,6 +162,20 @@ nvidia/Llama-3.1-Nemotron-70B-Instruct-HF openlm-research/open_llama_13b openlm-research/open_llama_3b openlm-research/open_llama_7b +Qwen/Qwen2.5-0.5B +Qwen/Qwen2.5-0.5B-Instruct +Qwen/Qwen2.5-1B +Qwen/Qwen2.5-1B-Instruct +Qwen/Qwen2.5-3B +Qwen/Qwen2.5-3B-Instruct +Qwen/Qwen2.5-7B +Qwen/Qwen2.5-7B-Instruct +Qwen/Qwen2.5-14B +Qwen/Qwen2.5-14B-Instruct +Qwen/Qwen2.5-32B +Qwen/Qwen2.5-32B-Instruct +Qwen/Qwen2.5-72B +Qwen/Qwen2.5-72B-Instruct stabilityai/FreeWilly2 stabilityai/stable-code-3b stabilityai/stablecode-completion-alpha-3b From 1e793701601ad7b7d114e429d0a12595e22a9ff0 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Wed, 20 Nov 2024 14:14:11 -0500 Subject: [PATCH 05/18] Qwen2.5: added prompt template --- litgpt/prompts.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/litgpt/prompts.py b/litgpt/prompts.py index 09fb86676c..cb26d35e6b 100644 --- a/litgpt/prompts.py +++ b/litgpt/prompts.py @@ -279,6 +279,12 @@ def apply(self, prompt: str, **kwargs: str) -> str: class OLMo(PromptStyle): def apply(self, prompt: str, **kwargs: str) -> str: return f"<|endoftext|><|user|>\n{prompt}\n<|assistant|>\n" + + +class Qwen2_5(PromptStyle): + def apply(self, prompt: str, **kwargs: str) -> str: + system_message = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." + return f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" # Maps prompt style names to PromptStyle classes @@ -304,6 +310,7 @@ def apply(self, prompt: str, **kwargs: str) -> str: "gemma": Gemma, "llama3": Llama3, "olmo": OLMo, + "qwen2.5": Qwen2_5, } @@ -342,6 +349,8 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle: return Gemma() if re.search(r"OLMo.*-hf", model_name): return OLMo() + if re.search(r"Qwen2\.5-(?!Coder)", model_name): + return Qwen2_5() return Default() From 43990411e1bf1cb68c2bd088b6ed21ced879b6f0 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Wed, 20 Nov 2024 14:56:44 -0500 Subject: [PATCH 06/18] Qwen2.5: fix tokenizer vocab size --- litgpt/config.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/litgpt/config.py b/litgpt/config.py index 332d25733b..84511a663a 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -1714,7 +1714,7 @@ def norm_class(self) -> Type: name="Qwen2.5-0.5B{}", hf_config=dict(org="Qwen", name="Qwen2.5-0.5B{}"), block_size=32768, - vocab_size=151936, + vocab_size=151643, padded_vocab_size=151936, n_layer=24, n_head=14, @@ -1735,7 +1735,7 @@ def norm_class(self) -> Type: name="Qwen2.5-1.5B{}", hf_config=dict(org="Qwen", name="Qwen2.5-1.5B{}"), block_size=131072, - vocab_size=151936, + vocab_size=151643, padded_vocab_size=151936, n_layer=28, n_head=12, @@ -1756,7 +1756,7 @@ def norm_class(self) -> Type: name="Qwen2.5-3B{}", hf_config=dict(org="Qwen", name="Qwen2.5-3B{}"), block_size=32768, - vocab_size=151936, + vocab_size=151643, padded_vocab_size=151936, n_layer=36, n_head=16, @@ -1777,7 +1777,7 @@ def norm_class(self) -> Type: name="Qwen2.5-7B{}", hf_config=dict(org="Qwen", name="Qwen2.5-7B{}"), block_size=131072, - vocab_size=152064, + vocab_size=151643, padded_vocab_size=152064, n_layer=28, n_head=28, @@ -1798,7 +1798,7 @@ def norm_class(self) -> Type: name="Qwen2.5-14B{}", hf_config=dict(org="Qwen", name="Qwen2.5-14B{}"), block_size=131072, - vocab_size=152064, + vocab_size=151643, padded_vocab_size=152064, n_layer=48, n_head=40, @@ -1819,7 +1819,7 @@ def norm_class(self) -> Type: name="Qwen2.5-32B{}", hf_config=dict(org="Qwen", name="Qwen2.5-32B{}"), block_size=131072, - vocab_size=152064, + vocab_size=151643, padded_vocab_size=152064, n_layer=64, n_head=40, @@ -1840,7 +1840,7 @@ def norm_class(self) -> Type: name="Qwen2.5-72B{}", hf_config=dict(org="Qwen", name="Qwen2.5-72B{}"), block_size=131072, - vocab_size=152064, + vocab_size=151643, padded_vocab_size=152064, n_layer=80, n_head=64, From bac1a27211e6a7b37e20870a368a4d823871b304 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Wed, 20 Nov 2024 15:25:28 -0500 Subject: [PATCH 07/18] Qwen2.5: fix test_tokenizer bos exception --- tests/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index a823eb71cd..60f6eae34a 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -56,7 +56,7 @@ def test_tokenizer_against_hf(config): else: assert ours.vocab_size == config.vocab_size - if config.name.startswith("falcon") or config.name.startswith("stablecode"): + if config.name.startswith("falcon") or config.name.startswith("stablecode") or config.name.startswith("Qwen2.5"): # even though their config defines it, it's set as None in HF assert isinstance(ours.bos_id, int) assert theirs.bos_token_id is None From d3576d2ad4ef9d415edcf34a22586ee80e1580b9 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Thu, 21 Nov 2024 13:37:19 -0500 Subject: [PATCH 08/18] Qwen2.5: fixed adding config.attn_bias to lora --- litgpt/adapter_v2.py | 2 +- litgpt/lora.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/litgpt/adapter_v2.py b/litgpt/adapter_v2.py index f5e6069343..1ad3d40b9d 100644 --- a/litgpt/adapter_v2.py +++ b/litgpt/adapter_v2.py @@ -163,7 +163,7 @@ def __init__(self, config: Config, block_idx: int) -> None: nn.Module.__init__(self) shape = (config.n_head + 2 * config.n_query_groups) * config.head_size # key, query, value projections for all heads, but in a batch - self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias) + self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias or config.attn_bias) # output projection # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` self.proj = AdapterV2Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias) diff --git a/litgpt/lora.py b/litgpt/lora.py index a540961837..18a472337b 100644 --- a/litgpt/lora.py +++ b/litgpt/lora.py @@ -609,7 +609,7 @@ def __init__(self, config: Config, block_idx: int) -> None: lora_alpha=config.lora_alpha, lora_dropout=config.lora_dropout, enable_lora=(config.lora_query, config.lora_key, config.lora_value), - bias=config.bias, + bias=config.bias or config.attn_bias, # for MQA/GQA support head_size=config.head_size, n_head=config.n_head, From 0c22a8e5bc0eb931757f995b62532d21976eb6db Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Fri, 22 Nov 2024 13:38:55 -0500 Subject: [PATCH 09/18] Update tests/test_tokenizer.py Co-authored-by: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com> --- tests/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 60f6eae34a..2b30c916e9 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -56,7 +56,7 @@ def test_tokenizer_against_hf(config): else: assert ours.vocab_size == config.vocab_size - if config.name.startswith("falcon") or config.name.startswith("stablecode") or config.name.startswith("Qwen2.5"): + if config.name.startswith(("falcon", "stablecode", "Qwen2.5")): # even though their config defines it, it's set as None in HF assert isinstance(ours.bos_id, int) assert theirs.bos_token_id is None From e441de39d53709204fe83f6f610a0f74a5b199df Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Fri, 22 Nov 2024 13:53:48 -0500 Subject: [PATCH 10/18] Qwen2.5: fix adding Qwen2.5-Coder in prompts.py --- litgpt/prompts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litgpt/prompts.py b/litgpt/prompts.py index cb26d35e6b..e11c0e9583 100644 --- a/litgpt/prompts.py +++ b/litgpt/prompts.py @@ -349,7 +349,7 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle: return Gemma() if re.search(r"OLMo.*-hf", model_name): return OLMo() - if re.search(r"Qwen2\.5-(?!Coder)", model_name): + if re.search(r"Qwen2\.5-.*", model_name): return Qwen2_5() return Default() From 982d0c94291bc092a59781c035039f4b1cc79761 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Fri, 22 Nov 2024 14:11:22 -0500 Subject: [PATCH 11/18] Qwen2.5: added coder variant --- litgpt/config.py | 132 ++++++++++++++++++++++++++++ tutorials/download_model_weights.md | 16 +++- 2 files changed, 146 insertions(+), 2 deletions(-) diff --git a/litgpt/config.py b/litgpt/config.py index 84511a663a..5884433372 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -1857,6 +1857,138 @@ def norm_class(self) -> Type: rope_base=1000000 ), ] + +qwen_2_5_coder = [ + # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B/blob/main/config.json + dict( + name="Qwen2.5-Coder-0.5B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-0.5B{}"), + block_size=32768, + vocab_size=151643, + padded_vocab_size=151936, + n_layer=24, + n_head=14, + n_embd=896, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=4864, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B/blob/main/config.json + dict( + name="Qwen2.5-Coder-1.5B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-1.5B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=151936, + n_layer=28, + n_head=12, + n_embd=1536, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=8960, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-Coder-3B/blob/main/config.json + dict( + name="Qwen2.5-Coder-3B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-3B{}"), + block_size=32768, + vocab_size=151643, + padded_vocab_size=151936, + n_layer=36, + n_head=16, + n_embd=2048, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=11008, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-Coder-7B/blob/main/config.json + dict( + name="Qwen2.5-Coder-7B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-7B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=152064, + n_layer=28, + n_head=28, + n_embd=3584, + n_query_groups=4, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=18944, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-Coder-14B/blob/main/config.json + dict( + name="Qwen2.5-Coder-14B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-14B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=152064, + n_layer=48, + n_head=40, + n_embd=5120, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=13824, + norm_eps=1e-5, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-Coder-32B/blob/main/config.json + dict( + name="Qwen2.5-Coder-32B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-32B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=152064, + n_layer=64, + n_head=40, + n_embd=5120, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=27648, + norm_eps=1e-5, + rope_base=1000000 + ), +] + +qwen_2_5.extend(qwen_2_5_coder) + for c in qwen_2_5: for kind in ("", "-Instruct"): copy = deepcopy(c) diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index 8b90b9400b..b14e149d80 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -164,8 +164,8 @@ openlm-research/open_llama_3b openlm-research/open_llama_7b Qwen/Qwen2.5-0.5B Qwen/Qwen2.5-0.5B-Instruct -Qwen/Qwen2.5-1B -Qwen/Qwen2.5-1B-Instruct +Qwen/Qwen2.5-1.5B +Qwen/Qwen2.5-1.5B-Instruct Qwen/Qwen2.5-3B Qwen/Qwen2.5-3B-Instruct Qwen/Qwen2.5-7B @@ -176,6 +176,18 @@ Qwen/Qwen2.5-32B Qwen/Qwen2.5-32B-Instruct Qwen/Qwen2.5-72B Qwen/Qwen2.5-72B-Instruct +Qwen/Qwen2.5-Coder-0.5B +Qwen/Qwen2.5-Coder-0.5B-Instruct +Qwen/Qwen2.5-Coder-1.5B +Qwen/Qwen2.5-Coder-1.5B-Instruct +Qwen/Qwen2.5-Coder-3B +Qwen/Qwen2.5-Coder-3B-Instruct +Qwen/Qwen2.5-Coder-7B +Qwen/Qwen2.5-Coder-7B-Instruct +Qwen/Qwen2.5-Coder-14B +Qwen/Qwen2.5-Coder-14B-Instruct +Qwen/Qwen2.5-Coder-32B +Qwen/Qwen2.5-Coder-32B-Instruct stabilityai/FreeWilly2 stabilityai/stable-code-3b stabilityai/stablecode-completion-alpha-3b From 0b9d17209209db31c2872c4c799a9b7770a79760 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Fri, 22 Nov 2024 14:16:16 -0500 Subject: [PATCH 12/18] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index be3b222a03..87389e10fb 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,7 @@ Every model is written from scratch to maximize performance and remove layers of | Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) | | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) | | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) | +| Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186) | | StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) | | StableLM | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM) | | StableLM Zephyr | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) | From f68808d69df1d8bf9f725a655b4e6268f37fcd47 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Fri, 22 Nov 2024 14:17:00 -0500 Subject: [PATCH 13/18] Update download_model_weights.md --- tutorials/download_model_weights.md | 1 + 1 file changed, 1 insertion(+) diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index b14e149d80..50c6924f63 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -34,6 +34,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights. | Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) | | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) | | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) | +| Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186) | | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1) | | StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) | | StableLM | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM) | From d58eb95941fbd6d38bb7ff49e841275273363564 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Sun, 24 Nov 2024 12:07:50 -0500 Subject: [PATCH 14/18] Qwen2.5: fix convert lit/hf checkpoint scripts --- litgpt/scripts/convert_hf_checkpoint.py | 8 ++++---- litgpt/scripts/convert_lit_checkpoint.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py index a38ac174a9..7ac7254d60 100644 --- a/litgpt/scripts/convert_hf_checkpoint.py +++ b/litgpt/scripts/convert_hf_checkpoint.py @@ -562,14 +562,14 @@ def convert_hf_checkpoint( # holder to reconstitute the split q, k, v qkv_weights = {} copy_fn = partial(copy_weights_phi, config, qkv_weights) - elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"): - # holder to reconstitute the split q, k, v - qkv_weights = {} - copy_fn = partial(copy_weights_hf_llama, config, qkv_weights) elif model_name.lower().startswith("qwen2.5"): # holder to reconstitute the split q, k, v qkv_weights = {} copy_fn = partial(copy_weights_qwen_2_5, config, qkv_weights) + elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"): + # holder to reconstitute the split q, k, v + qkv_weights = {} + copy_fn = partial(copy_weights_hf_llama, config, qkv_weights) else: copy_fn = copy_weights_gpt_neox diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py index adc9d85240..26eda5e147 100644 --- a/litgpt/scripts/convert_lit_checkpoint.py +++ b/litgpt/scripts/convert_lit_checkpoint.py @@ -384,11 +384,11 @@ def convert_lit_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None: copy_fn = partial(copy_weights_gemma_2, config) elif config.name.lower().startswith("phi"): copy_fn = partial(copy_weights_phi, config) + elif config.name.lower().startswith("qwen2.5"): + copy_fn = partial(copy_weights_qwen_2_5, config) elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"): untie_weights = "Gemma" in config.name copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights) - elif config.name.lower().startswith("qwen2.5"): - copy_fn = partial(copy_weights_qwen_2_5, config) else: copy_fn = copy_weights_gpt_neox From abfecd81c5f02ad96c6de74c0d6403c9e8c38cba Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Mon, 25 Nov 2024 09:55:21 -0500 Subject: [PATCH 15/18] Qwen2.5: added test script components for qwen2.5 --- tests/test_convert_lit_checkpoint.py | 65 ++++++++++++++++++++++++++++ tests/test_model.py | 63 +++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py index f2e0b48459..9f27b80d21 100644 --- a/tests/test_convert_lit_checkpoint.py +++ b/tests/test_convert_lit_checkpoint.py @@ -15,6 +15,7 @@ from transformers.models.llama import LlamaConfig, LlamaForCausalLM from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM from transformers.models.olmo import OlmoConfig, OlmoForCausalLM +from transformers.models.qwen2 import Qwen2Config, Qwen2ForCausalLM from litgpt import GPT, Config from litgpt.scripts.convert_lit_checkpoint import ( @@ -25,6 +26,7 @@ copy_weights_gpt_neox, copy_weights_llama, copy_weights_phi, + copy_weights_qwen_2_5, qkv_split, ) from tests.conftest import RunIf @@ -520,6 +522,69 @@ def test_check_conversion_supported_lora(): with pytest.raises(ValueError, match=r"LoRA.*cannot be converted"): check_conversion_supported(lit_weights=lit_weights) +@torch.inference_mode() +@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B")) +@pytest.mark.parametrize( + ("device", "dtype"), + [ + (torch.device("cpu"), torch.float32), + pytest.param( + torch.device("cuda"), + torch.float16, + marks=[ + # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input + # is slightly different + pytest.mark.xfail(raises=AssertionError, strict=False), + RunIf(min_cuda_gpus=1), + ], + ), + ], +) +def test_against_original_qwen_2_5(model_name, device, dtype): + torch.set_default_dtype(dtype) + + T = 20 + ours_config = Config.from_name( + model_name, + block_size=T, + n_layer=2, + n_head=16, + n_embd=32, + intermediate_size=86, + ) + theirs_config = Qwen2Config( + vocab_size=ours_config.padded_vocab_size, + hidden_size=ours_config.n_embd, + head_dim=ours_config.head_size, + num_attention_heads=ours_config.n_head, + num_hidden_layers=ours_config.n_layer, + intermediate_size=ours_config.intermediate_size, + max_position_embeddings=ours_config.block_size, + rms_norm_eps=ours_config.norm_eps, + num_key_value_heads=ours_config.n_query_groups, + rope_theta=ours_config.rope_base, + attention_bias=ours_config.attn_bias, + tie_word_embeddings=True, + ) + + assert ours_config.intermediate_size == theirs_config.intermediate_size + + ours_model = GPT(ours_config).to(device) + # tie weights + ours_model.lm_head.weight = ours_model.transformer.wte.weight + ours_state_dict = ours_model.state_dict() + theirs_state_dict = {} + copy_weights_qwen_2_5(ours_config, theirs_state_dict, ours_state_dict, untie_weights=True) + theirs_model = Qwen2ForCausalLM(theirs_config).to(device) + keys = theirs_model.load_state_dict(theirs_state_dict, strict=False) + assert not keys.unexpected_keys + + # test end to end + x = torch.randint(low=0, high=ours_config.padded_vocab_size, size=(T,), device=device).unsqueeze(0) + assert x.size(1) == T + ours_y = ours_model(x) + theirs_y = theirs_model(x)["logits"].to(dtype) # HF converts logits to float + torch.testing.assert_close(ours_y, theirs_y) def test_qkv_split(): # MHA diff --git a/tests/test_model.py b/tests/test_model.py index f2ec330f14..1a9a94efd5 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -28,6 +28,7 @@ from transformers.models.mistral import MistralConfig, MistralForCausalLM from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM from transformers.models.olmo import OlmoConfig, OlmoForCausalLM +from transformers.models.qwen2 import Qwen2Config, Qwen2ForCausalLM import litgpt.config as config_module from litgpt.model import batched_index_copy_ @@ -38,6 +39,7 @@ copy_weights_gpt_neox, copy_weights_hf_llama, copy_weights_phi, + copy_weights_qwen_2_5, ) from tests.conftest import RunIf @@ -787,6 +789,67 @@ def test_against_original_gemma_2(model_name, device, dtype): torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5) +@torch.inference_mode() +@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B")) +@pytest.mark.parametrize( + ("device", "dtype"), + [ + (torch.device("cpu"), torch.float32), + pytest.param( + torch.device("cuda"), + torch.float16, + marks=[ + # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input + # is slightly different + pytest.mark.xfail(raises=AssertionError, strict=False), + RunIf(min_cuda_gpus=1), + ], + ), + ], +) +def test_against_original_qwen_2_5(model_name, device, dtype): + torch.set_default_dtype(dtype) + + T = 20 + ours_config = Config.from_name( + model_name, + block_size=T, + n_layer=2, + n_head=16, + n_embd=32, + intermediate_size=86, + ) + theirs_config = Qwen2Config( + vocab_size=ours_config.padded_vocab_size, + hidden_size=ours_config.n_embd, + head_dim=ours_config.head_size, + num_attention_heads=ours_config.n_head, + num_hidden_layers=ours_config.n_layer, + intermediate_size=ours_config.intermediate_size, + max_position_embeddings=ours_config.block_size, + rms_norm_eps=ours_config.norm_eps, + num_key_value_heads=ours_config.n_query_groups, + rope_theta=ours_config.rope_base, + attention_bias=ours_config.attn_bias, + tie_word_embeddings=True, + ) + + theirs_model = Qwen2ForCausalLM(theirs_config).to(device) + theirs_state_dict = theirs_model.state_dict() + # Gemma weights are shipped without `lm_head.weight` + theirs_state_dict.pop("lm_head.weight") + state_dict = {} + copy_weights_qwen_2_5(ours_config, {}, state_dict, theirs_state_dict) + ours_model = GPT(ours_config).to(device) + ours_model.load_state_dict(state_dict) + + # test end to end + x = torch.randint(low=0, high=ours_config.padded_vocab_size, size=(T,), device=device).unsqueeze(0) + assert x.size(1) == T + ours_y = ours_model(x) + theirs_y = theirs_model(x)["logits"].to(dtype) # HF converts logits to float + torch.testing.assert_close(ours_y, theirs_y) + @RunIf(dynamo=True) @torch.inference_mode() def test_model_compile(): From f7cb6d375f0f3a3e4261e35e8d3e0fb9abcf8b1c Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Mon, 25 Nov 2024 11:35:51 -0500 Subject: [PATCH 16/18] Update litgpt/scripts/convert_lit_checkpoint.py Co-authored-by: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com> --- litgpt/scripts/convert_lit_checkpoint.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py index 26eda5e147..c0d4eacfa7 100644 --- a/litgpt/scripts/convert_lit_checkpoint.py +++ b/litgpt/scripts/convert_lit_checkpoint.py @@ -302,6 +302,7 @@ def copy_weights_qwen_2_5( config: Config, state_dict: Dict[str, torch.Tensor], lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + untie_weights: bool = False, saver: Optional[incremental_save] = None, ) -> None: weight_map = { From 425dfdd28fdf0d6e4f66ac7d8a487871044a7e90 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Mon, 25 Nov 2024 11:35:57 -0500 Subject: [PATCH 17/18] Update litgpt/scripts/convert_lit_checkpoint.py Co-authored-by: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com> --- litgpt/scripts/convert_lit_checkpoint.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py index c0d4eacfa7..e6bdf78abc 100644 --- a/litgpt/scripts/convert_lit_checkpoint.py +++ b/litgpt/scripts/convert_lit_checkpoint.py @@ -318,6 +318,8 @@ def copy_weights_qwen_2_5( } for name, param in lit_weights.items(): + if name == "lm_head.weight" and untie_weights: + continue if name.endswith((".attn.attn.weight", ".attn.attn.bias")): from_name, l_idx = layer_template(name, 2) qkv = load_param(param, name, None) From deecd07b9238cba24bbe232bac85d977056836d2 Mon Sep 17 00:00:00 2001 From: Yu Shi Jie Date: Tue, 26 Nov 2024 13:21:14 -0500 Subject: [PATCH 18/18] Qwen2.5: relaxed assert_close for test_lora.py gemma-2 --- tests/test_lora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lora.py b/tests/test_lora.py index f14323b38a..079d900d0b 100644 --- a/tests/test_lora.py +++ b/tests/test_lora.py @@ -674,7 +674,7 @@ def test_against_original_gemma_2(model_name): assert x.size(1) == T ours_y = ours_model(x) theirs_y = theirs_model(x)["logits"].to(dtype) # HF converts logits to float - torch.testing.assert_close(ours_y, theirs_y) + torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5) @RunIf(min_cuda_gpus=1)