diff --git a/README.md b/README.md index 6092db7ad8..87389e10fb 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,8 @@ Every model is written from scratch to maximize performance and remove layers of | Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) | | Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) | | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) | +| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) | +| Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186) | | StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) | | StableLM | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM) | | StableLM Zephyr | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) | diff --git a/litgpt/adapter_v2.py b/litgpt/adapter_v2.py index f5e6069343..1ad3d40b9d 100644 --- a/litgpt/adapter_v2.py +++ b/litgpt/adapter_v2.py @@ -163,7 +163,7 @@ def __init__(self, config: Config, block_idx: int) -> None: nn.Module.__init__(self) shape = (config.n_head + 2 * config.n_query_groups) * config.head_size # key, query, value projections for all heads, but in a batch - self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias) + self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias or config.attn_bias) # output projection # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` self.proj = AdapterV2Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias) diff --git a/litgpt/config.py b/litgpt/config.py index b218df849c..5884433372 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -31,6 +31,7 @@ class Config: parallel_residual: bool = True bias: bool = True lm_head_bias: bool = False + attn_bias: bool = False # to use multi-head attention (MHA), set this to `n_head` (default) # to use multi-query attention (MQA), set this to 1 # to use grouped-query attention (GQA), set this to a value in between @@ -1704,4 +1705,295 @@ def norm_class(self) -> Type: configs.extend(llama_2_function_calling) +########## +# Qwen2.5 +########## +qwen_2_5 = [ + # https://huggingface.co/Qwen/Qwen2.5-0.5B/blob/main/config.json + dict( + name="Qwen2.5-0.5B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-0.5B{}"), + block_size=32768, + vocab_size=151643, + padded_vocab_size=151936, + n_layer=24, + n_head=14, + n_embd=896, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=4864, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-1.5B/blob/main/config.json + dict( + name="Qwen2.5-1.5B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-1.5B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=151936, + n_layer=28, + n_head=12, + n_embd=1536, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=8960, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-3B/blob/main/config.json + dict( + name="Qwen2.5-3B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-3B{}"), + block_size=32768, + vocab_size=151643, + padded_vocab_size=151936, + n_layer=36, + n_head=16, + n_embd=2048, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=11008, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-7B/blob/main/config.json + dict( + name="Qwen2.5-7B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-7B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=152064, + n_layer=28, + n_head=28, + n_embd=3584, + n_query_groups=4, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=18944, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-14B/blob/main/config.json + dict( + name="Qwen2.5-14B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-14B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=152064, + n_layer=48, + n_head=40, + n_embd=5120, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=13824, + norm_eps=1e-5, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-32B/blob/main/config.json + dict( + name="Qwen2.5-32B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-32B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=152064, + n_layer=64, + n_head=40, + n_embd=5120, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=27648, + norm_eps=1e-5, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-72B/blob/main/config.json + dict( + name="Qwen2.5-72B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-72B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=152064, + n_layer=80, + n_head=64, + n_embd=8192, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=29568, + norm_eps=1e-5, + rope_base=1000000 + ), +] + +qwen_2_5_coder = [ + # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B/blob/main/config.json + dict( + name="Qwen2.5-Coder-0.5B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-0.5B{}"), + block_size=32768, + vocab_size=151643, + padded_vocab_size=151936, + n_layer=24, + n_head=14, + n_embd=896, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=4864, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B/blob/main/config.json + dict( + name="Qwen2.5-Coder-1.5B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-1.5B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=151936, + n_layer=28, + n_head=12, + n_embd=1536, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=8960, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-Coder-3B/blob/main/config.json + dict( + name="Qwen2.5-Coder-3B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-3B{}"), + block_size=32768, + vocab_size=151643, + padded_vocab_size=151936, + n_layer=36, + n_head=16, + n_embd=2048, + n_query_groups=2, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=11008, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-Coder-7B/blob/main/config.json + dict( + name="Qwen2.5-Coder-7B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-7B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=152064, + n_layer=28, + n_head=28, + n_embd=3584, + n_query_groups=4, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=18944, + norm_eps=1e-6, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-Coder-14B/blob/main/config.json + dict( + name="Qwen2.5-Coder-14B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-14B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=152064, + n_layer=48, + n_head=40, + n_embd=5120, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=13824, + norm_eps=1e-5, + rope_base=1000000 + ), + # https://huggingface.co/Qwen/Qwen2.5-Coder-32B/blob/main/config.json + dict( + name="Qwen2.5-Coder-32B{}", + hf_config=dict(org="Qwen", name="Qwen2.5-Coder-32B{}"), + block_size=131072, + vocab_size=151643, + padded_vocab_size=152064, + n_layer=64, + n_head=40, + n_embd=5120, + n_query_groups=8, + rotary_percentage=1.0, + parallel_residual=False, + bias=False, + attn_bias=True, + norm_class_name="RMSNorm", + mlp_class_name="LLaMAMLP", + intermediate_size=27648, + norm_eps=1e-5, + rope_base=1000000 + ), +] + +qwen_2_5.extend(qwen_2_5_coder) + +for c in qwen_2_5: + for kind in ("", "-Instruct"): + copy = deepcopy(c) + copy["name"] = c["name"].format(kind) + copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + configs.append(copy) + name_to_config = {config["name"]: config for config in configs} diff --git a/litgpt/lora.py b/litgpt/lora.py index a540961837..18a472337b 100644 --- a/litgpt/lora.py +++ b/litgpt/lora.py @@ -609,7 +609,7 @@ def __init__(self, config: Config, block_idx: int) -> None: lora_alpha=config.lora_alpha, lora_dropout=config.lora_dropout, enable_lora=(config.lora_query, config.lora_key, config.lora_value), - bias=config.bias, + bias=config.bias or config.attn_bias, # for MQA/GQA support head_size=config.head_size, n_head=config.n_head, diff --git a/litgpt/model.py b/litgpt/model.py index b60b0506b6..17b3b4ab04 100644 --- a/litgpt/model.py +++ b/litgpt/model.py @@ -244,7 +244,7 @@ def __init__(self, config: Config, block_idx: int) -> None: super().__init__() shape = (config.n_head + 2 * config.n_query_groups) * config.head_size # key, query, value projections for all heads, but in a batch - self.attn = nn.Linear(config.n_embd, shape, bias=config.bias) + self.attn = nn.Linear(config.n_embd, shape, bias=config.bias or config.attn_bias) # output projection # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head` self.proj = nn.Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias) diff --git a/litgpt/prompts.py b/litgpt/prompts.py index 09fb86676c..e11c0e9583 100644 --- a/litgpt/prompts.py +++ b/litgpt/prompts.py @@ -279,6 +279,12 @@ def apply(self, prompt: str, **kwargs: str) -> str: class OLMo(PromptStyle): def apply(self, prompt: str, **kwargs: str) -> str: return f"<|endoftext|><|user|>\n{prompt}\n<|assistant|>\n" + + +class Qwen2_5(PromptStyle): + def apply(self, prompt: str, **kwargs: str) -> str: + system_message = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." + return f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" # Maps prompt style names to PromptStyle classes @@ -304,6 +310,7 @@ def apply(self, prompt: str, **kwargs: str) -> str: "gemma": Gemma, "llama3": Llama3, "olmo": OLMo, + "qwen2.5": Qwen2_5, } @@ -342,6 +349,8 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle: return Gemma() if re.search(r"OLMo.*-hf", model_name): return OLMo() + if re.search(r"Qwen2\.5-.*", model_name): + return Qwen2_5() return Default() diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py index 0fbb99bd49..7ac7254d60 100644 --- a/litgpt/scripts/convert_hf_checkpoint.py +++ b/litgpt/scripts/convert_hf_checkpoint.py @@ -407,6 +407,81 @@ def copy_weights_phi( if progress_per_file is not None: pbar.update(progress_per_file) +def copy_weights_qwen_2_5( + config: Config, + qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]], + state_dict: Dict[str, torch.Tensor], + hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + saver: Optional[incremental_save] = None, + dtype: Optional[torch.dtype] = None, + pbar: Optional[tqdm] = None, + progress_per_file: Optional[float] = None, + debug_mode: Optional[bool] = False +) -> None: + weight_map = { + "model.embed_tokens.weight": "transformer.wte.weight", + "model.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight", + "model.layers.{}.self_attn.q_proj.weight": None, + "model.layers.{}.self_attn.k_proj.weight": None, + "model.layers.{}.self_attn.v_proj.weight": None, + "model.layers.{}.self_attn.q_proj.bias": None, + "model.layers.{}.self_attn.k_proj.bias": None, + "model.layers.{}.self_attn.v_proj.bias": None, + "model.layers.{}.self_attn.o_proj.weight": "transformer.h.{}.attn.proj.weight", + "model.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.norm_2.weight", + "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{}.mlp.fc_1.weight", + "model.layers.{}.mlp.up_proj.weight": "transformer.h.{}.mlp.fc_2.weight", + "model.layers.{}.mlp.down_proj.weight": "transformer.h.{}.mlp.proj.weight", + "model.norm.weight": "transformer.ln_f.weight", + "lm_head.weight": "lm_head.weight", + } + + if progress_per_file is not None: + progress_per_file = progress_per_file / max(1, len(hf_weights) + len(qkv_weights)) + + for name, param in hf_weights.items(): + if "model.layers" in name: + from_name, l = layer_template(name, 2) + qkv = qkv_weights.setdefault(l, defaultdict(dict)) + if any(w in from_name for w in ("q_proj", "k_proj", "v_proj")): + weight_name, weight_type = from_name.split(".")[-2:] + qkv[weight_type][weight_name] = param + to_name = weight_map[from_name] + if to_name is None: + continue + to_name = to_name.format(l) + else: + to_name = weight_map[name] + param = load_param(param, name, dtype, verbose=debug_mode) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + + if progress_per_file is not None: + pbar.update(progress_per_file) + + if "lm_head.weight" not in state_dict: + state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"] + + for i in list(qkv_weights): + for weight_type in list(qkv_weights[i]): + qkv = qkv_weights[i][weight_type] + if len(qkv) != 3: + # split across different .bin files + continue + q = load_param(qkv["q_proj"], f"layer {i} q {weight_type}", dtype, verbose=debug_mode) + k = load_param(qkv["k_proj"], f"layer {i} k {weight_type}", dtype, verbose=debug_mode) + v = load_param(qkv["v_proj"], f"layer {i} v {weight_type}", dtype, verbose=debug_mode) + q_per_kv = config.n_head // config.n_query_groups + qs = torch.split(q, config.head_size * q_per_kv) + ks = torch.split(k, config.head_size) + vs = torch.split(v, config.head_size) + cycled = [t for group in zip(qs, ks, vs) for t in group] + qkv = torch.cat(cycled) + state_dict[f"transformer.h.{i}.attn.attn.{weight_type}"] = qkv + del qkv_weights[i][weight_type] + if progress_per_file is not None: + pbar.update(progress_per_file) def qkv_reassemble(param: Union[torch.Tensor, NotYetLoadedTensor], config: Config) -> torch.Tensor: """Reassemble from a normal to an interleaved placement in a QKV matrix. @@ -487,6 +562,10 @@ def convert_hf_checkpoint( # holder to reconstitute the split q, k, v qkv_weights = {} copy_fn = partial(copy_weights_phi, config, qkv_weights) + elif model_name.lower().startswith("qwen2.5"): + # holder to reconstitute the split q, k, v + qkv_weights = {} + copy_fn = partial(copy_weights_qwen_2_5, config, qkv_weights) elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"): # holder to reconstitute the split q, k, v qkv_weights = {} diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py index 3fce585538..e6bdf78abc 100644 --- a/litgpt/scripts/convert_lit_checkpoint.py +++ b/litgpt/scripts/convert_lit_checkpoint.py @@ -298,6 +298,52 @@ def copy_weights_phi( state_dict[layer_name] = weight del gate_up_proj_weights[i] +def copy_weights_qwen_2_5( + config: Config, + state_dict: Dict[str, torch.Tensor], + lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]], + untie_weights: bool = False, + saver: Optional[incremental_save] = None, +) -> None: + weight_map = { + "transformer.wte.weight": "model.embed_tokens.weight", + "transformer.h.{}.norm_1.weight": "model.layers.{}.input_layernorm.weight", + "transformer.h.{}.norm_2.weight": "model.layers.{}.post_attention_layernorm.weight", + "transformer.h.{}.attn.proj.weight": "model.layers.{}.self_attn.o_proj.weight", + "transformer.h.{}.mlp.fc_1.weight": "model.layers.{}.mlp.gate_proj.weight", + "transformer.h.{}.mlp.fc_2.weight": "model.layers.{}.mlp.up_proj.weight", + "transformer.h.{}.mlp.proj.weight": "model.layers.{}.mlp.down_proj.weight", + "transformer.ln_f.weight": "model.norm.weight", + "lm_head.weight": "lm_head.weight", + } + + for name, param in lit_weights.items(): + if name == "lm_head.weight" and untie_weights: + continue + if name.endswith((".attn.attn.weight", ".attn.attn.bias")): + from_name, l_idx = layer_template(name, 2) + qkv = load_param(param, name, None) + qp, kp, vp = qkv_split(qkv, config) + + weight_type = name.split(".")[-1] # weight or bias + q = f"model.layers.{l_idx}.self_attn.q_proj.{weight_type}" + k = f"model.layers.{l_idx}.self_attn.k_proj.{weight_type}" + v = f"model.layers.{l_idx}.self_attn.v_proj.{weight_type}" + for to_name, param in zip((q, k, v), (qp, kp, vp)): + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param + else: + if "transformer.h" in name: + from_name, l_idx = layer_template(name, 2) + to_name = weight_map[from_name] + to_name = to_name.format(l_idx) + else: + to_name = weight_map[name] + param = load_param(param, name, None) + if saver is not None: + param = saver.store_early(param) + state_dict[to_name] = param def qkv_split( param: Union[torch.Tensor, NotYetLoadedTensor], config: Config @@ -341,6 +387,8 @@ def convert_lit_checkpoint(checkpoint_dir: Path, output_dir: Path) -> None: copy_fn = partial(copy_weights_gemma_2, config) elif config.name.lower().startswith("phi"): copy_fn = partial(copy_weights_phi, config) + elif config.name.lower().startswith("qwen2.5"): + copy_fn = partial(copy_weights_qwen_2_5, config) elif config.mlp_class_name in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"): untie_weights = "Gemma" in config.name copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights) diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py index f2e0b48459..9f27b80d21 100644 --- a/tests/test_convert_lit_checkpoint.py +++ b/tests/test_convert_lit_checkpoint.py @@ -15,6 +15,7 @@ from transformers.models.llama import LlamaConfig, LlamaForCausalLM from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM from transformers.models.olmo import OlmoConfig, OlmoForCausalLM +from transformers.models.qwen2 import Qwen2Config, Qwen2ForCausalLM from litgpt import GPT, Config from litgpt.scripts.convert_lit_checkpoint import ( @@ -25,6 +26,7 @@ copy_weights_gpt_neox, copy_weights_llama, copy_weights_phi, + copy_weights_qwen_2_5, qkv_split, ) from tests.conftest import RunIf @@ -520,6 +522,69 @@ def test_check_conversion_supported_lora(): with pytest.raises(ValueError, match=r"LoRA.*cannot be converted"): check_conversion_supported(lit_weights=lit_weights) +@torch.inference_mode() +@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B")) +@pytest.mark.parametrize( + ("device", "dtype"), + [ + (torch.device("cpu"), torch.float32), + pytest.param( + torch.device("cuda"), + torch.float16, + marks=[ + # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input + # is slightly different + pytest.mark.xfail(raises=AssertionError, strict=False), + RunIf(min_cuda_gpus=1), + ], + ), + ], +) +def test_against_original_qwen_2_5(model_name, device, dtype): + torch.set_default_dtype(dtype) + + T = 20 + ours_config = Config.from_name( + model_name, + block_size=T, + n_layer=2, + n_head=16, + n_embd=32, + intermediate_size=86, + ) + theirs_config = Qwen2Config( + vocab_size=ours_config.padded_vocab_size, + hidden_size=ours_config.n_embd, + head_dim=ours_config.head_size, + num_attention_heads=ours_config.n_head, + num_hidden_layers=ours_config.n_layer, + intermediate_size=ours_config.intermediate_size, + max_position_embeddings=ours_config.block_size, + rms_norm_eps=ours_config.norm_eps, + num_key_value_heads=ours_config.n_query_groups, + rope_theta=ours_config.rope_base, + attention_bias=ours_config.attn_bias, + tie_word_embeddings=True, + ) + + assert ours_config.intermediate_size == theirs_config.intermediate_size + + ours_model = GPT(ours_config).to(device) + # tie weights + ours_model.lm_head.weight = ours_model.transformer.wte.weight + ours_state_dict = ours_model.state_dict() + theirs_state_dict = {} + copy_weights_qwen_2_5(ours_config, theirs_state_dict, ours_state_dict, untie_weights=True) + theirs_model = Qwen2ForCausalLM(theirs_config).to(device) + keys = theirs_model.load_state_dict(theirs_state_dict, strict=False) + assert not keys.unexpected_keys + + # test end to end + x = torch.randint(low=0, high=ours_config.padded_vocab_size, size=(T,), device=device).unsqueeze(0) + assert x.size(1) == T + ours_y = ours_model(x) + theirs_y = theirs_model(x)["logits"].to(dtype) # HF converts logits to float + torch.testing.assert_close(ours_y, theirs_y) def test_qkv_split(): # MHA diff --git a/tests/test_lora.py b/tests/test_lora.py index f14323b38a..079d900d0b 100644 --- a/tests/test_lora.py +++ b/tests/test_lora.py @@ -674,7 +674,7 @@ def test_against_original_gemma_2(model_name): assert x.size(1) == T ours_y = ours_model(x) theirs_y = theirs_model(x)["logits"].to(dtype) # HF converts logits to float - torch.testing.assert_close(ours_y, theirs_y) + torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5) @RunIf(min_cuda_gpus=1) diff --git a/tests/test_model.py b/tests/test_model.py index f2ec330f14..1a9a94efd5 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -28,6 +28,7 @@ from transformers.models.mistral import MistralConfig, MistralForCausalLM from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM from transformers.models.olmo import OlmoConfig, OlmoForCausalLM +from transformers.models.qwen2 import Qwen2Config, Qwen2ForCausalLM import litgpt.config as config_module from litgpt.model import batched_index_copy_ @@ -38,6 +39,7 @@ copy_weights_gpt_neox, copy_weights_hf_llama, copy_weights_phi, + copy_weights_qwen_2_5, ) from tests.conftest import RunIf @@ -787,6 +789,67 @@ def test_against_original_gemma_2(model_name, device, dtype): torch.testing.assert_close(ours_y, theirs_y, rtol=3e-5, atol=3e-5) +@torch.inference_mode() +@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B")) +@pytest.mark.parametrize( + ("device", "dtype"), + [ + (torch.device("cpu"), torch.float32), + pytest.param( + torch.device("cuda"), + torch.float16, + marks=[ + # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input + # is slightly different + pytest.mark.xfail(raises=AssertionError, strict=False), + RunIf(min_cuda_gpus=1), + ], + ), + ], +) +def test_against_original_qwen_2_5(model_name, device, dtype): + torch.set_default_dtype(dtype) + + T = 20 + ours_config = Config.from_name( + model_name, + block_size=T, + n_layer=2, + n_head=16, + n_embd=32, + intermediate_size=86, + ) + theirs_config = Qwen2Config( + vocab_size=ours_config.padded_vocab_size, + hidden_size=ours_config.n_embd, + head_dim=ours_config.head_size, + num_attention_heads=ours_config.n_head, + num_hidden_layers=ours_config.n_layer, + intermediate_size=ours_config.intermediate_size, + max_position_embeddings=ours_config.block_size, + rms_norm_eps=ours_config.norm_eps, + num_key_value_heads=ours_config.n_query_groups, + rope_theta=ours_config.rope_base, + attention_bias=ours_config.attn_bias, + tie_word_embeddings=True, + ) + + theirs_model = Qwen2ForCausalLM(theirs_config).to(device) + theirs_state_dict = theirs_model.state_dict() + # Gemma weights are shipped without `lm_head.weight` + theirs_state_dict.pop("lm_head.weight") + state_dict = {} + copy_weights_qwen_2_5(ours_config, {}, state_dict, theirs_state_dict) + ours_model = GPT(ours_config).to(device) + ours_model.load_state_dict(state_dict) + + # test end to end + x = torch.randint(low=0, high=ours_config.padded_vocab_size, size=(T,), device=device).unsqueeze(0) + assert x.size(1) == T + ours_y = ours_model(x) + theirs_y = theirs_model(x)["logits"].to(dtype) # HF converts logits to float + torch.testing.assert_close(ours_y, theirs_y) + @RunIf(dynamo=True) @torch.inference_mode() def test_model_compile(): diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index a823eb71cd..2b30c916e9 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -56,7 +56,7 @@ def test_tokenizer_against_hf(config): else: assert ours.vocab_size == config.vocab_size - if config.name.startswith("falcon") or config.name.startswith("stablecode"): + if config.name.startswith(("falcon", "stablecode", "Qwen2.5")): # even though their config defines it, it's set as None in HF assert isinstance(ours.bos_id, int) assert theirs.bos_token_id is None diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md index 9ab0041357..50c6924f63 100644 --- a/tutorials/download_model_weights.md +++ b/tutorials/download_model_weights.md @@ -33,6 +33,8 @@ LitGPT supports a variety of LLM architectures with publicly available weights. | Phi 3 & 3.5 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) | Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) | | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) | +| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) | +| Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186) | | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1) | | StableCode | 3B | Stability AI | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding) | | StableLM | 3B, 7B | Stability AI | [Stability AI 2023](https://github.com/Stability-AI/StableLM) | @@ -161,6 +163,32 @@ nvidia/Llama-3.1-Nemotron-70B-Instruct-HF openlm-research/open_llama_13b openlm-research/open_llama_3b openlm-research/open_llama_7b +Qwen/Qwen2.5-0.5B +Qwen/Qwen2.5-0.5B-Instruct +Qwen/Qwen2.5-1.5B +Qwen/Qwen2.5-1.5B-Instruct +Qwen/Qwen2.5-3B +Qwen/Qwen2.5-3B-Instruct +Qwen/Qwen2.5-7B +Qwen/Qwen2.5-7B-Instruct +Qwen/Qwen2.5-14B +Qwen/Qwen2.5-14B-Instruct +Qwen/Qwen2.5-32B +Qwen/Qwen2.5-32B-Instruct +Qwen/Qwen2.5-72B +Qwen/Qwen2.5-72B-Instruct +Qwen/Qwen2.5-Coder-0.5B +Qwen/Qwen2.5-Coder-0.5B-Instruct +Qwen/Qwen2.5-Coder-1.5B +Qwen/Qwen2.5-Coder-1.5B-Instruct +Qwen/Qwen2.5-Coder-3B +Qwen/Qwen2.5-Coder-3B-Instruct +Qwen/Qwen2.5-Coder-7B +Qwen/Qwen2.5-Coder-7B-Instruct +Qwen/Qwen2.5-Coder-14B +Qwen/Qwen2.5-Coder-14B-Instruct +Qwen/Qwen2.5-Coder-32B +Qwen/Qwen2.5-Coder-32B-Instruct stabilityai/FreeWilly2 stabilityai/stable-code-3b stabilityai/stablecode-completion-alpha-3b