[Model] Added GLM-4 series hf format model support vllm==0.6.4 (#10561)

Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
vllm-project · Nov 28, 2024 · 5fc5ce0 · 5fc5ce0
1 parent 3ed5e73
commit 5fc5ce0
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 1 deletion.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -139,6 +139,11 @@ Text Generation
     - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc.
     - ✅︎
     - ✅︎
+  * - :code:`GlmForCausalLM`
+    - GLM-4
+    - :code:`THUDM/glm-4-9b-chat-hf`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.

diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -63,6 +63,7 @@ class _HfExamplesInfo:
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
     "GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
     "GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"),

diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
@@ -11,7 +11,7 @@
 
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
-    if (model_arch == "Idefics3ForConditionalGeneration"
+    if (model_arch in {"Idefics3ForConditionalGeneration", "GlmForCausalLM"}
             and transformers.__version__ < "4.46.0"):
         pytest.skip(reason="Model introduced in HF >= 4.46.0")
 

diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
@@ -0,0 +1,21 @@
+"""Inference-only HF format GLM-4 model compatible with THUDM weights."""
+from vllm.config import VllmConfig
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+from .utils import PPMissingLayer
+
+
+class GlmForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # Hack Llama model to fit HF format GLM implementation
+        # Attention difference between GLM and Llama:
+        # 1. Half partial rotary_dim and no Neox style.
+        # 2. There is no bias for o_proj in attention
+        for layer in self.model.layers:
+            if not isinstance(layer, PPMissingLayer):
+                layer.self_attn.rotary_emb.rotary_dim //= 2
+                layer.self_attn.rotary_emb.is_neox_style = False
+                layer.self_attn.o_proj.bias = None
+                layer.self_attn.o_proj.skip_bias_add = True
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -48,6 +48,7 @@
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
+    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
@@ -107,6 +108,7 @@
     "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"),
+    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "LlamaModel": ("llama", "LlamaEmbeddingModel"),
     **{
         # Multiple models share the same architecture, so we include them all