diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
index a8aa51b14..0c702d658 100644
--- a/lmdeploy/turbomind/deploy/source_model/llama.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -153,6 +153,7 @@ def model_info(self):
             max_position_embeddings = int(
                 model_arg.get('max_position_embeddings', 0))
             rope_scaling = model_arg.get('rope_scaling', None)
+            head_dim = model_arg.get('head_dim', hidden_units // attn_head_num)
             scaling_factor = 0.0
             use_dynamic_ntk = 0
             scaling_type = ''
@@ -189,7 +190,7 @@ def model_info(self):
                     beta_slow = rope_scaling.get('beta_slow', 1.0)
 
         return dict(
-            size_per_head=hidden_units // attn_head_num,
+            size_per_head=head_dim,
             rotary_embedding=hidden_units // attn_head_num,
             num_layer=num_layer,
             norm_eps=norm_eps,