diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py index a8aa51b14..0c702d658 100644 --- a/lmdeploy/turbomind/deploy/source_model/llama.py +++ b/lmdeploy/turbomind/deploy/source_model/llama.py @@ -153,6 +153,7 @@ def model_info(self): max_position_embeddings = int( model_arg.get('max_position_embeddings', 0)) rope_scaling = model_arg.get('rope_scaling', None) + head_dim = model_arg.get('head_dim', hidden_units // attn_head_num) scaling_factor = 0.0 use_dynamic_ntk = 0 scaling_type = '' @@ -189,7 +190,7 @@ def model_info(self): beta_slow = rope_scaling.get('beta_slow', 1.0) return dict( - size_per_head=hidden_units // attn_head_num, + size_per_head=head_dim, rotary_embedding=hidden_units // attn_head_num, num_layer=num_layer, norm_eps=norm_eps,