From d78789ac16870809d64378105f200049cae95112 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 30 Aug 2024 03:54:49 +0800 Subject: [PATCH] [Bugfix] Fix incorrect vocal embedding shards for GGUF model in tensor parallelism (#7954) --- vllm/model_executor/layers/vocab_parallel_embedding.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 3ba15573c217b..b26a3227e6931 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -351,7 +351,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param.weight_type = loaded_weight.item() return elif isinstance(param, UninitializedParameter): - param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) + shape = list(loaded_weight.shape) + if output_dim is not None: + shape[output_dim] = shape[output_dim] // self.tp_size + param.materialize(tuple(shape), dtype=loaded_weight.dtype) # If parameter does not have output dim, then it should # be copied onto all gpus (e.g. g_idx for act_order gptq).