diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 6e5c4826da3d3..76c04ce66fc2e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1037,9 +1037,17 @@ def load_model(self) -> None: assert supports_lora( self.model ), f"{self.model.__class__.__name__} does not support LoRA yet." + if supports_multimodal(self.model): logger.warning("Regarding multimodal models, vLLM currently " "only supports adding LoRA to language model.") + # It's necessary to distinguish between the max_position_embeddings + # of VLMs and LLMs. + if hasattr(self.model.config, "max_position_embeddings"): + max_pos_embeddings = self.model.config.max_position_embeddings + else: + max_pos_embeddings = ( + self.model.config.text_config.max_position_embeddings) self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, @@ -1049,8 +1057,7 @@ def load_model(self) -> None: self.device, self.model.embedding_modules, self.model.embedding_padding_modules, - max_position_embeddings=self.model.config. - max_position_embeddings, + max_position_embeddings=max_pos_embeddings, ) self.model = self.lora_manager.create_lora_manager(self.model)