From 151ef4efd2fb52554f4d30408aca619e181ea751 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 7 Oct 2024 19:55:12 +0800 Subject: [PATCH] [Model] Support NVLM-D and fix QK Norm in InternViT (#9045) Co-authored-by: Roger Wang Co-authored-by: Isotr0py --- docs/source/models/supported_models.rst | 9 + examples/offline_inference_vision_language.py | 55 +++- ...e_inference_vision_language_multi_image.py | 34 ++ vllm/entrypoints/chat_utils.py | 2 +- vllm/model_executor/layers/layernorm.py | 32 +- vllm/model_executor/models/intern_vit.py | 206 +++++++----- vllm/model_executor/models/internvl.py | 294 +++++++++++------- vllm/model_executor/models/nvlm_d.py | 64 ++++ vllm/model_executor/models/registry.py | 37 +-- vllm/transformers_utils/config.py | 7 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/nvlm_d.py | 12 + 12 files changed, 518 insertions(+), 236 deletions(-) create mode 100644 vllm/model_executor/models/nvlm_d.py create mode 100644 vllm/transformers_utils/configs/nvlm_d.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index dea109cb17f58..084607c155cb0 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -315,6 +315,9 @@ Multimodal Language Models .. _supported_vlms: +Text Generation +--------------- + .. list-table:: :widths: 25 25 25 25 5 5 :header-rows: 1 @@ -384,7 +387,13 @@ Multimodal Language Models - Image - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - + - + * - :code:`NVLM_D_Model` + - NVLM-D 1.0 + - Image\ :sup:`E+` + - :code:`nvidia/NVLM-D-72B`, etc. - + - ✅︎ * - :code:`PaliGemmaForConditionalGeneration` - PaliGemma - Image\ :sup:`E` diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index b94ef537d783f..efad7e33793df 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -18,7 +18,7 @@ # LLaVA-1.5 -def run_llava(question, modality): +def run_llava(question: str, modality: str): assert modality == "image" prompt = f"USER: \n{question}\nASSISTANT:" @@ -29,7 +29,7 @@ def run_llava(question, modality): # LLaVA-1.6/LLaVA-NeXT -def run_llava_next(question, modality): +def run_llava_next(question: str, modality: str): assert modality == "image" prompt = f"[INST] \n{question} [/INST]" @@ -40,7 +40,7 @@ def run_llava_next(question, modality): # LlaVA-NeXT-Video # Currently only support for video input -def run_llava_next_video(question, modality): +def run_llava_next_video(question: str, modality: str): assert modality == "video" prompt = f"USER: