diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index dea109cb17f58..084607c155cb0 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -315,6 +315,9 @@ Multimodal Language Models .. _supported_vlms: +Text Generation +--------------- + .. list-table:: :widths: 25 25 25 25 5 5 :header-rows: 1 @@ -384,7 +387,13 @@ Multimodal Language Models - Image - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - + - + * - :code:`NVLM_D_Model` + - NVLM-D 1.0 + - Image\ :sup:`E+` + - :code:`nvidia/NVLM-D-72B`, etc. - + - ✅︎ * - :code:`PaliGemmaForConditionalGeneration` - PaliGemma - Image\ :sup:`E` diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index b94ef537d783f..efad7e33793df 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -18,7 +18,7 @@ # LLaVA-1.5 -def run_llava(question, modality): +def run_llava(question: str, modality: str): assert modality == "image" prompt = f"USER: \n{question}\nASSISTANT:" @@ -29,7 +29,7 @@ def run_llava(question, modality): # LLaVA-1.6/LLaVA-NeXT -def run_llava_next(question, modality): +def run_llava_next(question: str, modality: str): assert modality == "image" prompt = f"[INST] \n{question} [/INST]" @@ -40,7 +40,7 @@ def run_llava_next(question, modality): # LlaVA-NeXT-Video # Currently only support for video input -def run_llava_next_video(question, modality): +def run_llava_next_video(question: str, modality: str): assert modality == "video" prompt = f"USER: