From 342b52ec930789e01c21d4439a04647eec224aba Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Wed, 9 Oct 2024 14:17:55 +0200 Subject: [PATCH 01/53] Task specific OV diffusion pipelines with no mixins (#889) * added task specific pipelines * added task specific tests * style * new diffusers modeling * style * fix * fix decoder reshape * test * fix and extend tests to all possible input types * only translate generators * dummy objects * fix import * patch work * update tests * fixes for many tests * fix * test * fix * update * warning for missing config attr * added test_safety_checker and test_height_width_properties * fix static shapes test * fix quantization * remove the need for a base OVpipeline class * fix dispatch * fix import --- optimum/exporters/openvino/model_configs.py | 65 +- optimum/intel/__init__.py | 22 + optimum/intel/openvino/__init__.py | 6 + optimum/intel/openvino/modeling_diffusion.py | 1385 +++++++++-------- optimum/intel/openvino/quantization.py | 10 +- optimum/intel/openvino/utils.py | 34 + .../dummy_openvino_and_diffusers_objects.py | 66 + setup.py | 2 +- tests/openvino/test_diffusion.py | 1028 ++++++------ tests/openvino/test_modeling.py | 19 +- 10 files changed, 1469 insertions(+), 1168 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index bb036fdc1..07c284ec2 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -33,9 +33,6 @@ MistralOnnxConfig, MPTOnnxConfig, PhiOnnxConfig, - UNetOnnxConfig, - VaeDecoderOnnxConfig, - VaeEncoderOnnxConfig, VisionOnnxConfig, ) from optimum.exporters.tasks import TasksManager @@ -119,12 +116,12 @@ def init_model_configs(): if TYPE_CHECKING: - from transformers.modeling_utils import PreTrainedModel + from transformers.modeling_utils import PreTrainedModel # noqa: F811 - from optimum.exporters.onnx.model_patcher import ModelPatcher + from optimum.exporters.onnx.model_patcher import ModelPatcher # noqa: F811 if is_tf_available(): - from transformers.modeling_tf_utils import TFPreTrainedModel + from transformers.modeling_tf_utils import TFPreTrainedModel # noqa: F811 register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True) @@ -675,62 +672,6 @@ def patch_model_for_export( return FalconModelPatcher(self, model, model_kwargs=model_kwargs) -@register_in_tasks_manager("unet", *["semantic-segmentation"], library_name="diffusers") -class UNetOpenVINOConfig(UNetOnnxConfig): - @property - def inputs(self) -> Dict[str, Dict[int, str]]: - common_inputs = { - "sample": {0: "batch_size", 2: "height", 3: "width"}, - "timestep": {0: "steps"}, - "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"}, - } - - # TODO : add text_image, image and image_embeds - if getattr(self._normalized_config, "addition_embed_type", None) == "text_time": - common_inputs["text_embeds"] = {0: "batch_size"} - common_inputs["time_ids"] = {0: "batch_size"} - - if getattr(self._normalized_config, "time_cond_proj_dim", None) is not None: - common_inputs["timestep_cond"] = {0: "batch_size"} - return common_inputs - - @property - def outputs(self) -> Dict[str, Dict[int, str]]: - return { - "out_sample": {0: "batch_size", 2: "height", 3: "width"}, - } - - -@register_in_tasks_manager("vae-encoder", *["semantic-segmentation"], library_name="diffusers") -class VaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig): - @property - def inputs(self) -> Dict[str, Dict[int, str]]: - return { - "sample": {0: "batch_size", 2: "height", 3: "width"}, - } - - @property - def outputs(self) -> Dict[str, Dict[int, str]]: - return { - "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, - } - - -@register_in_tasks_manager("vae-decoder", *["semantic-segmentation"], library_name="diffusers") -class VaeDecoderOpenVINOConfig(VaeDecoderOnnxConfig): - @property - def inputs(self) -> Dict[str, Dict[int, str]]: - return { - "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, - } - - @property - def outputs(self) -> Dict[str, Dict[int, str]]: - return { - "sample": {0: "batch_size", 2: "height", 3: "width"}, - } - - @register_in_tasks_manager( "persimmon", *[ diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index fc6b0a775..5926f1869 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -99,7 +99,13 @@ "OVStableDiffusionInpaintPipeline", "OVStableDiffusionXLPipeline", "OVStableDiffusionXLImg2ImgPipeline", + "OVStableDiffusionXLInpaintPipeline", "OVLatentConsistencyModelPipeline", + "OVLatentConsistencyModelImg2ImgPipeline", + "OVPipelineForImage2Image", + "OVPipelineForText2Image", + "OVPipelineForInpainting", + "OVDiffusionPipeline", ] else: _import_structure["openvino"].extend( @@ -109,7 +115,13 @@ "OVStableDiffusionInpaintPipeline", "OVStableDiffusionXLPipeline", "OVStableDiffusionXLImg2ImgPipeline", + "OVStableDiffusionXLInpaintPipeline", "OVLatentConsistencyModelPipeline", + "OVLatentConsistencyModelImg2ImgPipeline", + "OVPipelineForImage2Image", + "OVPipelineForText2Image", + "OVPipelineForInpainting", + "OVDiffusionPipeline", ] ) @@ -250,7 +262,11 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from .utils.dummy_openvino_and_diffusers_objects import ( + OVDiffusionPipeline, OVLatentConsistencyModelPipeline, + OVPipelineForImage2Image, + OVPipelineForInpainting, + OVPipelineForText2Image, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, @@ -259,11 +275,17 @@ ) else: from .openvino import ( + OVDiffusionPipeline, + OVLatentConsistencyModelImg2ImgPipeline, OVLatentConsistencyModelPipeline, + OVPipelineForImage2Image, + OVPipelineForInpainting, + OVPipelineForText2Image, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, OVStableDiffusionXLImg2ImgPipeline, + OVStableDiffusionXLInpaintPipeline, OVStableDiffusionXLPipeline, ) diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 9f3e983ff..549bf8170 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -81,11 +81,17 @@ if is_diffusers_available(): from .modeling_diffusion import ( + OVDiffusionPipeline, + OVLatentConsistencyModelImg2ImgPipeline, OVLatentConsistencyModelPipeline, + OVPipelineForImage2Image, + OVPipelineForInpainting, + OVPipelineForText2Image, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, OVStableDiffusionXLImg2ImgPipeline, + OVStableDiffusionXLInpaintPipeline, OVStableDiffusionXLPipeline, ) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 5c80fe255..81dc085df 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -13,41 +13,47 @@ # limitations under the License. import importlib +import inspect import logging import os import shutil +from abc import abstractmethod +from collections import OrderedDict from copy import deepcopy from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Any, Dict, List, Optional, Union +from tempfile import TemporaryDirectory, gettempdir +from typing import Any, Dict, Optional, Union import numpy as np import openvino -import PIL -from diffusers import ( - DDIMScheduler, - LMSDiscreteScheduler, - PNDMScheduler, +import torch +from diffusers.configuration_utils import ConfigMixin +from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution +from diffusers.pipelines import ( + AutoPipelineForImage2Image, + AutoPipelineForInpainting, + AutoPipelineForText2Image, + DiffusionPipeline, + LatentConsistencyModelImg2ImgPipeline, + LatentConsistencyModelPipeline, + StableDiffusionImg2ImgPipeline, + StableDiffusionInpaintPipeline, StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, + StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, ) -from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from diffusers.schedulers import SchedulerMixin from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME -from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available +from diffusers.utils.constants import CONFIG_NAME from huggingface_hub import snapshot_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from huggingface_hub.utils import validate_hf_hub_args from openvino._offline_transformations import compress_model_transformation from openvino.runtime import Core from transformers import CLIPFeatureExtractor, CLIPTokenizer +from transformers.modeling_outputs import ModelOutput -from optimum.pipelines.diffusers.pipeline_latent_consistency import LatentConsistencyPipelineMixin -from optimum.pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin -from optimum.pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin -from optimum.pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin -from optimum.pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin -from optimum.pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin -from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor from optimum.utils import ( DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, @@ -58,9 +64,15 @@ from ...exporters.openvino import main_export from .configuration import OVConfig, OVQuantizationMethod, OVWeightQuantizationConfig -from .loaders import OVTextualInversionLoaderMixin -from .modeling_base import OVBaseModel, OVModelPart -from .utils import ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME +from .modeling_base import OVBaseModel +from .utils import ( + ONNX_WEIGHTS_NAME, + OV_TO_PT_TYPE, + OV_XML_FILE_NAME, + _print_compiled_model_properties, + model_has_dynamic_inputs, + np_to_pt_generators, +) core = Core() @@ -68,96 +80,129 @@ logger = logging.getLogger(__name__) -class OVStableDiffusionPipelineBase(OVBaseModel, OVTextualInversionLoaderMixin): - auto_model_class = StableDiffusionPipeline +# TODO: support DiffusionPipeline.from_pipe() +# TODO: makes more sense to have a compositional OVMixin class +# TODO: instead of one bloated __init__, we should consider an __init__ per pipeline +class OVDiffusionPipeline(OVBaseModel, DiffusionPipeline): + auto_model_class = DiffusionPipeline config_name = "model_index.json" - export_feature = "text-to-image" _library_name = "diffusers" def __init__( self, + scheduler: SchedulerMixin, unet: openvino.runtime.Model, - config: Dict[str, Any], - scheduler: Union["DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler"], - vae_decoder: Optional[openvino.runtime.Model] = None, + vae_decoder: openvino.runtime.Model, + # optional pipeline models vae_encoder: Optional[openvino.runtime.Model] = None, text_encoder: Optional[openvino.runtime.Model] = None, text_encoder_2: Optional[openvino.runtime.Model] = None, - tokenizer: Optional["CLIPTokenizer"] = None, - tokenizer_2: Optional["CLIPTokenizer"] = None, - feature_extractor: Optional["CLIPFeatureExtractor"] = None, - safety_checker: Optional["StableDiffusionSafetyChecker"] = None, + # optional pipeline submodels + tokenizer: Optional[CLIPTokenizer] = None, + tokenizer_2: Optional[CLIPTokenizer] = None, + feature_extractor: Optional[CLIPFeatureExtractor] = None, + # stable diffusion xl specific arguments + force_zeros_for_empty_prompt: bool = True, + requires_aesthetics_score: bool = False, + add_watermarker: Optional[bool] = None, + # openvino specific arguments device: str = "CPU", - dynamic_shapes: bool = True, compile: bool = True, + compile_only: bool = False, + dynamic_shapes: bool = True, ov_config: Optional[Dict[str, str]] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): - self._internal_dict = config self._device = device.upper() self.is_dynamic = dynamic_shapes + self._compile_only = compile_only + self.model_save_dir = model_save_dir self.ov_config = {} if ov_config is None else {**ov_config} - self._compile_only = kwargs.get("compile_only", False) - - # This attribute is needed to keep one reference on the temporary directory, since garbage collecting - # would end-up removing the directory containing the underlying OpenVINO model - self._model_save_dir_tempdirectory_instance = None - if isinstance(model_save_dir, TemporaryDirectory): - self._model_save_dir_tempdirectory_instance = model_save_dir - self._model_save_dir = Path(model_save_dir.name) - elif isinstance(model_save_dir, str): - self._model_save_dir = Path(model_save_dir) - else: - self._model_save_dir = model_save_dir + self.preprocessors = kwargs.get("preprocessors", []) + + if self._compile_only: + if not compile: + raise ValueError( + "`compile_only` mode does not support disabling compilation." + "Please provide `compile=True` if you want to use `compile_only=True` or set `compile_only=False`" + ) + + if not isinstance(unet, openvino.runtime.CompiledModel): + raise ValueError("`compile_only` expect that already compiled model will be provided") - self.vae_decoder = OVModelVaeDecoder(vae_decoder, self) - self.unet = OVModelUnet(unet, self) - self.text_encoder = OVModelTextEncoder(text_encoder, self) if text_encoder is not None else None + model_is_dynamic = model_has_dynamic_inputs(unet) + if dynamic_shapes ^ model_is_dynamic: + requested_shapes = "dynamic" if dynamic_shapes else "static" + compiled_shapes = "dynamic" if model_is_dynamic else "static" + raise ValueError( + f"Provided compiled model with {compiled_shapes} shapes but requested to use {requested_shapes}. " + f"Please set `compile_only=False` or `dynamic_shapes={model_is_dynamic}`" + ) + + self.unet = OVModelUnet(unet, self, DIFFUSION_MODEL_UNET_SUBFOLDER) + self.vae_decoder = OVModelVaeDecoder(vae_decoder, self, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER) + self.vae_encoder = ( + OVModelVaeEncoder(vae_encoder, self, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) + if vae_encoder is not None + else None + ) + self.text_encoder = ( + OVModelTextEncoder(text_encoder, self, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER) + if text_encoder is not None + else None + ) self.text_encoder_2 = ( - OVModelTextEncoder(text_encoder_2, self, model_name=DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER) + OVModelTextEncoder(text_encoder_2, self, DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER) if text_encoder_2 is not None else None ) - self.vae_encoder = OVModelVaeEncoder(vae_encoder, self) if vae_encoder is not None else None - - if "block_out_channels" in self.vae_decoder.config: - self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1) - else: - self.vae_scale_factor = 8 - - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API + self.vae = OVModelVae(decoder=self.vae_decoder, encoder=self.vae_encoder) + self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 - self.scheduler = scheduler self.feature_extractor = feature_extractor - self.safety_checker = safety_checker - self.preprocessors = [] - - if self.is_dynamic and not self._compile_only: - self.reshape(batch_size=-1, height=-1, width=-1, num_images_per_prompt=-1) - sub_models = { - DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder, - DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet, - DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder, - DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder, - DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2, + # we allow passing these as torch models for now + self.image_encoder = kwargs.pop("image_encoder", None) # TODO: maybe mplement OVModelImageEncoder + self.safety_checker = kwargs.pop("safety_checker", None) # TODO: maybe mplement OVModelSafetyChecker + + all_pipeline_init_args = { + "vae": self.vae, + "unet": self.unet, + "text_encoder": self.text_encoder, + "text_encoder_2": self.text_encoder_2, + "safety_checker": self.safety_checker, + "image_encoder": self.image_encoder, + "scheduler": self.scheduler, + "tokenizer": self.tokenizer, + "tokenizer_2": self.tokenizer_2, + "feature_extractor": self.feature_extractor, + "requires_aesthetics_score": requires_aesthetics_score, + "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt, + "add_watermarker": add_watermarker, } - for name in sub_models.keys(): - self._internal_dict[name] = ( - ("optimum", sub_models[name].__class__.__name__) if sub_models[name] is not None else (None, None) - ) - self._internal_dict.pop("vae", None) + diffusers_pipeline_args = {} + for key in inspect.signature(self.auto_model_class).parameters.keys(): + if key in all_pipeline_init_args: + diffusers_pipeline_args[key] = all_pipeline_init_args[key] + # inits diffusers pipeline specific attributes (registers modules and config) + self.auto_model_class.__init__(self, **diffusers_pipeline_args) + # we use auto_model_class.__init__ here because we can't call super().__init__ + # as OptimizedModel already defines an __init__ which is the first in the MRO self._openvino_config = None if quantization_config: self._openvino_config = OVConfig(quantization_config=quantization_config) self._set_ov_config_parameters() + if self.is_dynamic and not self._compile_only: + self.reshape(batch_size=-1, height=-1, width=-1, num_images_per_prompt=-1) + if compile and not self._compile_only: self.compile() @@ -177,31 +222,32 @@ def _save_pretrained(self, save_directory: Union[str, Path]): save_directory = Path(save_directory) - sub_models_to_save = { - self.unet: DIFFUSION_MODEL_UNET_SUBFOLDER, - self.vae_decoder: DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, - self.vae_encoder: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, - self.text_encoder: DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, - self.text_encoder_2: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, + models_to_save_paths = { + (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER), + (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER), + (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER), + (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER), + (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER), } - - for ov_model, dst_path in sub_models_to_save.items(): - if ov_model is not None: - dst_path = save_directory / dst_path / OV_XML_FILE_NAME + for model, save_path in models_to_save_paths: + if model is not None: + dst_path = save_path / OV_XML_FILE_NAME dst_path.parent.mkdir(parents=True, exist_ok=True) - openvino.save_model(ov_model.model, dst_path, compress_to_fp16=False) - model_dir = ov_model.config.get("_name_or_path", None) or ov_model._model_dir / ov_model._model_name - config_path = Path(model_dir) / ov_model.CONFIG_NAME + openvino.save_model(model.model, dst_path, compress_to_fp16=False) + model_dir = model.config.get("_name_or_path", None) or model.model_save_dir + config_path = Path(model_dir) / CONFIG_NAME if config_path.is_file(): - shutil.copyfile(config_path, dst_path.parent / ov_model.CONFIG_NAME) + config_save_path = save_path / CONFIG_NAME + shutil.copyfile(config_path, config_save_path) self.scheduler.save_pretrained(save_directory / "scheduler") - if self.feature_extractor is not None: - self.feature_extractor.save_pretrained(save_directory / "feature_extractor") + if self.tokenizer is not None: self.tokenizer.save_pretrained(save_directory / "tokenizer") if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + if self.feature_extractor is not None: + self.feature_extractor.save_pretrained(save_directory / "feature_extractor") self._save_openvino_config(save_directory) @@ -212,143 +258,163 @@ def _from_pretrained( config: Dict[str, Any], token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, + force_download: bool = False, + local_files_only: bool = False, cache_dir: str = HUGGINGFACE_HUB_CACHE, - vae_decoder_file_name: Optional[str] = None, - text_encoder_file_name: Optional[str] = None, unet_file_name: Optional[str] = None, + vae_decoder_file_name: Optional[str] = None, vae_encoder_file_name: Optional[str] = None, + text_encoder_file_name: Optional[str] = None, text_encoder_2_file_name: Optional[str] = None, - local_files_only: bool = False, from_onnx: bool = False, - model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, load_in_8bit: bool = False, quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): + # same as DiffusionPipeline.from_pretraoned, if called directly, it loads the class in the config + if cls.__name__ == "OVDiffusionPipeline": + class_name = config["_class_name"] + ov_pipeline_class = _get_ov_class(class_name) + else: + ov_pipeline_class = cls + default_file_name = ONNX_WEIGHTS_NAME if from_onnx else OV_XML_FILE_NAME + + unet_file_name = unet_file_name or default_file_name + vae_encoder_file_name = vae_encoder_file_name or default_file_name vae_decoder_file_name = vae_decoder_file_name or default_file_name text_encoder_file_name = text_encoder_file_name or default_file_name text_encoder_2_file_name = text_encoder_2_file_name or default_file_name - unet_file_name = unet_file_name or default_file_name - vae_encoder_file_name = vae_encoder_file_name or default_file_name - model_id = str(model_id) - patterns = set(config.keys()) - sub_models_names = patterns.intersection({"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"}) - if not os.path.isdir(model_id): - patterns.update({"vae_encoder", "vae_decoder"}) - allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")} + + if not os.path.isdir(str(model_id)): + all_components = {key for key in config.keys() if not key.startswith("_")} | {"vae_encoder", "vae_decoder"} + allow_patterns = {os.path.join(component, "*") for component in all_components} allow_patterns.update( { + unet_file_name, + vae_encoder_file_name, vae_decoder_file_name, text_encoder_file_name, text_encoder_2_file_name, - unet_file_name, - vae_encoder_file_name, + unet_file_name.replace(".xml", ".bin"), + vae_encoder_file_name.replace(".xml", ".bin"), vae_decoder_file_name.replace(".xml", ".bin"), text_encoder_file_name.replace(".xml", ".bin"), text_encoder_2_file_name.replace(".xml", ".bin"), - unet_file_name.replace(".xml", ".bin"), - vae_encoder_file_name.replace(".xml", ".bin"), SCHEDULER_CONFIG_NAME, - CONFIG_NAME, cls.config_name, + CONFIG_NAME, } ) ignore_patterns = ["*.msgpack", "*.safetensors", "*pytorch_model.bin"] if not from_onnx: ignore_patterns.extend(["*.onnx", "*.onnx_data"]) - # Downloads all repo's files matching the allowed patterns - model_id = snapshot_download( + + model_save_folder = snapshot_download( model_id, cache_dir=cache_dir, + force_download=force_download, local_files_only=local_files_only, - token=token, revision=revision, + token=token, allow_patterns=allow_patterns, ignore_patterns=ignore_patterns, ) - new_model_save_dir = Path(model_id) + else: + model_save_folder = str(model_id) + + model_save_path = Path(model_save_folder) + + if model_save_dir is None: + model_save_dir = model_save_path - for name in sub_models_names: - # Check if the subcomponent needs to be loaded + submodels = {"scheduler": None, "tokenizer": None, "tokenizer_2": None, "feature_extractor": None} + for name in submodels.keys(): if kwargs.get(name, None) is not None: - continue - library_name, library_classes = config[name] - if library_classes is not None: + submodels[name] = kwargs.pop(name) + elif config.get(name, (None, None))[0] is not None: + library_name, library_classes = config.get(name) library = importlib.import_module(library_name) class_obj = getattr(library, library_classes) load_method = getattr(class_obj, "from_pretrained") # Check if the module is in a subdirectory - if (new_model_save_dir / name).is_dir(): - kwargs[name] = load_method(new_model_save_dir / name) + if (model_save_path / name).is_dir(): + submodels[name] = load_method(model_save_path / name) else: - kwargs[name] = load_method(new_model_save_dir) - - unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name - components = { - "vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, - "vae_decoder": new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, - "text_encoder": new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, - "text_encoder_2": new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, + submodels[name] = load_method(model_save_path) + + models = { + "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, + "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, + "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, + "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, } compile_only = kwargs.get("compile_only", False) - - if model_save_dir is None: - model_save_dir = new_model_save_dir - quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) if (quantization_config is None or quantization_config.dataset is None) and not compile_only: - unet = cls.load_model(unet_path, quantization_config) - for key, value in components.items(): - components[key] = cls.load_model(value, quantization_config) if value.is_file() else None + for name, path in models.items(): + if kwargs.get(name, None) is not None: + models[name] = kwargs.pop(name) + else: + models[name] = cls.load_model(path, quantization_config) if path.is_file() else None elif compile_only: ov_config = kwargs.get("ov_config", {}) device = kwargs.get("device", "CPU") vae_ov_conifg = {**ov_config} if "GPU" in device.upper() and "INFERENCE_PRECISION_HINT" not in vae_ov_conifg: vae_ov_conifg["INFERENCE_PRECISION_HINT"] = "f32" - unet = cls._compile_model(unet_path, device, ov_config, Path(model_save_dir) / "unet") - for key, value in components.items(): - components[key] = ( - cls._compile_model( - value, device, ov_config if "vae" not in key else vae_ov_conifg, Path(model_save_dir) / key + for name, path in models.items(): + if kwargs.get(name, None) is not None: + models[name] = kwargs.pop(name) + else: + models[name] = ( + cls._compile_model( + path, + device, + ov_config if "vae" not in name else vae_ov_conifg, + Path(model_save_dir) / name, + ) + if path.is_file() + else None ) - if value.is_file() - else None - ) else: - # Load uncompressed models to apply hybrid quantization further - unet = cls.load_model(unet_path) - for key, value in components.items(): - components[key] = cls.load_model(value) if value.is_file() else None - sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs) - - supported_pipelines = ( - OVStableDiffusionPipeline, - OVStableDiffusionXLPipeline, - OVLatentConsistencyModelPipeline, - ) - if not isinstance(sd_model, supported_pipelines): + # why is this quantization not performed in __init__? + if ov_pipeline_class.export_feature != "text-to-image": raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}") from optimum.intel import OVQuantizer + for name, path in models.items(): + if kwargs.get(name, None) is not None: + models[name] = kwargs.pop(name) + else: + models[name] = cls.load_model(path) if path.is_file() else None + + ov_pipeline = ov_pipeline_class(**models, **submodels, model_save_dir=model_save_dir, **kwargs) + # same as in DiffusionPipeline.from_pretrained, we save where the model was instantiated from + ov_pipeline.register_to_config(_name_or_path=config.get("_name_or_path", str(model_id))) + hybrid_quantization_config = deepcopy(quantization_config) hybrid_quantization_config.quant_method = OVQuantizationMethod.HYBRID - quantizer = OVQuantizer(sd_model) + quantizer = OVQuantizer(ov_pipeline) quantizer.quantize(ov_config=OVConfig(quantization_config=hybrid_quantization_config)) - return sd_model + return ov_pipeline - return cls( - unet=unet, - config=config, + ov_pipeline = ov_pipeline_class( + **models, + **submodels, model_save_dir=model_save_dir, quantization_config=quantization_config, - **components, **kwargs, ) + # same as in DiffusionPipeline.from_pretrained, we save where the model was instantiated from + ov_pipeline.register_to_config(_name_or_path=config.get("_name_or_path", str(model_id))) + + return ov_pipeline @classmethod def _from_transformers( @@ -360,25 +426,11 @@ def _from_transformers( force_download: bool = False, cache_dir: str = HUGGINGFACE_HUB_CACHE, local_files_only: bool = False, - tokenizer: Optional["CLIPTokenizer"] = None, - scheduler: Union["DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler"] = None, - feature_extractor: Optional["CLIPFeatureExtractor"] = None, - tokenizer_2: Optional["CLIPTokenizer"] = None, load_in_8bit: Optional[bool] = None, quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + compile_only: bool = False, **kwargs, ): - save_dir = TemporaryDirectory() - save_dir_path = Path(save_dir.name) - - # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size - if load_in_8bit is None and not quantization_config: - ov_config = None - else: - ov_config = OVConfig(dtype="fp32") - - compile_only = kwargs.pop("compile_only", False) - if compile_only: logger.warning( "`compile_only` mode will be disabled because it does not support model export." @@ -386,10 +438,19 @@ def _from_transformers( ) compile_only = False + # If load_in_8bit and quantization_config not specified then ov_config is set + # to None and will be set by default in convert depending on the model size + if load_in_8bit is None and not quantization_config: + ov_config = None + else: + ov_config = OVConfig(dtype="fp32") + + model_save_dir = TemporaryDirectory() + model_save_path = Path(model_save_dir.name) + main_export( model_name_or_path=model_id, - output=save_dir_path, - task=cls.export_feature, + output=model_save_path, do_validation=False, no_post_process=True, revision=revision, @@ -402,25 +463,44 @@ def _from_transformers( ) return cls._from_pretrained( - model_id=save_dir_path, + model_id=model_save_path, config=config, from_onnx=False, token=token, revision=revision, - force_download=force_download, cache_dir=cache_dir, + force_download=force_download, local_files_only=local_files_only, - model_save_dir=save_dir, - tokenizer=tokenizer, - tokenizer_2=tokenizer_2, - scheduler=scheduler, - feature_extractor=feature_extractor, - load_in_8bit=load_in_8bit, + model_save_dir=model_save_dir, quantization_config=quantization_config, + load_in_8bit=load_in_8bit, compile_only=compile_only, **kwargs, ) + def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = None): + for arg in args: + if isinstance(arg, str): + device = arg + elif isinstance(arg, torch.dtype): + dtype = arg + + if isinstance(device, str): + self._device = device.upper() + self.request = None + elif device is not None: + raise ValueError( + "The `device` argument should be a string representing the device on which the model should be loaded." + ) + + if dtype is not None and dtype != self.dtype: + raise NotImplementedError( + f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " + f"Please export the model with the desired dtype." + ) + + return self + @property def height(self) -> int: height = self.unet.model.inputs[0].get_partial_shape()[2] @@ -436,7 +516,7 @@ def width(self) -> int: return width.get_length() * self.vae_scale_factor @property - def _batch_size(self) -> int: + def batch_size(self) -> int: batch_size = self.unet.model.inputs[0].get_partial_shape()[0] if batch_size.is_dynamic: return -1 @@ -497,21 +577,6 @@ def _reshape_text_encoder( model.reshape(shapes) return model - def _reshape_vae_decoder(self, model: openvino.runtime.Model, height: int = -1, width: int = -1): - height = height // self.vae_scale_factor if height > -1 else height - width = width // self.vae_scale_factor if width > -1 else width - latent_channels = self.vae_decoder.config.get("latent_channels", None) - if latent_channels is None: - latent_channels = model.inputs[0].get_partial_shape()[1] - if latent_channels.is_dynamic: - logger.warning( - "Could not identify `latent_channels` from the VAE decoder configuration, to statically reshape the VAE decoder please provide a configuration." - ) - self.is_dynamic = True - shapes = {model.inputs[0]: [1, latent_channels, height, width]} - model.reshape(shapes) - return model - def _reshape_vae_encoder( self, model: openvino.runtime.Model, batch_size: int = -1, height: int = -1, width: int = -1 ): @@ -527,6 +592,23 @@ def _reshape_vae_encoder( model.reshape(shapes) return model + def _reshape_vae_decoder( + self, model: openvino.runtime.Model, height: int = -1, width: int = -1, num_images_per_prompt: int = -1 + ): + height = height // self.vae_scale_factor if height > -1 else height + width = width // self.vae_scale_factor if width > -1 else width + latent_channels = self.vae_decoder.config.get("latent_channels", None) + if latent_channels is None: + latent_channels = model.inputs[0].get_partial_shape()[1] + if latent_channels.is_dynamic: + logger.warning( + "Could not identify `latent_channels` from the VAE decoder configuration, to statically reshape the VAE decoder please provide a configuration." + ) + self.is_dynamic = True + shapes = {model.inputs[0]: [num_images_per_prompt, latent_channels, height, width]} + model.reshape(shapes) + return model + def reshape( self, batch_size: int, @@ -540,16 +622,23 @@ def reshape( ) self.is_dynamic = -1 in {batch_size, height, width, num_images_per_prompt} - self.vae_decoder.model = self._reshape_vae_decoder(self.vae_decoder.model, height, width) + if self.tokenizer is None and self.tokenizer_2 is None: tokenizer_max_len = -1 else: tokenizer_max_len = ( self.tokenizer.model_max_length if self.tokenizer is not None else self.tokenizer_2.model_max_length ) + self.unet.model = self._reshape_unet( self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len ) + self.vae_decoder.model = self._reshape_vae_decoder( + self.vae_decoder.model, height, width, num_images_per_prompt + ) + + if self.vae_encoder is not None: + self.vae_encoder.model = self._reshape_vae_encoder(self.vae_encoder.model, batch_size, height, width) if self.text_encoder is not None: self.text_encoder.model = self._reshape_text_encoder( @@ -561,9 +650,6 @@ def reshape( self.text_encoder_2.model, batch_size, self.tokenizer_2.model_max_length ) - if self.vae_encoder is not None: - self.vae_encoder.model = self._reshape_vae_encoder(self.vae_encoder.model, batch_size, height, width) - self.clear_requests() return self @@ -576,12 +662,12 @@ def half(self): "`half()` is not supported with `compile_only` mode, please intialize model without this option" ) - compress_model_transformation(self.vae_decoder.model) - compress_model_transformation(self.unet.model) - for component in {self.text_encoder, self.text_encoder_2, self.vae_encoder}: + for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: if component is not None: compress_model_transformation(component.model) + self.clear_requests() + return self def clear_requests(self): @@ -590,16 +676,12 @@ def clear_requests(self): "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option" ) - self.vae_decoder.request = None - self.unet.request = None - for component in {self.text_encoder, self.text_encoder_2, self.vae_encoder}: + for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: if component is not None: component.request = None def compile(self): - self.vae_decoder._compile() - self.unet._compile() - for component in {self.text_encoder, self.text_encoder_2, self.vae_encoder}: + for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: if component is not None: component._compile() @@ -610,97 +692,249 @@ def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs): def _save_config(self, save_directory): self.save_config(save_directory) + @property + def components(self) -> Dict[str, Any]: + components = { + "vae": self.vae, + "unet": self.unet, + "text_encoder": self.text_encoder, + "text_encoder_2": self.text_encoder_2, + "safety_checker": self.safety_checker, + "image_encoder": self.image_encoder, + } + components = {k: v for k, v in components.items() if v is not None} + return components + + def __call__(self, *args, **kwargs): + # we do this to keep numpy random states support for now + # TODO: deprecate and add warnings when a random state is passed -class OVDiffusersModelPart(OVModelPart): - CONFIG_NAME = "config.json" + args = list(args) + for i in range(len(args)): + args[i] = np_to_pt_generators(args[i], self.device) + + for k, v in kwargs.items(): + kwargs[k] = np_to_pt_generators(v, self.device) + + # we use auto_model_class.__call__ here because we can't call super().__call__ + # as OptimizedModel already defines a __call__ which is the first in the MRO + return self.auto_model_class.__call__(self, *args, **kwargs) + + +class OVPipelinePart(ConfigMixin): + config_name: str = CONFIG_NAME def __init__( self, model: openvino.runtime.Model, - parent_model: OVBaseModel, - ov_config: Optional[Dict[str, str]] = None, - model_name: str = "encoder", - model_dir: str = None, + parent_pipeline: OVDiffusionPipeline, + model_name: str = "", ): - super().__init__( - model=model, parent_model=parent_model, ov_config=ov_config, model_name=model_name, model_dir=model_dir - ) - config_path = self._model_dir / model_name / self.CONFIG_NAME - self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {} - self.input_dtype = { - inputs.get_any_name(): OV_TO_NP_TYPE[inputs.get_element_type().get_type_name()] - for inputs in self.model.inputs - } + self.model = model + self.model_name = model_name + self.parent_pipeline = parent_pipeline + self.request = None if not parent_pipeline._compile_only else self.model + if isinstance(parent_pipeline.model_save_dir, TemporaryDirectory): + self.model_save_dir = Path(parent_pipeline.model_save_dir.name) / self.model_name + else: + self.model_save_dir = Path(parent_pipeline.model_save_dir) / self.model_name -class OVModelTextEncoder(OVDiffusersModelPart): - def __init__( + config_file_path = self.model_save_dir / self.config_name + + if not config_file_path.is_file(): + # config is mandatory for the model part to be used for inference + raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_file_path}") + + config_dict = self._dict_from_json_file(config_file_path) + self.register_to_config(**config_dict) + + @property + def _device(self) -> str: + return self.parent_pipeline._device + + @property + def device(self) -> torch.device: + return self.parent_pipeline.device + + @property + def ov_config(self) -> OVConfig: + return self.parent_pipeline.ov_config + + @property + def dtype(self) -> torch.dtype: + return OV_TO_PT_TYPE[self.ov_config.get("dtype", "f32")] + + def _compile(self): + if self.request is None: + if ( + "CACHE_DIR" not in self.ov_config.keys() + and not str(self.model_save_dir).startswith(gettempdir()) + and "GPU" in self._device + ): + self.ov_config["CACHE_DIR"] = os.path.join(self.model_save_dir, "model_cache") + + logger.info(f"Compiling the {self.model_name} to {self._device} ...") + self.request = core.compile_model(self.model, self._device, self.ov_config) + # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html + if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: + logger.info(f"{self._device} SUPPORTED_PROPERTIES:") + _print_compiled_model_properties(self.request) + + def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = None): + for arg in args: + if isinstance(arg, str): + device = arg + elif isinstance(arg, torch.dtype): + dtype = arg + + if isinstance(device, str): + self._device = device.upper() + self.request = None + elif device is not None: + raise ValueError( + "The `device` argument should be a string representing the device on which the model should be loaded." + ) + + if dtype is not None and dtype != self.dtype: + raise NotImplementedError( + f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " + f"Please export the model with the desired dtype." + ) + + return self + + @abstractmethod + def forward(self, *args, **kwargs): + raise NotImplementedError + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + +class OVModelTextEncoder(OVPipelinePart): + def forward( self, - model: openvino.runtime.Model, - parent_model: OVBaseModel, - ov_config: Optional[Dict[str, str]] = None, - model_name: str = "text_encoder", + input_ids: Union[np.ndarray, torch.Tensor], + attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None, + output_hidden_states: Optional[bool] = None, + return_dict: bool = False, ): - super().__init__(model, parent_model, ov_config, model_name) - - def forward(self, input_ids: np.ndarray): self._compile() - inputs = { - "input_ids": input_ids, - } - outputs = self.request(inputs, share_inputs=True) - return list(outputs.values()) + model_inputs = {"input_ids": input_ids} + ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() -class OVModelUnet(OVDiffusersModelPart): - def __init__( - self, model: openvino.runtime.Model, parent_model: OVBaseModel, ov_config: Optional[Dict[str, str]] = None - ): - super().__init__(model, parent_model, ov_config, "unet") + model_outputs = {} + for key, value in ov_outputs.items(): + model_outputs[next(iter(key.names))] = torch.from_numpy(value) + + if output_hidden_states: + model_outputs["hidden_states"] = [] + for i in range(self.config.num_hidden_layers): + model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) + model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) + + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) + + +class OVModelUnet(OVPipelinePart): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if not hasattr(self.config, "time_cond_proj_dim"): + logger.warning( + "The `time_cond_proj_dim` attribute is missing from the UNet configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(time_cond_proj_dim=None) def forward( self, - sample: np.ndarray, - timestep: np.ndarray, - encoder_hidden_states: np.ndarray, - text_embeds: Optional[np.ndarray] = None, - time_ids: Optional[np.ndarray] = None, - timestep_cond: Optional[np.ndarray] = None, + sample: Union[np.ndarray, torch.Tensor], + timestep: Union[np.ndarray, torch.Tensor], + encoder_hidden_states: Union[np.ndarray, torch.Tensor], + text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None, + time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, + timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = False, ): self._compile() - inputs = { + model_inputs = { "sample": sample, "timestep": timestep, "encoder_hidden_states": encoder_hidden_states, } if text_embeds is not None: - inputs["text_embeds"] = text_embeds + model_inputs["text_embeds"] = text_embeds if time_ids is not None: - inputs["time_ids"] = time_ids + model_inputs["time_ids"] = time_ids if timestep_cond is not None: - inputs["timestep_cond"] = timestep_cond + model_inputs["timestep_cond"] = timestep_cond + if cross_attention_kwargs is not None: + model_inputs.update(cross_attention_kwargs) + if added_cond_kwargs is not None: + model_inputs.update(added_cond_kwargs) - outputs = self.request(inputs, share_inputs=True) - return list(outputs.values()) + ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() + model_outputs = {} + for key, value in ov_outputs.items(): + model_outputs[next(iter(key.names))] = torch.from_numpy(value) -class OVModelVaeDecoder(OVDiffusersModelPart): - def __init__( - self, model: openvino.runtime.Model, parent_model: OVBaseModel, ov_config: Optional[Dict[str, str]] = None - ): - super().__init__(model, parent_model, ov_config, "vae_decoder") + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) + + +class OVModelVaeEncoder(OVPipelinePart): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) - def forward(self, latent_sample: np.ndarray): + if not hasattr(self.config, "scaling_factor"): + logger.warning( + "The `scaling_factor` attribute is missing from the VAE encoder configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) + + def forward( + self, + sample: Union[np.ndarray, torch.Tensor], + generator: Optional[torch.Generator] = None, + return_dict: bool = False, + ): self._compile() - inputs = { - "latent_sample": latent_sample, - } - outputs = self.request(inputs, share_inputs=True) - return list(outputs.values()) + model_inputs = {"sample": sample} + + ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() + + model_outputs = {} + for key, value in ov_outputs.items(): + model_outputs[next(iter(key.names))] = torch.from_numpy(value) + + if "latent_sample" in model_outputs: + model_outputs["latents"] = model_outputs.pop("latent_sample") + + if "latent_parameters" in model_outputs: + model_outputs["latent_dist"] = DiagonalGaussianDistribution( + parameters=model_outputs.pop("latent_parameters") + ) + + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) def _compile(self): if "GPU" in self._device and "INFERENCE_PRECISION_HINT" not in self.ov_config: @@ -708,20 +942,38 @@ def _compile(self): super()._compile() -class OVModelVaeEncoder(OVDiffusersModelPart): - def __init__( - self, model: openvino.runtime.Model, parent_model: OVBaseModel, ov_config: Optional[Dict[str, str]] = None - ): - super().__init__(model, parent_model, ov_config, "vae_encoder") +class OVModelVaeDecoder(OVPipelinePart): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # can be missing from models exported long ago + if not hasattr(self.config, "scaling_factor"): + logger.warning( + "The `scaling_factor` attribute is missing from the VAE decoder configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) - def forward(self, sample: np.ndarray): + def forward( + self, + latent_sample: Union[np.ndarray, torch.Tensor], + generator: Optional[torch.Generator] = None, + return_dict: bool = False, + ): self._compile() - inputs = { - "sample": sample, - } - outputs = self.request(inputs, share_inputs=True) - return list(outputs.values()) + model_inputs = {"latent_sample": latent_sample} + + ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() + + model_outputs = {} + for key, value in ov_outputs.items(): + model_outputs[next(iter(key.names))] = torch.from_numpy(value) + + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) def _compile(self): if "GPU" in self._device and "INFERENCE_PRECISION_HINT" not in self.ov_config: @@ -729,382 +981,291 @@ def _compile(self): super()._compile() -class OVStableDiffusionPipeline(OVStableDiffusionPipelineBase, StableDiffusionPipelineMixin): - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - **kwargs, - ): - height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - _height = self.height - _width = self.width - expected_batch_size = self._batch_size +class OVModelVae: + def __init__(self, decoder: OVModelVaeDecoder, encoder: OVModelVaeEncoder): + self.decoder = decoder + self.encoder = encoder - if _height != -1 and height != _height: - logger.warning( - f"`height` was set to {height} but the static model will output images of height {_height}." - "To fix the height, please reshape your model accordingly using the `.reshape()` method." - ) - height = _height + @property + def config(self): + return self.decoder.config - if _width != -1 and width != _width: - logger.warning( - f"`width` was set to {width} but the static model will output images of width {_width}." - "To fix the width, please reshape your model accordingly using the `.reshape()` method." - ) - width = _width + @property + def dtype(self): + return self.decoder.dtype - if expected_batch_size != -1: - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = kwargs.get("prompt_embeds").shape[0] - - _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale) - - return StableDiffusionPipelineMixin.__call__( - self, - prompt=prompt, - height=height, - width=width, - num_inference_steps=num_inference_steps, - guidance_scale=guidance_scale, - negative_prompt=negative_prompt, - num_images_per_prompt=num_images_per_prompt, - **kwargs, - ) + @property + def device(self): + return self.decoder.device + + def decode(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + def encode(self, *args, **kwargs): + return self.encoder(*args, **kwargs) + + def to(self, *args, **kwargs): + self.decoder.to(*args, **kwargs) + if self.encoder is not None: + self.encoder.to(*args, **kwargs) -class OVStableDiffusionImg2ImgPipeline(OVStableDiffusionPipelineBase, StableDiffusionImg2ImgPipelineMixin): +class OVStableDiffusionPipeline(OVDiffusionPipeline, StableDiffusionPipeline): + """ + OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion#diffusers.StableDiffusionPipeline). + """ + + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = StableDiffusionPipeline + + +class OVStableDiffusionImg2ImgPipeline(OVDiffusionPipeline, StableDiffusionImg2ImgPipeline): + """ + OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_img2img#diffusers.StableDiffusionImg2ImgPipeline). + """ + + main_input_name = "image" export_feature = "image-to-image" + auto_model_class = StableDiffusionImg2ImgPipeline - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - image: Union[np.ndarray, PIL.Image.Image] = None, - strength: float = 0.8, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - **kwargs, - ): - _height = self.height - _width = self.width - expected_batch_size = self._batch_size - - if _height != -1 and _width != -1: - image = self.image_processor.preprocess(image, height=_height, width=_width).transpose(0, 2, 3, 1) - - if expected_batch_size != -1: - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = kwargs.get("prompt_embeds").shape[0] - - _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale) - - return StableDiffusionImg2ImgPipelineMixin.__call__( - self, - prompt=prompt, - image=image, - strength=strength, - num_inference_steps=num_inference_steps, - guidance_scale=guidance_scale, - negative_prompt=negative_prompt, - num_images_per_prompt=num_images_per_prompt, - **kwargs, - ) +class OVStableDiffusionInpaintPipeline(OVDiffusionPipeline, StableDiffusionInpaintPipeline): + """ + OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_inpaint#diffusers.StableDiffusionInpaintPipeline). + """ -class OVStableDiffusionInpaintPipeline(OVStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin): + main_input_name = "image" export_feature = "inpainting" + auto_model_class = StableDiffusionInpaintPipeline + + +class OVStableDiffusionXLPipeline(OVDiffusionPipeline, StableDiffusionXLPipeline): + """ + OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). + """ + + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = StableDiffusionXLPipeline - def __call__( + def _get_add_time_ids( self, - prompt: Optional[Union[str, List[str]]], - image: PIL.Image.Image, - mask_image: PIL.Image.Image, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - **kwargs, + original_size, + crops_coords_top_left, + target_size, + dtype, + text_encoder_projection_dim=None, ): - height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - _height = self.height - _width = self.width - expected_batch_size = self._batch_size - - if _height != -1 and _width != -1: - if height != _height: - logger.warning( - f"`height` was set to {height} but the static model will output images of height {_height}." - "To fix the height, please reshape your model accordingly using the `.reshape()` method." - ) - height = _height - - if width != _width: - logger.warning( - f"`width` was set to {width} but the static model will output images of width {_width}." - "To fix the width, please reshape your model accordingly using the `.reshape()` method." - ) - width = _width + add_time_ids = list(original_size + crops_coords_top_left + target_size) - if isinstance(image, list): - image = [self.image_processor.resize(i, _height, _width) for i in image] - else: - image = self.image_processor.resize(image, _height, _width) + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + return add_time_ids - if isinstance(mask_image, list): - mask_image = [self.image_processor.resize(i, _height, _width) for i in mask_image] - else: - mask_image = self.image_processor.resize(mask_image, _height, _width) - if expected_batch_size != -1: - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = kwargs.get("prompt_embeds").shape[0] - - _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale) - - return StableDiffusionInpaintPipelineMixin.__call__( - self, - prompt=prompt, - image=image, - mask_image=mask_image, - height=height, - width=width, - num_inference_steps=num_inference_steps, - guidance_scale=guidance_scale, - negative_prompt=negative_prompt, - num_images_per_prompt=num_images_per_prompt, - **kwargs, - ) +class OVStableDiffusionXLImg2ImgPipeline(OVDiffusionPipeline, StableDiffusionXLImg2ImgPipeline): + """ + OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). + """ + main_input_name = "image" + export_feature = "image-to-image" + auto_model_class = StableDiffusionXLImg2ImgPipeline -class OVStableDiffusionXLPipelineBase(OVStableDiffusionPipelineBase): - auto_model_class = StableDiffusionXLPipeline + def _get_add_time_ids( + self, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype, + text_encoder_projection_dim=None, + ): + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) + else: + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) - def __init__(self, *args, add_watermarker: Optional[bool] = None, **kwargs): - super().__init__(*args, **kwargs) + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) - add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() + return add_time_ids, add_neg_time_ids - if add_watermarker: - if not is_invisible_watermark_available(): - raise ImportError( - "`add_watermarker` requires invisible-watermark to be installed, which can be installed with `pip install invisible-watermark`." - ) - from optimum.pipelines.diffusers.watermark import StableDiffusionXLWatermarker - self.watermark = StableDiffusionXLWatermarker() - else: - self.watermark = None +class OVStableDiffusionXLInpaintPipeline(OVDiffusionPipeline, StableDiffusionXLInpaintPipeline): + """ + OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline). + """ + main_input_name = "image" + export_feature = "inpainting" + auto_model_class = StableDiffusionXLInpaintPipeline -class OVStableDiffusionXLPipeline(OVStableDiffusionXLPipelineBase, StableDiffusionXLPipelineMixin): - def __call__( + def _get_add_time_ids( self, - prompt: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 5.0, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - **kwargs, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype, + text_encoder_projection_dim=None, ): - height = height or self.unet.config["sample_size"] * self.vae_scale_factor - width = width or self.unet.config["sample_size"] * self.vae_scale_factor - _height = self.height - _width = self.width - expected_batch_size = self._batch_size - - if _height != -1 and height != _height: - logger.warning( - f"`height` was set to {height} but the static model will output images of height {_height}." - "To fix the height, please reshape your model accordingly using the `.reshape()` method." + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) ) - height = _height + else: + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) - if _width != -1 and width != _width: - logger.warning( - f"`width` was set to {width} but the static model will output images of width {_width}." - "To fix the width, please reshape your model accordingly using the `.reshape()` method." - ) - width = _width + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) - if expected_batch_size != -1: - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = kwargs.get("prompt_embeds").shape[0] - - _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale) - - return StableDiffusionXLPipelineMixin.__call__( - self, - prompt=prompt, - height=height, - width=width, - num_inference_steps=num_inference_steps, - guidance_scale=guidance_scale, - negative_prompt=negative_prompt, - num_images_per_prompt=num_images_per_prompt, - **kwargs, - ) + return add_time_ids, add_neg_time_ids -class OVStableDiffusionXLImg2ImgPipeline(OVStableDiffusionXLPipelineBase, StableDiffusionXLImg2ImgPipelineMixin): - auto_model_class = StableDiffusionXLImg2ImgPipeline +class OVLatentConsistencyModelPipeline(OVDiffusionPipeline, LatentConsistencyModelPipeline): + """ + OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). + """ + + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = LatentConsistencyModelPipeline + + +class OVLatentConsistencyModelImg2ImgPipeline(OVDiffusionPipeline, LatentConsistencyModelImg2ImgPipeline): + """ + OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency_img2img#diffusers.LatentConsistencyModelImg2ImgPipeline). + """ + + main_input_name = "image" export_feature = "image-to-image" + auto_model_class = LatentConsistencyModelImg2ImgPipeline + + +SUPPORTED_OV_PIPELINES = [ + OVStableDiffusionPipeline, + OVStableDiffusionImg2ImgPipeline, + OVStableDiffusionInpaintPipeline, + OVStableDiffusionXLPipeline, + OVStableDiffusionXLImg2ImgPipeline, + OVStableDiffusionXLInpaintPipeline, + OVLatentConsistencyModelPipeline, + OVLatentConsistencyModelImg2ImgPipeline, +] + + +def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): + for ov_pipeline_class in SUPPORTED_OV_PIPELINES: + if ( + ov_pipeline_class.__name__ == pipeline_class_name + or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name + ): + return ov_pipeline_class + + if throw_error_if_not_exist: + raise ValueError(f"OVDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}") + + +OV_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", OVStableDiffusionPipeline), + ("stable-diffusion-xl", OVStableDiffusionXLPipeline), + ("latent-consistency", OVLatentConsistencyModelPipeline), + ] +) - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - image: Union[np.ndarray, PIL.Image.Image] = None, - strength: float = 0.3, - num_inference_steps: int = 50, - guidance_scale: float = 5.0, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - **kwargs, - ): - _height = self.height - _width = self.width - expected_batch_size = self._batch_size - - if _height != -1 and _width != -1: - image = self.image_processor.preprocess(image, height=_height, width=_width).transpose(0, 2, 3, 1) - - if expected_batch_size != -1: - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = kwargs.get("prompt_embeds").shape[0] - - _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale) - - return StableDiffusionXLImg2ImgPipelineMixin.__call__( - self, - prompt=prompt, - image=image, - strength=strength, - num_inference_steps=num_inference_steps, - guidance_scale=guidance_scale, - negative_prompt=negative_prompt, - num_images_per_prompt=num_images_per_prompt, - **kwargs, - ) +OV_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", OVStableDiffusionImg2ImgPipeline), + ("stable-diffusion-xl", OVStableDiffusionXLImg2ImgPipeline), + ("latent-consistency", OVLatentConsistencyModelImg2ImgPipeline), + ] +) +OV_INPAINT_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", OVStableDiffusionInpaintPipeline), + ("stable-diffusion-xl", OVStableDiffusionXLInpaintPipeline), + ] +) -class OVLatentConsistencyModelPipeline(OVStableDiffusionPipelineBase, LatentConsistencyPipelineMixin): - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 4, - original_inference_steps: int = None, - guidance_scale: float = 8.5, - num_images_per_prompt: int = 1, - **kwargs, - ): - height = height or self.unet.config["sample_size"] * self.vae_scale_factor - width = width or self.unet.config["sample_size"] * self.vae_scale_factor - _height = self.height - _width = self.width - expected_batch_size = self._batch_size +SUPPORTED_OV_PIPELINES_MAPPINGS = [ + OV_TEXT2IMAGE_PIPELINES_MAPPING, + OV_IMAGE2IMAGE_PIPELINES_MAPPING, + OV_INPAINT_PIPELINES_MAPPING, +] - if _height != -1 and height != _height: - logger.warning( - f"`height` was set to {height} but the static model will output images of height {_height}." - "To fix the height, please reshape your model accordingly using the `.reshape()` method." - ) - height = _height - if _width != -1 and width != _width: - logger.warning( - f"`width` was set to {width} but the static model will output images of width {_width}." - "To fix the width, please reshape your model accordingly using the `.reshape()` method." - ) - width = _width +def _get_task_ov_class(mapping, pipeline_class_name): + def _get_model_name(pipeline_class_name): + for ov_pipelines_mapping in SUPPORTED_OV_PIPELINES_MAPPINGS: + for model_name, ov_pipeline_class in ov_pipelines_mapping.items(): + if ( + ov_pipeline_class.__name__ == pipeline_class_name + or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name + ): + return model_name - if expected_batch_size != -1: - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = kwargs.get("prompt_embeds").shape[0] - - _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale=0.0) - - return LatentConsistencyPipelineMixin.__call__( - self, - prompt=prompt, - height=height, - width=width, - num_inference_steps=num_inference_steps, - original_inference_steps=original_inference_steps, - guidance_scale=guidance_scale, - num_images_per_prompt=num_images_per_prompt, - **kwargs, - ) + model_name = _get_model_name(pipeline_class_name) - def run_safety_checker(self, image: np.ndarray): - if self.safety_checker is None: - has_nsfw_concept = None - else: - # Transpose the image to NHWC - image = image.transpose(0, 2, 3, 1) + if model_name is not None: + task_class = mapping.get(model_name, None) + if task_class is not None: + return task_class - feature_extractor_input = self.image_processor.numpy_to_pil(image) - safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt") - image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_checker_input.pixel_values) + raise ValueError(f"OVPipelineForTask can't find a pipeline linked to {pipeline_class_name} for {model_name}") - # Transpose the image back to NCHW - image = image.transpose(0, 3, 1, 2) - return image, has_nsfw_concept +class OVPipelineForTask(ConfigMixin): + auto_model_class = DiffusionPipeline + config_name = "model_index.json" + @classmethod + @validate_hf_hub_args + def from_pretrained(cls, pretrained_model_or_path, **kwargs): + load_config_kwargs = { + "force_download": kwargs.get("force_download", False), + "resume_download": kwargs.get("resume_download", None), + "local_files_only": kwargs.get("local_files_only", False), + "cache_dir": kwargs.get("cache_dir", None), + "revision": kwargs.get("revision", None), + "proxies": kwargs.get("proxies", None), + "token": kwargs.get("token", None), + } + config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) + config = config[0] if isinstance(config, tuple) else config + class_name = config["_class_name"] -def _raise_invalid_batch_size( - expected_batch_size: int, batch_size: int, num_images_per_prompt: int, guidance_scale: float -): - current_batch_size = batch_size * num_images_per_prompt * (1 if guidance_scale <= 1 else 2) + ov_pipeline_class = _get_task_ov_class(cls.ov_pipelines_mapping, class_name) - if expected_batch_size != current_batch_size: - msg = "" - if guidance_scale is not None and guidance_scale <= 1: - msg = f"`guidance_scale` was set to {guidance_scale}, static shapes are currently only supported for `guidance_scale` > 1 " + return ov_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) - raise ValueError( - "The model was statically reshaped and the pipeline inputs do not match the expected shapes. " - f"The `batch_size`, `num_images_per_prompt` and `guidance_scale` were respectively set to {batch_size}, {num_images_per_prompt} and {guidance_scale}. " - f"The static model expects an input of size equal to {expected_batch_size} and got the following value instead : {current_batch_size}. " - f"To fix this, please either provide a different inputs to your model so that `batch_size` * `num_images_per_prompt` * 2 is equal to {expected_batch_size} " - "or reshape it again accordingly using the `.reshape()` method by setting `batch_size` to -1. " + msg - ) + +class OVPipelineForText2Image(OVPipelineForTask): + auto_model_class = AutoPipelineForText2Image + ov_pipelines_mapping = OV_TEXT2IMAGE_PIPELINES_MAPPING + + +class OVPipelineForImage2Image(OVPipelineForTask): + auto_model_class = AutoPipelineForImage2Image + ov_pipelines_mapping = OV_IMAGE2IMAGE_PIPELINES_MAPPING + + +class OVPipelineForInpainting(OVPipelineForTask): + auto_model_class = AutoPipelineForInpainting + ov_pipelines_mapping = OV_INPAINT_PIPELINES_MAPPING diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 6395880e4..1ad75477c 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -314,7 +314,7 @@ def _quantize_ovbasemodel( **kwargs, ): if is_diffusers_available(): - from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipelineBase + from optimum.intel.openvino.modeling_diffusion import OVDiffusionPipeline if save_directory is not None: save_directory = Path(save_directory) @@ -324,7 +324,7 @@ def _quantize_ovbasemodel( if calibration_dataset is not None: # Process custom calibration dataset - if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): + if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): calibration_dataset = self._prepare_unet_dataset( quantization_config.num_samples, dataset=calibration_dataset ) @@ -361,7 +361,7 @@ def _quantize_ovbasemodel( if isinstance(self.model, OVModelForCausalLM): calibration_dataset = self._prepare_causal_lm_dataset(quantization_config) - elif is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): + elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): if not isinstance(quantization_config.dataset, str): raise ValueError("Please provide dataset as one of the accepted dataset labels.") calibration_dataset = self._prepare_unet_dataset( @@ -375,7 +375,7 @@ def _quantize_ovbasemodel( if quantization_config.quant_method == OVQuantizationMethod.HYBRID: if calibration_dataset is None: raise ValueError("Calibration dataset is required to run hybrid quantization.") - if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): + if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): # Apply weight-only quantization to all SD submodels except UNet quantization_config_copy = copy.deepcopy(quantization_config) quantization_config_copy.dataset = None @@ -395,7 +395,7 @@ def _quantize_ovbasemodel( self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset) self.model.request = None else: - if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): + if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2", "unet"] sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 3426abd5f..4e8033880 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -207,6 +207,40 @@ def _print_compiled_model_properties(compiled_model): logger.error("[error] Get FULL_DEVICE_NAME failed") +def np_to_pt_generators(np_object, device): + if isinstance(np_object, np.random.RandomState): + return torch.Generator(device=device).manual_seed(int(np_object.get_state()[1][0])) + elif isinstance(np_object, np.random.Generator): + return torch.Generator(device=device).manual_seed(int(np_object.bit_generator.state[1][0])) + elif isinstance(np_object, list) and isinstance(np_object[0], (np.random.RandomState, np.random.Generator)): + return [np_to_pt_generators(a, device) for a in np_object] + elif isinstance(np_object, dict) and isinstance( + next(iter(np_object.values())), (np.random.RandomState, np.random.Generator) + ): + return {k: np_to_pt_generators(v, device) for k, v in np_object.items()} + else: + return np_object + + +def _raise_invalid_batch_size( + expected_batch_size: int, batch_size: int, num_images_per_prompt: int, guidance_scale: float +): + current_batch_size = batch_size * num_images_per_prompt * (1 if guidance_scale <= 1 else 2) + + if expected_batch_size != current_batch_size: + msg = "" + if guidance_scale is not None and guidance_scale <= 1: + msg = f"`guidance_scale` was set to {guidance_scale}, static shapes are currently only supported for `guidance_scale` > 1 " + + raise ValueError( + "The model was statically reshaped and the pipeline inputs do not match the expected shapes. " + f"The `batch_size`, `num_images_per_prompt` and `guidance_scale` were respectively set to {batch_size}, {num_images_per_prompt} and {guidance_scale}. " + f"The static model expects an input of size equal to {expected_batch_size} and got the following value instead : {current_batch_size}. " + f"To fix this, please either provide a different inputs to your model so that `batch_size` * `num_images_per_prompt` * 2 is equal to {expected_batch_size} " + "or reshape it again accordingly using the `.reshape()` method by setting `batch_size` to -1. " + msg + ) + + def get_export_transformers_version(model, config): version_str = None diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index 78016ea71..6ded4fd5d 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -70,6 +70,17 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) +class OVStableDiffusionXLInpaintPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + class OVLatentConsistencyModelPipeline(metaclass=DummyObject): _backends = ["openvino", "diffusers"] @@ -79,3 +90,58 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) + + +class OVLatentConsistencyModelImg2ImgPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + +class OVDiffusionPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + +class OVPipelineForText2Image(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + +class OVPipelineForImage2Image(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + +class OVPipelineForInpainting(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) diff --git a/setup.py b/setup.py index 24a75b5f6..61eac1d79 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", "transformers>=4.36,<4.46", - "optimum @ git+https://github.com/huggingface/optimum.git", + "optimum@git+https://github.com/huggingface/optimum.git@update-diffusers-mixins", "datasets>=1.4.0", "sentencepiece", "setuptools", diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 6248d3eda..6271ff3e4 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -12,52 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. -import random -import tempfile import unittest -from typing import Dict import numpy as np -import PIL -import pytest import torch from diffusers import ( - StableDiffusionPipeline, - StableDiffusionXLImg2ImgPipeline, - StableDiffusionXLPipeline, + AutoPipelineForImage2Image, + AutoPipelineForInpainting, + AutoPipelineForText2Image, + DiffusionPipeline, ) +from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from diffusers.utils import load_image -from diffusers.utils.testing_utils import floats_tensor -from openvino.runtime.ie_api import CompiledModel from parameterized import parameterized -from transformers.testing_utils import slow -from utils_tests import MODEL_NAMES - -from optimum.intel import ( - OVLatentConsistencyModelPipeline, - OVStableDiffusionImg2ImgPipeline, - OVStableDiffusionInpaintPipeline, - OVStableDiffusionPipeline, - OVStableDiffusionXLImg2ImgPipeline, - OVStableDiffusionXLPipeline, -) -from optimum.intel.openvino.modeling_diffusion import ( - OVModelTextEncoder, - OVModelUnet, - OVModelVaeDecoder, - OVModelVaeEncoder, -) -from optimum.intel.utils.import_utils import is_diffusers_version -from optimum.utils.import_utils import is_onnxruntime_available +from utils_tests import MODEL_NAMES, SEED - -F32_CONFIG = {"INFERENCE_PRECISION_HINT": "f32"} +from optimum.intel.openvino import ( + OVDiffusionPipeline, + OVPipelineForImage2Image, + OVPipelineForInpainting, + OVPipelineForText2Image, +) +from optimum.utils.testing_utils import require_diffusers -SEED = 0 +def get_generator(framework, seed): + if framework == "np": + return np.random.RandomState(seed) + elif framework == "pt": + return torch.Generator().manual_seed(seed) + else: + raise ValueError(f"Unknown framework: {framework}") -def _generate_inputs(batch_size=1): +def _generate_prompts(batch_size=1): inputs = { "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, "num_inference_steps": 3, @@ -67,7 +55,7 @@ def _generate_inputs(batch_size=1): return inputs -def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"): +def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type="pil"): if input_type == "pil": image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" @@ -81,537 +69,611 @@ def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pi return [image] * batch_size -def to_np(image): - if isinstance(image[0], PIL.Image.Image): - return np.stack([np.array(i) for i in image], axis=0) - elif isinstance(image, torch.Tensor): - return image.cpu().numpy().transpose(0, 2, 3, 1) - return image +class OVPipelineForText2ImageTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + OVMODEL_CLASS = OVPipelineForText2Image + AUTOMODEL_CLASS = AutoPipelineForText2Image -class OVStableDiffusionPipelineBaseTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ("stable-diffusion",) - MODEL_CLASS = OVStableDiffusionPipeline TASK = "text-to-image" + def generate_inputs(self, height=128, width=128, batch_size=1): + inputs = _generate_prompts(batch_size=batch_size) + + inputs["height"] = height + inputs["width"] = width + + return inputs + + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn(f"does not appear to have a file named {self.OVMODEL_CLASS.config_name}", str(context.exception)) + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_ov_pipeline_class_dispatch(self, model_arch: str): + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__) + + auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + ov_pipeline = OVDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + + self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers def test_num_images_per_prompt(self, model_arch: str): - model_id = MODEL_NAMES[model_arch] - pipeline = self.MODEL_CLASS.from_pretrained(model_id, compile=False) - pipeline.to("cpu") - pipeline.compile() - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - batch_size, height = 2, 128 - for width in [64, 128]: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): + height, width, batch_size = 128, 128, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type + + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) - @pytest.mark.run_slow - @slow + @require_diffusers def test_callback(self, model_arch: str): - MODEL_NAMES[model_arch] + height, width, batch_size = 64, 128, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 + + def __call__(self, *args, **kwargs) -> None: + self.has_been_called = True + self.number_of_steps += 1 + + ov_callback = Callback() + auto_callback = Callback() - def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: - callback_fn.has_been_called = True - callback_fn.number_of_steps += 1 + ov_pipe = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - callback_fn.has_been_called = False - callback_fn.number_of_steps = 0 - inputs = self.generate_inputs(height=64, width=64) - pipeline(**inputs, callback=callback_fn, callback_steps=1) - self.assertTrue(callback_fn.has_been_called) - self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"]) + # callback_steps=1 to trigger callback every step + ov_pipe(**inputs, callback=ov_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ov_callback.has_been_called) + self.assertTrue(auto_callback.has_been_called) + self.assertEqual(auto_callback.number_of_steps, ov_callback.number_of_steps) @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers def test_shape(self, model_arch: str): + pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + height, width, batch_size = 128, 64, 1 - pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + for output_type in ["pil", "np", "pt", "latent"]: + inputs["output_type"] = output_type + outputs = pipeline(**inputs).images + if output_type == "pil": + self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + elif output_type == "np": + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) + else: + self.assertEqual( + outputs.shape, + (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_image_reproducibility(self, model_arch: str): + pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + for generator_framework in ["np", "pt"]: + ov_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ov_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ov_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - if self.TASK == "image-to-image": - input_types = ["np", "pil", "pt"] - elif self.TASK == "text-to-image": - input_types = ["np"] + self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0])) + np.testing.assert_allclose(ov_outputs_1.images[0], ov_outputs_2.images[0], atol=1e-4, rtol=1e-2) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_negative_prompt(self, model_arch: str): + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + negative_prompt = ["This is a negative prompt"] + pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + images_1 = pipeline(**inputs, negative_prompt=negative_prompt, generator=get_generator("pt", SEED)).images + prompt = inputs.pop("prompt") + + if model_arch == "stable-diffusion-xl": + ( + inputs["prompt_embeds"], + inputs["negative_prompt_embeds"], + inputs["pooled_prompt_embeds"], + inputs["negative_pooled_prompt_embeds"], + ) = pipeline.encode_prompt( + prompt=prompt, + num_images_per_prompt=1, + device=torch.device("cpu"), + do_classifier_free_guidance=True, + negative_prompt=negative_prompt, + ) else: - input_types = ["pil"] + inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt( + prompt=prompt, + num_images_per_prompt=1, + device=torch.device("cpu"), + do_classifier_free_guidance=True, + negative_prompt=negative_prompt, + ) - for input_type in input_types: - if self.TASK == "image-to-image": - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) - else: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for output_type in ["np", "pil", "latent"]: + images_2 = pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(images_1, images_2, atol=1e-4, rtol=1e-2) + + @parameterized.expand(["stable-diffusion", "latent-consistency"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ov_pipeline.safety_checker, StableDiffusionSafetyChecker) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) + + ov_nsfw_content_detected = ov_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected + + self.assertTrue(ov_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ov_nsfw_content_detected, diffusers_nsfw_content_detected) + + ov_images = ov_output.images + diffusers_images = diffusers_output.images + + np.testing.assert_allclose(ov_images, diffusers_images, atol=1e-4, rtol=1e-2) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_height_width_properties(self, model_arch: str): + batch_size, height, width, num_images_per_prompt = 2, 128, 64, 4 + ov_pipeline = self.OVMODEL_CLASS.from_pretrained( + MODEL_NAMES[model_arch], export=True, compile=False, dynamic_shapes=True + ) + + self.assertTrue(ov_pipeline.is_dynamic) + self.assertEqual(ov_pipeline.batch_size, -1) + self.assertEqual(ov_pipeline.height, -1) + self.assertEqual(ov_pipeline.width, -1) + + ov_pipeline.reshape( + batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images_per_prompt + ) + + self.assertFalse(ov_pipeline.is_dynamic) + self.assertEqual( + ov_pipeline.batch_size, + batch_size + * num_images_per_prompt + * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), + ) + self.assertEqual(ov_pipeline.height, height) + self.assertEqual(ov_pipeline.width, width) + + +class OVPipelineForImage2ImageTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + + AUTOMODEL_CLASS = AutoPipelineForImage2Image + OVMODEL_CLASS = OVPipelineForImage2Image + + TASK = "image-to-image" + + def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"): + inputs = _generate_prompts(batch_size=batch_size) + + inputs["image"] = _generate_images( + height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type + ) + + inputs["strength"] = 0.75 + + return inputs + + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn(f"does not appear to have a file named {self.OVMODEL_CLASS.config_name}", str(context.exception)) + + @parameterized.expand(list(SUPPORTED_ARCHITECTURES)) + @require_diffusers + def test_ov_pipeline_class_dispatch(self, model_arch: str): + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_num_images_per_prompt(self, model_arch: str): + pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_callback(self, model_arch: str): + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + inputs["num_inference_steps"] = 3 + + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 + + def __call__(self, *args, **kwargs) -> None: + self.has_been_called = True + self.number_of_steps += 1 + + ov_pipe = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + ov_callback = Callback() + auto_callback = Callback() + # callback_steps=1 to trigger callback every step + ov_pipe(**inputs, callback=ov_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ov_callback.has_been_called) + self.assertEqual(ov_callback.number_of_steps, auto_callback.number_of_steps) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_shape(self, model_arch: str): + pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + height, width, batch_size = 128, 64, 1 + + for input_type in ["pil", "np", "pt"]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) + + for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type outputs = pipeline(**inputs).images if output_type == "pil": self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) elif output_type == "np": self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: self.assertEqual( outputs.shape, (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), ) - def generate_inputs(self, height=128, width=128, batch_size=1): - inputs = _generate_inputs(batch_size) - inputs["height"] = height - inputs["width"] = width - return inputs - - -class OVStableDiffusionImg2ImgPipelineTest(OVStableDiffusionPipelineBaseTest): - SUPPORTED_ARCHITECTURES = ("stable-diffusion",) - MODEL_CLASS = OVStableDiffusionImg2ImgPipeline - TASK = "image-to-image" - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_compare_diffusers_pipeline(self, model_arch: str): - model_id = MODEL_NAMES[model_arch] - pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - inputs["prompt"] = "A painting of a squirrel eating a burger" - inputs["image"] = floats_tensor((batch_size, 3, height, width), rng=random.Random(SEED)) - output = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1].flatten() - # https://github.com/huggingface/diffusers/blob/v0.17.1/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py#L71 - expected_slice = np.array([0.66964, 0.61614, 0.48283, 0.57811, 0.55551, 0.55392, 0.53045, 0.41177, 0.46099]) - self.assertTrue( - np.allclose(output, expected_slice, atol=1e-1), - msg=f"Max difference: {np.abs(output - expected_slice).max()}. Actual value: {output}", - ) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @pytest.mark.run_slow - @slow - def test_num_images_per_prompt_static_model(self, model_arch: str): - model_id = MODEL_NAMES[model_arch] - pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=False) - batch_size, num_images, height, width = 2, 3, 128, 64 - pipeline.half() - pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images) - for _height in [height, height + 16]: - inputs = self.generate_inputs(height=_height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - - def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): - inputs = _generate_inputs(batch_size) - inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) - inputs["strength"] = 0.75 - return inputs + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) -class OVStableDiffusionPipelineTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ("stable-diffusion",) - MODEL_CLASS = OVStableDiffusionPipeline - TASK = "text-to-image" + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_compare_to_diffusers(self, model_arch: str): - model_id = MODEL_NAMES[model_arch] - ov_pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) - self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder) - self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder) - self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder) - self.assertIsInstance(ov_pipeline.unet, OVModelUnet) - self.assertIsInstance(ov_pipeline.config, Dict) - - pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) - pipeline.safety_checker = None - batch_size, num_images_per_prompt, height, width = 1, 2, 64, 64 - - latents = ov_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ov_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - kwargs = { - "prompt": "sailing ship in storm by Leonardo da Vinci", - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_rescale": 0.1, - } - - for output_type in ["latent", "np"]: - ov_outputs = ov_pipeline(latents=latents, output_type=output_type, **kwargs).images - self.assertIsInstance(ov_outputs, np.ndarray) - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - # Compare model outputs - self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4)) - - # Compare model devices - self.assertEqual(pipeline.device, ov_pipeline.device) + np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers def test_image_reproducibility(self, model_arch: str): - model_id = MODEL_NAMES[model_arch] - pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True) - inputs = _generate_inputs() - height, width = 64, 64 - ov_outputs_1 = pipeline(**inputs, height=height, width=width, generator=np.random.RandomState(SEED)) - ov_outputs_2 = pipeline(**inputs, height=height, width=width, generator=np.random.RandomState(SEED)) - ov_outputs_3 = pipeline(**inputs, height=height, width=width) - # Compare model outputs - self.assertTrue(np.array_equal(ov_outputs_1.images[0], ov_outputs_2.images[0])) - self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0])) + pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @pytest.mark.run_slow - @slow - def test_num_images_per_prompt_static_model(self, model_arch: str): - model_id = MODEL_NAMES[model_arch] - pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False) - batch_size, num_images, height, width = 3, 4, 128, 64 - pipeline.half() - pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images) - self.assertFalse(pipeline.is_dynamic) - pipeline.compile() - # Verify output shapes requirements not matching the static model doesn't impact the final outputs - for _height in [height, height + 16]: - inputs = _generate_inputs(batch_size) - outputs = pipeline(**inputs, num_images_per_prompt=num_images, height=_height, width=width).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_height_width_properties(self, model_arch: str): - model_id = MODEL_NAMES[model_arch] - batch_size, num_images, height, width = 2, 4, 128, 64 - pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=True) - self.assertTrue(pipeline.is_dynamic) - self.assertEqual(pipeline.height, -1) - self.assertEqual(pipeline.width, -1) - pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images) - self.assertFalse(pipeline.is_dynamic) - self.assertEqual(pipeline.height, height) - self.assertEqual(pipeline.width, width) - - -class OVStableDiffusionInpaintPipelineTest(OVStableDiffusionPipelineBaseTest): - SUPPORTED_ARCHITECTURES = ("stable-diffusion",) - MODEL_CLASS = OVStableDiffusionInpaintPipeline - TASK = "inpaint" + for generator_framework in ["np", "pt"]: + ov_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ov_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ov_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @unittest.skipIf(not is_onnxruntime_available(), "this test requires onnxruntime") - def test_compare_diffusers_pipeline(self, model_arch: str): - from optimum.onnxruntime import ORTStableDiffusionInpaintPipeline - - model_id = MODEL_NAMES[model_arch] - pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) - batch_size, num_images, height, width = 1, 1, 64, 64 - latents = pipeline.prepare_latents( - batch_size * num_images, - pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) - inputs = self.generate_inputs(height=height, width=width) + self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0])) + np.testing.assert_allclose(ov_outputs_1.images[0], ov_outputs_2.images[0], atol=1e-4, rtol=1e-2) - inputs["image"] = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ).resize((width, height)) + @parameterized.expand(["stable-diffusion", "latent-consistency"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") - inputs["mask_image"] = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" - ).resize((width, height)) + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) - outputs = pipeline(**inputs, latents=latents).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ov_pipeline.safety_checker, StableDiffusionSafetyChecker) - ort_pipeline = ORTStableDiffusionInpaintPipeline.from_pretrained(model_id, export=True) - ort_outputs = ort_pipeline(**inputs, latents=latents).images - self.assertTrue(np.allclose(outputs, ort_outputs, atol=1e-1)) + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - expected_slice = np.array([0.4692, 0.5260, 0.4005, 0.3609, 0.3259, 0.4676, 0.5593, 0.4728, 0.4411]) - self.assertTrue(np.allclose(outputs[0, -3:, -3:, -1].flatten(), expected_slice, atol=1e-1)) + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @pytest.mark.run_slow - @slow - def test_num_images_per_prompt_static_model(self, model_arch: str): - model_id = MODEL_NAMES[model_arch] - pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=False) - batch_size, num_images, height, width = 1, 3, 128, 64 - pipeline.half() - pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images) - for _height in [height, height + 16]: - inputs = self.generate_inputs(height=_height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + ov_nsfw_content_detected = ov_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected - def generate_inputs(self, height=128, width=128, batch_size=1): - inputs = super(OVStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width, batch_size) - inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] - inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] - return inputs + self.assertTrue(ov_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ov_nsfw_content_detected, diffusers_nsfw_content_detected) + ov_images = ov_output.images + diffusers_images = diffusers_output.images -class OVtableDiffusionXLPipelineTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ("stable-diffusion-xl",) - MODEL_CLASS = OVStableDiffusionXLPipeline - PT_MODEL_CLASS = StableDiffusionXLPipeline - TASK = "text-to-image" + np.testing.assert_allclose(ov_images, diffusers_images, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_compare_to_diffusers(self, model_arch: str): - ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG) - self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder) - self.assertIsInstance(ov_pipeline.text_encoder_2, OVModelTextEncoder) - self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder) - self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder) - self.assertIsInstance(ov_pipeline.unet, OVModelUnet) - self.assertIsInstance(ov_pipeline.config, Dict) - - pipeline = self.PT_MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128 - latents = ov_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ov_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), + def test_height_width_properties(self, model_arch: str): + batch_size, height, width, num_images_per_prompt = 2, 128, 64, 4 + ov_pipeline = self.OVMODEL_CLASS.from_pretrained( + MODEL_NAMES[model_arch], export=True, compile=False, dynamic_shapes=True ) - kwargs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_rescale": 0.1, - } + self.assertTrue(ov_pipeline.is_dynamic) + self.assertEqual(ov_pipeline.batch_size, -1) + self.assertEqual(ov_pipeline.height, -1) + self.assertEqual(ov_pipeline.width, -1) - for output_type in ["latent", "np"]: - ov_outputs = ov_pipeline(latents=latents, output_type=output_type, **kwargs).images + ov_pipeline.reshape( + batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images_per_prompt + ) - self.assertIsInstance(ov_outputs, np.ndarray) - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images + self.assertFalse(ov_pipeline.is_dynamic) + self.assertEqual( + ov_pipeline.batch_size, + batch_size + * num_images_per_prompt + * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), + ) + self.assertEqual(ov_pipeline.height, height) + self.assertEqual(ov_pipeline.width, width) - # Compare model outputs - self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ov_pipeline.device) - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_image_reproducibility(self, model_arch: str): - pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - - # Verify every subcomponent is compiled by default - for component in {"unet", "vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"}: - self.assertIsInstance(getattr(pipeline, component).request, CompiledModel) - - batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128 - inputs = _generate_inputs(batch_size) - ov_outputs_1 = pipeline( - **inputs, - height=height, - width=width, - num_images_per_prompt=num_images_per_prompt, - generator=np.random.RandomState(SEED), +class OVPipelineForInpaintingTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + + AUTOMODEL_CLASS = AutoPipelineForInpainting + OVMODEL_CLASS = OVPipelineForInpainting + + TASK = "inpainting" + + def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"): + inputs = _generate_prompts(batch_size=batch_size) + + inputs["image"] = _generate_images( + height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type ) - with tempfile.TemporaryDirectory() as tmp_dir: - pipeline.save_pretrained(tmp_dir) - pipeline = self.MODEL_CLASS.from_pretrained(tmp_dir) - ov_outputs_2 = pipeline( - **inputs, - height=height, - width=width, - num_images_per_prompt=num_images_per_prompt, - generator=np.random.RandomState(SEED), + inputs["mask_image"] = _generate_images( + height=height, width=width, batch_size=batch_size, channel=1, input_type=input_type ) - ov_outputs_3 = pipeline(**inputs, height=height, width=width, num_images_per_prompt=num_images_per_prompt) - self.assertTrue(np.array_equal(ov_outputs_1.images[0], ov_outputs_2.images[0])) - self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0])) + + inputs["strength"] = 0.75 + inputs["height"] = height + inputs["width"] = width + + return inputs + + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn(f"does not appear to have a file named {self.OVMODEL_CLASS.config_name}", str(context.exception)) @parameterized.expand(SUPPORTED_ARCHITECTURES) - @pytest.mark.run_slow - @slow - def test_num_images_per_prompt_static_model(self, model_arch: str): - model_id = MODEL_NAMES[model_arch] - pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False) - batch_size, num_images, height, width = 3, 4, 128, 64 - pipeline.half() - pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images) - self.assertFalse(pipeline.is_dynamic) - pipeline.compile() - - for _height in [height, height + 16]: - inputs = _generate_inputs(batch_size) - outputs = pipeline(**inputs, num_images_per_prompt=num_images, height=_height, width=width).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - - -class OVStableDiffusionXLImg2ImgPipelineTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ("stable-diffusion-xl", "stable-diffusion-xl-refiner") - MODEL_CLASS = OVStableDiffusionXLImg2ImgPipeline - PT_MODEL_CLASS = StableDiffusionXLImg2ImgPipeline - TASK = "image-to-image" + @require_diffusers + def test_ov_pipeline_class_dispatch(self, model_arch: str): + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - def test_inference(self): - model_id = "hf-internal-testing/tiny-stable-diffusion-xl-pipe" - pipeline = self.MODEL_CLASS.from_pretrained(model_id, ov_config=F32_CONFIG) + self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__) - with tempfile.TemporaryDirectory() as tmp_dir: - pipeline.save_pretrained(tmp_dir) - pipeline = self.MODEL_CLASS.from_pretrained(tmp_dir, ov_config=F32_CONFIG) + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_num_images_per_prompt(self, model_arch: str): + pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - batch_size, height, width = 1, 128, 128 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - inputs["image"] = floats_tensor((batch_size, 3, height, width), rng=random.Random(SEED)) - output = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1] - expected_slice = np.array([0.5747, 0.5182, 0.4857, 0.5295, 0.5106, 0.5520, 0.4814, 0.4289, 0.4868]) - self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-3)) + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) @parameterized.expand(SUPPORTED_ARCHITECTURES) - @pytest.mark.run_slow - @slow - def test_num_images_per_prompt_static_model(self, model_arch: str): - model_id = MODEL_NAMES[model_arch] - pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=False) - batch_size, num_images, height, width = 2, 3, 128, 64 - pipeline.half() - pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images) - for _height in [height, height + 16]: - inputs = self.generate_inputs(height=_height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - - def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): - inputs = _generate_inputs(batch_size) - inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) - inputs["strength"] = 0.75 - return inputs + @require_diffusers + def test_callback(self, model_arch: str): + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + inputs["num_inference_steps"] = 3 + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 -class OVLatentConsistencyModelPipelineTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ("latent-consistency",) - MODEL_CLASS = OVLatentConsistencyModelPipeline - TASK = "text-to-image" + def __call__(self, *args, **kwargs) -> None: + self.has_been_called = True + self.number_of_steps += 1 + + ov_pipe = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + ov_callback = Callback() + auto_callback = Callback() + # callback_steps=1 to trigger callback every step + ov_pipe(**inputs, callback=ov_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ov_callback.has_been_called) + self.assertEqual(ov_callback.number_of_steps, auto_callback.number_of_steps) @parameterized.expand(SUPPORTED_ARCHITECTURES) - @unittest.skipIf(is_diffusers_version("<=", "0.21.4"), "not supported with this diffusers version") - def test_compare_to_diffusers(self, model_arch: str): - ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG) - self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder) - self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder) - self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder) - self.assertIsInstance(ov_pipeline.unet, OVModelUnet) - self.assertIsInstance(ov_pipeline.config, Dict) - - from diffusers import LatentConsistencyModelPipeline - - pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch]) - batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128 - latents = ov_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ov_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) + @require_diffusers + def test_shape(self, model_arch: str): + pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - kwargs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_scale": 8.5, - } - - for output_type in ["latent", "np"]: - ov_outputs = ov_pipeline(latents=latents, output_type=output_type, **kwargs).images - self.assertIsInstance(ov_outputs, np.ndarray) - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - - # Compare model outputs - self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ov_pipeline.device) + height, width, batch_size = 128, 64, 1 + + for input_type in ["pil", "np", "pt"]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) + + for output_type in ["pil", "np", "pt", "latent"]: + inputs["output_type"] = output_type + outputs = pipeline(**inputs).images + if output_type == "pil": + self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + elif output_type == "np": + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) + else: + self.assertEqual( + outputs.shape, + (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ) @parameterized.expand(SUPPORTED_ARCHITECTURES) - @pytest.mark.run_slow - @slow - @unittest.skipIf(is_diffusers_version("<=", "0.21.4"), "not supported with this diffusers version") - def test_num_images_per_prompt_static_model(self, model_arch: str): - model_id = MODEL_NAMES[model_arch] - pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=False) - batch_size, num_images, height, width = 3, 4, 128, 64 - pipeline.half() - pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images) - self.assertFalse(pipeline.is_dynamic) - pipeline.compile() - - for _height in [height, height + 16]: - inputs = _generate_inputs(batch_size) - outputs = pipeline(**inputs, num_images_per_prompt=num_images, height=_height, width=width).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type + + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) - @unittest.skipIf(is_diffusers_version("<=", "0.21.4"), "not supported with this diffusers version") - def test_safety_checker(self, model_arch: str): - ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG) - self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder) - self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder) - self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder) - self.assertIsInstance(ov_pipeline.unet, OVModelUnet) - self.assertIsInstance(ov_pipeline.config, Dict) + @require_diffusers + def test_image_reproducibility(self, model_arch: str): + pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - from diffusers import LatentConsistencyModelPipeline - from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker + for generator_framework in ["np", "pt"]: + ov_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ov_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ov_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) + self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0])) + np.testing.assert_allclose(ov_outputs_1.images[0], ov_outputs_2.images[0], atol=1e-4, rtol=1e-2) + + @parameterized.expand(["stable-diffusion"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") - pipeline = LatentConsistencyModelPipeline.from_pretrained( - MODEL_NAMES[model_arch], safety_checker=safety_checker + + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ov_pipeline.safety_checker, StableDiffusionSafetyChecker) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) + + ov_nsfw_content_detected = ov_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected + + self.assertTrue(ov_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ov_nsfw_content_detected, diffusers_nsfw_content_detected) + + ov_images = ov_output.images + diffusers_images = diffusers_output.images + + np.testing.assert_allclose(ov_images, diffusers_images, atol=1e-4, rtol=1e-2) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_height_width_properties(self, model_arch: str): + batch_size, height, width, num_images_per_prompt = 2, 128, 64, 4 + ov_pipeline = self.OVMODEL_CLASS.from_pretrained( + MODEL_NAMES[model_arch], export=True, compile=False, dynamic_shapes=True ) - batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128 - latents = ov_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ov_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), + self.assertTrue(ov_pipeline.is_dynamic) + self.assertEqual(ov_pipeline.batch_size, -1) + self.assertEqual(ov_pipeline.height, -1) + self.assertEqual(ov_pipeline.width, -1) + + ov_pipeline.reshape( + batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images_per_prompt ) - kwargs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_scale": 8.5, - } - - for output_type in ["latent", "np"]: - ov_outputs = ov_pipeline(latents=latents, output_type=output_type, **kwargs).images - self.assertIsInstance(ov_outputs, np.ndarray) - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - - # Compare model outputs - self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ov_pipeline.device) + self.assertFalse(ov_pipeline.is_dynamic) + self.assertEqual( + ov_pipeline.batch_size, + batch_size + * num_images_per_prompt + * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), + ) + self.assertEqual(ov_pipeline.height, height) + self.assertEqual(ov_pipeline.width, width) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index ea7245344..33bfb55dc 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -272,8 +272,12 @@ def test_load_from_hub_and_save_stable_diffusion_model(self): "num_inference_steps": 2, "output_type": "np", } - pipeline_outputs = loaded_pipeline(**inputs, generator=np.random.RandomState(SEED)).images + + np.random.seed(0) + torch.manual_seed(0) + pipeline_outputs = loaded_pipeline(**inputs).images self.assertEqual(pipeline_outputs.shape, (batch_size, height, width, 3)) + with tempfile.TemporaryDirectory() as tmpdirname: loaded_pipeline.save_pretrained(tmpdirname) pipeline = OVStableDiffusionPipeline.from_pretrained(tmpdirname) @@ -294,12 +298,17 @@ def test_load_from_hub_and_save_stable_diffusion_model(self): self.assertIsInstance(compile_only_pipeline.text_encoder.model, ov.runtime.CompiledModel) self.assertIsInstance(compile_only_pipeline.vae_encoder.model, ov.runtime.CompiledModel) self.assertIsInstance(compile_only_pipeline.vae_decoder.model, ov.runtime.CompiledModel) - outputs = compile_only_pipeline(**inputs, generator=np.random.RandomState(SEED)).images - self.assertTrue(np.array_equal(pipeline_outputs, outputs)) + + np.random.seed(0) + torch.manual_seed(0) + outputs = compile_only_pipeline(**inputs).images + np.testing.assert_allclose(pipeline_outputs, outputs, atol=1e-4, rtol=1e-4) del compile_only_pipeline - outputs = pipeline(**inputs, generator=np.random.RandomState(SEED)).images - self.assertTrue(np.array_equal(pipeline_outputs, outputs)) + np.random.seed(0) + torch.manual_seed(0) + outputs = pipeline(**inputs).images + np.testing.assert_allclose(pipeline_outputs, outputs, atol=1e-4, rtol=1e-4) del pipeline gc.collect() From 41d93a17ea4dfd0880f80ac25925e51e3622ac20 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Wed, 9 Oct 2024 14:38:28 +0200 Subject: [PATCH 02/53] fix (#929) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 61eac1d79..6b3ddc876 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", "transformers>=4.36,<4.46", - "optimum@git+https://github.com/huggingface/optimum.git@update-diffusers-mixins", + "optimum@git+https://github.com/huggingface/optimum.git", "datasets>=1.4.0", "sentencepiece", "setuptools", From cb5208c326188422d5a3a2538021ede2f02b27b8 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Wed, 9 Oct 2024 18:29:06 +0200 Subject: [PATCH 03/53] Add $ to the end of filename regex patterns (#931) And add an unrelated test for a model with modeling files in a subfolder and configuration files in the root. This passes on main, but failed on 1.19.0 --- optimum/intel/openvino/modeling_base.py | 2 +- optimum/intel/openvino/modeling_open_clip.py | 2 +- tests/openvino/test_modeling.py | 21 ++++++++++++++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 6a1d0cea3..6a1fbbc76 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -439,7 +439,7 @@ def from_pretrained( ov_files = _find_files_matching_pattern( model_dir, - pattern=r"(.*)?openvino(.*)?\_model.xml", + pattern=r"(.*)?openvino(.*)?\_model.xml$", subfolder=subfolder, use_auth_token=token, revision=revision, diff --git a/optimum/intel/openvino/modeling_open_clip.py b/optimum/intel/openvino/modeling_open_clip.py index 967153a03..4a3cb0fca 100644 --- a/optimum/intel/openvino/modeling_open_clip.py +++ b/optimum/intel/openvino/modeling_open_clip.py @@ -152,7 +152,7 @@ def from_pretrained( ov_files = _find_files_matching_pattern( model_dir, - pattern=r"(.*)?openvino(.*)?\_model\_(.*)?.xml", + pattern=r"(.*)?openvino(.*)?\_model\_(.*)?.xml$", subfolder=subfolder, use_auth_token=token, revision=revision, diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 33bfb55dc..7ef57ead5 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -339,7 +339,7 @@ def test_infer_export_when_loading(self): def test_find_files_matching_pattern(self): model_id = "echarlaix/tiny-random-PhiForCausalLM" - pattern = r"(.*)?openvino(.*)?\_model.xml" + pattern = r"(.*)?openvino(.*)?\_model.xml$" # hub model for revision in ("main", "ov", "itrex"): ov_files = _find_files_matching_pattern( @@ -360,7 +360,7 @@ def test_find_files_matching_pattern(self): @parameterized.expand(("stable-diffusion", "stable-diffusion-openvino")) def test_find_files_matching_pattern_sd(self, model_arch): - pattern = r"(.*)?openvino(.*)?\_model.xml" + pattern = r"(.*)?openvino(.*)?\_model.xml$" model_id = MODEL_NAMES[model_arch] # hub model ov_files = _find_files_matching_pattern(model_id, pattern=pattern) @@ -374,6 +374,23 @@ def test_find_files_matching_pattern_sd(self, model_arch): ov_files = _find_files_matching_pattern(local_dir, pattern=pattern) self.assertTrue(len(ov_files) > 0 if "openvino" in model_id else len(ov_files) == 0) + @parameterized.expand(("", "openvino")) + def test_find_files_matching_pattern_with_config_in_root(self, subfolder): + # Notably, the model has a config.json file in the root directory and not in the subfolder + model_id = "sentence-transformers-testing/stsb-bert-tiny-openvino" + pattern = r"(.*)?openvino(.*)?\_model.xml$" + # hub model + ov_files = _find_files_matching_pattern(model_id, pattern=pattern, subfolder=subfolder) + self.assertTrue(len(ov_files) == 1 if subfolder == "openvino" else len(ov_files) == 0) + + # local model + api = HfApi() + with tempfile.TemporaryDirectory() as tmpdirname: + local_dir = Path(tmpdirname) / "model" + api.snapshot_download(repo_id=model_id, local_dir=local_dir) + ov_files = _find_files_matching_pattern(local_dir, pattern=pattern, subfolder=subfolder) + self.assertTrue(len(ov_files) == 1 if subfolder == "openvino" else len(ov_files) == 0) + class PipelineTest(unittest.TestCase): def test_load_model_from_hub(self): From 95a80f0f3b88a5903654f48fd86c6601b9d39793 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:19:04 +0200 Subject: [PATCH 04/53] Allow loading model models in a subfolder (with config in root) (#933) --- optimum/intel/openvino/modeling_base.py | 2 +- tests/openvino/test_modeling.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 6a1fbbc76..353416ff5 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -512,7 +512,7 @@ def _cached_file( # locates a file in a local folder and repo, downloads and cache it if necessary. model_path = Path(model_path) if model_path.is_dir(): - model_cache_path = model_path / file_name + model_cache_path = model_path / subfolder / file_name else: file_name = Path(file_name) if file_name.suffix != ".onnx": diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 7ef57ead5..a28d6a51f 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -324,6 +324,20 @@ def test_load_model_from_hub_private_with_token(self): self.assertIsInstance(model.config, PretrainedConfig) self.assertTrue(model.stateful) + @parameterized.expand(("", "openvino")) + def test_loading_with_config_in_root(self, subfolder): + # config.json file in the root directory and not in the subfolder + model_id = "sentence-transformers-testing/stsb-bert-tiny-openvino" + export = subfolder == "" + # hub model + OVModelForFeatureExtraction.from_pretrained(model_id, subfolder=subfolder, export=export) + # local model + api = HfApi() + with tempfile.TemporaryDirectory() as tmpdirname: + local_dir = Path(tmpdirname) / "model" + api.snapshot_download(repo_id=model_id, local_dir=local_dir) + OVModelForFeatureExtraction.from_pretrained(local_dir, subfolder=subfolder, export=export) + def test_infer_export_when_loading(self): model_id = MODEL_NAMES["phi"] model = AutoModelForCausalLM.from_pretrained(model_id) From b31524c2c35f5b6c98f61b8cfa0b59d72a653e70 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:45:17 +0200 Subject: [PATCH 05/53] Update setup optimum version (#936) * update setup * restore --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6b3ddc876..7eb40d6c3 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", "transformers>=4.36,<4.46", - "optimum@git+https://github.com/huggingface/optimum.git", + "optimum~=1.23", "datasets>=1.4.0", "sentencepiece", "setuptools", From 7702d352a2997d3d0ac675f2338d8e253c752895 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 10 Oct 2024 17:13:14 +0200 Subject: [PATCH 06/53] Add tests for latest transformers release (#939) * Add tests for latest transformers release * style --- .github/workflows/test_openvino.yml | 2 +- tests/openvino/test_modeling.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index bbdfdb32a..53c210707 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -21,7 +21,7 @@ jobs: fail-fast: false matrix: python-version: ["3.8", "3.12"] - transformers-version: ["4.36.0", "4.44.*"] + transformers-version: ["4.36.0", "4.45.*"] os: [ubuntu-latest] runs-on: ${{ matrix.os }} diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index a28d6a51f..2eb6c1e84 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -771,7 +771,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "bloom", "chatglm", "codegen", - "codegen2", "gpt2", "gpt_neo", "gpt_neox", @@ -820,6 +819,10 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "mistral-nemo", ) + # custom modeling defined in https://huggingface.co/katuni4ka/tiny-random-codegen2 differs from transformers after v4.45 resulting in unadapted patching + if is_transformers_version("<", "4.45.0"): + SUPPORTED_ARCHITECTURES += ("codegen2",) + GENERATION_LENGTH = 100 REMOTE_CODE_MODELS = ( "chatglm", From 9474efbe2ed72e4402588569d53334fb157c857d Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Thu, 10 Oct 2024 18:30:53 +0200 Subject: [PATCH 07/53] Update number of expected exported tokenizers (#940) * update number of expected exported tokenizers * tokenizer version * warning * fix last test --- optimum/exporters/openvino/convert.py | 7 +++++++ optimum/intel/utils/import_utils.py | 20 ++++++++++++++++++++ tests/openvino/test_exporters_cli.py | 11 ++++++----- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 4db452dbd..59cc2bb7c 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -41,6 +41,7 @@ _torch_version, _transformers_version, compare_versions, + is_tokenizers_version, is_transformers_version, ) from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available @@ -730,6 +731,12 @@ def export_tokenizer( except ModuleNotFoundError: return + if is_tokenizers_version(">", "0.19"): + logger.warning( + "Exporting tokenizers to OpenVINO is not supported for tokenizers version > 0.19. " + "Please downgrade to tokenizers version <= 0.19 to export tokenizers to OpenVINO." + ) + if not isinstance(output, Path): output = Path(output) diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index d231a6bef..60d20361e 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -43,6 +43,13 @@ except importlib_metadata.PackageNotFoundError: _transformers_available = False +_tokenizers_available = importlib.util.find_spec("tokenizers") is not None +_tokenizers_version = "N/A" +if _tokenizers_available: + try: + _tokenizers_version = importlib_metadata.version("tokenizers") + except importlib_metadata.PackageNotFoundError: + _tokenizers_available = False _torch_available = importlib.util.find_spec("torch") is not None _torch_version = "N/A" @@ -181,6 +188,10 @@ def is_transformers_available(): return _transformers_available +def is_tokenizers_available(): + return _tokenizers_available + + def is_neural_compressor_available(): return _neural_compressor_available @@ -340,6 +351,15 @@ def is_transformers_version(operation: str, version: str): return compare_versions(parse(_transformers_version), operation, version) +def is_tokenizers_version(operation: str, version: str): + """ + Compare the current Tokenizers version to a given reference with an operation. + """ + if not _tokenizers_available: + return False + return compare_versions(parse(_tokenizers_version), operation, version) + + def is_optimum_version(operation: str, version: str): return compare_versions(parse(_optimum_version), operation, version) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 0cd19a2d4..04e6fc601 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -48,6 +48,7 @@ from optimum.intel.utils.import_utils import ( compare_versions, is_openvino_tokenizers_available, + is_tokenizers_version, ) @@ -73,17 +74,17 @@ class OVCLIExportTestCase(unittest.TestCase): ("image-to-image", "stable-diffusion-xl-refiner"), ) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { - "gpt2": 2, + "gpt2": 2 if is_tokenizers_version("<", "0.20") else 0, "t5": 0, # no .model file in the repository "albert": 0, # not supported yet "distilbert": 1, # no detokenizer - "roberta": 2, + "roberta": 2 if is_tokenizers_version("<", "0.20") else 0, "vit": 0, # no tokenizer for image model "wav2vec2": 0, # no tokenizer "bert": 1, # no detokenizer - "blenderbot": 2, - "stable-diffusion": 2, - "stable-diffusion-xl": 4, + "blenderbot": 2 if is_tokenizers_version("<", "0.20") else 0, + "stable-diffusion": 2 if is_tokenizers_version("<", "0.20") else 0, + "stable-diffusion-xl": 4 if is_tokenizers_version("<", "0.20") else 0, } SUPPORTED_SD_HYBRID_ARCHITECTURES = ( From fec00922343876cf03970fbda63cd280ff3db0de Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Thu, 10 Oct 2024 18:40:49 +0200 Subject: [PATCH 08/53] Include custom textual inversion to diffusers pipelines (#938) * added textual inversion * added tests * fix textual inversion loader and test it * fix * slow test * fix * mark as run slow to test with CI --- optimum/intel/openvino/loaders.py | 385 +++---------------- optimum/intel/openvino/modeling_diffusion.py | 29 +- optimum/intel/openvino/utils.py | 4 +- tests/openvino/test_diffusion.py | 76 +++- 4 files changed, 151 insertions(+), 343 deletions(-) diff --git a/optimum/intel/openvino/loaders.py b/optimum/intel/openvino/loaders.py index fc5ae9749..5da287700 100644 --- a/optimum/intel/openvino/loaders.py +++ b/optimum/intel/openvino/loaders.py @@ -13,26 +13,18 @@ # limitations under the License. import logging -import warnings from typing import Dict, List, Optional, Union -import torch -from diffusers.utils import _get_model_file - -from ..utils.import_utils import is_safetensors_available - - -if is_safetensors_available(): - import safetensors - import openvino -from huggingface_hub.constants import HF_HUB_OFFLINE, HUGGINGFACE_HUB_CACHE +import torch +from diffusers.loaders.textual_inversion import TextualInversionLoaderMixin, load_textual_inversion_state_dicts +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from openvino.runtime import Type from openvino.runtime import opset11 as ops from openvino.runtime.passes import Manager, Matcher, MatcherPass, WrapType from transformers import PreTrainedTokenizer -from .utils import TEXTUAL_INVERSION_EMBEDDING_KEY, TEXTUAL_INVERSION_NAME, TEXTUAL_INVERSION_NAME_SAFE +from .utils import TEXTUAL_INVERSION_EMBEDDING_KEY try: @@ -49,17 +41,17 @@ class InsertTextEmbedding(MatcherPass): OpenVINO ngraph transformation for inserting pre-trained texual inversion embedding to text encoder """ - def __init__(self, token_ids_and_embeddings): + def __init__(self, tokens_ids, embeddings): MatcherPass.__init__(self) - self.model_changed = False + param = WrapType("opset1.Constant") def callback(matcher: Matcher) -> bool: root = matcher.get_match_root() - if root.get_friendly_name() == TEXTUAL_INVERSION_EMBEDDING_KEY: + if root.get_friendly_name() == TEXTUAL_INVERSION_EMBEDDING_KEY: # there should be a better way to do this add_ti = root consumers = matcher.get_match_value().get_target_inputs() - for token_id, embedding in token_ids_and_embeddings: + for token_id, embedding in zip(tokens_ids, embeddings): ti_weights = ops.constant(embedding, Type.f32, name=str(token_id)) ti_weights_unsqueeze = ops.unsqueeze(ti_weights, axes=0) add_ti = ops.concat( @@ -81,341 +73,74 @@ def callback(matcher: Matcher) -> bool: # Adapted from diffusers.loaders.TextualInversionLoaderMixin -class OVTextualInversionLoaderMixin: - r""" - Load textual inversion tokens and embeddings to the tokenizer and text encoder. - """ - - def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "PreTrainedTokenizer"): - r""" - Processes prompts that include a special token corresponding to a multi-vector textual inversion embedding to - be replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual - inversion token or if the textual inversion token is a single vector, the input prompt is returned. - - Parameters: - prompt (`str` or list of `str`): - The prompt or prompts to guide the image generation. - tokenizer (`PreTrainedTokenizer`): - The tokenizer responsible for encoding the prompt into input tokens. - - Returns: - `str` or list of `str`: The converted prompt - """ - if not isinstance(prompt, List): - prompts = [prompt] - else: - prompts = prompt - - prompts = [self._maybe_convert_prompt(p, tokenizer) for p in prompts] - - if not isinstance(prompt, List): - return prompts[0] - - return prompts - - def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"): - r""" - Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds - to a multi-vector textual inversion embedding, this function will process the prompt so that the special token - is replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual - inversion token or a textual inversion token that is a single vector, the input prompt is simply returned. - - Parameters: - prompt (`str`): - The prompt to guide the image generation. - tokenizer (`PreTrainedTokenizer`): - The tokenizer responsible for encoding the prompt into input tokens. - - Returns: - `str`: The converted prompt - """ - tokens = tokenizer.tokenize(prompt) - unique_tokens = set(tokens) - for token in unique_tokens: - if token in tokenizer.added_tokens_encoder: - replacement = token - i = 1 - while f"{token}_{i}" in tokenizer.added_tokens_encoder: - replacement += f" {token}_{i}" - i += 1 - - prompt = prompt.replace(token, replacement) - - return prompt - +class OVTextualInversionLoaderMixin(TextualInversionLoaderMixin): def load_textual_inversion( self, pretrained_model_name_or_path: Union[str, List[str], Dict[str, torch.Tensor], List[Dict[str, torch.Tensor]]], token: Optional[Union[str, List[str]]] = None, + tokenizer: Optional["PreTrainedTokenizer"] = None, # noqa: F821 + text_encoder: Optional["openvino.runtime.Model"] = None, # noqa: F821 **kwargs, ): - r""" - Load textual inversion embeddings into the text encoder of [`StableDiffusionPipeline`] (both 🤗 Diffusers and - Automatic1111 formats are supported). - - Parameters: - pretrained_model_name_or_path (`str` or `os.PathLike` or `List[str or os.PathLike]` or `Dict` or `List[Dict]`): - Can be either one of the following or a list of them: - - - A string, the *model id* (for example `sd-concepts-library/low-poly-hd-logos-icons`) of a - pretrained model hosted on the Hub. - - A path to a *directory* (for example `./my_text_inversion_directory/`) containing the textual - inversion weights. - - A path to a *file* (for example `./my_text_inversions.pt`) containing textual inversion weights. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - token (`str` or `List[str]`, *optional*): - Override the token to use for the textual inversion weights. If `pretrained_model_name_or_path` is a - list, then `token` must also be a list of equal length. - weight_name (`str`, *optional*): - Name of a custom weight file. This should be used when: - - - The saved textual inversion file is in 🤗 Diffusers format, but was saved under a specific weight - name such as `text_inv.bin`. - - The saved textual inversion file is in the Automatic1111 format. - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - use_auth_token (Optional[Union[bool, str]], defaults to `None`): - Deprecated. Please use `token` instead. - token (Optional[Union[bool, str]], defaults to `None`): - The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated - when running `huggingface-cli login` (stored in `~/.huggingface`). - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - mirror (`str`, *optional*): - Mirror source to resolve accessibility issues if you're downloading a model in China. We do not - guarantee the timeliness or safety of the source, and you should refer to the mirror site for more - information. - - Example: - - To load a textual inversion embedding vector in 🤗 Diffusers format: - - ```py - from optimum.intel import OVStableDiffusionPipeline - - model_id = "runwayml/stable-diffusion-v1-5" - pipe = OVStableDiffusionPipeline.from_pretrained(model_id, compile=False) - - pipe.load_textual_inversion("sd-concepts-library/cat-toy") - pipe.compile() - - prompt = "A backpack" - - image = pipe(prompt, num_inference_steps=50).images[0] - image.save("cat-backpack.png") - ``` - - To load a textual inversion embedding vector in Automatic1111 format, make sure to download the vector first - (for example from [civitAI](https://civitai.com/models/3036?modelVersionId=9857)) and then load the vector - locally: - - ```py - from optimum.intel import OVStableDiffusionPipeline - - model_id = "runwayml/stable-diffusion-v1-5" - pipe = StableDiffusionPipeline.from_pretrained(model_id, compile=False) - - pipe.load_textual_inversion("./charturnerv2.pt", token="charturnerv2") - pipe.compile() - - prompt = "charturnerv2, multiple views of the same character in the same outfit, a character turnaround of a woman wearing a black jacket and red shirt, best quality, intricate details." - - image = pipe(prompt, num_inference_steps=50).images[0] - image.save("character.png") - ``` - """ - - if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PreTrainedTokenizer): + if not hasattr(self, "tokenizer"): raise ValueError( - f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizer` for calling" - f" `{self.load_textual_inversion.__name__}`" + f"{self.__class__.__name__} requires `self.tokenizer` for calling `{self.load_textual_inversion.__name__}`" ) - - if not hasattr(self, "text_encoder") or not isinstance(self.text_encoder.model, openvino.runtime.Model): + elif not isinstance(self.tokenizer, PreTrainedTokenizer): raise ValueError( - f"{self.__class__.__name__} requires `self.text_encoder.model` of type `openvino.runtime.Model` for calling" - f" `{self.load_textual_inversion.__name__}`" - ) - - cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE) - force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) - proxies = kwargs.pop("proxies", None) - local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE) - use_auth_token = kwargs.pop("use_auth_token", None) - token = kwargs.pop("token", None) - revision = kwargs.pop("revision", None) - subfolder = kwargs.pop("subfolder", None) - weight_name = kwargs.pop("weight_name", None) - use_safetensors = kwargs.pop("use_safetensors", None) - - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, + f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizer` for calling `{self.load_textual_inversion.__name__}`" ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - if use_safetensors and not is_safetensors_available(): + if not hasattr(self, "text_encoder"): raise ValueError( - "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors" + f"{self.__class__.__name__} requires `self.text_encoder` for calling `{self.load_textual_inversion.__name__}`" ) - - allow_pickle = False - if use_safetensors is None: - use_safetensors = is_safetensors_available() - allow_pickle = True - - user_agent = { - "file_type": "text_inversion", - "framework": "pytorch", - } - - if not isinstance(pretrained_model_name_or_path, list): - pretrained_model_name_or_paths = [pretrained_model_name_or_path] - else: - pretrained_model_name_or_paths = pretrained_model_name_or_path - - if isinstance(token, str): - tokens = [token] - elif token is None: - tokens = [None] * len(pretrained_model_name_or_paths) - else: - tokens = token - - if len(pretrained_model_name_or_paths) != len(tokens): + elif not isinstance(self.text_encoder.model, openvino.runtime.Model): raise ValueError( - f"You have passed a list of models of length {len(pretrained_model_name_or_paths)}, and list of tokens of length {len(tokens)}" - f"Make sure both lists have the same length." + f"{self.__class__.__name__} requires `self.text_encoder` of type `openvino.runtime.Model` for calling `{self.load_textual_inversion.__name__}`" ) - valid_tokens = [t for t in tokens if t is not None] - if len(set(valid_tokens)) < len(valid_tokens): - raise ValueError(f"You have passed a list of tokens that contains duplicates: {tokens}") - - token_ids_and_embeddings = [] - - for pretrained_model_name_or_path, token in zip(pretrained_model_name_or_paths, tokens): - if not isinstance(pretrained_model_name_or_path, dict): - # 1. Load textual inversion file - model_file = None - # Let's first try to load .safetensors weights - if (use_safetensors and weight_name is None) or ( - weight_name is not None and weight_name.endswith(".safetensors") - ): - try: - model_file = _get_model_file( - pretrained_model_name_or_path, - weights_name=weight_name or TEXTUAL_INVERSION_NAME_SAFE, - cache_dir=cache_dir, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - use_auth_token=token, # still uses use_auth_token - revision=revision, - subfolder=subfolder, - user_agent=user_agent, - ) - state_dict = safetensors.torch.load_file(model_file, device="cpu") - except Exception as e: - if not allow_pickle: - raise e - - model_file = None - - if model_file is None: - model_file = _get_model_file( - pretrained_model_name_or_path, - weights_name=weight_name or TEXTUAL_INVERSION_NAME, - cache_dir=cache_dir, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - use_auth_token=token, # still uses use_auth_token - revision=revision, - subfolder=subfolder, - user_agent=user_agent, - ) - state_dict = torch.load(model_file, map_location="cpu") - else: - state_dict = pretrained_model_name_or_path - - # 2. Load token and embedding correcly from file - loaded_token = None - if isinstance(state_dict, torch.Tensor): - if token is None: + # 1. Set correct tokenizer and text encoder + tokenizer = tokenizer or getattr(self, "tokenizer", None) + text_encoder = text_encoder or getattr(self, "text_encoder", None) + + # 2. Normalize inputs + pretrained_model_name_or_paths = ( + [pretrained_model_name_or_path] + if not isinstance(pretrained_model_name_or_path, list) + else pretrained_model_name_or_path + ) + tokens = [token] if not isinstance(token, list) else token + if tokens[0] is None: + tokens = tokens * len(pretrained_model_name_or_paths) + + # 3. Check inputs + self._check_text_inv_inputs(tokenizer, text_encoder, pretrained_model_name_or_paths, tokens) + + # 4. Load state dicts of textual embeddings + state_dicts = load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs) + + # 4.1 Handle the special case when state_dict is a tensor that contains n embeddings for n tokens + if len(tokens) > 1 and len(state_dicts) == 1: + if isinstance(state_dicts[0], torch.Tensor): + state_dicts = list(state_dicts[0]) + if len(tokens) != len(state_dicts): raise ValueError( - "You are trying to load a textual inversion embedding that has been saved as a PyTorch tensor. Make sure to pass the name of the corresponding token in this case: `token=...`." + f"You have passed a state_dict contains {len(state_dicts)} embeddings, and list of tokens of length {len(tokens)} " + f"Make sure both have the same length." ) - embedding = state_dict - elif len(state_dict) == 1: - # diffusers - loaded_token, embedding = next(iter(state_dict.items())) - elif "string_to_param" in state_dict: - # A1111 - loaded_token = state_dict["name"] - embedding = state_dict["string_to_param"]["*"] - - if token is not None and loaded_token != token: - logger.info(f"The loaded token: {loaded_token} is overwritten by the passed token {token}.") - else: - token = loaded_token - - embedding = embedding.detach().cpu().numpy() - # 3. Make sure we don't mess up the tokenizer or text encoder - vocab = self.tokenizer.get_vocab() - if token in vocab: - raise ValueError( - f"Token {token} already in tokenizer vocabulary. Please choose a different token name or remove {token} and embedding from the tokenizer and text encoder." - ) - elif f"{token}_1" in vocab: - multi_vector_tokens = [token] - i = 1 - while f"{token}_{i}" in self.tokenizer.added_tokens_encoder: - multi_vector_tokens.append(f"{token}_{i}") - i += 1 + # 4. Retrieve tokens and embeddings + tokens, embeddings = self._retrieve_tokens_and_embeddings(tokens, state_dicts, tokenizer) - raise ValueError( - f"Multi-vector Token {multi_vector_tokens} already in tokenizer vocabulary. Please choose a different token name or remove the {multi_vector_tokens} and embedding from the tokenizer and text encoder." - ) - is_multi_vector = len(embedding.shape) > 1 and embedding.shape[0] > 1 - if is_multi_vector: - tokens = [token] + [f"{token}_{i}" for i in range(1, embedding.shape[0])] - embeddings = [e for e in embedding] # noqa: C416 - else: - tokens = [token] - embeddings = [embedding[0]] if len(embedding.shape) > 1 else [embedding] - # add tokens and get ids - self.tokenizer.add_tokens(tokens) - token_ids = self.tokenizer.convert_tokens_to_ids(tokens) - token_ids_and_embeddings += zip(token_ids, embeddings) + # 5. Extend tokens and embeddings for multi vector + tokens, embeddings = self._extend_tokens_and_embeddings(tokens, embeddings, tokenizer) - logger.info(f"Loaded textual inversion embedding for {token}.") + # 7.4 add tokens to tokenizer (modified) + tokenizer.add_tokens(tokens) + token_ids = tokenizer.convert_tokens_to_ids(tokens) # Insert textual inversion embeddings to text encoder with OpenVINO ngraph transformation manager = Manager() - manager.register_pass(InsertTextEmbedding(token_ids_and_embeddings)) - manager.run_passes(self.text_encoder.model) + manager.register_pass(InsertTextEmbedding(token_ids, embeddings)) + manager.run_passes(text_encoder.model) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 81dc085df..d5ee6ee22 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -64,6 +64,7 @@ from ...exporters.openvino import main_export from .configuration import OVConfig, OVQuantizationMethod, OVWeightQuantizationConfig +from .loaders import OVTextualInversionLoaderMixin from .modeling_base import OVBaseModel from .utils import ( ONNX_WEIGHTS_NAME, @@ -1010,7 +1011,7 @@ def to(self, *args, **kwargs): self.encoder.to(*args, **kwargs) -class OVStableDiffusionPipeline(OVDiffusionPipeline, StableDiffusionPipeline): +class OVStableDiffusionPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionPipeline): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion#diffusers.StableDiffusionPipeline). """ @@ -1020,7 +1021,9 @@ class OVStableDiffusionPipeline(OVDiffusionPipeline, StableDiffusionPipeline): auto_model_class = StableDiffusionPipeline -class OVStableDiffusionImg2ImgPipeline(OVDiffusionPipeline, StableDiffusionImg2ImgPipeline): +class OVStableDiffusionImg2ImgPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionImg2ImgPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_img2img#diffusers.StableDiffusionImg2ImgPipeline). """ @@ -1030,7 +1033,9 @@ class OVStableDiffusionImg2ImgPipeline(OVDiffusionPipeline, StableDiffusionImg2I auto_model_class = StableDiffusionImg2ImgPipeline -class OVStableDiffusionInpaintPipeline(OVDiffusionPipeline, StableDiffusionInpaintPipeline): +class OVStableDiffusionInpaintPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionInpaintPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_inpaint#diffusers.StableDiffusionInpaintPipeline). """ @@ -1040,7 +1045,7 @@ class OVStableDiffusionInpaintPipeline(OVDiffusionPipeline, StableDiffusionInpai auto_model_class = StableDiffusionInpaintPipeline -class OVStableDiffusionXLPipeline(OVDiffusionPipeline, StableDiffusionXLPipeline): +class OVStableDiffusionXLPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLPipeline): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). """ @@ -1063,7 +1068,9 @@ def _get_add_time_ids( return add_time_ids -class OVStableDiffusionXLImg2ImgPipeline(OVDiffusionPipeline, StableDiffusionXLImg2ImgPipeline): +class OVStableDiffusionXLImg2ImgPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLImg2ImgPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). """ @@ -1100,7 +1107,9 @@ def _get_add_time_ids( return add_time_ids, add_neg_time_ids -class OVStableDiffusionXLInpaintPipeline(OVDiffusionPipeline, StableDiffusionXLInpaintPipeline): +class OVStableDiffusionXLInpaintPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLInpaintPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline). """ @@ -1137,7 +1146,9 @@ def _get_add_time_ids( return add_time_ids, add_neg_time_ids -class OVLatentConsistencyModelPipeline(OVDiffusionPipeline, LatentConsistencyModelPipeline): +class OVLatentConsistencyModelPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ @@ -1147,7 +1158,9 @@ class OVLatentConsistencyModelPipeline(OVDiffusionPipeline, LatentConsistencyMod auto_model_class = LatentConsistencyModelPipeline -class OVLatentConsistencyModelImg2ImgPipeline(OVDiffusionPipeline, LatentConsistencyModelImg2ImgPipeline): +class OVLatentConsistencyModelImg2ImgPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelImg2ImgPipeline +): """ OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency_img2img#diffusers.LatentConsistencyModelImg2ImgPipeline). """ diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 4e8033880..279a24818 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -53,9 +53,7 @@ EXTERNAL_DATA_FORMAT_SIZE_LIMIT = 2 * 1024 * 1024 * 1024 -TEXTUAL_INVERSION_NAME = "learned_embeds.bin" -TEXTUAL_INVERSION_NAME_SAFE = "learned_embeds.safetensors" -TEXTUAL_INVERSION_EMBEDDING_KEY = "text_model.embeddings.token_embedding.weight" +TEXTUAL_INVERSION_EMBEDDING_KEY = "self.text_model.embeddings.token_embedding.weight" OV_TO_NP_TYPE = { "boolean": np.bool_, diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 6271ff3e4..687c1f5c0 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -15,6 +15,7 @@ import unittest import numpy as np +import pytest import torch from diffusers import ( AutoPipelineForImage2Image, @@ -25,6 +26,7 @@ from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from diffusers.utils import load_image from parameterized import parameterized +from transformers.testing_utils import slow from utils_tests import MODEL_NAMES, SEED from optimum.intel.openvino import ( @@ -295,6 +297,30 @@ def test_height_width_properties(self, model_arch: str): self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) + @pytest.mark.run_slow + @slow + @require_diffusers + def test_textual_inversion(self): + # for now we only test for stable-diffusion + # this is very slow and costly to run right now + + model_id = "runwayml/stable-diffusion-v1-5" + ti_id = "sd-concepts-library/cat-toy" + + inputs = self.generate_inputs() + inputs["prompt"] = "A backpack" + + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(model_id, safety_checker=None) + diffusers_pipeline.load_textual_inversion(ti_id) + + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(model_id, compile=False, safety_checker=None) + ov_pipeline.load_textual_inversion(ti_id) + + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) + class OVPipelineForImage2ImageTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] @@ -348,7 +374,6 @@ def test_num_images_per_prompt(self, model_arch: str): def test_callback(self, model_arch: str): height, width, batch_size = 32, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - inputs["num_inference_steps"] = 3 class Callback: def __init__(self): @@ -484,6 +509,30 @@ def test_height_width_properties(self, model_arch: str): self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) + @pytest.mark.run_slow + @slow + @require_diffusers + def test_textual_inversion(self): + # for now we only test for stable-diffusion + # this is very slow and costly to run right now + + model_id = "runwayml/stable-diffusion-v1-5" + ti_id = "sd-concepts-library/cat-toy" + + inputs = self.generate_inputs() + inputs["prompt"] = "A backpack" + + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(model_id, safety_checker=None) + diffusers_pipeline.load_textual_inversion(ti_id) + + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(model_id, compile=False, safety_checker=None) + ov_pipeline.load_textual_inversion(ti_id) + + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) + class OVPipelineForInpaintingTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] @@ -542,7 +591,6 @@ def test_num_images_per_prompt(self, model_arch: str): def test_callback(self, model_arch: str): height, width, batch_size = 32, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - inputs["num_inference_steps"] = 3 class Callback: def __init__(self): @@ -677,3 +725,27 @@ def test_height_width_properties(self, model_arch: str): ) self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) + + @pytest.mark.run_slow + @slow + @require_diffusers + def test_textual_inversion(self): + # for now we only test for stable-diffusion + # this is very slow and costly to run right now + + model_id = "runwayml/stable-diffusion-v1-5" + ti_id = "sd-concepts-library/cat-toy" + + inputs = self.generate_inputs() + inputs["prompt"] = "A backpack" + + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(model_id, safety_checker=None) + diffusers_pipeline.load_textual_inversion(ti_id) + + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(model_id, compile=False, safety_checker=None) + ov_pipeline.load_textual_inversion(ti_id) + + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) From 2559620339d0589c7b26c19b36a772a3ff96ed4a Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 11 Oct 2024 11:03:47 +0400 Subject: [PATCH 09/53] fix lora unscaling in diffusion pipelines (#937) --- optimum/intel/openvino/modeling_diffusion.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index d5ee6ee22..11698af70 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -812,6 +812,9 @@ def forward(self, *args, **kwargs): def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs) + def modules(self): + return [] + class OVModelTextEncoder(OVPipelinePart): def forward( From d169fcee852a5eaacb126328ab5fb1d83ac0a809 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 14 Oct 2024 16:28:20 +0200 Subject: [PATCH 10/53] Fix PRs doc build (#950) --- .github/workflows/build_pr_documentation.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 982535623..cab800759 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -30,6 +30,8 @@ jobs: - name: Setup environment run: | + python -m venv venv-doc + source venv-doc/bin/activate pip uninstall -y doc-builder cd doc-builder git pull origin main From 93ca007bc71e8b52c7c776426ab5b221e687c968 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 14 Oct 2024 17:19:25 +0200 Subject: [PATCH 11/53] Add env variable needed for slow tests (#949) --- .github/workflows/test_openvino_basic.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml index 82c39da37..eefce73ab 100644 --- a/.github/workflows/test_openvino_basic.yml +++ b/.github/workflows/test_openvino_basic.yml @@ -77,3 +77,4 @@ jobs: pytest tests/openvino -s -m "run_slow" --durations=0 env: RUN_SLOW: 1 + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} From f7d379dcdca93296d3f3f77adc5af0bf79d1681d Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 15 Oct 2024 17:30:26 +0400 Subject: [PATCH 12/53] Fix compatibility with diffusers < 0.25.0 (#952) --- optimum/intel/openvino/modeling_diffusion.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 11698af70..7ca52215b 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -28,7 +28,6 @@ import openvino import torch from diffusers.configuration_utils import ConfigMixin -from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution from diffusers.pipelines import ( AutoPipelineForImage2Image, AutoPipelineForInpainting, @@ -63,6 +62,7 @@ ) from ...exporters.openvino import main_export +from ..utils.import_utils import is_diffusers_version from .configuration import OVConfig, OVQuantizationMethod, OVWeightQuantizationConfig from .loaders import OVTextualInversionLoaderMixin from .modeling_base import OVBaseModel @@ -76,6 +76,12 @@ ) +if is_diffusers_version(">=", "0.25.0"): + from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution +else: + from diffusers.models.vae import DiagonalGaussianDistribution + + core = Core() logger = logging.getLogger(__name__) From 981442f82d75698ab570a9f3ffb86eff60580370 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Thu, 17 Oct 2024 12:16:45 +0400 Subject: [PATCH 13/53] Allow to use SDPA in clip models (#941) --- optimum/exporters/openvino/model_configs.py | 49 +++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 07c284ec2..33190e6f1 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -24,6 +24,8 @@ from optimum.exporters.onnx.model_configs import ( CLIPOnnxConfig, CLIPTextOnnxConfig, + CLIPTextWithProjectionOnnxConfig, + CLIPVisionModelOnnxConfig, CodeGenOnnxConfig, FalconOnnxConfig, GemmaOnnxConfig, @@ -35,6 +37,7 @@ PhiOnnxConfig, VisionOnnxConfig, ) +from optimum.exporters.onnx.model_patcher import ModelPatcher from optimum.exporters.tasks import TasksManager from optimum.utils import DEFAULT_DUMMY_SHAPES from optimum.utils.input_generators import ( @@ -1079,6 +1082,11 @@ def generate_dummy_inputs_for_validation( reference_model_inputs["text"] = reference_model_inputs.pop("input_ids") return super().generate_dummy_inputs_for_validation(reference_model_inputs) + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return ModelPatcher(self, model, model_kwargs=model_kwargs) + @register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="open_clip") class OpenCLIPTextOpenVINOConfig(CLIPTextOnnxConfig): @@ -1109,6 +1117,11 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs) return dummy_inputs + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return ModelPatcher(self, model, model_kwargs=model_kwargs) + @register_in_tasks_manager("clip-vision-model", *["feature-extraction"], library_name="open_clip") class OpenCLIPVisualOpenVINOConfig(VisionOnnxConfig): @@ -1134,6 +1147,42 @@ def rename_ambiguous_inputs(self, inputs): return model_inputs +@register_in_tasks_manager( + "clip", *["feature-extraction", "zero-shot-image-classification"], library_name="transformers" +) +class CLIPOpenVINOConfig(CLIPOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return ModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="transformers") +@register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="diffusers") +class CLIPTextOpenVINOConfig(CLIPTextOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return ModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("clip-text-with-projection", *["feature-extraction"], library_name="transformers") +@register_in_tasks_manager("clip-text-with-projection", *["feature-extraction"], library_name="diffusers") +class CLIPTextWithProjectionOpenVINOConfig(CLIPTextWithProjectionOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return ModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("clip-vision-model", *["feature-extraction"], library_name="transformers") +class CLIPVisionModelOpenVINOConfig(CLIPVisionModelOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return ModelPatcher(self, model, model_kwargs=model_kwargs) + + @register_in_tasks_manager( "ibert", *[ From f7b5b547c167cb6a9f20fa77d493ee2dde3c3034 Mon Sep 17 00:00:00 2001 From: Helena Kloosterman Date: Thu, 17 Oct 2024 10:17:22 +0200 Subject: [PATCH 14/53] Install torchvision CPU in OpenVINO notebook tests (#953) --- .github/workflows/test_openvino_notebooks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino_notebooks.yml b/.github/workflows/test_openvino_notebooks.yml index ded091d5a..26a09012f 100644 --- a/.github/workflows/test_openvino_notebooks.yml +++ b/.github/workflows/test_openvino_notebooks.yml @@ -39,7 +39,7 @@ jobs: # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages # ffmpeg, torchaudio and pillow are required for image classification and audio classification pipelines sudo apt-get install ffmpeg - pip install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cpu + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu pip install -r notebooks/openvino/requirements.txt pip install .[tests,openvino] nbval From 03a59aa5685d60d49b4aa891bca11823eb84e92e Mon Sep 17 00:00:00 2001 From: Eddy Kim Date: Tue, 22 Oct 2024 00:12:41 +0900 Subject: [PATCH 15/53] updated OVPipelinePart to have separate ov_config (#957) --- optimum/intel/openvino/modeling_diffusion.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 7ca52215b..6c9063df7 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -741,6 +741,7 @@ def __init__( self.model_name = model_name self.parent_pipeline = parent_pipeline self.request = None if not parent_pipeline._compile_only else self.model + self.ov_config = parent_pipeline.ov_config if isinstance(parent_pipeline.model_save_dir, TemporaryDirectory): self.model_save_dir = Path(parent_pipeline.model_save_dir.name) / self.model_name @@ -764,10 +765,6 @@ def _device(self) -> str: def device(self) -> torch.device: return self.parent_pipeline.device - @property - def ov_config(self) -> OVConfig: - return self.parent_pipeline.ov_config - @property def dtype(self) -> torch.dtype: return OV_TO_PT_TYPE[self.ov_config.get("dtype", "f32")] From dcb49eaf932317edfd189826a37d8f4245469fc6 Mon Sep 17 00:00:00 2001 From: Evgenya Nugmanova Date: Tue, 22 Oct 2024 17:01:21 +0400 Subject: [PATCH 16/53] Symbol use in optimum: fix misprint (#948) * Symbol use in optimum: fix misprint * fix wrong filling chatglm position_ids input * fix cutting position ids --------- Co-authored-by: eaidova --- optimum/exporters/openvino/utils.py | 2 +- optimum/intel/openvino/modeling_decoder.py | 26 ++++++++++++++++++---- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 9b8747243..75106fc2b 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -103,7 +103,7 @@ def _get_input_info( symbol = name_to_symbol[dim_name] else: symbol = Symbol() - name_to_symbol[name] = symbol + name_to_symbol[dim_name] = symbol dim = Dimension(-1) dim.set_symbol(symbol) shape[idx] = dim diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 733f5a411..56b7a1c5a 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -16,7 +16,7 @@ import os from pathlib import Path from tempfile import TemporaryDirectory -from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import openvino @@ -31,7 +31,7 @@ from transformers.generation.logits_process import LogitsProcessorList from transformers.generation.stopping_criteria import StoppingCriteriaList from transformers.generation.utils import GenerateOutput, GenerationMode -from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput from optimum.utils.normalized_config import NormalizedConfigManager @@ -504,8 +504,8 @@ def prepare_inputs( else: position_ids = np.cumsum(attention_mask, axis=1) - 1 position_ids[attention_mask == 0] = 1 - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] inputs["position_ids"] = position_ids @@ -604,6 +604,24 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg return model_inputs + def _update_model_kwargs_for_generation( + self, + outputs: ModelOutput, + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + **kwargs, + ) -> Dict[str, Any]: + model_kwargs = super()._update_model_kwargs_for_generation( + outputs=outputs, model_kwargs=model_kwargs, is_encoder_decoder=is_encoder_decoder, **kwargs + ) + + if "position_ids" in model_kwargs: + position_ids = model_kwargs["position_ids"] + new_position_id = position_ids[..., -1:].clone() + new_position_id += 1 + model_kwargs["position_ids"] = torch.cat([position_ids, new_position_id], dim=-1) + return model_kwargs + def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_key_values: Tuple): batch_size = logits.shape[0] if indicies.shape[0] != 1: From 966c8c2df5eb0fee88151cfa58dfab31c1ab3fe9 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 22 Oct 2024 22:38:56 +0400 Subject: [PATCH 17/53] fix tmp dir saving (#959) --- optimum/exporters/openvino/convert.py | 3 + optimum/intel/openvino/modeling_base.py | 3 +- .../intel/openvino/modeling_base_seq2seq.py | 2 +- optimum/intel/openvino/modeling_decoder.py | 2 +- optimum/intel/openvino/modeling_diffusion.py | 3 +- optimum/intel/openvino/modeling_open_clip.py | 2 +- .../openvino/modeling_visual_language.py | 2 +- optimum/intel/openvino/utils.py | 276 ++++++++++++++++++ tests/openvino/test_export.py | 2 +- tests/openvino/test_exporters_cli.py | 3 +- tests/openvino/test_modeling.py | 28 +- .../test_modeling_sentence_transformers.py | 4 +- tests/openvino/test_quantization.py | 30 +- tests/openvino/test_training_examples.py | 9 +- 14 files changed, 324 insertions(+), 45 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 59cc2bb7c..4e6503b5b 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -206,6 +206,7 @@ def export_tensorflow( ov_config=ov_config, library_name=library_name, ) + del ov_model return input_names, output_names, True @@ -268,6 +269,7 @@ def export_pytorch_via_onnx( ov_config=ov_config, library_name=library_name, ) + del ov_model return input_names, output_names, True @@ -442,6 +444,7 @@ def ts_patched_forward(*args, **kwargs): library_name=library_name, ) clear_class_registry() + del ov_model del model gc.collect() return input_names, output_names, False diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 353416ff5..ed3cdadb5 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -16,7 +16,7 @@ import os import warnings from pathlib import Path -from tempfile import TemporaryDirectory, gettempdir +from tempfile import gettempdir from typing import Dict, Optional, Union import openvino @@ -41,6 +41,7 @@ ONNX_WEIGHTS_NAME, OV_TO_PT_TYPE, OV_XML_FILE_NAME, + TemporaryDirectory, _print_compiled_model_properties, model_has_dynamic_inputs, ) diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 763dd2b50..06c601148 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -15,7 +15,6 @@ import logging import os from pathlib import Path -from tempfile import TemporaryDirectory from typing import Dict, Optional, Union import openvino @@ -36,6 +35,7 @@ OV_DECODER_NAME, OV_DECODER_WITH_PAST_NAME, OV_ENCODER_NAME, + TemporaryDirectory, ) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 56b7a1c5a..7c0bde8cd 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -15,7 +15,6 @@ import logging import os from pathlib import Path -from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np @@ -50,6 +49,7 @@ ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE, + TemporaryDirectory, get_export_transformers_version, model_has_dynamic_inputs, ) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 6c9063df7..68dc31bc9 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -21,7 +21,7 @@ from collections import OrderedDict from copy import deepcopy from pathlib import Path -from tempfile import TemporaryDirectory, gettempdir +from tempfile import gettempdir from typing import Any, Dict, Optional, Union import numpy as np @@ -70,6 +70,7 @@ ONNX_WEIGHTS_NAME, OV_TO_PT_TYPE, OV_XML_FILE_NAME, + TemporaryDirectory, _print_compiled_model_properties, model_has_dynamic_inputs, np_to_pt_generators, diff --git a/optimum/intel/openvino/modeling_open_clip.py b/optimum/intel/openvino/modeling_open_clip.py index 4a3cb0fca..ef00c182e 100644 --- a/optimum/intel/openvino/modeling_open_clip.py +++ b/optimum/intel/openvino/modeling_open_clip.py @@ -16,7 +16,6 @@ import logging import os from pathlib import Path -from tempfile import TemporaryDirectory from typing import Dict, Optional, Union import numpy as np @@ -39,6 +38,7 @@ from ..utils.modeling_utils import _find_files_matching_pattern, _OpenClipForZeroShotImageClassification from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling import MODEL_START_DOCSTRING, OVModel +from .utils import TemporaryDirectory logger = logging.getLogger(__name__) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index cf6aee7b1..141abeb87 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -2,7 +2,6 @@ import os import warnings from pathlib import Path -from tempfile import TemporaryDirectory from typing import Dict, Optional, Tuple, Union import numpy as np @@ -19,6 +18,7 @@ from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel, OVModelPart from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM +from .utils import TemporaryDirectory logger = logging.getLogger(__name__) diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 279a24818..fcc6944e9 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -16,8 +16,12 @@ import json import logging import os +import stat +import warnings +import weakref from glob import glob from pathlib import Path +from tempfile import TemporaryDirectory as OrigTemporaryDirectory from typing import Tuple, Type, Union import numpy as np @@ -260,3 +264,275 @@ def model_has_dynamic_inputs(model): if is_dynamic: return is_dynamic return is_dynamic + + +# adopted from https://github.com/python/cpython/blob/3.12/Lib/shutil.py for compatibility with python<3.10 +def _rmtree(path, ignore_errors=False, onerror=None, *, onexc=None, dir_fd=None): + """Recursively delete a directory tree. + + If dir_fd is not None, it should be a file descriptor open to a directory; + path will then be relative to that directory. + dir_fd may not be implemented on your platform. + If it is unavailable, using it will raise a NotImplementedError. + + If ignore_errors is set, errors are ignored; otherwise, if onexc or + onerror is set, it is called to handle the error with arguments (func, + path, exc_info) where func is platform and implementation dependent; + path is the argument to that function that caused it to fail; and + the value of exc_info describes the exception. For onexc it is the + exception instance, and for onerror it is a tuple as returned by + sys.exc_info(). If ignore_errors is false and both onexc and + onerror are None, the exception is reraised. + + onerror is deprecated and only remains for backwards compatibility. + If both onerror and onexc are set, onerror is ignored and onexc is used. + """ + _use_fd_functions = ( + {os.open, os.stat, os.unlink, os.rmdir} <= os.supports_dir_fd + and os.scandir in os.supports_fd + and os.stat in os.supports_follow_symlinks + ) + + if hasattr(os.stat_result, "st_file_attributes"): + + def _rmtree_islink(path): + try: + st = os.lstat(path) + return stat.S_ISLNK(st.st_mode) or ( + st.st_file_attributes & stat.FILE_ATTRIBUTE_REPARSE_POINT + and st.st_reparse_tag == stat.IO_REPARSE_TAG_MOUNT_POINT + ) + except OSError: + return False + + else: + + def _rmtree_islink(path): + return os.path.islink(path) + + def _rmtree_safe_fd(stack, onexc): + # Each stack item has four elements: + # * func: The first operation to perform: os.lstat, os.close or os.rmdir. + # Walking a directory starts with an os.lstat() to detect symlinks; in + # this case, func is updated before subsequent operations and passed to + # onexc() if an error occurs. + # * dirfd: Open file descriptor, or None if we're processing the top-level + # directory given to rmtree() and the user didn't supply dir_fd. + # * path: Path of file to operate upon. This is passed to onexc() if an + # error occurs. + # * orig_entry: os.DirEntry, or None if we're processing the top-level + # directory given to rmtree(). We used the cached stat() of the entry to + # save a call to os.lstat() when walking subdirectories. + func, dirfd, path, orig_entry = stack.pop() + name = path if orig_entry is None else orig_entry.name + try: + if func is os.close: + os.close(dirfd) + return + if func is os.rmdir: + os.rmdir(name, dir_fd=dirfd) + return + + # Note: To guard against symlink races, we use the standard + # lstat()/open()/fstat() trick. + assert func is os.lstat + if orig_entry is None: + orig_st = os.lstat(name, dir_fd=dirfd) + else: + orig_st = orig_entry.stat(follow_symlinks=False) + + func = os.open # For error reporting. + topfd = os.open(name, os.O_RDONLY | os.O_NONBLOCK, dir_fd=dirfd) + + func = os.path.islink # For error reporting. + try: + if not os.path.samestat(orig_st, os.fstat(topfd)): + # Symlinks to directories are forbidden, see GH-46010. + raise OSError("Cannot call rmtree on a symbolic link") + stack.append((os.rmdir, dirfd, path, orig_entry)) + finally: + stack.append((os.close, topfd, path, orig_entry)) + + func = os.scandir # For error reporting. + with os.scandir(topfd) as scandir_it: + entries = list(scandir_it) + for entry in entries: + fullname = os.path.join(path, entry.name) + try: + if entry.is_dir(follow_symlinks=False): + # Traverse into sub-directory. + stack.append((os.lstat, topfd, fullname, entry)) + continue + except OSError: + pass + try: + os.unlink(entry.name, dir_fd=topfd) + except OSError as err: + onexc(os.unlink, fullname, err) + except OSError as err: + err.filename = path + onexc(func, path, err) + + def _rmtree_unsafe(path, onexc): + def onerror(err): + onexc(os.scandir, err.filename, err) + + results = os.walk(path, topdown=False, onerror=onerror, followlinks=hasattr(os, "_walk_symlinks_as_files")) + for dirpath, dirnames, filenames in results: + for name in dirnames: + fullname = os.path.join(dirpath, name) + try: + os.rmdir(fullname) + except OSError as err: + onexc(os.rmdir, fullname, err) + for name in filenames: + fullname = os.path.join(dirpath, name) + try: + os.unlink(fullname) + except OSError as err: + onexc(os.unlink, fullname, err) + try: + os.rmdir(path) + except OSError as err: + onexc(os.rmdir, path, err) + + if ignore_errors: + + def onexc(*args): + pass + + elif onerror is None and onexc is None: + + def onexc(*args): + raise + + elif onexc is None: + if onerror is None: + + def onexc(*args): + raise + + else: + # delegate to onerror + def onexc(*args): + func, path, exc = args + if exc is None: + exc_info = None, None, None + else: + exc_info = type(exc), exc, exc.__traceback__ + return onerror(func, path, exc_info) + + if _use_fd_functions: + # While the unsafe rmtree works fine on bytes, the fd based does not. + if isinstance(path, bytes): + path = os.fsdecode(path) + stack = [(os.lstat, dir_fd, path, None)] + try: + while stack: + _rmtree_safe_fd(stack, onexc) + finally: + # Close any file descriptors still on the stack. + while stack: + func, fd, path, entry = stack.pop() + if func is not os.close: + continue + try: + os.close(fd) + except OSError as err: + onexc(os.close, path, err) + else: + if dir_fd is not None: + raise NotImplementedError("dir_fd unavailable on this platform") + try: + if _rmtree_islink(path): + # symlinks to directories are forbidden, see bug #1669 + raise OSError("Cannot call rmtree on a symbolic link") + except OSError as err: + onexc(os.path.islink, path, err) + # can't continue even if onexc hook returns + return + return _rmtree_unsafe(path, onexc) + + +# copied https://github.com/python/cpython/blob/3.12/Lib/tempfile.py +# to add behaviour that available only for python3.10+ for older python version +class TemporaryDirectory(OrigTemporaryDirectory): + def __init__(self, suffix=None, prefix=None, dir=None, ignore_cleanup_errors=True, *, delete=True): + super().__init__(suffix=suffix, prefix=prefix, dir=dir) + self._ignore_cleanup_errors = ignore_cleanup_errors + self._delete = delete + self._finalizer = weakref.finalize( + self, + self._cleanup, + self.name, + warn_message="Implicitly cleaning up {!r}".format(self), + ignore_errors=self._ignore_cleanup_errors, + delete=self._delete, + ) + + @classmethod + def _cleanup(cls, name, warn_message, ignore_errors=False, delete=True): + if delete: + cls._rmtree(name, ignore_errors=ignore_errors) + warnings.warn(warn_message, ResourceWarning) + + @classmethod + def _rmtree(cls, name, ignore_errors=False, repeated=False): + def _dont_follow_symlinks(func, path, *args): + # Pass follow_symlinks=False, unless not supported on this platform. + if func in os.supports_follow_symlinks: + func(path, *args, follow_symlinks=False) + elif os.name == "nt" or not os.path.islink(path): + func(path, *args) + + def _resetperms(path): + try: + chflags = os.chflags + except AttributeError: + pass + else: + _dont_follow_symlinks(chflags, path, 0) + _dont_follow_symlinks(os.chmod, path, 0o700) + + def onexc(func, path, exc): + if isinstance(exc, PermissionError): + if repeated and path == name: + if ignore_errors: + return + raise + + try: + if path != name: + _resetperms(os.path.dirname(path)) + _resetperms(path) + + try: + os.unlink(path) + except IsADirectoryError: + cls._rmtree(path, ignore_errors=ignore_errors) + except PermissionError: + # The PermissionError handler was originally added for + # FreeBSD in directories, but it seems that it is raised + # on Windows too. + # bpo-43153: Calling _rmtree again may + # raise NotADirectoryError and mask the PermissionError. + # So we must re-raise the current PermissionError if + # path is not a directory. + if not os.path.isdir(path) or os.path.isjunction(path): + if ignore_errors: + return + raise + cls._rmtree(path, ignore_errors=ignore_errors, repeated=(path == name)) + except FileNotFoundError: + pass + elif isinstance(exc, FileNotFoundError): + pass + else: + if not ignore_errors: + raise + + _rmtree(name, onexc=onexc) + + def cleanup(self): + if self._finalizer.detach() or os.path.exists(self.name): + self._rmtree(self.name, ignore_errors=self._ignore_cleanup_errors) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index d48e86fe2..43c535e67 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -15,7 +15,6 @@ import unittest from pathlib import Path -from tempfile import TemporaryDirectory import torch from parameterized import parameterized @@ -46,6 +45,7 @@ OVStableDiffusionXLPipeline, ) from optimum.intel.openvino.modeling_base import OVBaseModel +from optimum.intel.openvino.utils import TemporaryDirectory from optimum.intel.utils.import_utils import _transformers_version from optimum.utils.save_utils import maybe_load_preprocessors diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 04e6fc601..8443f95b3 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -14,7 +14,6 @@ import subprocess import unittest from pathlib import Path -from tempfile import TemporaryDirectory from parameterized import parameterized from transformers import AutoModelForCausalLM @@ -44,7 +43,7 @@ OVStableDiffusionXLPipeline, ) from optimum.intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS -from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS +from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS, TemporaryDirectory from optimum.intel.utils.import_utils import ( compare_versions, is_openvino_tokenizers_available, diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 2eb6c1e84..119e00403 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -94,7 +94,7 @@ OVModelWithEmbedForCausalLM, OVVisionEmbedding, ) -from optimum.intel.openvino.utils import _print_compiled_model_properties +from optimum.intel.openvino.utils import TemporaryDirectory, _print_compiled_model_properties from optimum.intel.pipelines import pipeline as optimum_pipeline from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version from optimum.intel.utils.modeling_utils import _find_files_matching_pattern @@ -171,7 +171,7 @@ def test_load_from_hub_and_save_model(self): self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits)) del compile_only_model - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: loaded_model.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) self.assertTrue(OV_XML_FILE_NAME in folder_contents) @@ -200,7 +200,7 @@ def test_load_from_hub_and_save_decoder_model(self, use_cache): self.assertEqual(loaded_model.request.get_compiled_model().get_property("PERFORMANCE_HINT"), "LATENCY") loaded_model_outputs = loaded_model(**tokens) - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: loaded_model.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) self.assertTrue(OV_XML_FILE_NAME in folder_contents) @@ -234,7 +234,7 @@ def test_load_from_hub_and_save_seq2seq_model(self): loaded_model_outputs = loaded_model.generate(**tokens) - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: loaded_model.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) self.assertTrue(OV_ENCODER_NAME in folder_contents) @@ -278,7 +278,7 @@ def test_load_from_hub_and_save_stable_diffusion_model(self): pipeline_outputs = loaded_pipeline(**inputs).images self.assertEqual(pipeline_outputs.shape, (batch_size, height, width, 3)) - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: loaded_pipeline.save_pretrained(tmpdirname) pipeline = OVStableDiffusionPipeline.from_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) @@ -333,7 +333,7 @@ def test_loading_with_config_in_root(self, subfolder): OVModelForFeatureExtraction.from_pretrained(model_id, subfolder=subfolder, export=export) # local model api = HfApi() - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: local_dir = Path(tmpdirname) / "model" api.snapshot_download(repo_id=model_id, local_dir=local_dir) OVModelForFeatureExtraction.from_pretrained(local_dir, subfolder=subfolder, export=export) @@ -341,7 +341,7 @@ def test_loading_with_config_in_root(self, subfolder): def test_infer_export_when_loading(self): model_id = MODEL_NAMES["phi"] model = AutoModelForCausalLM.from_pretrained(model_id) - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: model.save_pretrained(Path(tmpdirname) / "original") # Load original model and convert model = OVModelForCausalLM.from_pretrained(Path(tmpdirname) / "original") @@ -363,7 +363,7 @@ def test_find_files_matching_pattern(self): # local model api = HfApi() - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: for revision in ("main", "ov", "itrex"): local_dir = Path(tmpdirname) / revision api.snapshot_download(repo_id=model_id, local_dir=local_dir, revision=revision) @@ -382,7 +382,7 @@ def test_find_files_matching_pattern_sd(self, model_arch): # local model api = HfApi() - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: local_dir = Path(tmpdirname) / "model" api.snapshot_download(repo_id=model_id, local_dir=local_dir) ov_files = _find_files_matching_pattern(local_dir, pattern=pattern) @@ -416,7 +416,7 @@ def test_load_model_from_hub(self): self.assertIsInstance(ov_exported_pipe.model, OVBaseModel) self.assertIsInstance(ov_pipe.model, OVBaseModel) - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: ov_exported_pipe.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) self.assertTrue(OV_XML_FILE_NAME in folder_contents) @@ -436,7 +436,7 @@ def test_seq2seq_load_from_hub(self): self.assertIsInstance(ov_exported_pipe.model, OVBaseModel) self.assertIsInstance(ov_pipe.model, OVBaseModel) - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: ov_exported_pipe.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) self.assertTrue(OV_DECODER_WITH_PAST_NAME in folder_contents) @@ -752,7 +752,7 @@ def test_sentence_transformers_pipeline(self, model_arch): from Sentence Transformers then an appropriate exception raises. """ model_id = MODEL_NAMES[model_arch] - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: save_dir = str(tmp_dir) OVSentenceTransformer.from_pretrained(model_id, export=True).save_pretrained(save_dir) with self.assertRaises(Exception) as context: @@ -1389,7 +1389,7 @@ def test_compare_to_timm(self, model_id): @parameterized.expand(TIMM_MODELS) def test_timm_save_and_infer(self, model_id): ov_model = OVModelForImageClassification.from_pretrained(model_id, export=True) - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: model_save_path = os.path.join(tmpdirname, "timm_ov_model") ov_model.save_pretrained(model_save_path) model = OVModelForImageClassification.from_pretrained(model_save_path) @@ -2258,7 +2258,7 @@ def test_load_from_hub_and_save_model(self): loaded_model_outputs = loaded_model(tokens, processed_image) - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: loaded_model.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) self.assertTrue(loaded_model.text_model._xml_model_name in folder_contents) diff --git a/tests/openvino/test_modeling_sentence_transformers.py b/tests/openvino/test_modeling_sentence_transformers.py index acda04512..0ddd60ea0 100644 --- a/tests/openvino/test_modeling_sentence_transformers.py +++ b/tests/openvino/test_modeling_sentence_transformers.py @@ -14,7 +14,6 @@ import gc import os -import tempfile import unittest import numpy as np @@ -26,6 +25,7 @@ ) from optimum.intel import OVSentenceTransformer +from optimum.intel.openvino.utils import TemporaryDirectory SEED = 42 @@ -65,7 +65,7 @@ def test_compare_to_transformers(self, model_arch): def test_sentence_transformers_save_and_infer(self, model_arch): model_id = MODEL_NAMES[model_arch] ov_model = OVSentenceTransformer.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) - with tempfile.TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: model_save_path = os.path.join(tmpdirname, "sentence_transformers_ov_model") ov_model.save_pretrained(model_save_path) model = OVSentenceTransformer.from_pretrained(model_save_path) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 719509738..b294e3e22 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -17,7 +17,6 @@ import itertools import logging -import tempfile import unittest from collections import defaultdict from enum import Enum @@ -70,6 +69,7 @@ _DEFAULT_4BIT_CONFIGS, _DEFAULT_4BIT_CONFIG, ) +from optimum.intel.openvino.utils import TemporaryDirectory from copy import deepcopy from optimum.intel.openvino.quantization import InferRequestWrapper @@ -102,7 +102,7 @@ def test_automodel_static_quantization(self, model_cls, model_name, expected_fak def preprocess_function(examples, tokenizer): return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True) - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: transformers_model = model_cls.auto_model_class.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -146,7 +146,7 @@ def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_ def preprocess_function(examples, tokenizer): return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True) - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: ov_model = model_cls.from_pretrained(model_id, export=True) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -315,7 +315,7 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i task = model_cls.export_feature model_id = MODEL_NAMES[model_name] - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: transformers_model = model_cls.auto_model_class.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -346,7 +346,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p task = model_cls.export_feature model_id = MODEL_NAMES[model_name] - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: transformers_model = model_cls.from_pretrained(model_id, export=True) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -371,7 +371,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_int8, expected_int4): task = model_cls.export_feature model_id = MODEL_NAMES[model_name] - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: transformers_model = model_cls.from_pretrained(model_id, export=True, stateful=False) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -398,7 +398,7 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature model_id = MODEL_NAMES[model_name] - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: transformers_model = model_cls.from_pretrained(model_id, export=True, stateful=True) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -451,7 +451,7 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8): model_id = MODEL_NAMES[model_type] quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2) - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet) @@ -497,7 +497,7 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset( "optimum.intel.openvino.configuration._DEFAULT_4BIT_CONFIGS", {"facebook/opt-125m": DEFAULT_INT4_CONFIG} ) def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_int8, expected_ov_int4): - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: model_id = MODEL_NAMES[model_type] model = model_cls.from_pretrained(model_id, export=True, quantization_config={"bits": 4}) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -521,7 +521,7 @@ def test_ovmodel_4bit_auto_compression_with_config( self, model_cls, model_name, quantization_config, expected_num_weight_nodes ): model_id = MODEL_NAMES[model_name] - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) if quantization_config.quant_method.lower() == "awq": @@ -658,7 +658,7 @@ def test_ovmodel_4bit_dynamic_with_config( self, model_cls, model_name, quantization_config, expected_num_weight_nodes ): model_id = MODEL_NAMES[model_name] - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: group_size = quantization_config.pop("group_size", 32) quantization_config = OVDynamicQuantizationConfig( weights_group_size=group_size, activations_group_size=group_size, **quantization_config @@ -693,7 +693,7 @@ def preprocess_function(examples, tokenizer): examples["question"], examples["context"], padding="max_length", max_length=64, truncation=True ) - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: transformers_model = AutoModelForQuestionAnswering.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) quantizer = OVQuantizer.from_pretrained(transformers_model) @@ -734,7 +734,7 @@ def preprocess_function(examples, tokenizer): examples["question"], examples["context"], padding="max_length", max_length=64, truncation=True ) - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: transformers_model = OVModelForQuestionAnswering.from_pretrained(model_name, export=True) tokenizer = AutoTokenizer.from_pretrained(model_name) quantizer = OVQuantizer.from_pretrained(transformers_model) @@ -787,7 +787,7 @@ def test_aware_training_quantization(self, model_name, expected_fake_quantize, e def compute_metrics(p): return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids) - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: trainer = OVTrainer( model=model, ov_config=ov_config, @@ -916,7 +916,7 @@ def get_default_configurations() -> dict: @parameterized.expand(QUANTIZATION_CONFIGS) def test_config_serialization(self, quantization_config: OVQuantizationConfigBase): ov_config = OVConfig(quantization_config=quantization_config) - with tempfile.TemporaryDirectory() as tmp_dir: + with TemporaryDirectory() as tmp_dir: ov_config.save_pretrained(tmp_dir) loaded_ov_config = OVConfig.from_pretrained(tmp_dir) diff --git a/tests/openvino/test_training_examples.py b/tests/openvino/test_training_examples.py index 8a33ba42e..023f9df7b 100644 --- a/tests/openvino/test_training_examples.py +++ b/tests/openvino/test_training_examples.py @@ -15,7 +15,6 @@ import os import subprocess import sys -import tempfile import unittest from dataclasses import dataclass from pathlib import Path @@ -25,7 +24,7 @@ import torch.cuda from parameterized import parameterized -from optimum.intel.openvino.utils import OV_XML_FILE_NAME +from optimum.intel.openvino.utils import OV_XML_FILE_NAME, TemporaryDirectory PROJECT_ROOT = Path(__file__).parents[2] @@ -148,7 +147,7 @@ def test_single_card_training(self, _, desc: TrainingExampleDescriptor): self.skipTest("No enough cuda devices.") self.env[CUDA_VISIBLE_DEVICES] = str(self.available_cuda_device_ids[0]) - with tempfile.TemporaryDirectory() as output_dir: + with TemporaryDirectory() as output_dir: args = ["torchrun", "--nproc_per_node=1", desc.filename, *desc.get_args_with_output_dir(output_dir)] proc = subprocess.Popen( args=args, @@ -165,7 +164,7 @@ def test_data_parallel_training(self, _, desc: TrainingExampleDescriptor): self.skipTest("No enough cuda devices.") self.env[CUDA_VISIBLE_DEVICES] = ",".join(map(str, self.available_cuda_device_ids[:2])) - with tempfile.TemporaryDirectory() as output_dir: + with TemporaryDirectory() as output_dir: args = [sys.executable, desc.filename, *desc.get_args_with_output_dir(output_dir)] proc = subprocess.Popen( args=args, @@ -182,7 +181,7 @@ def test_distributed_data_parallel_training(self, _, desc: TrainingExampleDescri self.skipTest("No enough cuda devices.") self.env[CUDA_VISIBLE_DEVICES] = ",".join(map(str, self.available_cuda_device_ids[:2])) - with tempfile.TemporaryDirectory() as output_dir: + with TemporaryDirectory() as output_dir: args = [ "torchrun", "--rdzv_backend=c10d", From 227defe0dfb160b61bee88092072a183fa5aeaa3 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 23 Oct 2024 11:25:35 +0200 Subject: [PATCH 18/53] Skip training tests on windows (#930) * Skip training tests on windows * Trigger Tests * Trigger Tests --- tests/openvino/test_training.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py index 014b80268..4147a632b 100644 --- a/tests/openvino/test_training.py +++ b/tests/openvino/test_training.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import random import re import shutil @@ -89,6 +90,10 @@ def initialize_movement_sparsifier_parameters_by_sparsity( operand.bias_importance.copy_(bias_init_tensor) +def is_windows(): + return os.name == "nt" + + def is_avx_vnni_supported() -> bool: return any(re.search("avx.*vnni", flag.lower()) is not None for flag in cpuinfo.get_cpu_info()["flags"]) @@ -614,6 +619,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): # TODO : can be moved to MODEL_NAMES["swin-window"] after transformers v4.42.3 +@unittest.skipIf(is_windows(), reason="Fails on windows") class OVTrainerImageClassificationTrainingTest(OVTrainerBaseTrainingTest): ovmodel_cls = OVModelForImageClassification task = "image-classification" @@ -794,6 +800,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): } +@unittest.skipIf(is_windows(), reason="Fails on windows") class OVTrainerAudioClassificationTrainingTest(OVTrainerBaseTrainingTest): ovmodel_cls = OVModelForAudioClassification task = "audio-classification" From b1f8d1d6957c12b403a04db1ebd58f6668a4f951 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Thu, 24 Oct 2024 11:50:11 +0400 Subject: [PATCH 19/53] disable warning about tokenizers version for ov tokenizers >= 2024.5 (#962) * disable warning about tokenizers version for ov tokenizers >= 2024.5 * increase verbosity for tokenizers test --- optimum/exporters/openvino/__main__.py | 4 +++- optimum/exporters/openvino/convert.py | 5 +++-- optimum/intel/utils/import_utils.py | 18 ++++++++++++++++++ tests/openvino/test_exporters_cli.py | 2 +- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 69cfec1d9..412ed21f6 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -55,6 +55,9 @@ logger = logging.getLogger(__name__) +# init core before import openvino tokenizers to prevent failed attempt loading extension +core = Core() + def infer_task( task, @@ -413,7 +416,6 @@ class StoreAttr(object): del model gc.collect() - core = Core() for submodel_path in submodel_paths: submodel_path = Path(output) / submodel_path submodel = core.read_model(submodel_path) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 4e6503b5b..e731cd180 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -41,6 +41,7 @@ _torch_version, _transformers_version, compare_versions, + is_openvino_tokenizers_version, is_tokenizers_version, is_transformers_version, ) @@ -734,9 +735,9 @@ def export_tokenizer( except ModuleNotFoundError: return - if is_tokenizers_version(">", "0.19"): + if is_tokenizers_version(">", "0.19") and is_openvino_tokenizers_version("<", "2024.5.0.0"): logger.warning( - "Exporting tokenizers to OpenVINO is not supported for tokenizers version > 0.19. " + "Exporting tokenizers to OpenVINO is not supported for tokenizers version > 0.19 and openvino version <= 2024.4. " "Please downgrade to tokenizers version <= 0.19 to export tokenizers to OpenVINO." ) diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index 60d20361e..6fa6c590b 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -382,6 +382,24 @@ def is_openvino_version(operation: str, version: str): return compare_versions(parse(_openvino_version), operation, version) +def is_openvino_tokenizers_version(operation: str, version: str): + if not is_openvino_available(): + return False + if not is_openvino_tokenizers_available(): + return False + import openvino_tokenizers + + tokenizers_version = openvino_tokenizers.__version__ + + if tokenizers_version == "0.0.0.0": + try: + tokenizers_version = importlib_metadata.version("openvino_tokenizers") or tokenizers_version + except importlib_metadata.PackageNotFoundError: + pass + + return compare_versions(parse(tokenizers_version), operation, version) + + def is_diffusers_version(operation: str, version: str): """ Compare the current diffusers version to a given reference with an operation. diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 8443f95b3..cea6c94fc 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -153,7 +153,7 @@ def test_exporters_cli(self, task: str, model_type: str): def test_exporters_cli_tokenizers(self, task: str, model_type: str): with TemporaryDirectory() as tmpdir: output = subprocess.check_output( - f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} {tmpdir}", + f"TRANSFORMERS_VERBOSITY=debug optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} {tmpdir}", shell=True, stderr=subprocess.STDOUT, ).decode() From a432102fc2354d991b72053d82e30ab7c6d790fa Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Thu, 24 Oct 2024 12:01:53 +0400 Subject: [PATCH 20/53] restore original model_index.json after save_pretrained call (#961) --- optimum/intel/openvino/modeling_diffusion.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 68dc31bc9..22e8bf314 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -259,6 +259,26 @@ def _save_pretrained(self, save_directory: Union[str, Path]): self._save_openvino_config(save_directory) + def _save_config(self, save_directory): + """ + Saves a model configuration into a directory, so that it can be re-loaded using the + [`from_pretrained`] class method. + """ + model_dir = ( + self.model_save_dir + if not isinstance(self.model_save_dir, TemporaryDirectory) + else self.model_save_dir.name + ) + save_dir = Path(save_directory) + original_config = Path(model_dir) / self.config_name + if original_config.exists(): + if not save_dir.exists(): + save_dir.mkdir(parents=True) + + shutil.copy(original_config, save_dir) + else: + self.config.save_pretrained(save_dir) + @classmethod def _from_pretrained( cls, From 86598a6c6e4338678f50f0ef54eda0fe95f95930 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Thu, 24 Oct 2024 13:43:37 +0400 Subject: [PATCH 21/53] sd3 pipeline support (#916) * WIP: conversion and pipeline base * Support SD3 * img2img pipeline * fix model export * update after migration on new pipeline style * fix inference issues * fix missed tokenizer export * add support in quantization * Update optimum/intel/openvino/modeling_diffusion.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * add tests * fix tests * update tests * Update tests/openvino/utils_tests.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * fix tests * add export tests * fix cli tests * use fp32 timesteps * add flux * fix after black update * apply review comments * compatibility with diffusers 0.31.0 * apply review comments * Update tests/openvino/test_diffusion.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update tests/openvino/test_diffusion.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- optimum/commands/export/openvino.py | 4 + optimum/exporters/openvino/__main__.py | 2 +- optimum/exporters/openvino/convert.py | 265 ++++++++++++++- optimum/exporters/openvino/model_configs.py | 172 +++++++++- optimum/exporters/openvino/model_patcher.py | 38 +++ optimum/intel/__init__.py | 16 + optimum/intel/openvino/__init__.py | 4 + optimum/intel/openvino/modeling_diffusion.py | 314 ++++++++++++++++-- optimum/intel/openvino/quantization.py | 40 ++- optimum/intel/openvino/utils.py | 2 + .../dummy_openvino_and_diffusers_objects.py | 44 +++ optimum/intel/utils/modeling_utils.py | 15 +- tests/openvino/test_diffusion.py | 132 ++++++-- tests/openvino/test_export.py | 7 +- tests/openvino/test_exporters_cli.py | 27 +- tests/openvino/test_quantization.py | 25 +- tests/openvino/utils_tests.py | 4 + 17 files changed, 1001 insertions(+), 110 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 93528e008..70d2e4885 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -318,6 +318,10 @@ def run(self): from optimum.intel import OVStableDiffusionPipeline model_cls = OVStableDiffusionPipeline + elif class_name == "StableDiffusion3Pipeline": + from optimum.intel import OVStableDiffusion3Pipeline + + model_cls = OVStableDiffusion3Pipeline else: raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.") diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 412ed21f6..ee61563c9 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -493,7 +493,7 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro f"models won't be generated. Exception: {exception}" ) elif model: - for tokenizer_name in ("tokenizer", "tokenizer_2"): + for tokenizer_name in ("tokenizer", "tokenizer_2", "tokenizer_3"): tokenizer = getattr(model, tokenizer_name, None) if tokenizer: export_tokenizer(tokenizer, output / tokenizer_name, task=task) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index e731cd180..2c076827d 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import functools import gc import logging @@ -31,7 +32,12 @@ from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx -from optimum.exporters.utils import _get_submodels_and_export_configs as _default_get_submodels_and_export_configs +from optimum.exporters.utils import ( + _get_submodels_and_export_configs as _default_get_submodels_and_export_configs, +) +from optimum.exporters.utils import ( + get_diffusion_models_for_export, +) from optimum.intel.utils.import_utils import ( _nncf_version, _open_clip_version, @@ -619,23 +625,27 @@ def export_from_model( model, library_name, task, preprocessors, custom_export_configs, fn_get_submodels ) - logging.disable(logging.INFO) - export_config, models_and_export_configs, stateful_submodels = _get_submodels_and_export_configs( - model=model, - task=task, - monolith=False, - custom_export_configs=custom_export_configs if custom_export_configs is not None else {}, - custom_architecture=custom_architecture, - fn_get_submodels=fn_get_submodels, - preprocessors=preprocessors, - library_name=library_name, - model_kwargs=model_kwargs, - _variant="default", - legacy=False, - exporter="openvino", - stateful=stateful, - ) - logging.disable(logging.NOTSET) + if library_name == "diffusers": + export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino") + stateful_submodels = False + else: + logging.disable(logging.INFO) + export_config, models_and_export_configs, stateful_submodels = _get_submodels_and_export_configs( + model=model, + task=task, + monolith=False, + custom_export_configs=custom_export_configs if custom_export_configs is not None else {}, + custom_architecture=custom_architecture, + fn_get_submodels=fn_get_submodels, + preprocessors=preprocessors, + library_name=library_name, + model_kwargs=model_kwargs, + _variant="default", + legacy=False, + exporter="openvino", + stateful=stateful, + ) + logging.disable(logging.NOTSET) if library_name == "open_clip": if hasattr(model.config, "save_pretrained"): @@ -701,6 +711,10 @@ def export_from_model( if tokenizer_2 is not None: tokenizer_2.save_pretrained(output.joinpath("tokenizer_2")) + tokenizer_3 = getattr(model, "tokenizer_3", None) + if tokenizer_3 is not None: + tokenizer_3.save_pretrained(output.joinpath("tokenizer_3")) + model.save_config(output) export_models( @@ -889,3 +903,218 @@ def _get_submodels_and_export_configs( ) stateful_per_model = [stateful] * len(models_for_export) return export_config, models_for_export, stateful_per_model + + +def get_diffusion_models_for_export_ext( + pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino" +): + try: + from diffusers import ( + StableDiffusion3Img2ImgPipeline, + StableDiffusion3InpaintPipeline, + StableDiffusion3Pipeline, + ) + + is_sd3 = isinstance( + pipeline, (StableDiffusion3Pipeline, StableDiffusion3InpaintPipeline, StableDiffusion3Img2ImgPipeline) + ) + except ImportError: + is_sd3 = False + + try: + from diffusers import FluxPipeline + + is_flux = isinstance(pipeline, FluxPipeline) + except ImportError: + is_flux = False + + if not is_sd3 and not is_flux: + return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter) + if is_sd3: + models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype) + else: + models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype) + + return None, models_for_export + + +def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype): + models_for_export = {} + + # Text encoder + text_encoder = getattr(pipeline, "text_encoder", None) + if text_encoder is not None: + text_encoder.config.output_hidden_states = True + text_encoder.text_model.config.output_hidden_states = True + text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="clip-text-with-projection", + ) + text_encoder_export_config = text_encoder_config_constructor( + pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config) + + transformer = pipeline.transformer + transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim + transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + transformer.config.time_cond_proj_dim = None + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=transformer, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="sd3-transformer", + ) + transformer_export_config = export_config_constructor( + pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["transformer"] = (transformer, transformer_export_config) + + # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + vae_encoder = copy.deepcopy(pipeline.vae) + vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_encoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-encoder", + ) + vae_encoder_export_config = vae_config_constructor( + vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) + + # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + vae_decoder = copy.deepcopy(pipeline.vae) + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_decoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-decoder", + ) + vae_decoder_export_config = vae_config_constructor( + vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config) + + text_encoder_2 = getattr(pipeline, "text_encoder_2", None) + if text_encoder_2 is not None: + text_encoder_2.config.output_hidden_states = True + text_encoder_2.text_model.config.output_hidden_states = True + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_2, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="clip-text-with-projection", + ) + export_config = export_config_constructor(text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["text_encoder_2"] = (text_encoder_2, export_config) + + text_encoder_3 = getattr(pipeline, "text_encoder_3", None) + if text_encoder_3 is not None: + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_3, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="t5-encoder-model", + ) + export_config = export_config_constructor( + text_encoder_3.config, + int_dtype=int_dtype, + float_dtype=float_dtype, + ) + models_for_export["text_encoder_3"] = (text_encoder_3, export_config) + + return models_for_export + + +def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype): + models_for_export = {} + + # Text encoder + text_encoder = getattr(pipeline, "text_encoder", None) + if text_encoder is not None: + text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="clip-text-model", + ) + text_encoder_export_config = text_encoder_config_constructor( + pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config) + + transformer = pipeline.transformer + transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim + transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + transformer.config.time_cond_proj_dim = None + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=transformer, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="flux-transformer", + ) + transformer_export_config = export_config_constructor( + pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["transformer"] = (transformer, transformer_export_config) + + # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + vae_encoder = copy.deepcopy(pipeline.vae) + vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_encoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-encoder", + ) + vae_encoder_export_config = vae_config_constructor( + vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) + + # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + vae_decoder = copy.deepcopy(pipeline.vae) + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_decoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-decoder", + ) + vae_decoder_export_config = vae_config_constructor( + vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config) + + text_encoder_2 = getattr(pipeline, "text_encoder_2", None) + if text_encoder_2 is not None: + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_2, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="t5-encoder-model", + ) + export_config = export_config_constructor( + text_encoder_2.config, + int_dtype=int_dtype, + float_dtype=float_dtype, + ) + models_for_export["text_encoder_2"] = (text_encoder_2, export_config) + + return models_for_export diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 33190e6f1..ace5c150d 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -35,22 +35,26 @@ MistralOnnxConfig, MPTOnnxConfig, PhiOnnxConfig, + UNetOnnxConfig, VisionOnnxConfig, ) from optimum.exporters.onnx.model_patcher import ModelPatcher from optimum.exporters.tasks import TasksManager from optimum.utils import DEFAULT_DUMMY_SHAPES from optimum.utils.input_generators import ( + DTYPE_MAPPER, DummyInputGenerator, DummyPastKeyValuesGenerator, + DummySeq2SeqDecoderTextInputGenerator, DummyTextInputGenerator, + DummyTimestepInputGenerator, DummyVisionInputGenerator, FalconDummyPastKeyValuesGenerator, MistralDummyPastKeyValuesGenerator, ) -from optimum.utils.normalized_config import NormalizedTextConfig, NormalizedVisionConfig +from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig -from ...intel.utils.import_utils import _transformers_version, is_transformers_version +from ...intel.utils.import_utils import _transformers_version, is_diffusers_version, is_transformers_version from .model_patcher import ( AquilaModelPatcher, ArcticModelPatcher, @@ -60,6 +64,7 @@ DBRXModelPatcher, DeciLMModelPatcher, FalconModelPatcher, + FluxTransfromerModelPatcher, Gemma2ModelPatcher, GptNeoxJapaneseModelPatcher, GptNeoxModelPatcher, @@ -1570,3 +1575,166 @@ def patch_model_for_export( if self._behavior != InternVLChatConfigBehavior.VISION_EMBEDDINGS: return super().patch_model_for_export(model, model_kwargs) return InternVLChatImageEmbeddingModelPatcher(self, model, model_kwargs) + + +class PooledProjectionsDummyInputGenerator(DummyInputGenerator): + SUPPORTED_INPUT_NAMES = ["pooled_projections"] + + def __init__( + self, + task: str, + normalized_config: NormalizedConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + self.task = task + self.batch_size = batch_size + self.pooled_projection_dim = normalized_config.config.pooled_projection_dim + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + shape = [self.batch_size, self.pooled_projection_dim] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + + +class DummyTransformerTimestpsInputGenerator(DummyTimestepInputGenerator): + SUPPORTED_INPUT_NAMES = ("timestep", "text_embeds", "time_ids", "timestep_cond", "guidance") + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name in ["timestep", "guidance"]: + shape = [self.batch_size] + return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) + return super().generate(input_name, framework, int_dtype, float_dtype) + + +@register_in_tasks_manager("sd3-transformer", *["semantic-segmentation"], library_name="diffusers") +class SD3TransformerOpenVINOConfig(UNetOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + (DummyTransformerTimestpsInputGenerator,) + + UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + + (PooledProjectionsDummyInputGenerator,) + ) + NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( + image_size="sample_size", + num_channels="in_channels", + hidden_size="joint_attention_dim", + vocab_size="attention_head_dim", + allow_new=True, + ) + + @property + def inputs(self): + common_inputs = super().inputs + common_inputs["pooled_projections"] = {0: "batch_size"} + return common_inputs + + def rename_ambiguous_inputs(self, inputs): + # The input name in the model signature is `x, hence the export input name is updated. + hidden_states = inputs.pop("sample", None) + if hidden_states is not None: + inputs["hidden_states"] = hidden_states + return inputs + + +@register_in_tasks_manager("t5-encoder-model", *["feature-extraction"], library_name="diffusers") +class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig): + pass + + +class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "pixel_values", + "pixel_mask", + "sample", + "latent_sample", + "hidden_states", + "img_ids", + ) + + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = DEFAULT_DUMMY_SHAPES["width"], + height: int = DEFAULT_DUMMY_SHAPES["height"], + **kwargs, + ): + super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs) + if getattr(normalized_config, "in_channels", None): + self.num_channels = normalized_config.in_channels // 4 + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name in ["hidden_states", "sample"]: + shape = [self.batch_size, (self.height // 2) * (self.width // 2), self.num_channels * 4] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + if input_name == "img_ids": + img_ids_height = self.height // 2 + img_ids_width = self.width // 2 + return self.random_int_tensor( + [self.batch_size, img_ids_height * img_ids_width, 3] + if is_diffusers_version("<", "0.31.0") + else [img_ids_height * img_ids_width, 3], + min_value=0, + max_value=min(img_ids_height, img_ids_width), + framework=framework, + dtype=float_dtype, + ) + + return super().generate(input_name, framework, int_dtype, float_dtype) + + +class DummyFluxTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "decoder_input_ids", + "decoder_attention_mask", + "encoder_outputs", + "encoder_hidden_states", + "txt_ids", + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "txt_ids": + import torch + + shape = ( + [self.batch_size, self.sequence_length, 3] + if is_diffusers_version("<", "0.31.0") + else [self.sequence_length, 3] + ) + dtype = DTYPE_MAPPER.pt(float_dtype) + return torch.full(shape, 0, dtype=dtype) + return super().generate(input_name, framework, int_dtype, float_dtype) + + +@register_in_tasks_manager("flux-transformer", *["semantic-segmentation"], library_name="diffusers") +class FluxTransformerOpenVINOConfig(SD3TransformerOpenVINOConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + DummyTransformerTimestpsInputGenerator, + DummyFluxTransformerInputGenerator, + DummyFluxTextInputGenerator, + PooledProjectionsDummyInputGenerator, + ) + + @property + def inputs(self): + common_inputs = super().inputs + common_inputs.pop("sample", None) + common_inputs["hidden_states"] = {0: "batch_size", 1: "packed_height_width"} + common_inputs["txt_ids"] = ( + {0: "batch_size", 1: "sequence_length"} if is_diffusers_version("<", "0.31.0") else {0: "sequence_length"} + ) + common_inputs["img_ids"] = ( + {0: "batch_size", 1: "packed_height_width"} + if is_diffusers_version("<", "0.31.0") + else {0: "packed_height_width"} + ) + if getattr(self._normalized_config, "guidance_embeds", False): + common_inputs["guidance"] = {0: "batch_size"} + return common_inputs + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return FluxTransfromerModelPatcher(self, model, model_kwargs=model_kwargs) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index eadce6d38..3bc9452ff 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -29,6 +29,7 @@ _openvino_version, _torch_version, _transformers_version, + is_diffusers_version, is_openvino_version, is_torch_version, is_transformers_version, @@ -2705,3 +2706,40 @@ def __init__( def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) self._model.forward = self._model.__orig_forward + + +def _embednb_forward(self, ids: torch.Tensor) -> torch.Tensor: + def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: + assert dim % 2 == 0, "The dimension must be even." + + scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim + omega = 1.0 / (theta**scale) + + batch_size, seq_length = pos.shape + out = pos.unsqueeze(-1) * omega.unsqueeze(0).unsqueeze(0) + cos_out = torch.cos(out) + sin_out = torch.sin(out) + + stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1) + out = stacked_out.view(batch_size, -1, dim // 2, 2, 2) + return out.float() + + n_axes = ids.shape[-1] + emb = torch.cat( + [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], + dim=-3, + ) + return emb.unsqueeze(1) + + +class FluxTransfromerModelPatcher(ModelPatcher): + def __enter__(self): + super().__enter__() + if is_diffusers_version("<", "0.31.0"): + self._model.pos_embed._orig_forward = self._model.pos_embed.forward + self._model.pos_embed.forward = types.MethodType(_embednb_forward, self._model.pos_embed) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + if hasattr(self._model.pos_embed, "_orig_forward"): + self._model.pos_embed.forward = self._model.pos_embed._orig_forward diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 5926f1869..67a01011a 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -100,8 +100,12 @@ "OVStableDiffusionXLPipeline", "OVStableDiffusionXLImg2ImgPipeline", "OVStableDiffusionXLInpaintPipeline", + "OVStableDiffusion3Pipeline", + "OVStableDiffusion3Image2ImagePipeline", + "OVStableDiffusion3InpaintPipeline", "OVLatentConsistencyModelPipeline", "OVLatentConsistencyModelImg2ImgPipeline", + "OVFluxPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", "OVPipelineForInpainting", @@ -116,8 +120,12 @@ "OVStableDiffusionXLPipeline", "OVStableDiffusionXLImg2ImgPipeline", "OVStableDiffusionXLInpaintPipeline", + "OVStableDiffusion3Pipeline", + "OVStableDiffusion3Image2ImagePipeline", + "OVStableDiffusion3InpaintPipeline", "OVLatentConsistencyModelPipeline", "OVLatentConsistencyModelImg2ImgPipeline", + "OVFluxPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", "OVPipelineForInpainting", @@ -263,10 +271,14 @@ except OptionalDependencyNotAvailable: from .utils.dummy_openvino_and_diffusers_objects import ( OVDiffusionPipeline, + OVFluxPipeline, OVLatentConsistencyModelPipeline, OVPipelineForImage2Image, OVPipelineForInpainting, OVPipelineForText2Image, + OVStableDiffusion3Img2ImgPipeline, + OVStableDiffusion3InpaintPipeline, + OVStableDiffusion3Pipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, @@ -276,11 +288,15 @@ else: from .openvino import ( OVDiffusionPipeline, + OVFluxPipeline, OVLatentConsistencyModelImg2ImgPipeline, OVLatentConsistencyModelPipeline, OVPipelineForImage2Image, OVPipelineForInpainting, OVPipelineForText2Image, + OVStableDiffusion3Img2ImgPipeline, + OVStableDiffusion3InpaintPipeline, + OVStableDiffusion3Pipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 549bf8170..589a0938e 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -82,11 +82,15 @@ if is_diffusers_available(): from .modeling_diffusion import ( OVDiffusionPipeline, + OVFluxPipeline, OVLatentConsistencyModelImg2ImgPipeline, OVLatentConsistencyModelPipeline, OVPipelineForImage2Image, OVPipelineForInpainting, OVPipelineForText2Image, + OVStableDiffusion3Img2ImgPipeline, + OVStableDiffusion3InpaintPipeline, + OVStableDiffusion3Pipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 22e8bf314..8bca8cc9a 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -22,7 +22,7 @@ from copy import deepcopy from pathlib import Path from tempfile import gettempdir -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, List, Optional, Union import numpy as np import openvino @@ -82,6 +82,20 @@ else: from diffusers.models.vae import DiagonalGaussianDistribution +if is_diffusers_version(">=", "0.29.0"): + from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline +else: + StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline = StableDiffusionPipeline, StableDiffusionImg2ImgPipeline + +if is_diffusers_version(">=", "0.30.0"): + from diffusers import FluxPipeline, StableDiffusion3InpaintPipeline +else: + StableDiffusion3InpaintPipeline = StableDiffusionInpaintPipeline + FluxPipeline = StableDiffusionPipeline + + +DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" +DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3" core = Core() @@ -99,15 +113,18 @@ class OVDiffusionPipeline(OVBaseModel, DiffusionPipeline): def __init__( self, scheduler: SchedulerMixin, - unet: openvino.runtime.Model, - vae_decoder: openvino.runtime.Model, + unet: Optional[openvino.runtime.Model] = None, + vae_decoder: Optional[openvino.runtime.Model] = None, # optional pipeline models vae_encoder: Optional[openvino.runtime.Model] = None, text_encoder: Optional[openvino.runtime.Model] = None, text_encoder_2: Optional[openvino.runtime.Model] = None, + text_encoder_3: Optional[openvino.runtime.Model] = None, + transformer: Optional[openvino.runtime.Model] = None, # optional pipeline submodels tokenizer: Optional[CLIPTokenizer] = None, tokenizer_2: Optional[CLIPTokenizer] = None, + tokenizer_3: Optional[CLIPTokenizer] = None, feature_extractor: Optional[CLIPFeatureExtractor] = None, # stable diffusion xl specific arguments force_zeros_for_empty_prompt: bool = True, @@ -149,7 +166,15 @@ def __init__( f"Please set `compile_only=False` or `dynamic_shapes={model_is_dynamic}`" ) - self.unet = OVModelUnet(unet, self, DIFFUSION_MODEL_UNET_SUBFOLDER) + self.unet = OVModelUnet(unet, self, DIFFUSION_MODEL_UNET_SUBFOLDER) if unet is not None else None + self.transformer = ( + OVModelTransformer(transformer, self, DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER) + if transformer is not None + else None + ) + + if unet is None and transformer is None: + raise ValueError("`unet` or `transformer` model should be provided for pipeline work") self.vae_decoder = OVModelVaeDecoder(vae_decoder, self, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER) self.vae_encoder = ( OVModelVaeEncoder(vae_encoder, self, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER) @@ -166,12 +191,18 @@ def __init__( if text_encoder_2 is not None else None ) + self.text_encoder_3 = ( + OVModelTextEncoder(text_encoder_3, self, DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER) + if text_encoder_3 is not None + else None + ) # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API self.vae = OVModelVae(decoder=self.vae_decoder, encoder=self.vae_encoder) self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 + self.tokenizer_3 = tokenizer_3 self.feature_extractor = feature_extractor # we allow passing these as torch models for now @@ -181,13 +212,16 @@ def __init__( all_pipeline_init_args = { "vae": self.vae, "unet": self.unet, + "transformer": self.transformer, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, + "text_encoder_3": self.text_encoder_3, "safety_checker": self.safety_checker, "image_encoder": self.image_encoder, "scheduler": self.scheduler, "tokenizer": self.tokenizer, "tokenizer_2": self.tokenizer_2, + "tokenizer_3": self.tokenizer_3, "feature_extractor": self.feature_extractor, "requires_aesthetics_score": requires_aesthetics_score, "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt, @@ -236,6 +270,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER), (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER), (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER), + (self.text_encoder_3, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER), + (self.transformer, save_directory / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER), } for model, save_path in models_to_save_paths: if model is not None: @@ -254,6 +290,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): self.tokenizer.save_pretrained(save_directory / "tokenizer") if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + if self.tokenizer_3 is not None: + self.tokenizer_3.save_pretrained(save_directory / "tokenizer_3") if self.feature_extractor is not None: self.feature_extractor.save_pretrained(save_directory / "feature_extractor") @@ -294,6 +332,8 @@ def _from_pretrained( vae_encoder_file_name: Optional[str] = None, text_encoder_file_name: Optional[str] = None, text_encoder_2_file_name: Optional[str] = None, + text_encoder_3_file_name: Optional[str] = None, + transformer_file_name: Optional[str] = None, from_onnx: bool = False, load_in_8bit: bool = False, quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, @@ -314,6 +354,8 @@ def _from_pretrained( vae_decoder_file_name = vae_decoder_file_name or default_file_name text_encoder_file_name = text_encoder_file_name or default_file_name text_encoder_2_file_name = text_encoder_2_file_name or default_file_name + text_encoder_3_file_name = text_encoder_3_file_name or default_file_name + transformer_file_name = transformer_file_name or default_file_name if not os.path.isdir(str(model_id)): all_components = {key for key in config.keys() if not key.startswith("_")} | {"vae_encoder", "vae_decoder"} @@ -321,15 +363,19 @@ def _from_pretrained( allow_patterns.update( { unet_file_name, + transformer_file_name, vae_encoder_file_name, vae_decoder_file_name, text_encoder_file_name, text_encoder_2_file_name, + text_encoder_3_file_name, unet_file_name.replace(".xml", ".bin"), + transformer_file_name.replace(".xml", ".bin"), vae_encoder_file_name.replace(".xml", ".bin"), vae_decoder_file_name.replace(".xml", ".bin"), text_encoder_file_name.replace(".xml", ".bin"), text_encoder_2_file_name.replace(".xml", ".bin"), + text_encoder_3_file_name.replace(".xml", ".bin"), SCHEDULER_CONFIG_NAME, cls.config_name, CONFIG_NAME, @@ -357,9 +403,15 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = model_save_path - submodels = {"scheduler": None, "tokenizer": None, "tokenizer_2": None, "feature_extractor": None} + submodels = { + "scheduler": None, + "tokenizer": None, + "tokenizer_2": None, + "tokenizer_3": None, + "feature_extractor": None, + } for name in submodels.keys(): - if kwargs.get(name, None) is not None: + if kwargs.get(name) is not None: submodels[name] = kwargs.pop(name) elif config.get(name, (None, None))[0] is not None: library_name, library_classes = config.get(name) @@ -374,17 +426,19 @@ def _from_pretrained( models = { "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + "transformer": model_save_path / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER / transformer_file_name, "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, + "text_encoder_3": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER / text_encoder_3_file_name, } compile_only = kwargs.get("compile_only", False) quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) if (quantization_config is None or quantization_config.dataset is None) and not compile_only: for name, path in models.items(): - if kwargs.get(name, None) is not None: + if name in kwargs: models[name] = kwargs.pop(name) else: models[name] = cls.load_model(path, quantization_config) if path.is_file() else None @@ -395,7 +449,7 @@ def _from_pretrained( if "GPU" in device.upper() and "INFERENCE_PRECISION_HINT" not in vae_ov_conifg: vae_ov_conifg["INFERENCE_PRECISION_HINT"] = "f32" for name, path in models.items(): - if kwargs.get(name, None) is not None: + if name in kwargs: models[name] = kwargs.pop(name) else: models[name] = ( @@ -416,7 +470,7 @@ def _from_pretrained( from optimum.intel import OVQuantizer for name, path in models.items(): - if kwargs.get(name, None) is not None: + if name in kwargs: models[name] = kwargs.pop(name) else: models[name] = cls.load_model(path) if path.is_file() else None @@ -431,7 +485,6 @@ def _from_pretrained( quantizer.quantize(ov_config=OVConfig(quantization_config=hybrid_quantization_config)) return ov_pipeline - ov_pipeline = ov_pipeline_class( **models, **submodels, @@ -483,6 +536,7 @@ def _from_transformers( no_post_process=True, revision=revision, cache_dir=cache_dir, + task=cls.export_feature, token=token, local_files_only=local_files_only, force_download=force_download, @@ -515,7 +569,7 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = if isinstance(device, str): self._device = device.upper() - self.request = None + self.clear_requests() elif device is not None: raise ValueError( "The `device` argument should be a string representing the device on which the model should be loaded." @@ -531,21 +585,24 @@ def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = @property def height(self) -> int: - height = self.unet.model.inputs[0].get_partial_shape()[2] + model = self.vae.decoder.model + height = model.inputs[0].get_partial_shape()[2] if height.is_dynamic: return -1 return height.get_length() * self.vae_scale_factor @property def width(self) -> int: - width = self.unet.model.inputs[0].get_partial_shape()[3] + model = self.vae.decoder.model + width = model.inputs[0].get_partial_shape()[3] if width.is_dynamic: return -1 return width.get_length() * self.vae_scale_factor @property def batch_size(self) -> int: - batch_size = self.unet.model.inputs[0].get_partial_shape()[0] + model = self.unet.model if self.unet is not None else self.transformer.model + batch_size = model.inputs[0].get_partial_shape()[0] if batch_size.is_dynamic: return -1 return batch_size.get_length() @@ -597,6 +654,65 @@ def _reshape_unet( model.reshape(shapes) return model + def _reshape_transformer( + self, + model: openvino.runtime.Model, + batch_size: int = -1, + height: int = -1, + width: int = -1, + num_images_per_prompt: int = -1, + tokenizer_max_length: int = -1, + ): + if batch_size == -1 or num_images_per_prompt == -1: + batch_size = -1 + else: + # The factor of 2 comes from the guidance scale > 1 + batch_size *= num_images_per_prompt + if "img_ids" not in {inputs.get_any_name() for inputs in model.inputs}: + batch_size *= 2 + + height = height // self.vae_scale_factor if height > 0 else height + width = width // self.vae_scale_factor if width > 0 else width + packed_height = height // 2 if height > 0 else height + packed_width = width // 2 if width > 0 else width + packed_height_width = packed_width * packed_height if height > 0 and width > 0 else -1 + shapes = {} + for inputs in model.inputs: + shapes[inputs] = inputs.get_partial_shape() + if inputs.get_any_name() in ["timestep", "guidance"]: + shapes[inputs][0] = batch_size + elif inputs.get_any_name() == "hidden_states": + in_channels = self.transformer.config.get("in_channels", None) + if in_channels is None: + in_channels = ( + shapes[inputs][1] if inputs.get_partial_shape().rank.get_length() == 4 else shapes[inputs][2] + ) + if in_channels.is_dynamic: + logger.warning( + "Could not identify `in_channels` from the unet configuration, to statically reshape the unet please provide a configuration." + ) + self.is_dynamic = True + if inputs.get_partial_shape().rank.get_length() == 4: + shapes[inputs] = [batch_size, in_channels, height, width] + else: + shapes[inputs] = [batch_size, packed_height_width, in_channels] + + elif inputs.get_any_name() == "pooled_projections": + shapes[inputs] = [batch_size, self.transformer.config["pooled_projection_dim"]] + elif inputs.get_any_name() == "img_ids": + shapes[inputs] = ( + [batch_size, packed_height_width, 3] + if is_diffusers_version("<", "0.31.0") + else [packed_height_width, 3] + ) + elif inputs.get_any_name() == "txt_ids": + shapes[inputs] = [batch_size, -1, 3] if is_diffusers_version("<", "0.31.0") else [-1, 3] + else: + shapes[inputs][0] = batch_size + shapes[inputs][1] = -1 # text_encoder_3 may have vary input length + model.reshape(shapes) + return model + def _reshape_text_encoder( self, model: openvino.runtime.Model, batch_size: int = -1, tokenizer_max_length: int = -1 ): @@ -658,9 +774,14 @@ def reshape( self.tokenizer.model_max_length if self.tokenizer is not None else self.tokenizer_2.model_max_length ) - self.unet.model = self._reshape_unet( - self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len - ) + if self.unet is not None: + self.unet.model = self._reshape_unet( + self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len + ) + if self.transformer is not None: + self.transformer.model = self._reshape_transformer( + self.transformer.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len + ) self.vae_decoder.model = self._reshape_vae_decoder( self.vae_decoder.model, height, width, num_images_per_prompt ) @@ -678,6 +799,11 @@ def reshape( self.text_encoder_2.model, batch_size, self.tokenizer_2.model_max_length ) + if self.text_encoder_3 is not None: + self.text_encoder_3.model = self._reshape_text_encoder( + self.text_encoder_3.model, batch_size, self.tokenizer_3.model_max_length + ) + self.clear_requests() return self @@ -690,7 +816,15 @@ def half(self): "`half()` is not supported with `compile_only` mode, please intialize model without this option" ) - for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: + for component in { + self.unet, + self.transformer, + self.vae_encoder, + self.vae_decoder, + self.text_encoder, + self.text_encoder_2, + self.text_encoder_3, + }: if component is not None: compress_model_transformation(component.model) @@ -704,12 +838,28 @@ def clear_requests(self): "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option" ) - for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: + for component in [ + self.unet, + self.transformer, + self.vae_encoder, + self.vae_decoder, + self.text_encoder, + self.text_encoder_2, + self.text_encoder_3, + ]: if component is not None: component.request = None def compile(self): - for component in {self.unet, self.vae_encoder, self.vae_decoder, self.text_encoder, self.text_encoder_2}: + for component in [ + self.unet, + self.transformer, + self.vae_encoder, + self.vae_decoder, + self.text_encoder, + self.text_encoder_2, + self.text_encoder_3, + ]: if component is not None: component._compile() @@ -725,8 +875,10 @@ def components(self) -> Dict[str, Any]: components = { "vae": self.vae, "unet": self.unet, + "transformer": self.transformer, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, + "text_encoder_3": self.text_encoder_2, "safety_checker": self.safety_checker, "image_encoder": self.image_encoder, } @@ -841,6 +993,12 @@ def modules(self): class OVModelTextEncoder(OVPipelinePart): + def __init__(self, model: openvino.runtime.Model, parent_pipeline: OVDiffusionPipeline, model_name: str = ""): + super().__init__(model, parent_pipeline, model_name) + self.hidden_states_output_names = sorted( + {name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")} + ) + def forward( self, input_ids: Union[np.ndarray, torch.Tensor], @@ -849,24 +1007,26 @@ def forward( return_dict: bool = False, ): self._compile() - model_inputs = {"input_ids": input_ids} - ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() - + ov_outputs = self.request(model_inputs, share_inputs=True) + main_out = ov_outputs[0] model_outputs = {} - for key, value in ov_outputs.items(): - model_outputs[next(iter(key.names))] = torch.from_numpy(value) - - if output_hidden_states: - model_outputs["hidden_states"] = [] - for i in range(self.config.num_hidden_layers): - model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) - model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) + model_outputs[self.model.outputs[0].get_any_name()] = torch.from_numpy(main_out) + if len(self.model.outputs) > 1 and "pooler_output" in self.model.outputs[1].get_any_name(): + model_outputs["pooler_output"] = torch.from_numpy(ov_outputs[1]) + if self.hidden_states_output_names and "last_hidden_state" not in model_outputs: + model_outputs["last_hidden_state"] = torch.from_numpy(ov_outputs[self.hidden_states_output_names[-1]]) + if ( + self.hidden_states_output_names + and output_hidden_states + or getattr(self.config, "output_hidden_states", False) + ): + hidden_states = [torch.from_numpy(ov_outputs[out_name]) for out_name in self.hidden_states_output_names] + model_outputs["hidden_states"] = hidden_states if return_dict: return model_outputs - return ModelOutput(**model_outputs) @@ -924,6 +1084,48 @@ def forward( return ModelOutput(**model_outputs) +class OVModelTransformer(OVPipelinePart): + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + pooled_projections: torch.FloatTensor = None, + timestep: torch.LongTensor = None, + img_ids: torch.Tensor = None, + txt_ids: torch.Tensor = None, + guidance: torch.Tensor = None, + block_controlnet_hidden_states: List = None, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = True, + ): + self._compile() + + model_inputs = { + "hidden_states": hidden_states, + "timestep": timestep, + "encoder_hidden_states": encoder_hidden_states, + "pooled_projections": pooled_projections, + } + + if img_ids is not None: + model_inputs["img_ids"] = img_ids + if txt_ids is not None: + model_inputs["txt_ids"] = txt_ids + if guidance is not None: + model_inputs["guidance"] = guidance + + ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() + + model_outputs = {} + for key, value in ov_outputs.items(): + model_outputs[next(iter(key.names))] = torch.from_numpy(value) + + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) + + class OVModelVaeEncoder(OVPipelinePart): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1197,6 +1399,34 @@ class OVLatentConsistencyModelImg2ImgPipeline( auto_model_class = LatentConsistencyModelImg2ImgPipeline +class OVStableDiffusion3Pipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Pipeline): + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = StableDiffusion3Pipeline + + +class OVStableDiffusion3Img2ImgPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Img2ImgPipeline +): + main_input_name = "image" + export_feature = "image-to-image" + auto_model_class = StableDiffusion3Img2ImgPipeline + + +class OVStableDiffusion3InpaintPipeline( + OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3InpaintPipeline +): + main_input_name = "image" + export_feature = "inpainting" + auto_model_class = StableDiffusion3InpaintPipeline + + +class OVFluxPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, FluxPipeline): + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = FluxPipeline + + SUPPORTED_OV_PIPELINES = [ OVStableDiffusionPipeline, OVStableDiffusionImg2ImgPipeline, @@ -1244,6 +1474,23 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru ] ) +if is_diffusers_version(">=", "0.29.0"): + SUPPORTED_OV_PIPELINES.extend( + [ + OVStableDiffusion3Pipeline, + OVStableDiffusion3Img2ImgPipeline, + ] + ) + + OV_TEXT2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3Pipeline + OV_IMAGE2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3Img2ImgPipeline + +if is_diffusers_version(">=", "0.30.0"): + SUPPORTED_OV_PIPELINES.extend([OVStableDiffusion3InpaintPipeline, OVFluxPipeline]) + OV_INPAINT_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3InpaintPipeline + OV_TEXT2IMAGE_PIPELINES_MAPPING["flux"] = OVFluxPipeline + + SUPPORTED_OV_PIPELINES_MAPPINGS = [ OV_TEXT2IMAGE_PIPELINES_MAPPING, OV_IMAGE2IMAGE_PIPELINES_MAPPING, @@ -1299,13 +1546,16 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): class OVPipelineForText2Image(OVPipelineForTask): auto_model_class = AutoPipelineForText2Image ov_pipelines_mapping = OV_TEXT2IMAGE_PIPELINES_MAPPING + export_feature = "text-to-image" class OVPipelineForImage2Image(OVPipelineForTask): auto_model_class = AutoPipelineForImage2Image ov_pipelines_mapping = OV_IMAGE2IMAGE_PIPELINES_MAPPING + export_feature = "image-to-image" class OVPipelineForInpainting(OVPipelineForTask): auto_model_class = AutoPipelineForInpainting ov_pipelines_mapping = OV_INPAINT_PIPELINES_MAPPING + export_feature = "inpainting" diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 1ad75477c..c2e880e62 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -380,15 +380,27 @@ def _quantize_ovbasemodel( quantization_config_copy = copy.deepcopy(quantization_config) quantization_config_copy.dataset = None quantization_config_copy.quant_method = OVQuantizationMethod.DEFAULT - sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"] + sub_model_names = [ + "vae_encoder", + "vae_decoder", + "text_encoder", + "text_encoder_2", + "text_encoder_3", + ] sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config_copy) - # Apply hybrid quantization to UNet - self.model.unet.model = _hybrid_quantization( - self.model.unet.model, quantization_config, calibration_dataset - ) + if self.model.unet is not None: + # Apply hybrid quantization to UNet + self.model.unet.model = _hybrid_quantization( + self.model.unet.model, quantization_config, calibration_dataset + ) + else: + self.model.transformer.model = _hybrid_quantization( + self.model.transformer.model, quantization_config, calibration_dataset + ) + self.model.clear_requests() else: # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc. @@ -396,7 +408,15 @@ def _quantize_ovbasemodel( self.model.request = None else: if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): - sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2", "unet"] + sub_model_names = [ + "vae_encoder", + "vae_decoder", + "text_encoder", + "text_encoder_2", + "unet", + "transformer", + "text_encoder_3", + ] sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config) @@ -743,7 +763,9 @@ def _prepare_unet_dataset( ) -> nncf.Dataset: self.model.compile() - size = self.model.unet.config.get("sample_size", 64) * self.model.vae_scale_factor + diffuser = self.model.unet if self.model.unet is not None else self.model.transformer + + size = diffuser.config.get("sample_size", 64) * self.model.vae_scale_factor height, width = 2 * (min(size, 512),) num_samples = num_samples or 200 @@ -784,7 +806,7 @@ def transform_fn(data_item): calibration_data = [] try: - self.model.unet.request = InferRequestWrapper(self.model.unet.request, calibration_data) + diffuser.request = InferRequestWrapper(diffuser.request, calibration_data) for inputs in dataset: inputs = transform_fn(inputs) @@ -795,7 +817,7 @@ def transform_fn(data_item): if len(calibration_data) >= num_samples: break finally: - self.model.unet.request = self.model.unet.request.request + diffuser.request = diffuser.request.request calibration_dataset = nncf.Dataset(calibration_data[:num_samples]) return calibration_dataset diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index fcc6944e9..ca7d17720 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -119,6 +119,8 @@ "audio-classification": "OVModelForAudioClassification", "stable-diffusion": "OVStableDiffusionPipeline", "stable-diffusion-xl": "OVStableDiffusionXLPipeline", + "stable-diffusion-3": "OVStableDiffusion3Pipeline", + "flux": "OVFluxPipeline", "pix2struct": "OVModelForPix2Struct", "latent-consistency": "OVLatentConsistencyModelPipeline", "open_clip_text": "OVModelOpenCLIPText", diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index 6ded4fd5d..38aea6c1f 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -145,3 +145,47 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) + + +class OVStableDiffusion3Img2ImgPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + +class OVStableDiffusion3Pipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + +class OVStableDiffusion3InpaintPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) + + +class OVFluxPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index a05efc46c..a39957bbf 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -123,17 +123,20 @@ def _find_files_matching_pattern( str(model_name_or_path), subfolder=subfolder, revision=revision, token=token ) if library_name == "diffusers": - subfolder = os.path.join(subfolder, "unet") + subfolders = [os.path.join(subfolder, "unet"), os.path.join(subfolder, "transformer")] else: - subfolder = subfolder or "." + subfolders = [subfolder or "."] if model_path.is_dir(): - glob_pattern = subfolder + "/*" - files = model_path.glob(glob_pattern) - files = [p for p in files if re.search(pattern, str(p))] + files = [] + for subfolder in subfolders: + glob_pattern = subfolder + "/*" + files_ = model_path.glob(glob_pattern) + files_ = [p for p in files_ if re.search(pattern, str(p))] + files.extend(files_) else: repo_files = map(Path, HfApi().list_repo_files(model_name_or_path, revision=revision, token=token)) - files = [Path(p) for p in repo_files if re.match(pattern, str(p)) and str(p.parent) == subfolder] + files = [Path(p) for p in repo_files if re.match(pattern, str(p)) and str(p.parent) in subfolders] return files diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 687c1f5c0..1467e5ed1 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -35,6 +35,7 @@ OVPipelineForInpainting, OVPipelineForText2Image, ) +from optimum.intel.utils.import_utils import is_transformers_version from optimum.utils.testing_utils import require_diffusers @@ -73,6 +74,11 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= class OVPipelineForText2ImageTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + if is_transformers_version(">=", "4.40.0"): + SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux"]) + NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append("stable-diffusion-3") + CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] OVMODEL_CLASS = OVPipelineForText2Image AUTOMODEL_CLASS = AutoPipelineForText2Image @@ -126,8 +132,8 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type @@ -135,9 +141,9 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(CALLBACK_SUPPORT_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): height, width, batch_size = 64, 128, 1 @@ -184,10 +190,26 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: - self.assertEqual( - outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), - ) + if model_arch != "flux": + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) + self.assertEqual( + outputs.shape, + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), + ) + else: + packed_height = height // pipeline.vae_scale_factor + packed_width = width // pipeline.vae_scale_factor + channels = pipeline.transformer.config.in_channels + self.assertEqual(outputs.shape, (batch_size, packed_height * packed_width, channels)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -205,7 +227,7 @@ def test_image_reproducibility(self, model_arch: str): self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0])) np.testing.assert_allclose(ov_outputs_1.images[0], ov_outputs_2.images[0], atol=1e-4, rtol=1e-2) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES) def test_negative_prompt(self, model_arch: str): height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) @@ -229,6 +251,22 @@ def test_negative_prompt(self, model_arch: str): do_classifier_free_guidance=True, negative_prompt=negative_prompt, ) + elif model_arch == "stable-diffusion-3": + ( + inputs["prompt_embeds"], + inputs["negative_prompt_embeds"], + inputs["pooled_prompt_embeds"], + inputs["negative_pooled_prompt_embeds"], + ) = pipeline.encode_prompt( + prompt=prompt, + prompt_2=None, + prompt_3=None, + num_images_per_prompt=1, + device=torch.device("cpu"), + do_classifier_free_guidance=True, + negative_prompt=negative_prompt, + ) + else: inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt( prompt=prompt, @@ -288,11 +326,18 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) + expected_batch = batch_size * num_images_per_prompt + if ( + ov_pipeline.unet is not None + and "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} + ) or ( + ov_pipeline.transformer is not None + and "txt_ids" not in {inputs.get_any_name() for inputs in ov_pipeline.transformer.model.inputs} + ): + expected_batch *= 2 self.assertEqual( ov_pipeline.batch_size, - batch_size - * num_images_per_prompt - * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), + expected_batch, ) self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) @@ -324,6 +369,8 @@ def test_textual_inversion(self): class OVPipelineForImage2ImageTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + if is_transformers_version(">=", "4.40.0"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") AUTOMODEL_CLASS = AutoPipelineForImage2Image OVMODEL_CLASS = OVPipelineForImage2Image @@ -369,7 +416,7 @@ def test_num_images_per_prompt(self, model_arch: str): outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]) @require_diffusers def test_callback(self, model_arch: str): height, width, batch_size = 32, 64, 1 @@ -416,9 +463,19 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) self.assertEqual( outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -427,16 +484,17 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) + ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None) for output_type in ["latent", "np", "pt"]: + print(output_type) inputs["output_type"] = output_type ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -500,12 +558,12 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) - self.assertEqual( - ov_pipeline.batch_size, - batch_size - * num_images_per_prompt - * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), - ) + expected_batch = batch_size * num_images_per_prompt + if ov_pipeline.unet is None or "timestep_cond" not in { + inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs + }: + expected_batch *= 2 + self.assertEqual(ov_pipeline.batch_size, expected_batch) self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) @@ -537,6 +595,9 @@ def test_textual_inversion(self): class OVPipelineForInpaintingTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + if is_transformers_version(">=", "4.40.0"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + AUTOMODEL_CLASS = AutoPipelineForInpainting OVMODEL_CLASS = OVPipelineForInpainting @@ -586,7 +647,7 @@ def test_num_images_per_prompt(self, model_arch: str): outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(["stable-diffusion", "stable-diffusion-xl"]) @require_diffusers def test_callback(self, model_arch: str): height, width, batch_size = 32, 64, 1 @@ -633,9 +694,19 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) self.assertEqual( outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -653,7 +724,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -717,11 +788,14 @@ def test_height_width_properties(self, model_arch: str): ) self.assertFalse(ov_pipeline.is_dynamic) + expected_batch = batch_size * num_images_per_prompt + if ov_pipeline.unet is None or "timestep_cond" not in { + inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs + }: + expected_batch *= 2 self.assertEqual( ov_pipeline.batch_size, - batch_size - * num_images_per_prompt - * (2 if "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs} else 1), + expected_batch, ) self.assertEqual(ov_pipeline.height, height) self.assertEqual(ov_pipeline.width, width) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 43c535e67..6a42c4a09 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -27,6 +27,7 @@ from optimum.exporters.openvino import export_from_model, main_export from optimum.exporters.tasks import TasksManager from optimum.intel import ( + OVFluxPipeline, OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, @@ -40,13 +41,14 @@ OVModelForSequenceClassification, OVModelForSpeechSeq2Seq, OVModelForTokenClassification, + OVStableDiffusion3Pipeline, OVStableDiffusionPipeline, OVStableDiffusionXLImg2ImgPipeline, OVStableDiffusionXLPipeline, ) from optimum.intel.openvino.modeling_base import OVBaseModel from optimum.intel.openvino.utils import TemporaryDirectory -from optimum.intel.utils.import_utils import _transformers_version +from optimum.intel.utils.import_utils import _transformers_version, is_transformers_version from optimum.utils.save_utils import maybe_load_preprocessors @@ -70,6 +72,9 @@ class ExportModelTest(unittest.TestCase): "latent-consistency": OVLatentConsistencyModelPipeline, } + if is_transformers_version(">=", "4.45"): + SUPPORTED_ARCHITECTURES.update({"stable-diffusion-3": OVStableDiffusion3Pipeline, "flux": OVFluxPipeline}) + GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper") def _openvino_export( diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index cea6c94fc..7542a347d 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -25,6 +25,7 @@ from optimum.exporters.openvino.__main__ import main_export from optimum.intel import ( # noqa + OVFluxPipeline, OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, @@ -39,6 +40,7 @@ OVModelOpenCLIPText, OVModelOpenCLIPVisual, OVSentenceTransformer, + OVStableDiffusion3Pipeline, OVStableDiffusionPipeline, OVStableDiffusionXLPipeline, ) @@ -48,6 +50,7 @@ compare_versions, is_openvino_tokenizers_available, is_tokenizers_version, + is_transformers_version, ) @@ -56,7 +59,7 @@ class OVCLIExportTestCase(unittest.TestCase): Integration tests ensuring supported models are correctly exported. """ - SUPPORTED_ARCHITECTURES = ( + SUPPORTED_ARCHITECTURES = [ ("text-generation", "gpt2"), ("text-generation-with-past", "gpt2"), ("text2text-generation", "t5"), @@ -71,7 +74,10 @@ class OVCLIExportTestCase(unittest.TestCase): ("text-to-image", "stable-diffusion"), ("text-to-image", "stable-diffusion-xl"), ("image-to-image", "stable-diffusion-xl-refiner"), - ) + ] + + if is_transformers_version(">=", "4.45"): + SUPPORTED_ARCHITECTURES.extend([("text-to-image", "stable-diffusion-3"), ("text-to-image", "flux")]) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { "gpt2": 2 if is_tokenizers_version("<", "0.20") else 0, "t5": 0, # no .model file in the repository @@ -84,13 +90,18 @@ class OVCLIExportTestCase(unittest.TestCase): "blenderbot": 2 if is_tokenizers_version("<", "0.20") else 0, "stable-diffusion": 2 if is_tokenizers_version("<", "0.20") else 0, "stable-diffusion-xl": 4 if is_tokenizers_version("<", "0.20") else 0, + "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") else 2, + "flux": 4 if is_tokenizers_version("<", "0.20") else 0, } - SUPPORTED_SD_HYBRID_ARCHITECTURES = ( + SUPPORTED_SD_HYBRID_ARCHITECTURES = [ ("stable-diffusion", 72, 195), ("stable-diffusion-xl", 84, 331), ("latent-consistency", 50, 135), - ) + ] + + if is_transformers_version(">=", "4.45"): + SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("stable-diffusion-3", 9, 65)) TEST_4BIT_CONFIGURATONS = [ ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}), @@ -208,8 +219,8 @@ def test_exporters_cli_int8(self, task: str, model_type: str): models = [model.encoder, model.decoder] if task.endswith("with-past"): models.append(model.decoder_with_past) - elif model_type.startswith("stable-diffusion"): - models = [model.unet, model.vae_encoder, model.vae_decoder] + elif model_type.startswith("stable-diffusion") or model_type.startswith("flux"): + models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder] models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2) else: models = [model] @@ -228,7 +239,9 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in check=True, ) model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir) - num_fq, num_weight_nodes = get_num_quantized_nodes(model.unet) + num_fq, num_weight_nodes = get_num_quantized_nodes( + model.unet if model.unet is not None else model.transformer + ) self.assertEqual(exp_num_int8, num_weight_nodes["int8"]) self.assertEqual(exp_num_fq, num_fq) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index b294e3e22..f2a4dc723 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -56,6 +56,8 @@ OVModelForSpeechSeq2Seq, OVStableDiffusionPipeline, OVStableDiffusionXLPipeline, + OVStableDiffusion3Pipeline, + OVFluxPipeline, OVQuantizer, OVTrainer, OVQuantizationConfig, @@ -300,11 +302,18 @@ class OVWeightCompressionTest(unittest.TestCase): (OVModelOpenCLIPForZeroShotImageClassification, "open-clip"), ) - SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = ( + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331), (OVLatentConsistencyModelPipeline, "latent-consistency", 50, 135), - ) + ] + + if is_transformers_version(">=", "4.45.0"): + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION.extend( + [ + (OVStableDiffusion3Pipeline, "stable-diffusion-3", 9, 65), + ] + ) IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") @@ -454,7 +463,9 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_f with TemporaryDirectory() as tmp_dir: model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) - num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes( + model.unet if model.unet is not None else model.transformer + ) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) @@ -468,7 +479,9 @@ def test_stable_diffusion_with_weight_compression(self): quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) - num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(int8_pipe.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes( + int8_pipe.unet if int8_pipe.unet is not None else int8_pipe.transformer + ) self.assertEqual(0, num_fake_quantize) self.assertEqual(242, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) @@ -487,7 +500,9 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset( self.assertEqual(quantization_config.quant_method, OVQuantizationMethod.HYBRID) quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset) - num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes( + model.unet if model.unet is not None else model.transformer + ) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index d7eea01db..e5a9f73a6 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -59,6 +59,7 @@ "falcon": "fxmarty/really-tiny-falcon-testing", "falcon-40b": "katuni4ka/tiny-random-falcon-40b", "flaubert": "hf-internal-testing/tiny-random-flaubert", + "flux": "katuni4ka/tiny-random-flux", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", @@ -118,6 +119,7 @@ "stable-diffusion-openvino": "hf-internal-testing/tiny-stable-diffusion-openvino", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "stable-diffusion-xl-refiner": "echarlaix/tiny-random-stable-diffusion-xl-refiner", + "stable-diffusion-3": "yujiepan/stable-diffusion-3-tiny-random", "stablelm": "hf-internal-testing/tiny-random-StableLmForCausalLM", "starcoder2": "hf-internal-testing/tiny-random-Starcoder2ForCausalLM", "latent-consistency": "echarlaix/tiny-random-latent-consistency", @@ -170,6 +172,8 @@ "stable-diffusion-xl": (366, 34, 42, 66), "stable-diffusion-xl-refiner": (366, 34, 42, 66), "open-clip": (20, 28), + "stable-diffusion-3": (66, 42, 58, 30), + "flux": (56, 24, 28, 64), } From fe82729b7e04bdcb898cd0927cc6192569f6d499 Mon Sep 17 00:00:00 2001 From: Alexander Kozlov Date: Fri, 25 Oct 2024 12:32:53 +0400 Subject: [PATCH 22/53] Added notebook to showcase quantization of Sentence Transformers model (#955) * Added notebook to showcase quantization of Sentence Transformers model * Update optimum/intel/openvino/modeling_diffusion.py * Update optimum/intel/openvino/modeling_diffusion.py * Style * Fixed small issue. Results are the same. * Added description to the sections of the notebook * Update notebooks/openvino/sentence_transformer_quantization.ipynb Co-authored-by: Helena Kloosterman * Update notebooks/openvino/sentence_transformer_quantization.ipynb Co-authored-by: Helena Kloosterman * Update notebooks/openvino/sentence_transformer_quantization.ipynb Co-authored-by: Helena Kloosterman * Fixed issue. Added info about benchmarking * Fixed paths to models * Removed unused code --------- Co-authored-by: Helena Kloosterman --- notebooks/openvino/requirements.txt | 1 - .../sentence_transformer_quantization.ipynb | 625 ++++++++++++++++++ 2 files changed, 625 insertions(+), 1 deletion(-) create mode 100644 notebooks/openvino/sentence_transformer_quantization.ipynb diff --git a/notebooks/openvino/requirements.txt b/notebooks/openvino/requirements.txt index bb7a517cf..64ccd6d8c 100644 --- a/notebooks/openvino/requirements.txt +++ b/notebooks/openvino/requirements.txt @@ -4,4 +4,3 @@ evaluate[evaluator] ipywidgets pillow torchaudio - diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb new file mode 100644 index 000000000..714544aa9 --- /dev/null +++ b/notebooks/openvino/sentence_transformer_quantization.ipynb @@ -0,0 +1,625 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quantization of Text Embedding model from Sentence Transformers library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install optimum[openvino]\n", + "%pip install evaluate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quantize staticly model to 8-bit with NNCF via Optimum-Intel API" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The code snippet below shows how to use Optimum-Intel [Model Optimization API](https://huggingface.co/docs/optimum/en/intel/openvino/optimization#static-quantization) to quantize the model staticly. It leaverages [NNCF](https://github.com/openvinotoolkit/nncf) capabilites for static quantization of Transformer models where a combination of the special quantization scheme + SmoothQuant method + Bias Correction method are used to provide state-of-the-art accuracy.\n", + "\n", + "The static quantization requires some data to estimate quantization parameters of activations. It means that some calibration dataset should be provided. `OVQuantizer` class used for quantization provides an API to build such a dataset with `.get_calibration_dataset()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No OpenVINO files were found for sentence-transformers/all-MiniLM-L6-v2, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`\n", + "Framework not specified. Using pt to export the model.\n", + "Using framework PyTorch: 2.4.1+cpu\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n", + "Compiling the model to CPU ...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a9bd847756fd467e905a7ad7a243640c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9d8ad91623d642f48e85b60ac823aca4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a2a7d09a573c4092a830bbaadc39f756",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b67c493aab36426090f8fafd25a17a00",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Configuration saved in all-MiniLM-L6-v2_int8/openvino_config.json\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "('all-MiniLM-L6-v2_int8/tokenizer_config.json',\n",
+       " 'all-MiniLM-L6-v2_int8/special_tokens_map.json',\n",
+       " 'all-MiniLM-L6-v2_int8/vocab.txt',\n",
+       " 'all-MiniLM-L6-v2_int8/added_tokens.json',\n",
+       " 'all-MiniLM-L6-v2_int8/tokenizer.json')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from functools import partial\n",
+    "import datasets\n",
+    "from transformers import AutoTokenizer\n",
+    "from optimum.intel import OVModelForFeatureExtraction, OVQuantizer, OVQuantizationConfig, OVConfig\n",
+    "\n",
+    "MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "base_model_path = \"all-MiniLM-L6-v2\"\n",
+    "int8_ptq_model_path = \"all-MiniLM-L6-v2_int8\"\n",
+    "\n",
+    "model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID)\n",
+    "model.save_pretrained(base_model_path)\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n",
+    "tokenizer.save_pretrained(base_model_path)\n",
+    "\n",
+    "\n",
+    "quantizer = OVQuantizer.from_pretrained(model)\n",
+    "\n",
+    "def preprocess_function(examples, tokenizer):\n",
+    "    return tokenizer(examples[\"sentence\"], padding=\"max_length\", max_length=384, truncation=True)\n",
+    "\n",
+    "\n",
+    "calibration_dataset = quantizer.get_calibration_dataset(\n",
+    "    \"glue\",\n",
+    "    dataset_config_name=\"sst2\",\n",
+    "    preprocess_function=partial(preprocess_function, tokenizer=tokenizer),\n",
+    "    num_samples=300,\n",
+    "    dataset_split=\"train\",\n",
+    ")\n",
+    "\n",
+    "ov_config = OVConfig(quantization_config=OVQuantizationConfig())\n",
+    "\n",
+    "quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=int8_ptq_model_path)\n",
+    "tokenizer.save_pretrained(int8_ptq_model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Benchmark model accuracy on GLUE STSB task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we estimate accuracy impact from model quantization. We evaluate accuracy of both the baseline and quantized model on a different task from the GLUE benchmark."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Pipeline\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "\n",
+    "\n",
+    "# copied from the model card \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "def mean_pooling(model_output, attention_mask):\n",
+    "    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings\n",
+    "    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
+    "    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n",
+    "\n",
+    "\n",
+    "class SentenceEmbeddingPipeline(Pipeline):\n",
+    "    def _sanitize_parameters(self, **kwargs):\n",
+    "        # we don\"t have any hyperameters to sanitize\n",
+    "        preprocess_kwargs = {}\n",
+    "        return preprocess_kwargs, {}, {}\n",
+    "\n",
+    "    def preprocess(self, inputs):\n",
+    "        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors=\"pt\")\n",
+    "        return encoded_inputs\n",
+    "\n",
+    "    def _forward(self, model_inputs):\n",
+    "        outputs = self.model(**model_inputs)\n",
+    "        return {\"outputs\": outputs, \"attention_mask\": model_inputs[\"attention_mask\"]}\n",
+    "\n",
+    "    def postprocess(self, model_outputs):\n",
+    "        # Perform pooling\n",
+    "        sentence_embeddings = mean_pooling(model_outputs[\"outputs\"], model_outputs[\"attention_mask\"])\n",
+    "        # Normalize embeddings\n",
+    "        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)\n",
+    "        return sentence_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Compiling the model to CPU ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Compiling the model to CPU ...\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = OVModelForFeatureExtraction.from_pretrained(base_model_path)\n",
+    "vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)\n",
+    "\n",
+    "q_model = OVModelForFeatureExtraction.from_pretrained(int8_ptq_model_path)\n",
+    "q8_emb = SentenceEmbeddingPipeline(model=q_model, tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from evaluate import load\n",
+    "\n",
+    "eval_dataset = load_dataset(\"glue\", \"stsb\", split=\"validation\")\n",
+    "metric = load(\"glue\", \"stsb\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5cab9e8fc58245a4b395a9575017633b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/1500 [00:00\n",
+      "[ INFO ]   PERFORMANCE_HINT: LATENCY\n",
+      "[ INFO ]   EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE\n",
+      "[ INFO ]   PERFORMANCE_HINT_NUM_REQUESTS: 0\n",
+      "[ INFO ]   ENABLE_CPU_PINNING: True\n",
+      "[ INFO ]   SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE\n",
+      "[ INFO ]   MODEL_DISTRIBUTION_POLICY: set()\n",
+      "[ INFO ]   ENABLE_HYPER_THREADING: False\n",
+      "[ INFO ]   EXECUTION_DEVICES: ['CPU']\n",
+      "[ INFO ]   CPU_DENORMALS_OPTIMIZATION: False\n",
+      "[ INFO ]   LOG_LEVEL: Level.NO\n",
+      "[ INFO ]   CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0\n",
+      "[ INFO ]   DYNAMIC_QUANTIZATION_GROUP_SIZE: 32\n",
+      "[ INFO ]   KV_CACHE_PRECISION: \n",
+      "[ INFO ]   AFFINITY: Affinity.CORE\n",
+      "[Step 9/11] Creating infer requests and preparing input tensors\n",
+      "[ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'attention_mask'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values!\n",
+      "[ INFO ] Fill input 'input_ids' with random values \n",
+      "[ INFO ] Fill input 'attention_mask' with random values \n",
+      "[ INFO ] Fill input 'token_type_ids' with random values \n",
+      "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
+      "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
+      "[ INFO ] First inference took 12.27 ms\n",
+      "[Step 11/11] Dumping statistics report\n",
+      "[ INFO ] Execution Devices:['CPU']\n",
+      "[ INFO ] Count:            200 iterations\n",
+      "[ INFO ] Duration:         1988.84 ms\n",
+      "[ INFO ] Latency:\n",
+      "[ INFO ]    Median:        9.74 ms\n",
+      "[ INFO ]    Average:       9.77 ms\n",
+      "[ INFO ]    Min:           9.59 ms\n",
+      "[ INFO ]    Max:           11.12 ms\n",
+      "[ INFO ] Throughput:   100.56 FPS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# FP32 baseline model\n",
+    "!benchmark_app -m all-MiniLM-L6-v2/openvino_model.xml -shape \"input_ids[1,384],attention_mask[1,384],token_type_ids[1,384]\" -api sync -niter 200"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Step 1/11] Parsing and validating input arguments\n",
+      "[ INFO ] Parsing input parameters\n",
+      "[Step 2/11] Loading OpenVINO Runtime\n",
+      "[ INFO ] OpenVINO:\n",
+      "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n",
+      "[ INFO ] \n",
+      "[ INFO ] Device info:\n",
+      "[ INFO ] CPU\n",
+      "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n",
+      "[ INFO ] \n",
+      "[ INFO ] \n",
+      "[Step 3/11] Setting device configuration\n",
+      "[ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.LATENCY.\n",
+      "[Step 4/11] Reading model files\n",
+      "[ INFO ] Loading model files\n",
+      "[ INFO ] Read model took 20.87 ms\n",
+      "[ INFO ] Original model I/O parameters:\n",
+      "[ INFO ] Model inputs:\n",
+      "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]\n",
+      "[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [?,?]\n",
+      "[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [?,?]\n",
+      "[ INFO ] Model outputs:\n",
+      "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [?,?,384]\n",
+      "[Step 5/11] Resizing model to match image sizes and given batch\n",
+      "[ INFO ] Model batch size: 1\n",
+      "[ INFO ] Reshaping model: 'input_ids': [1,384], 'attention_mask': [1,384], 'token_type_ids': [1,384]\n",
+      "[ INFO ] Reshape model took 3.42 ms\n",
+      "[Step 6/11] Configuring input of the model\n",
+      "[ INFO ] Model inputs:\n",
+      "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [1,384]\n",
+      "[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [1,384]\n",
+      "[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [1,384]\n",
+      "[ INFO ] Model outputs:\n",
+      "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [1,384,384]\n",
+      "[Step 7/11] Loading the model to the device\n",
+      "[ INFO ] Compile model took 323.91 ms\n",
+      "[Step 8/11] Querying optimal runtime parameters\n",
+      "[ INFO ] Model:\n",
+      "[ INFO ]   NETWORK_NAME: Model0\n",
+      "[ INFO ]   OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1\n",
+      "[ INFO ]   NUM_STREAMS: 1\n",
+      "[ INFO ]   INFERENCE_NUM_THREADS: 18\n",
+      "[ INFO ]   PERF_COUNT: NO\n",
+      "[ INFO ]   INFERENCE_PRECISION_HINT: \n",
+      "[ INFO ]   PERFORMANCE_HINT: LATENCY\n",
+      "[ INFO ]   EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE\n",
+      "[ INFO ]   PERFORMANCE_HINT_NUM_REQUESTS: 0\n",
+      "[ INFO ]   ENABLE_CPU_PINNING: True\n",
+      "[ INFO ]   SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE\n",
+      "[ INFO ]   MODEL_DISTRIBUTION_POLICY: set()\n",
+      "[ INFO ]   ENABLE_HYPER_THREADING: False\n",
+      "[ INFO ]   EXECUTION_DEVICES: ['CPU']\n",
+      "[ INFO ]   CPU_DENORMALS_OPTIMIZATION: False\n",
+      "[ INFO ]   LOG_LEVEL: Level.NO\n",
+      "[ INFO ]   CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0\n",
+      "[ INFO ]   DYNAMIC_QUANTIZATION_GROUP_SIZE: 32\n",
+      "[ INFO ]   KV_CACHE_PRECISION: \n",
+      "[ INFO ]   AFFINITY: Affinity.CORE\n",
+      "[Step 9/11] Creating infer requests and preparing input tensors\n",
+      "[ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'attention_mask'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values!\n",
+      "[ INFO ] Fill input 'input_ids' with random values \n",
+      "[ INFO ] Fill input 'attention_mask' with random values \n",
+      "[ INFO ] Fill input 'token_type_ids' with random values \n",
+      "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
+      "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
+      "[ INFO ] First inference took 6.72 ms\n",
+      "[Step 11/11] Dumping statistics report\n",
+      "[ INFO ] Execution Devices:['CPU']\n",
+      "[ INFO ] Count:            200 iterations\n",
+      "[ INFO ] Duration:         853.85 ms\n",
+      "[ INFO ] Latency:\n",
+      "[ INFO ]    Median:        4.13 ms\n",
+      "[ INFO ]    Average:       4.15 ms\n",
+      "[ INFO ]    Min:           4.05 ms\n",
+      "[ INFO ]    Max:           5.13 ms\n",
+      "[ INFO ] Throughput:   234.23 FPS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# INT8 counterpart\n",
+    "!benchmark_app -m all-MiniLM-L6-v2_int8/openvino_model.xml -shape \"input_ids[1,384],attention_mask[1,384],token_type_ids[1,384]\" -api sync -niter 200"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "test3.11_cpu",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 6251f83e7981ac3234d476be8d73803b0160fd24 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov 
Date: Fri, 25 Oct 2024 13:09:58 +0400
Subject: [PATCH 23/53] [OV]: Updated notebooks README (#973)

---
 notebooks/openvino/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/notebooks/openvino/README.md b/notebooks/openvino/README.md
index f63c13c55..31c258099 100644
--- a/notebooks/openvino/README.md
+++ b/notebooks/openvino/README.md
@@ -12,4 +12,5 @@ The notebooks have been tested with Python 3.8 and 3.10 on Ubuntu Linux.
 |:----------|:-------------|:-------------|------:|
 | [How to run inference with the OpenVINO](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb) | Explains how to export your model to OpenVINO and to run inference with OpenVINO Runtime on various tasks| [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb)|
 | [How to quantize a question answering model with OpenVINO NNCF](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/question_answering_quantization.ipynb) | Show how to apply post-training quantization on a question answering model using [NNCF](https://github.com/openvinotoolkit/nncf) and to accelerate inference with OpenVINO| [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/question_answering_quantization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/question_answering_quantization.ipynb)|
-| [How to quantize Stable Diffusion model with OpenVINO NNCF](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)| Show how to apply post-training hybrid quantization on a Stable Diffusion model using [NNCF](https://github.com/openvinotoolkit/nncf) and to accelerate inference with OpenVINO| [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)|
\ No newline at end of file
+| [How to quantize Stable Diffusion model with OpenVINO NNCF](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)| Show how to apply post-training hybrid quantization on a Stable Diffusion model using [NNCF](https://github.com/openvinotoolkit/nncf) and to accelerate inference with OpenVINO| [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)|
+| [How to quantize Sentence Transformer model with OpenVINO NNCF](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/sentence_transformer_quantization.ipynb)| Show how to apply post-training 8-bit quantization on a Sentence Transformer model using [NNCF](https://github.com/openvinotoolkit/nncf) and to accelerate inference with OpenVINO| [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/sentence_transformer_quantization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/sentence_transformer_quantization.ipynb)|

From 635f939984287a6124ecf95e401577084714c480 Mon Sep 17 00:00:00 2001
From: Ella Charlaix 
Date: Fri, 25 Oct 2024 16:41:02 +0200
Subject: [PATCH 24/53] Dev version

---
 optimum/intel/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/version.py b/optimum/intel/version.py
index e118ea713..16bf124e0 100644
--- a/optimum/intel/version.py
+++ b/optimum/intel/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "1.20.0.dev0"
+__version__ = "1.21.0.dev0"

From 936d2729a1054f02d1ca927d2ed5fb03c19d8b2e Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Mon, 28 Oct 2024 20:43:21 +0400
Subject: [PATCH 25/53] Restore SDPA in Gemma2 models for transformers > 4.45
 (#976)

* Restore SDPA in Gemma2 models for transformers > 4.45

* Update tests/openvino/test_modeling.py

* Update tests/openvino/test_modeling.py
---
 optimum/exporters/openvino/model_patcher.py | 20 ++++++++++++++++++++
 tests/openvino/test_modeling.py             |  8 ++++++++
 2 files changed, 28 insertions(+)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 3bc9452ff..7e5cd76a7 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -2505,6 +2505,26 @@ def patched_forward(*args, **kwargs):
 
         self.patched_forward = patched_forward
 
+    def __enter__(self):
+        super().__enter__()
+        if is_transformers_version(">=", "4.45.0"):
+            from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_CLASSES
+
+            sdpa_attn = GEMMA2_ATTENTION_CLASSES["sdpa"]
+            eager_attn = GEMMA2_ATTENTION_CLASSES["eager"]
+
+            for layer in self._model.model.layers:
+                if isinstance(layer.self_attn, eager_attn):
+                    layer.self_attn._orig_forward = layer.self_attn.forward
+                    layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if is_transformers_version(">=", "4.45.0"):
+            for layer in self._model.model.layers:
+                if hasattr(layer.self_attn, "_orig_forward"):
+                    layer.self_attn.forward = layer.self_attn._orig_forward
+
 
 def _decilm_attn_forward(
     self,
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 119e00403..082ffef28 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -863,6 +863,10 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch in self.REMOTE_CODE_MODELS:
             model_kwargs = {"trust_remote_code": True}
 
+        # starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
+        if model_arch == "gemma2" and is_transformers_version(">=", "4.45.0"):
+            model_kwargs["attn_implementation"] = "sdpa"
+
         ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs)
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         self.assertTrue(ov_model.use_cache)
@@ -1094,6 +1098,10 @@ def test_beam_search(self, model_arch):
                 "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
                 "trust_remote_code": True,
             }
+
+        # starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
+        if model_arch == "gemma2" and is_transformers_version(">=", "4.45.0"):
+            model_kwargs["attn_implementation"] = "sdpa"
         # Qwen tokenizer does not support padding, chatglm, glm4 testing models produce nan that incompatible with beam search
         if model_arch in ["qwen", "chatglm", "glm4"]:
             return

From 5f539d51286ca145540f7861dd7658aa39df7e49 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 29 Oct 2024 16:55:33 +0100
Subject: [PATCH 26/53] transformers 4.46 compatibility (#960)

* transformers 4.46 compatibility

* test

* upgrade python

* bump python

* Apply suggestions from code review

* fix trainer

* update examples requirements

* bump ipex version in tests

* update setup

* no ipex whl for python 3.12

* bump tests transformers version

* bump tests transformers version

* fix

---------

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 .github/workflows/check_code_quality.yml      |  2 +-
 .github/workflows/test_generation.yml         |  2 +-
 .github/workflows/test_inc.yml                |  6 +++---
 .github/workflows/test_ipex.yml               |  8 +++----
 .github/workflows/test_offline.yaml           |  2 +-
 .github/workflows/test_openvino.yml           | 12 +++++------
 .github/workflows/test_openvino_basic.yml     | 11 ++++++----
 .github/workflows/test_openvino_examples.yml  |  2 +-
 .github/workflows/test_openvino_notebooks.yml |  2 +-
 .../audio-classification/requirements.txt     |  1 +
 .../image-classification/requirements.txt     |  1 +
 .../question-answering/requirements.txt       |  1 +
 .../text-classification/requirements.txt      |  1 +
 optimum/intel/neural_compressor/trainer.py    | 21 +++++++++++--------
 optimum/intel/openvino/trainer.py             |  5 +++++
 setup.py                                      |  4 ++--
 tests/openvino/test_quantization.py           |  3 +++
 tests/openvino/test_training.py               | 13 ++++++++++--
 18 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml
index c76b6f804..4cf7d4cb0 100644
--- a/.github/workflows/check_code_quality.yml
+++ b/.github/workflows/check_code_quality.yml
@@ -24,7 +24,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8]
+        python-version: ["3.9"]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_generation.yml b/.github/workflows/test_generation.yml
index 3c2747318..f67cc2c0a 100644
--- a/.github/workflows/test_generation.yml
+++ b/.github/workflows/test_generation.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: ["3.9", "3.12"]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
index 6e9992348..e29dc83f8 100644
--- a/.github/workflows/test_inc.yml
+++ b/.github/workflows/test_inc.yml
@@ -20,7 +20,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: ["3.9", "3.11"]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
@@ -35,8 +35,8 @@ jobs:
         python -m pip install --upgrade pip
         pip install cmake
         pip install py-cpuinfo
-        pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --index-url https://download.pytorch.org/whl/cpu
-        pip install intel-extension-for-pytorch==2.3.0
+        pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
+        pip install intel-extension-for-pytorch==2.4.0
         pip install datasets==2.19.0
         pip install .[neural-compressor,diffusers,tests]
         pip install peft
diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 4030f7d40..1c1d12870 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -21,13 +21,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9]
+        python-version: ["3.9"]
         transformers-version: ["4.39.0", "4.44.*"]
         ipex-version: ["2.2.0", "2.3.*"]
         include:
-          - python-version: 3.8
-            transformers-version: 4.39.0
-            ipex-version: 2.2.0
+          - python-version: "3.10"
+            transformers-version: "4.39.0"
+            ipex-version: "2.2.0"
 
     steps:
       - uses: actions/checkout@v2
diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml
index a54ba2076..2e97a2f12 100644
--- a/.github/workflows/test_offline.yaml
+++ b/.github/workflows/test_offline.yaml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9]
+        python-version: ["3.9"]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 53c210707..bfec51e48 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -20,8 +20,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.12"]
-        transformers-version: ["4.36.0", "4.45.*"]
+        python-version: ["3.9", "3.12"]
+        transformers-version: ["4.36.0", "latest"]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
@@ -32,16 +32,16 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
+      - name: Install lowest compatible transformers version
+        if: ${{ matrix.transformers-version != 'latest' }}
+        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.*
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
           pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
-          pip install transformers==${{ matrix.transformers-version }}
-
-      - if: ${{ matrix.transformers-version == '4.36.0' }}
-        run: pip install accelerate==0.*
 
       - name: Test with Pytest
         env:
diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml
index eefce73ab..7ea6898fa 100644
--- a/.github/workflows/test_openvino_basic.yml
+++ b/.github/workflows/test_openvino_basic.yml
@@ -24,7 +24,7 @@ jobs:
         # This also ensures that the test fails if dependencies break for Python 3.7
         python-version: ["3.9", "3.12"]
         os: ["ubuntu-22.04", "windows-latest"]
-        transformers-version: ["4.45.*"]
+        transformers-version: ["latest"]
         openvino: ["openvino openvino-tokenizers"]
         nncf: ["nncf"]
         include:
@@ -35,12 +35,12 @@ jobs:
             nncf: "nncf"
           - python-version: "3.12"
             os: "ubuntu-22.04"
-            transformers-version: "4.45.*"
+            transformers-version: "latest"
             openvino: "--pre -U openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly"
             nncf: "nncf"
           - python-version: "3.12"
             os: "ubuntu-22.04"
-            transformers-version: "4.45.*"
+            transformers-version: "latest"
             openvino: "--pre -U openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly"
             nncf: "git+https://github.com/openvinotoolkit/nncf.git"
 
@@ -53,6 +53,10 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
+      - name: Install lowest compatible transformers version
+        if: ${{ matrix.transformers-version != 'latest' }}
+        run: pip install transformers==${{ matrix.transformers-version }}
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
@@ -61,7 +65,6 @@ jobs:
           # Install openvino manually to prevent dependency conflicts when .[openvino] pins
           # optimum or transformers to a specific version
           pip install ${{ matrix.openvino }}
-          pip install transformers==${{ matrix.transformers-version }}
           pip install .[tests]
 
       - name: Pip freeze
diff --git a/.github/workflows/test_openvino_examples.yml b/.github/workflows/test_openvino_examples.yml
index c76374e9e..872058d24 100644
--- a/.github/workflows/test_openvino_examples.yml
+++ b/.github/workflows/test_openvino_examples.yml
@@ -22,7 +22,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.12"]
+        python-version: ["3.9", "3.12"]
 
     runs-on: ubuntu-22.04
 
diff --git a/.github/workflows/test_openvino_notebooks.yml b/.github/workflows/test_openvino_notebooks.yml
index 26a09012f..24eb3b4f1 100644
--- a/.github/workflows/test_openvino_notebooks.yml
+++ b/.github/workflows/test_openvino_notebooks.yml
@@ -23,7 +23,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.12"]
+        python-version: ["3.9", "3.12"]
 
     runs-on: ubuntu-22.04
 
diff --git a/examples/openvino/audio-classification/requirements.txt b/examples/openvino/audio-classification/requirements.txt
index f88b156da..89569575f 100644
--- a/examples/openvino/audio-classification/requirements.txt
+++ b/examples/openvino/audio-classification/requirements.txt
@@ -1,3 +1,4 @@
+transformers>=4.36.0,<4.46.0
 datasets>=1.14.0,<2.20.0
 evaluate
 librosa
diff --git a/examples/openvino/image-classification/requirements.txt b/examples/openvino/image-classification/requirements.txt
index 4c467e0d8..4ef921275 100644
--- a/examples/openvino/image-classification/requirements.txt
+++ b/examples/openvino/image-classification/requirements.txt
@@ -1,3 +1,4 @@
+transformers>=4.36.0,<4.46.0
 datasets>=1.14.0,<2.20.0
 torch >= 1.9.0
 torchvision>=0.6.0
diff --git a/examples/openvino/question-answering/requirements.txt b/examples/openvino/question-answering/requirements.txt
index 2af8f0268..b4e37df13 100644
--- a/examples/openvino/question-answering/requirements.txt
+++ b/examples/openvino/question-answering/requirements.txt
@@ -1,3 +1,4 @@
+transformers>=4.36.0,<4.46.0
 datasets>=1.14.0,<2.20.0
 torch >= 1.9.0
 evaluate
diff --git a/examples/openvino/text-classification/requirements.txt b/examples/openvino/text-classification/requirements.txt
index bcf3f8025..f8b37a9e5 100644
--- a/examples/openvino/text-classification/requirements.txt
+++ b/examples/openvino/text-classification/requirements.txt
@@ -1,3 +1,4 @@
+transformers>=4.36.0,<4.46.0
 datasets>=1.14.0,<2.20.0
 sentencepiece != 0.1.92
 scipy
diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py
index c0fe0cf6d..c4ecf8570 100644
--- a/optimum/intel/neural_compressor/trainer.py
+++ b/optimum/intel/neural_compressor/trainer.py
@@ -39,6 +39,7 @@
 from transformers import Trainer
 from transformers.data.data_collator import DataCollator
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
+from transformers.feature_extraction_utils import FeatureExtractionMixin
 from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype, unwrap_model
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
@@ -104,7 +105,7 @@
     from neural_compressor.config import _BaseQuantizationConfig
 
 
-__version__ = "4.22.2"
+__version__ = "4.46.0"
 
 
 logger = logging.get_logger(__name__)
@@ -122,8 +123,9 @@ def __init__(
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Dataset] = None,
         eval_dataset: Optional[Dataset] = None,
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        processing_class: Optional[Union[PreTrainedTokenizerBase, FeatureExtractionMixin]] = None,
         model_init: Callable[[], PreTrainedModel] = None,
+        compute_loss_func: Optional[Callable] = None,
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
         optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
@@ -132,6 +134,7 @@ def __init__(
         pruning_config: Optional[_BaseQuantizationConfig] = None,
         distillation_config: Optional[_BaseQuantizationConfig] = None,
         task: Optional[str] = None,
+        **kwargs,
     ):
         self.neftune_noise_alpha = None
 
@@ -141,12 +144,12 @@ def __init__(
             data_collator,
             train_dataset,
             eval_dataset,
-            tokenizer,
-            model_init,
-            compute_metrics,
-            callbacks,
-            optimizers,
-            preprocess_logits_for_metrics,
+            processing_class or kwargs.get("tokenizer", None),
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
         )
 
         if self.args.device.type == "cuda" and not is_neural_compressor_version(">", "2.0.0"):
@@ -766,7 +769,7 @@ def _get_logits(model_outputs):
         output_names = ["logits", "start_logits", "end_logits"]
         return tuple(model_outputs.get(name) for name in output_names if name in model_outputs)
 
-    def compute_loss(self, model, inputs, return_outputs=False):
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
         """
         How the loss is computed by Trainer. By default, all models return the loss in the first element.
         """
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index a2f08b647..bac782205 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -216,6 +216,11 @@ def __init__(
         logger.warning("OVTrainer is deprecated and will be removed in optimum-intel v1.22.0.")
 
         if is_transformers_version(">=", "4.45.0"):
+            if is_transformers_version(">=", "4.46.0"):
+                raise ImportError(
+                    f"Unsupported transformers version found is {_transformers_version} which is not supported by the OVTrainer. Please downgrade to v4.44"
+                )
+
             logger.warning(
                 f"The transformers version found is {_transformers_version} which is not officially supported by the OVTrainer, use at your own risk"
             )
diff --git a/setup.py b/setup.py
index 7eb40d6c3..9ac8dce70 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@
 
 INSTALL_REQUIRE = [
     "torch>=1.11",
-    "transformers>=4.36,<4.46",
+    "transformers>=4.36,<4.47",
     "optimum~=1.23",
     "datasets>=1.4.0",
     "sentencepiece",
@@ -60,7 +60,7 @@
 QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"]
 
 EXTRAS_REQUIRE = {
-    "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate"],
+    "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
     "openvino": [
         "openvino==2024.4.1.dev20240926",
         "nncf>=2.11.0",
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index f2a4dc723..0e01932b6 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -786,6 +786,9 @@ class OVTrainerTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("albert", 64, 39),)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
+    @unittest.skipIf(
+        is_transformers_version(">=", "4.46"), reason="OVTrainer is not compatible with transformers>=v4.46"
+    )
     def test_aware_training_quantization(self, model_name, expected_fake_quantize, expected_int8):
         model_id = MODEL_NAMES[model_name]
         model = AutoModelForSequenceClassification.from_pretrained(model_id, attn_implementation="eager")
diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py
index 4147a632b..76f7ec319 100644
--- a/tests/openvino/test_training.py
+++ b/tests/openvino/test_training.py
@@ -475,7 +475,10 @@ class OVTrainerTextClassificationTrainingTest(OVTrainerBaseTrainingTest):
     task = "sequence-classification"
 
     @parameterized.expand(OVTRAINER_TEXT_CLASSIFICATION_TEST_DESCRIPTORS.items())
-    @unittest.skipIf(is_transformers_version("<", "4.41.0"), reason="Mismatch in expected fake quantized op")
+    @unittest.skipIf(
+        is_transformers_version("<", "4.41") or is_transformers_version(">=", "4.46"),
+        reason="Mismatch in expected fake quantized op and incompatible with transformers v4.46",
+    )
     def test_training(self, _, desc: OVTrainerTestDescriptor):
         self.run_ovtrainer_training_checks(desc)
 
@@ -627,7 +630,10 @@ class OVTrainerImageClassificationTrainingTest(OVTrainerBaseTrainingTest):
     @parameterized.expand(OVTRAINER_IMAGE_CLASSIFICATION_TEST_DESCRIPTORS.items())
     @pytest.mark.run_slow
     @slow
-    @unittest.skipIf(is_transformers_version("<", "4.41.0"), reason="Mismatch in expected fake quantized op")
+    @unittest.skipIf(
+        is_transformers_version("<", "4.41") or is_transformers_version(">=", "4.46"),
+        reason="Mismatch in expected fake quantized op and incompatible with transformers v4.46",
+    )
     def test_training(self, _, desc: OVTrainerTestDescriptor):
         self.run_ovtrainer_training_checks(desc)
 
@@ -808,6 +814,9 @@ class OVTrainerAudioClassificationTrainingTest(OVTrainerBaseTrainingTest):
     @parameterized.expand(OVTRAINER_AUDIO_CLASSIFICATION_TEST_DESCRIPTORS.items())
     @pytest.mark.run_slow
     @slow
+    @unittest.skipIf(
+        is_transformers_version(">=", "4.46"), reason="OVTrainer is not compatible with transformers>=v4.46"
+    )
     def test_training(self, _, desc: OVTrainerTestDescriptor):
         self.run_ovtrainer_training_checks(desc)
 

From cead516b3b8c38c1a0b8b8937a861edccdfab839 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Thu, 31 Oct 2024 22:47:24 +0400
Subject: [PATCH 27/53] add token_type_ids in lm forward signature (#964)

* add token_type_ids in lm forward signature

* update tests

* Update tests/openvino/test_modeling.py

* add message in tests

* Update tests/openvino/test_modeling.py

* Update tests/openvino/test_modeling.py

* Update optimum/intel/openvino/modeling_decoder.py

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/openvino/modeling_decoder.py | 3 +++
 tests/openvino/test_modeling.py            | 9 +++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 7c0bde8cd..4897db145 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -522,9 +522,12 @@ def forward(
         attention_mask: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
         self.compile()
+        # added as model.generate validates model inputs based on forward signature
+        kwargs["token_type_ids"] = token_type_ids
 
         inputs = self.prepare_inputs(
             input_ids=input_ids,
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 082ffef28..8b4258adf 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -872,7 +872,6 @@ def test_compare_to_transformers(self, model_arch):
         self.assertTrue(ov_model.use_cache)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
         tokens = tokenizer("This is a sample output", return_tensors="pt")
-        tokens.pop("token_type_ids", None)
 
         ov_outputs = ov_model(**tokens)
         self.assertTrue("logits" in ov_outputs)
@@ -909,7 +908,6 @@ def test_compare_to_transformers(self, model_arch):
         # Compare batched generation
         tokenizer.padding_side = "left"
         tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
-        tokens.pop("token_type_ids", None)
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         ov_model.config.eos_token_id = None
@@ -933,7 +931,10 @@ def test_compare_to_transformers(self, model_arch):
 
             additional_inputs = {"past_key_values": DynamicCache()}
         transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs)
-        self.assertTrue(torch.allclose(ov_outputs, transformers_outputs))
+        self.assertTrue(
+            torch.allclose(ov_outputs, transformers_outputs),
+            "OV output {ov_outputs}\nTransformers output  {transformers_output}",
+        )
 
         del transformers_model
         del ov_model
@@ -1102,6 +1103,7 @@ def test_beam_search(self, model_arch):
         # starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
         if model_arch == "gemma2" and is_transformers_version(">=", "4.45.0"):
             model_kwargs["attn_implementation"] = "sdpa"
+
         # Qwen tokenizer does not support padding, chatglm, glm4 testing models produce nan that incompatible with beam search
         if model_arch in ["qwen", "chatglm", "glm4"]:
             return
@@ -1177,7 +1179,6 @@ def test_beam_search(self, model_arch):
             from transformers.cache_utils import DynamicCache
         tokenizer.pad_token_id = tokenizer.eos_token_id
         tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
-        tokens.pop("token_type_ids", None)
         ov_model_stateful.generation_config.eos_token_id = None
         ov_model_stateless.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None

From e5fb400b622d8b470275ba6e1999490d64716f93 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Fri, 1 Nov 2024 11:11:41 +0400
Subject: [PATCH 28/53] fix order of hidden states in text encoder (#984)

* fix order of hidden states in text encoder

* Update optimum/intel/openvino/modeling_diffusion.py
---
 optimum/intel/openvino/modeling_diffusion.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 8bca8cc9a..dd212cd76 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -995,9 +995,9 @@ def modules(self):
 class OVModelTextEncoder(OVPipelinePart):
     def __init__(self, model: openvino.runtime.Model, parent_pipeline: OVDiffusionPipeline, model_name: str = ""):
         super().__init__(model, parent_pipeline, model_name)
-        self.hidden_states_output_names = sorted(
-            {name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")}
-        )
+        self.hidden_states_output_names = [
+            name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")
+        ]
 
     def forward(
         self,

From 46e778102b38869f741abdeaa4aa7e4d43755623 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Fri, 1 Nov 2024 11:12:31 +0400
Subject: [PATCH 29/53] fix getting default diffusion pipeline parameters from
 config (#983)

---
 optimum/intel/openvino/modeling_diffusion.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index dd212cd76..18d8a7506 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -409,6 +409,8 @@ def _from_pretrained(
             "tokenizer_2": None,
             "tokenizer_3": None,
             "feature_extractor": None,
+            "image_encoder": None,
+            "safety_checker": None,
         }
         for name in submodels.keys():
             if kwargs.get(name) is not None:
@@ -434,6 +436,10 @@ def _from_pretrained(
             "text_encoder_3": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER / text_encoder_3_file_name,
         }
 
+        for config_key, value in config.items():
+            if config_key not in models and config_key not in kwargs and config_key not in submodels:
+                kwargs[config_key] = value
+
         compile_only = kwargs.get("compile_only", False)
         quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
         if (quantization_config is None or quantization_config.dataset is None) and not compile_only:

From 74d2161e63899c9c65c8cfa03237163a77ae1bd6 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Fri, 1 Nov 2024 11:12:50 +0400
Subject: [PATCH 30/53] fix diffusers version info in IR (#978)

---
 optimum/exporters/openvino/convert.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 2c076827d..df2885fd0 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -39,6 +39,7 @@
     get_diffusion_models_for_export,
 )
 from optimum.intel.utils.import_utils import (
+    _diffusers_version,
     _nncf_version,
     _open_clip_version,
     _optimum_intel_version,
@@ -806,7 +807,7 @@ def _add_version_info_to_model(model: Model, library_name: Optional[str] = None)
 
             model.set_rt_info(sentence_transformers.__version__, ["optimum", "sentence_transformers_version"])
         if library_name == "diffusers":
-            model.set_rt_info(_optimum_version, ["optimum", "diffusers_version"])
+            model.set_rt_info(_diffusers_version, ["optimum", "diffusers_version"])
         elif library_name == "timm":
             model.set_rt_info(_timm_version, ["optimum", "timm_version"])
         elif library_name == "open_clip":

From a46ec679c2c78fb13a8298a6bc5fc7a39d8040ef Mon Sep 17 00:00:00 2001
From: Nikita Savelyev 
Date: Mon, 4 Nov 2024 11:42:27 +0100
Subject: [PATCH 31/53] Add SDPA to scope overrides (#982)

---
 .../configs/wav2vec2-base-jpqd.json                   |  3 +++
 .../configs/wav2vec2-base-qat.json                    |  3 +++
 .../image-classification/configs/swin-base-jpqd.json  | 11 ++++++++++-
 optimum/intel/openvino/trainer.py                     |  7 ++++++-
 4 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/examples/openvino/audio-classification/configs/wav2vec2-base-jpqd.json b/examples/openvino/audio-classification/configs/wav2vec2-base-jpqd.json
index c58903da1..41e53f5cb 100644
--- a/examples/openvino/audio-classification/configs/wav2vec2-base-jpqd.json
+++ b/examples/openvino/audio-classification/configs/wav2vec2-base-jpqd.json
@@ -54,6 +54,9 @@
             "activations": {
                 "{re}.*matmul_0": {
                     "mode": "symmetric"
+                },
+                "{re}.*scaled_dot_product_attention_0": {
+                    "mode": "symmetric"
                 }
             }
         },
diff --git a/examples/openvino/audio-classification/configs/wav2vec2-base-qat.json b/examples/openvino/audio-classification/configs/wav2vec2-base-qat.json
index 8edc51cf2..191f266a6 100644
--- a/examples/openvino/audio-classification/configs/wav2vec2-base-qat.json
+++ b/examples/openvino/audio-classification/configs/wav2vec2-base-qat.json
@@ -16,6 +16,9 @@
         "activations": {
             "{re}.*matmul_0": {
                 "mode": "symmetric"
+            },
+            "{re}.*scaled_dot_product_attention_0": {
+                "mode": "symmetric"
             }
         }
     },
diff --git a/examples/openvino/image-classification/configs/swin-base-jpqd.json b/examples/openvino/image-classification/configs/swin-base-jpqd.json
index 23b2fd3d8..a6057f6d7 100644
--- a/examples/openvino/image-classification/configs/swin-base-jpqd.json
+++ b/examples/openvino/image-classification/configs/swin-base-jpqd.json
@@ -32,7 +32,16 @@
           "num_bn_adaptation_samples": 200
         }
       },
-      "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}},
+      "scope_overrides": {
+        "activations": {
+          "{re}.*matmul_0": {
+            "mode": "symmetric"
+          },
+          "{re}.*scaled_dot_product_attention_0": {
+            "mode": "symmetric"
+          }
+        }
+      },
       "ignored_scopes": [
         "{re}.*__add___[0-1]",
         "{re}.*layer_norm_0",
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index bac782205..0edb3a730 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -148,7 +148,12 @@
         "range": {"num_init_samples": 300, "type": "mean_min_max"},
         "batchnorm_adaptation": {"num_bn_adaptation_samples": 0},
     },
-    "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}},
+    "scope_overrides": {
+        "activations": {
+            "{re}.*matmul_0": {"mode": "symmetric"},
+            "{re}.*scaled_dot_product_attention_0": {"mode": "symmetric"},
+        }
+    },
     "ignored_scopes": [
         "{re}.*Embedding.*",
         "{re}.*add___.*",

From 54a9727399b84715499b277603e366690ffed301 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Tue, 5 Nov 2024 11:10:30 +0400
Subject: [PATCH 32/53] add minicpmv support (#972)

---
 optimum/exporters/openvino/model_configs.py   | 271 +++++++++++
 optimum/exporters/openvino/model_patcher.py   | 212 ++++++++-
 optimum/exporters/openvino/utils.py           |   2 +-
 .../openvino/modeling_visual_language.py      | 429 +++++++++++++++++-
 tests/openvino/test_modeling.py               |  92 +++-
 tests/openvino/utils_tests.py                 |   1 +
 6 files changed, 977 insertions(+), 30 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index ace5c150d..108deed57 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -75,6 +75,8 @@
     JaisModelPatcher,
     LlamaModelPatcher,
     LlavaImageEmbeddingModelPatcher,
+    MiniCPMVImageEmbeddingsModelPatcher,
+    MiniCPMVResamplerModelPatcher,
     MistralModelPatcher,
     MixtralModelPatcher,
     MPTModelPatcher,
@@ -1738,3 +1740,272 @@ def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> ModelPatcher:
         return FluxTransfromerModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+class DummyMiniCPMVImageInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("pixel_values", "patch_attention_mask", "position_ids")
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"],
+        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        **kwargs,
+    ):
+        super().__init__(task, normalized_config, batch_size, num_channels, width, height)
+        self.patch_size = normalized_config.config.patch_size
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "pixel_values":
+            return self.random_float_tensor(
+                shape=[
+                    self.batch_size,
+                    self.num_channels,
+                    self.patch_size,
+                    (self.height * self.width) // self.patch_size,
+                ],
+                framework=framework,
+                dtype=float_dtype,
+            )
+
+        if input_name == "patch_attention_mask":
+            return self.random_int_tensor(
+                shape=[self.batch_size, 1, (self.height // self.patch_size) * (self.width // self.patch_size)],
+                framework=framework,
+                dtype=float_dtype,
+                min_value=0,
+                max_value=2,
+            )
+
+        if input_name == "position_ids":
+            return self.random_int_tensor(
+                shape=[self.batch_size, (self.height // self.patch_size) * (self.width // self.patch_size)],
+                max_value=self.patch_size,
+            )
+
+
+class DummyMiniCPMVResampleInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("image_feature", "pos_embed", "key_padding_mask")
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"],
+        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        **kwargs,
+    ):
+        super().__init__(task, normalized_config, batch_size, num_channels, width, height)
+        self.patch_size = normalized_config.config.patch_size
+        self.hidden_size = normalized_config.config.hidden_size
+        self.img_hidden_size = normalized_config.config.vision_config.hidden_size
+        self.feat_size = (normalized_config.config.vision_config.image_size // self.patch_size) * (
+            normalized_config.config.vision_config.image_size // self.patch_size
+        )
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "image_feature":
+            return self.random_float_tensor(
+                shape=[self.batch_size, self.feat_size, self.img_hidden_size], framework=framework, dtype=float_dtype
+            )
+
+        if input_name == "key_padding_mask":
+            return self.constant_tensor(
+                shape=[self.batch_size, self.feat_size],
+                framework=framework,
+                value=1,
+                dtype=DTYPE_MAPPER.pt(float_dtype),
+            )
+
+        if input_name == "pos_embed":
+            return self.random_float_tensor(shape=[self.feat_size, self.batch_size, self.hidden_size])
+
+
+class MiniCPMVConfigBehavior(str, enum.Enum):
+    RESAMPLER = "resampler"
+    LANGUAGE = "language"
+    VISION_EMBEDDINGS = "vision_embeddings"
+    TEXT_EMBEDDINGS = "text_embeddings"
+
+
+@register_in_tasks_manager("minicpmv", *["image-text-to-text"], library_name="transformers")
+class MiniCPMVOpenVINOConfig(OnnxConfig):
+    SUPPORTED_BEHAVIORS = [model_type.value for model_type in MiniCPMVConfigBehavior]
+    NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
+    DUMMY_INPUT_GENERATOR_CLASSES = ()
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        behavior: MiniCPMVConfigBehavior = MiniCPMVConfigBehavior.VISION_EMBEDDINGS,
+        preprocessors: Optional[List[Any]] = None,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+        )
+        self._behavior = behavior
+        self._orig_config = config
+        if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"):
+            self._config = config.vision_config
+            self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVImageInputGenerator,)
+        if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVResampleInputGenerator,)
+        self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
+            return {
+                "pixel_values": {0: "batch_size", 2: "height", 3: "width"},
+                "patch_attention_mask": {0: "batch_size", 1: "num_patches", 2: "patch_size"},
+                "position_ids": {0: "batch_size", 1: "patch_size"},
+            }
+        if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            return {
+                "image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
+                "pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
+                "key_padding_mask": {0: "batch_size", 1: "patch_size"},
+            }
+        return {}
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
+            return {"last_hidden_state": {0: "batch_size", 1: "patch_height", 2: "patch_width"}}
+        if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            return {"last_hidden_state": {0: "batch_size"}}
+
+        return {}
+
+    def with_behavior(
+        self,
+        behavior: Union[str, MiniCPMVConfigBehavior],
+    ):
+        """
+        Creates a config for different behaviour.
+        Args:
+            behavior ([`ConfigBehavior`]):
+                The behavior to use for the new instance.
+        """
+        if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
+            behavior = MiniCPMVConfigBehavior(behavior)
+
+        if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
+            model_type = "qwen2"
+            model_type = model_type.replace("_", "-")
+            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
+                raise ValueError(
+                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
+                )
+
+            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
+                raise ValueError(
+                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
+                )
+            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
+                "text-generation-with-past"
+            ]
+            internal_export_config = internal_export_config_class(
+                self._orig_config,
+                use_past=True,
+                use_past_in_inputs=True,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+            )
+            InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
+            export_config = InputEmbedOpenvVINOConfig(
+                self._orig_config,
+                task="feature-extraction",
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+            )
+            return export_config
+
+        if behavior == MiniCPMVConfigBehavior.LANGUAGE:
+            model_type = "qwen2"
+            model_type = model_type.replace("_", "-")
+
+            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
+                raise ValueError(
+                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
+                )
+
+            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
+                raise ValueError(
+                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
+                )
+            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
+                "text-generation-with-past"
+            ]
+            internal_export_config = internal_export_config_class(
+                self._orig_config,
+                use_past=True,
+                use_past_in_inputs=True,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+            )
+            export_config = LMInputEmbedsConfigHelper(internal_export_config)
+            export_config._normalized_config = internal_export_config._normalized_config
+            return export_config
+
+        if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+        if behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+    def get_model_for_behavior(self, model, behavior: Union[str, MiniCPMVConfigBehavior]):
+        if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
+            behavior = MiniCPMVConfigBehavior(behavior)
+
+        if behavior == MiniCPMVConfigBehavior.LANGUAGE:
+            return model.llm
+
+        if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
+            return model.vpm
+
+        if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
+            text_embedding = model.get_input_embeddings()
+            text_embedding.config = model.llm.config
+            return text_embedding
+        if behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            model.resampler.config = model.vpm.config
+            return model.resampler
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        model_kwargs = model_kwargs or {}
+        if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
+            return MiniCPMVImageEmbeddingsModelPatcher(self, model, model_kwargs)
+
+        if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            return MiniCPMVResamplerModelPatcher(self, model, model_kwargs)
+
+        return super().patch_model_for_export(model, model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 7e5cd76a7..b1aa7eaa9 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -21,7 +21,7 @@
 
 import torch
 import torch.nn.functional as F
-from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling
 from transformers.utils import is_tf_available
 
 from optimum.exporters.onnx.model_patcher import DecoderModelPatcher, ModelPatcher, override_arguments
@@ -2763,3 +2763,213 @@ def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         if hasattr(self._model.pos_embed, "_orig_forward"):
             self._model.pos_embed.forward = self._model.pos_embed._orig_forward
+
+
+def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask):
+    bs = image_feature.shape[0]
+    image_feature = self.kv_proj(image_feature)  # B * L * D
+    image_feature = self.ln_kv(image_feature).permute(1, 0, 2)  # L * B * D
+
+    q = self.ln_q(self.query)  # Q * D
+
+    q_bs = q.unsqueeze(1).repeat(1, bs, 1)
+
+    out = self.attn(q_bs, image_feature + pos_embed, image_feature, key_padding_mask=key_padding_mask)[
+        0
+    ]  # Q * B * D  # L * B * D +  L * B * D
+    #  out: Q * B * D
+    x = out.permute(1, 0, 2)  # B * Q * D
+
+    x = self.ln_post(x)
+    x = x @ self.proj
+    return x
+
+
+def _minicpmv_siglip_vis_embed_forward(
+    self,
+    pixel_values: torch.FloatTensor,
+    patch_attention_mask: torch.BoolTensor,
+    tgt_sizes: Optional[torch.IntTensor] = None,
+    position_ids: Optional[torch.FloatTensor] = None,
+) -> torch.Tensor:
+    patch_embeds = self.patch_embedding(pixel_values)
+    embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+    if position_ids is None:
+        batch_size = pixel_values.size(0)
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
+        position_ids = torch.full(
+            size=(
+                batch_size,
+                max_nb_patches_h * max_nb_patches_w,
+            ),
+            fill_value=0,
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+    position_ids = position_ids.to(self.position_embedding.weight.device)
+
+    embeddings = embeddings + self.position_embedding(position_ids)
+    return embeddings
+
+
+def _minicpmv_siglip_attn_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    output_attentions: Optional[bool] = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel"""
+
+    batch_size, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states, key_states, value_states, attention_mask, is_causal=attention_mask is None
+    )
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+    attn_output = self.out_proj(attn_output)
+
+    return attn_output, None
+
+
+def _minicpmv_siglip_transformer_forward(
+    self,
+    pixel_values,
+    patch_attention_mask: Optional[torch.BoolTensor] = None,
+    tgt_sizes: Optional[torch.IntTensor] = None,
+    position_ids: Optional[torch.FloatTensor] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, BaseModelOutputWithPooling]:
+    from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    batch_size = pixel_values.size(0)
+    if patch_attention_mask is None:
+        patch_attention_mask = torch.ones(
+            size=(
+                batch_size,
+                pixel_values.size(2) // self.config.patch_size,
+                pixel_values.size(3) // self.config.patch_size,
+            ),
+            dtype=torch.bool,
+            device=pixel_values.device,
+        )
+
+    hidden_states = self.embeddings(
+        pixel_values=pixel_values,
+        patch_attention_mask=patch_attention_mask,
+        tgt_sizes=tgt_sizes,
+        position_ids=position_ids,
+    )
+
+    patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+    attention_mask = (
+        _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+        if not self._use_flash_attention_2
+        else patch_attention_mask
+    )
+
+    encoder_outputs = self.encoder(
+        inputs_embeds=hidden_states,
+        attention_mask=attention_mask,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+
+    last_hidden_state = encoder_outputs[0]
+    last_hidden_state = self.post_layernorm(last_hidden_state)
+
+    if not return_dict:
+        return (last_hidden_state, None) + encoder_outputs[1:]
+
+    return BaseModelOutputWithPooling(
+        last_hidden_state=last_hidden_state,
+        pooler_output=None,
+        hidden_states=encoder_outputs.hidden_states,
+        attentions=encoder_outputs.attentions,
+    )
+
+
+class MiniCPMVResamplerModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        model.forward = types.MethodType(_minicpmv_resampler_forward, model)
+
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+
+
+class MiniCPMVImageEmbeddingsModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        model.forward = types.MethodType(_minicpmv_siglip_transformer_forward, model)
+
+        super().__init__(config, model, model_kwargs)
+
+    def __enter__(self):
+        super().__enter__()
+        self._model.embeddings._orig_forward = self._model.embeddings.forward
+        self._model.embeddings.forward = types.MethodType(_minicpmv_siglip_vis_embed_forward, self._model.embeddings)
+
+        if is_torch_version(">=", "2.0.0"):
+            for layer in self._model.encoder.layers:
+                layer.self_attn._orig_forward = layer.self_attn.forward
+                layer.self_attn.forward = types.MethodType(_minicpmv_siglip_attn_forward, layer.self_attn)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+        self._model.embeddings.forward = self._model.embeddings._orig_forward
+        if is_torch_version(">=", "2.0.0"):
+            for layer in self._model.encoder.layers:
+                layer.self_attn.forward = layer.self_attn._orig_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index 75106fc2b..35e0c3017 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -208,4 +208,4 @@ def get_submodels(model):
     return custom_export, fn_get_submodels
 
 
-MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "internvl-chat"]
+MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "internvl-chat", "minicpmv"]
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 141abeb87..b071602d9 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -171,6 +171,7 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
         super().__init__(model, parent_model, model_name=self._model_name)
         self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs}
         self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
+        self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
         self.hidden_states_output_names = []
         if len(self.model.outputs) > 2:
             self.hidden_states_output_names = [
@@ -178,7 +179,12 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
             ]
 
     def forward(self, pixel_values, **kwargs):
-        result = self.request({"pixel_values": pixel_values})
+        inputs = {"pixel_values": pixel_values}
+        if len(self.input_names) > 1:
+            for name in self.input_names:
+                if name in kwargs:
+                    inputs[name] = kwargs[name]
+        result = self.request(inputs)
         last_hidden_state = result[0]
         hidden_states = None
         pooler_out = None
@@ -193,7 +199,22 @@ def forward(self, pixel_values, **kwargs):
         )
 
 
-MODEL_PARTS_CLS_MAPPING = {}
+class OVResampler(OVModelPart):
+    _model_name = "resampler"
+
+    def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
+        super().__init__(model, parent_model, model_name=self._model_name)
+        self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs}
+        self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
+
+    def forward(self, image_feature, pos_embed, key_padding_mask):
+        result = self.request(
+            {"image_feature": image_feature, "pos_embed": pos_embed, "key_padding_mask": key_padding_mask}
+        )[0]
+        return result
+
+
+MODEL_PARTS_CLS_MAPPING = {"resampler": OVResampler}
 
 
 class OVModelForVisualCausalLM(OVBaseModel, GenerationMixin):
@@ -513,7 +534,7 @@ def _from_transformers(
             ov_config=ov_config,
             stateful=stateful,
         )
-        config = AutoConfig.from_pretrained(save_dir_path)
+        config = AutoConfig.from_pretrained(save_dir_path, trust_remote_code=trust_remote_code)
         return cls._from_pretrained(
             model_id=save_dir_path,
             config=config,
@@ -553,6 +574,8 @@ def forward(
         image_sizes=None,
         attention_mask=None,
         position_ids=None,
+        image_bound=None,
+        tgt_sizes=None,
         **kwargs,
     ):
         inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings(
@@ -562,6 +585,8 @@ def forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
+            image_bound=image_bound,
+            tgt_sizes=tgt_sizes,
             **kwargs,
         )
         return self.language_model.forward(
@@ -628,14 +653,14 @@ def prepare_inputs_for_generation(
             elif past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.image_token_index in input_ids:
+            elif getattr(self.config, "image_token_index", None) in input_ids:
                 input_ids = input_ids[:, input_ids.shape[1] - 1 :]
 
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
+            if past_key_values is not None:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
@@ -652,6 +677,8 @@ def prepare_inputs_for_generation(
                 "attention_mask": attention_mask,
                 "pixel_values": pixel_values,
                 "image_sizes": image_sizes,
+                "image_bound": kwargs.get("image_bound"),
+                "tgt_sizes": kwargs.get("tgt_sizes"),
             }
         )
         return model_inputs
@@ -1123,8 +1150,400 @@ def merge_vision_text_embeddings(
         return input_embeds, attention_mask, position_ids
 
 
+class _OVMiniCPMVForCausalLM(OVModelForVisualCausalLM):
+    additional_parts = ["resampler"]
+
+    def __init__(
+        self,
+        language_model: ov.Model,
+        text_embeddings: ov.Model,
+        vision_embeddings: ov.Model,
+        config: PretrainedConfig = None,
+        device: str = "CPU",
+        dynamic_shapes: bool = True,
+        ov_config: Optional[Dict[str, str]] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            language_model,
+            text_embeddings,
+            vision_embeddings,
+            config,
+            device,
+            dynamic_shapes,
+            ov_config,
+            model_save_dir,
+            quantization_config,
+            **kwargs,
+        )
+        self.embed_dim = self.language_model.config.hidden_size
+        max_size = self.config.vision_config.image_size // self.config.vision_config.patch_size
+        self._pos_embeds = torch.from_numpy(self._get_2d_sincos_pos_embed(self.embed_dim, max_size)).float()
+        self.max_size = (max_size, max_size)
+
+    def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
+        if input_ids is not None and input_ids.shape[1] == 1:
+            return None
+        tgt_sizes = kwargs["tgt_sizes"]
+        pixel_values_list = pixel_values
+        vision_hidden_states = []
+        all_pixel_values = []
+        img_cnt = []
+        for pixel_value in pixel_values_list:
+            img_cnt.append(len(pixel_value))
+            all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_value])
+
+        vision_embedding = None
+        # exist image
+        if all_pixel_values:
+            tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)]
+            tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
+
+            max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
+
+            all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True, padding_value=0.0)
+            B, L, _ = all_pixel_values.shape
+            all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+
+            patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool)
+            for i in range(B):
+                patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+            position_ids = self._prepare_vis_position_ids(
+                all_pixel_values,
+                patch_attn_mask,
+                tgt_sizes,
+                self.config.vision_config.patch_size,
+                self.config.vision_config.image_size // self.config.patch_size,
+            )
+            vision_embedding = torch.from_numpy(
+                self.vision_embeddings(
+                    pixel_values=all_pixel_values, patch_attention_mask=patch_attn_mask, position_ids=position_ids
+                )[0]
+            )
+            vision_embedding = self.resampling(vision_embedding, tgt_sizes)
+
+            start = 0
+            for pixel_value in pixel_values_list:
+                img_cnt = len(pixel_value)
+                if img_cnt > 0:
+                    vision_hidden_states.append(vision_embedding[start : start + img_cnt])
+                    start += img_cnt
+                else:
+                    vision_hidden_states.append([])
+        else:  # no image
+            dummy_feature = []
+            for _ in range(len(pixel_values_list)):
+                vision_hidden_states.append(dummy_feature)
+        return vision_hidden_states
+
+    def resampling(self, x, tgt_sizes):
+        bs = x.shape[0]
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes)
+
+        max_patch_len = torch.max(patch_len)
+        key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool)
+
+        pos_embed = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i]
+            pos_embed.append(self._pos_embeds[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)))  # patches * D
+            key_padding_mask[i, patch_len[i] :] = True
+
+        pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
+            1, 0, 2
+        )  # BLD => L * B * D
+        res = torch.from_numpy(self.resampler(image_feature=x, pos_embed=pos_embed, key_padding_mask=key_padding_mask))
+        return res
+
+    def _set_2d_pos_cache(self, max_size):
+        pos_embed = torch.from_numpy(self._get_2d_sincos_pos_embed(self.embed_dim, max_size)).float()
+        self._pos_embed = pos_embed
+
+    def _adjust_pos_cache(self, tgt_sizes):
+        max_h = torch.max(tgt_sizes[:, 0])
+        max_w = torch.max(tgt_sizes[:, 1])
+        if max_h > self.max_size[0] or max_w > self.max_size[1]:
+            self.max_size = [max(max_h, self.max_size[0]), max(max_w, self.max_size[1])]
+            self._set_2d_pos_cache(self.max_size)
+
+    def _get_2d_sincos_pos_embed(self, embed_dim, image_size):
+        """
+        image_size: image_size or (image_height, image_width)
+        return:
+        pos_embed: [image_height, image_width, embed_dim]
+        """
+        if isinstance(image_size, int):
+            grid_h_size, grid_w_size = image_size, image_size
+        else:
+            grid_h_size, grid_w_size = image_size[0], image_size[1]
+
+        grid_h = np.arange(grid_h_size, dtype=np.float32)
+        grid_w = np.arange(grid_w_size, dtype=np.float32)
+        grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+        grid = np.stack(grid, axis=0)
+
+        pos_embed = self._get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+        return pos_embed
+
+    def _get_2d_sincos_pos_embed_from_grid(self, embed_dim, grid):
+        assert embed_dim % 2 == 0
+
+        # use half of dimensions to encode grid_h
+        emb_h = self._get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[0])  # (H, W, D/2)
+        emb_w = self._get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[1])  # (H, W, D/2)
+
+        emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
+        return emb
+
+    def _get_1d_sincos_pos_embed_from_grid_new(self, embed_dim, pos):
+        """
+        embed_dim: output dimension for each position
+        pos: a list of positions to be encoded: size (H, W)
+        out: (H, W, D)
+        """
+        assert embed_dim % 2 == 0
+        omega = np.arange(embed_dim // 2, dtype=np.float32)
+        omega /= embed_dim / 2.0
+        omega = 1.0 / 10000**omega  # (D/2,)
+
+        out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
+
+        emb_sin = np.sin(out)  # (H, W, D/2)
+        emb_cos = np.cos(out)  # (H, W, D/2)
+
+        emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
+        return emb
+
+    def _prepare_vis_position_ids(
+        self, pixel_values, patch_attention_mask, tgt_sizes, patch_size, num_patches_per_side
+    ):
+        batch_size = pixel_values.size(0)
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = max_im_h // patch_size, max_im_w // patch_size
+        boundaries = torch.arange(1 / num_patches_per_side, 1.0, 1 / num_patches_per_side)
+        position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        return position_ids
+
+    def merge_vision_text_embeddings(
+        self, vision_embeds, input_embeds, input_ids, attention_mask, position_ids=None, **kwargs
+    ):
+        bs = input_ids.shape[0]
+        image_bound = kwargs["image_bound"]
+        vllm_embedding = torch.from_numpy(input_embeds)
+        for i in range(bs):
+            cur_vs_hs = vision_embeds[i]
+            if len(cur_vs_hs) > 0:
+                cur_vllm_emb = vllm_embedding[i]
+                cur_image_bound = image_bound[i]
+                if len(cur_image_bound) > 0:
+                    image_indices = torch.stack([torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound])
+
+                    cur_vllm_emb.scatter_(
+                        0,
+                        image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]),
+                        cur_vs_hs.view(-1, cur_vs_hs.shape[-1]),
+                    )
+        return vllm_embedding, attention_mask, position_ids
+
+
+class _OVNanoLlavaForCausalLM(OVModelForVisualCausalLM):
+    def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
+        if input_ids is not None and input_ids.shape[1] == 1:
+            return None
+        if isinstance(pixel_values, list) or pixel_values.ndim == 5:
+            concat_images = torch.cat(pixel_values, dim=0) if isinstance(pixel_values, list) else pixel_values
+            image_features = torch.from_numpy(self.vision_embeddings(concat_images).last_hidden_state)
+            split_sizes = [image.shape[0] for image in pixel_values]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            image_features = [x.flatten(0, 1).to(self.device) for x in image_features]
+        else:
+            image_features = self.vision_embeddings(pixel_values).last_hidden_state
+
+        return image_features
+
+    def get_multimodal_embeddings(
+        self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, **kwargs
+    ):
+        vision_embeds = None
+        IGNORE_INDEX = -100
+        IMAGE_TOKEN_INDEX = -200
+        if pixel_values is not None:
+            vision_embeds = self.get_vision_embeddings(pixel_values, input_ids=input_ids, **kwargs)
+        if vision_embeds is None:
+            inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids))
+            past_len = self.language_model._get_past_length(kwargs.get("past_key_values"))
+            if attention_mask is not None and attention_mask.shape[1] < past_len + input_ids.shape[1]:
+                attention_mask = torch.cat(
+                    [
+                        attention_mask,
+                        torch.ones(attention_mask.shape[0], past_len + input_ids.shape[1] - attention_mask.shape[1]),
+                    ],
+                    dim=1,
+                )
+                position_ids = None
+            return inputs_embeds, attention_mask, position_ids
+
+        vision_embeds = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+        # remove the padding using attention_mask -- TODO: double check
+        input_ids = [
+            cur_input_ids[cur_attention_mask]
+            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask.bool())
+        ]
+        labels = [
+            cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask.bool())
+        ]
+
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = vision_embeds[cur_image_idx]
+                cur_input_embeds_1 = torch.from_numpy(self.get_text_embeddings(cur_input_ids.unsqueeze(0))[0])
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+
+            image_token_indices = (
+                [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            )
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = torch.from_numpy(
+                self.get_text_embeddings(torch.cat(cur_input_ids_noim).unsqueeze(0))[0]
+            )
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = vision_embeds[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(
+                        torch.full(
+                            (cur_image_features.shape[0],),
+                            IGNORE_INDEX,
+                            device=cur_labels.device,
+                            dtype=cur_labels.dtype,
+                        )
+                    )
+
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full(
+            (batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device
+        )
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(
+                        0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                    )
+            else:
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            cur_new_embed,
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(
+                        0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                    )
+
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+
+        return new_input_embeds, attention_mask, position_ids
+
+
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
     "llava_next": _OVLlavaNextForCausalLM,
     "internvl_chat": _OvInternVLForCausalLM,
+    "minicpmv": _OVMiniCPMVForCausalLM,
 }
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 8b4258adf..0dcfaac71 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -50,6 +50,7 @@
     AutoModelForSpeechSeq2Seq,
     AutoModelForTokenClassification,
     AutoModelForVision2Seq,
+    AutoProcessor,
     AutoTokenizer,
     GenerationConfig,
     Pix2StructForConditionalGeneration,
@@ -1876,12 +1877,14 @@ def test_compare_with_and_without_past_key_values(self):
 
 
 class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = [
-        "llava",
-    ]
+    SUPPORTED_ARCHITECTURES = ["llava"]
+
+    REMOTE_CODE_MODELS = ["minicpmv"]
 
     if is_transformers_version(">=", "4.40.0"):
         SUPPORTED_ARCHITECTURES += ["llava_next"]
+    if is_transformers_version(">=", "4.45.0"):
+        SUPPORTED_ARCHITECTURES += ["minicpmv"]
     TASK = "image-text-to-text"
 
     IMAGE = Image.open(
@@ -1900,19 +1903,50 @@ def get_transformer_model_class(self, model_arch):
             from transformers import LlavaNextForConditionalGeneration
 
             return LlavaNextForConditionalGeneration
-        return None
+        return AutoModelForCausalLM
+
+    def gen_inputs(self, model_arch, base_text_prompt, image=None):
+        model_id = MODEL_NAMES[model_arch]
+        if "llava" in model_arch:
+            prompt = f"\n {base_text_prompt}"
+        elif "minicpmv" in model_arch:
+            prompt = "<|im_start|>user\n(./)\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n"
+        if model_arch != "nanollava":
+            processor = AutoProcessor.from_pretrained(
+                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            inputs = processor(images=[self.IMAGE.resize((600, 600))], text=[prompt], return_tensors="pt")
+        else:
+            config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
+            processor = AutoProcessor.from_pretrained(
+                config.mm_vision_tower, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            image_input = None
+            if image is not None:
+                image_input = processor(images=image, return_tensors="pt")["pixel_values"]
+            text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")]
+
+            input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
+            attention_mask = torch.ones_like(input_ids, dtype=torch.int64)
+            inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "images": image_input}
+        return inputs
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
-        prompt = "\n What is shown in this image?"
         model_id = MODEL_NAMES[model_arch]
-        processor = get_preprocessor(model_id)
-        transformers_model = self.get_transformer_model_class(model_arch).from_pretrained(model_id)
-        inputs = processor(images=self.IMAGE, text=prompt, return_tensors="pt")
-        set_seed(SEED)
-        with torch.no_grad():
-            transformers_outputs = transformers_model(**inputs)
-        ov_model = OVModelForVisualCausalLM.from_pretrained(model_id, export=True)
+        transformers_model = self.get_transformer_model_class(model_arch).from_pretrained(
+            model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+        )
+        if "nanollava" in model_arch:
+            transformers_model.get_vision_tower().load_model()
+        inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE)
+
+        ov_model = OVModelForVisualCausalLM.from_pretrained(
+            model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+        )
         self.assertIsInstance(ov_model, MODEL_TYPE_TO_CLS_MAPPING[ov_model.config.model_type])
         self.assertIsInstance(ov_model.vision_embeddings, OVVisionEmbedding)
         self.assertIsInstance(ov_model.language_model, OVModelWithEmbedForCausalLM)
@@ -1920,8 +1954,13 @@ def test_compare_to_transformers(self, model_arch):
             self.assertTrue(hasattr(ov_model, additional_part))
             self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part])
         self.assertIsInstance(ov_model.config, PretrainedConfig)
-        ov_outputs = ov_model(**inputs)
-        self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
+        # pytorch minicpmv is not designed to be used via forward
+        if "minicpmv" not in model_arch:
+            set_seed(SEED)
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**inputs)
+            ov_outputs = ov_model(**inputs)
+            self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
 
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
@@ -1930,7 +1969,6 @@ def test_compare_to_transformers(self, model_arch):
         gen_config = GenerationConfig(
             max_new_tokens=30,
             min_new_tokens=30,
-            num_beams=3,
             do_sample=False,
             eos_token_id=None,
         )
@@ -1938,6 +1976,9 @@ def test_compare_to_transformers(self, model_arch):
         ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
         set_seed(SEED)
         transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config)
+        # original minicpmv always skip input tokens in generation results, while transformers based approach provide them
+        if model_arch == "minicpmv":
+            ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
         self.assertTrue(
             torch.equal(ov_outputs, transformers_outputs),
             f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model output {ov_outputs}",
@@ -1951,20 +1992,25 @@ def test_compare_to_transformers(self, model_arch):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_generate_utils(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
-        model = OVModelForVisualCausalLM.from_pretrained(model_id, export=True)
-        preprocessor = get_preprocessor(model_id)
-        question = "\nDescribe image"
-        inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt")
-
+        model = OVModelForVisualCausalLM.from_pretrained(
+            model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
+        inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE)
         # General case
         outputs = model.generate(**inputs, max_new_tokens=10)
-        outputs = preprocessor.batch_decode(outputs, skip_special_tokens=True)
+        # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
+        outputs = outputs[:, inputs["input_ids"].shape[1] :]
+        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
         self.assertIsInstance(outputs[0], str)
 
+        # No input image case
         question = "Hi, how are you?"
-        inputs = preprocessor(images=None, text=question, return_tensors="pt")
+        inputs = self.gen_inputs(model_arch, question, None)
         outputs = model.generate(**inputs, max_new_tokens=10)
-        outputs = preprocessor.batch_decode(outputs, skip_special_tokens=True)
+        # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
+        outputs = outputs[:, inputs["input_ids"].shape[1] :]
+        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
         self.assertIsInstance(outputs[0], str)
         del model
 
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index e5a9f73a6..ec0ca3981 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -84,6 +84,7 @@
     "marian": "sshleifer/tiny-marian-en-de",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "minicpm": "katuni4ka/tiny-random-minicpm",
+    "minicpmv": "katuni4ka/tiny-random-minicpmv-2_6",
     "mistral": "echarlaix/tiny-random-mistral",
     "mistral-nemo": "katuni4ka/tiny-random-mistral-nemo",
     "mixtral": "TitanML/tiny-mixtral",

From f08e8b7b9e8812f8904ffbb49e8c469ffbfdcf78 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Tue, 5 Nov 2024 15:20:43 +0400
Subject: [PATCH 33/53] add support of nanollava model (#969)

---
 optimum/exporters/openvino/model_configs.py   | 162 +++++++++++++++++-
 optimum/exporters/openvino/model_patcher.py   |  18 ++
 optimum/exporters/openvino/utils.py           |   2 +-
 .../openvino/modeling_visual_language.py      |  25 ++-
 tests/openvino/test_modeling.py               |   5 +-
 tests/openvino/utils_tests.py                 |   1 +
 6 files changed, 200 insertions(+), 13 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 108deed57..9dbcacb7f 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -17,7 +17,7 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 from packaging import version
-from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
+from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, TFPreTrainedModel
 from transformers.utils import is_tf_available
 
 from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
@@ -75,6 +75,7 @@
     JaisModelPatcher,
     LlamaModelPatcher,
     LlavaImageEmbeddingModelPatcher,
+    LlavaQwen2ImageEmbeddingsModelPatcher,
     MiniCPMVImageEmbeddingsModelPatcher,
     MiniCPMVResamplerModelPatcher,
     MistralModelPatcher,
@@ -1579,6 +1580,165 @@ def patch_model_for_export(
         return InternVLChatImageEmbeddingModelPatcher(self, model, model_kwargs)
 
 
+@register_in_tasks_manager(
+    "llava-qwen2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers"
+)
+class LlavaQwen2OpenVINOConfig(OnnxConfig):
+    SUPPORTS_PAST = True
+    MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
+    SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaConfigBehavior]
+    NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        behavior: LlavaConfigBehavior = LlavaConfigBehavior.VISION_EMBEDDINGS,
+        preprocessors: Optional[List[Any]] = None,
+        use_past: bool = False,
+    ):
+        self._behavior = behavior
+        self._orig_config = config
+        if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            config = AutoConfig.from_pretrained(config.mm_vision_tower, trust_remote_code=True)
+            if hasattr(config, "vision_config"):
+                config = config.vision_config
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+        )
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return {}
+        return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}}
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return {}
+        return {"last_hidden_state": {0: "batch_size"}}
+
+    def get_model_for_behavior(self, model, behavior: Union[str, LlavaConfigBehavior]):
+        if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
+            behavior = LlavaConfigBehavior(behavior)
+
+        if behavior == LlavaConfigBehavior.LANGUAGE:
+            model.forward = super(type(model), model).forward
+            return model
+
+        if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return model
+
+        if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS:
+            text_embedding = model.model.embed_tokens
+            text_embedding.config = model.model.config
+            return text_embedding
+
+    def with_behavior(
+        self,
+        behavior: Union[str, LlavaConfigBehavior],
+    ):
+        """
+        Creates a config for different behaviour.
+        Args:
+            behavior ([`ConfigBehavior`]):
+                The behavior to use for the new instance.
+        """
+        if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
+            behavior = LlavaConfigBehavior(behavior)
+
+        if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS:
+            model_type = self._orig_config.model_type.replace("llava-", "")
+            model_type = model_type.replace("_", "-")
+            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
+                raise ValueError(
+                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
+                )
+
+            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
+                raise ValueError(
+                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
+                )
+            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
+                "text-generation-with-past"
+            ]
+            internal_export_config = internal_export_config_class(
+                self._orig_config,
+                use_past=True,
+                use_past_in_inputs=True,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+            )
+            InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
+            export_config = InputEmbedOpenvVINOConfig(
+                self._orig_config,
+                task="feature-extraction",
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+            )
+            return export_config
+
+        if behavior == LlavaConfigBehavior.LANGUAGE:
+            model_type = self._orig_config.model_type.replace("llava-", "")
+            model_type = model_type.replace("_", "-")
+
+            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
+                raise ValueError(
+                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
+                )
+
+            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
+                raise ValueError(
+                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
+                )
+            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
+                "text-generation-with-past"
+            ]
+            internal_export_config = internal_export_config_class(
+                self._orig_config,
+                use_past=True,
+                use_past_in_inputs=True,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+            )
+            export_config = LMInputEmbedsConfigHelper(internal_export_config)
+            export_config._normalized_config = internal_export_config._normalized_config
+            return export_config
+
+        if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        model_kwargs = model_kwargs or {}
+        if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return super().patch_model_for_export(model, model_kwargs)
+        return LlavaQwen2ImageEmbeddingsModelPatcher(self, model, model_kwargs)
+
+    def rename_ambiguous_inputs(self, inputs):
+        if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            model_inputs = {}
+            model_inputs["images"] = inputs["pixel_values"]
+            return model_inputs
+        return super().rename_ambiguous_inputs(inputs)
+
+
 class PooledProjectionsDummyInputGenerator(DummyInputGenerator):
     SUPPORTED_INPUT_NAMES = ["pooled_projections"]
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index b1aa7eaa9..8507d94fe 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -2973,3 +2973,21 @@ def __exit__(self, exc_type, exc_value, traceback):
         if is_torch_version(">=", "2.0.0"):
             for layer in self._model.encoder.layers:
                 layer.self_attn.forward = layer.self_attn._orig_forward
+
+
+class LlavaQwen2ImageEmbeddingsModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        model.forward = model.encode_images
+        super().__init__(config, model, model_kwargs)
+        if not self._model.get_vision_tower().is_loaded:
+            self._model.get_vision_tower().load_model()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index 35e0c3017..9286a37f7 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -208,4 +208,4 @@ def get_submodels(model):
     return custom_export, fn_get_submodels
 
 
-MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "internvl-chat", "minicpmv"]
+MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv"]
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index b071602d9..74d7c88d6 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -14,7 +14,7 @@
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 
 from ...exporters.openvino import main_export
-from ...exporters.openvino.stateful import ensure_stateful_is_available
+from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name
 from .configuration import OVConfig, OVWeightQuantizationConfig
 from .modeling_base import OVBaseModel, OVModelPart
 from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM
@@ -122,8 +122,8 @@ def prepare_inputs(
             else:
                 position_ids = np.cumsum(attention_mask, axis=1) - 1
                 position_ids[attention_mask == 0] = 1
-                if past_key_values:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
+            if past_len:
+                position_ids = position_ids[:, -inputs_embeds.shape[1] :]
 
             inputs["position_ids"] = position_ids
 
@@ -177,9 +177,11 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
             self.hidden_states_output_names = [
                 key.get_any_name() for key in self.model.outputs[2:] if "hidden_states" in key.get_any_name()
             ]
+        self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
+        self._main_input = "images" if model_has_input_output_name(self.model, "images") else "pixel_values"
 
     def forward(self, pixel_values, **kwargs):
-        inputs = {"pixel_values": pixel_values}
+        inputs = {self._main_input: pixel_values}
         if len(self.input_names) > 1:
             for name in self.input_names:
                 if name in kwargs:
@@ -568,7 +570,7 @@ def half(self):
     def forward(
         self,
         input_ids,
-        pixel_values,
+        pixel_values=None,
         past_key_values=None,
         inputs_embeds=None,
         image_sizes=None,
@@ -576,8 +578,11 @@ def forward(
         position_ids=None,
         image_bound=None,
         tgt_sizes=None,
+        images=None,
         **kwargs,
     ):
+        if pixel_values is None and images is not None:
+            pixel_values = images
         inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings(
             input_ids,
             pixel_values,
@@ -629,6 +634,7 @@ def get_multimodal_embeddings(
                 )
         return inputs_embeds, attention_mask, position_ids
 
+    # Adopted from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L521
     def prepare_inputs_for_generation(
         self,
         input_ids,
@@ -646,14 +652,15 @@ def prepare_inputs_for_generation(
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            if attention_mask is not None and past_length + 1 > input_ids.shape[1]:
+                input_discount = max(attention_mask.shape[1] - past_length, 1)
+                input_ids = input_ids[:, -input_discount:]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.llava
             elif past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif getattr(self.config, "image_token_index", None) in input_ids:
+            elif getattr(self.config, "image_token_index", -1) in input_ids:
                 input_ids = input_ids[:, input_ids.shape[1] - 1 :]
 
         position_ids = kwargs.get("position_ids", None)
@@ -679,6 +686,7 @@ def prepare_inputs_for_generation(
                 "image_sizes": image_sizes,
                 "image_bound": kwargs.get("image_bound"),
                 "tgt_sizes": kwargs.get("tgt_sizes"),
+                "images": kwargs.get("images"),
             }
         )
         return model_inputs
@@ -1546,4 +1554,5 @@ def get_multimodal_embeddings(
     "llava_next": _OVLlavaNextForCausalLM,
     "internvl_chat": _OvInternVLForCausalLM,
     "minicpmv": _OVMiniCPMVForCausalLM,
+    "llava-qwen2": _OVNanoLlavaForCausalLM,
 }
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 0dcfaac71..6c68438c7 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1879,12 +1879,11 @@ def test_compare_with_and_without_past_key_values(self):
 class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = ["llava"]
 
-    REMOTE_CODE_MODELS = ["minicpmv"]
-
     if is_transformers_version(">=", "4.40.0"):
-        SUPPORTED_ARCHITECTURES += ["llava_next"]
+        SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"]
     if is_transformers_version(">=", "4.45.0"):
         SUPPORTED_ARCHITECTURES += ["minicpmv"]
+    REMOTE_CODE_MODELS = ["minicpmv", "nanollava"]
     TASK = "image-text-to-text"
 
     IMAGE = Image.open(
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index ec0ca3981..f062ded11 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -95,6 +95,7 @@
     "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
     "mpnet": "hf-internal-testing/tiny-random-MPNetModel",
     "mt5": "stas/mt5-tiny-random",
+    "nanollava": "katuni4ka/tiny-random-nanollava",
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
     "olmo": "katuni4ka/tiny-random-olmo-hf",
     "orion": "katuni4ka/tiny-random-orion",

From d35737671ec226a66c495159588b7603066e1b07 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Tue, 5 Nov 2024 14:01:01 +0100
Subject: [PATCH 34/53] Clean and clear CI (#975)

* clean and clear CI

* fix

* test

* fix transformers version installation

* fix

* less verbose

* faster

* fix

* fix

* reduce

* peft in testing

* move ST tests to test_modeling.py since it's a single test

* resolve convs

* simple

* use latest

* fix

* fix

* safety checker not a submodel

* fix for pipeline modules

* style

* added pip freeze
---
 .github/workflows/build_pr_documentation.yml  | 21 +++--
 .github/workflows/check_code_quality.yml      | 54 ------------
 .github/workflows/dockerfile_sanity.yml       | 52 ++++++------
 .github/workflows/quality.yml                 | 41 +++++++++
 .github/workflows/security.yml                |  1 +
 .github/workflows/test_generation.yml         | 48 +++++------
 .github/workflows/test_inc.yml                | 57 ++++++-------
 .github/workflows/test_ipex.yml               | 47 ++++++-----
 .github/workflows/test_offline.yaml           | 28 ++++---
 .github/workflows/test_openvino.yml           | 61 ++++++++------
 .github/workflows/test_openvino_basic.yml     | 83 -------------------
 .github/workflows/test_openvino_examples.yml  | 46 +++++-----
 .github/workflows/test_openvino_notebooks.yml | 57 ++++++-------
 .github/workflows/test_openvino_slow.yml      | 75 +++++++++++++++++
 optimum/intel/openvino/modeling_diffusion.py  | 18 ++--
 setup.py                                      | 12 +--
 tests/openvino/test_modeling.py               | 34 ++++++++
 .../test_modeling_sentence_transformers.py    | 74 -----------------
 tests/openvino/utils_tests.py                 |  2 +
 19 files changed, 385 insertions(+), 426 deletions(-)
 delete mode 100644 .github/workflows/check_code_quality.yml
 create mode 100644 .github/workflows/quality.yml
 delete mode 100644 .github/workflows/test_openvino_basic.yml
 create mode 100644 .github/workflows/test_openvino_slow.yml
 delete mode 100644 tests/openvino/test_modeling_sentence_transformers.py

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index cab800759..916074ced 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -2,7 +2,8 @@ name: Build PR documentation
 
 on:
   pull_request:
-    branches: [ main ]
+    branches:
+      - main
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -10,7 +11,8 @@ concurrency:
 
 jobs:
   build_documentation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
+
     env:
       COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
       PR_NUMBER: ${{ github.event.number }}
@@ -18,20 +20,23 @@ jobs:
       PR_CLONE_URL: ${{ github.event.pull_request.head.repo.clone_url }}
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
-          repository: 'huggingface/doc-builder'
+          repository: "huggingface/doc-builder"
           path: doc-builder
 
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
-          repository: 'huggingface/optimum-intel'
+          repository: "huggingface/optimum-intel"
           path: optimum-intel
 
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+
       - name: Setup environment
         run: |
-          python -m venv venv-doc
-          source venv-doc/bin/activate
           pip uninstall -y doc-builder
           cd doc-builder
           git pull origin main
diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml
deleted file mode 100644
index 4cf7d4cb0..000000000
--- a/.github/workflows/check_code_quality.yml
+++ /dev/null
@@ -1,54 +0,0 @@
-name: check_code_quality
-
-on:
-  push:
-    branches: [ main ]
-    paths:
-      - "optimum/**.py"
-      - "tests/**.py"
-      - "examples/**.py"
-
-  pull_request:
-    branches: [ main ]
-    paths:
-      - "optimum/**.py"
-      - "tests/**.py"
-      - "examples/**.py"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  build:
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.9"]
-        os: [ubuntu-latest]
-
-    runs-on: ${{ matrix.os }}
-    steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Create and start a virtual environment
-      run: |
-        python -m venv venv
-        source venv/bin/activate
-    - name: Install dependencies
-      run: |
-        source venv/bin/activate
-        pip install --upgrade pip
-        pip install .[quality]
-
-    - name: Check style with black
-      run: |
-        source venv/bin/activate
-        black --check .
-    - name: Check style with ruff
-      run: |
-        source venv/bin/activate
-        ruff check .
diff --git a/.github/workflows/dockerfile_sanity.yml b/.github/workflows/dockerfile_sanity.yml
index 12be9a5b1..060b80ca4 100644
--- a/.github/workflows/dockerfile_sanity.yml
+++ b/.github/workflows/dockerfile_sanity.yml
@@ -5,40 +5,40 @@ on:
     branches:
       - main
     paths:
-      - 'docker/Dockerfile.intel'
- 
+      - "docker/Dockerfile.intel"
+
   pull_request:
     branches:
       - main
     paths:
-      - 'docker/Dockerfile.intel'
+      - "docker/Dockerfile.intel"
 
 jobs:
   build_and_run:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
+      - name: Checkout code
+        uses: actions/checkout@v4
 
-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
 
-    - name: Build and Run Docker Image
-      run: |
-        IMAGE_NAME="intel_image:latest"
-        docker build -f docker/Dockerfile.intel -t $IMAGE_NAME .
-        if [ $? -ne 0 ]; then
-          echo "Docker image build failed."
-          exit 1
-        fi
-        CONTAINER_ID=$(docker run -d $IMAGE_NAME tail -f /dev/null)
-        if docker inspect -f '{{.State.Running}}' $CONTAINER_ID 2>/dev/null | grep -q 'true'; then
-          echo "Container is running."
-        else
-          echo "Container failed to start."
-          docker logs $CONTAINER_ID 2>/dev/null || echo "No container ID found."
-          exit 1
-        fi
-        docker stop $CONTAINER_ID
-        docker rm $CONTAINER_ID
\ No newline at end of file
+      - name: Build and Run Docker Image
+        run: |
+          IMAGE_NAME="intel_image:latest"
+          docker build -f docker/Dockerfile.intel -t $IMAGE_NAME .
+          if [ $? -ne 0 ]; then
+            echo "Docker image build failed."
+            exit 1
+          fi
+          CONTAINER_ID=$(docker run -d $IMAGE_NAME tail -f /dev/null)
+          if docker inspect -f '{{.State.Running}}' $CONTAINER_ID 2>/dev/null | grep -q 'true'; then
+            echo "Container is running."
+          else
+            echo "Container failed to start."
+            docker logs $CONTAINER_ID 2>/dev/null || echo "No container ID found."
+            exit 1
+          fi
+          docker stop $CONTAINER_ID
+          docker rm $CONTAINER_ID
diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
new file mode 100644
index 000000000..389503820
--- /dev/null
+++ b/.github/workflows/quality.yml
@@ -0,0 +1,41 @@
+name: Quality Checks
+on:
+  push:
+    branches:
+      - main
+      - v*-release
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  quality:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          # .[quality] installs too many dependencies
+          # TODO: we should remove the the version pinning at some point
+          pip install "black~=23.1" "ruff==0.4.4"
+
+      - name: Check style with black
+        run: |
+          black --check .
+
+      - name: Check style with ruff
+        run: |
+          ruff check .
diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
index 3c2dc94da..378c78da4 100644
--- a/.github/workflows/security.yml
+++ b/.github/workflows/security.yml
@@ -9,6 +9,7 @@ permissions:
 jobs:
   secrets:
     runs-on: ubuntu-latest
+
     steps:
       - shell: bash
         run: |
diff --git a/.github/workflows/test_generation.yml b/.github/workflows/test_generation.yml
index f67cc2c0a..cfa3fde40 100644
--- a/.github/workflows/test_generation.yml
+++ b/.github/workflows/test_generation.yml
@@ -1,12 +1,13 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-name: Intel Generation Utils - Test
+name: Generation Utils - Test (deprecated)
 
 on:
   push:
-    branches: [ main ]
+    branches:
+      - main
+      - v*-release
   pull_request:
-    branches: [ main ]
+    branches:
+      - main
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -14,25 +15,22 @@ concurrency:
 
 jobs:
   build:
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.9", "3.12"]
-        os: [ubuntu-latest]
+    runs-on: ubuntu-22.04
 
-    runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install transformers==4.43.*
-        pip install optimum[exporters]
-        pip install .[tests]
-    - name: Test with Pytest
-      run: |
-        pytest tests/generation/
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[tests] transformers[testing]==4.43.*
+
+      - name: Test with Pytest
+        run: |
+          pytest tests/generation/
diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
index e29dc83f8..c1a75a6e3 100644
--- a/.github/workflows/test_inc.yml
+++ b/.github/workflows/test_inc.yml
@@ -1,6 +1,4 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-name: Intel Neural Compressor - Test
+name: INC - Test
 
 on:
   push:
@@ -20,31 +18,34 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.11"]
-        os: [ubuntu-latest]
+        torch-version: ["2.2.0", "2.3.*", "2.4.*"]
+
+    runs-on: ubuntu-22.04
 
-    runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install cmake
-        pip install py-cpuinfo
-        pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cpu
-        pip install intel-extension-for-pytorch==2.4.0
-        pip install datasets==2.19.0
-        pip install .[neural-compressor,diffusers,tests]
-        pip install peft
-
-    - name: Test with Pytest
-      run: |
-        pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0
-    - name: Test IPEX
-      run: |
-        pytest tests/neural_compressor/test_ipex.py
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install torch==${{ matrix.torch-version }} torchaudio torchvision --index-url https://download.pytorch.org/whl/cpu
+          pip install .[neural-compressor,ipex,diffusers,peft,tests] transformers[testing] intel-extension-for-pytorch==${{ matrix.torch-version }}
+
+      - if: ${{ matrix.torch-version == '2.2.0' }}
+        name: Downgrade Numpy
+        run: pip install numpy==1.*
+
+      - name: Assert versions
+        run: |
+          python -c "import torch; print(torch.__version__); assert torch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
+          python -c "import intel_extension_for_pytorch; print(intel_extension_for_pytorch.__version__); assert intel_extension_for_pytorch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
 
+      - name: Test with Pytest
+        run: |
+          pytest tests/neural_compressor
diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 1c1d12870..a14fc7337 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -1,6 +1,4 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-name: Intel IPEX - Test
+name: IPEX - Test
 
 on:
   push:
@@ -17,36 +15,39 @@ concurrency:
 
 jobs:
   build:
-    runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9"]
+        torch-version: ["2.2.0", "2.3.*", "2.4.*"]
         transformers-version: ["4.39.0", "4.44.*"]
-        ipex-version: ["2.2.0", "2.3.*"]
-        include:
-          - python-version: "3.10"
-            transformers-version: "4.39.0"
-            ipex-version: "2.2.0"
+
+    runs-on: ubuntu-22.04
 
     steps:
-      - uses: actions/checkout@v2
-      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: 3.9
+
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          pip install torch==${{ matrix.ipex-version }} --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install intel_extension_for_pytorch==${{ matrix.ipex-version }}
-          pip install Pillow parameterized
-          pip install transformers[testing]==${{ matrix.transformers-version }}
-          pip install .[ipex]
-
-      - if: ${{ matrix.ipex-version == '2.2.0' }}
+          pip install --upgrade pip
+          pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
+
+      - if: ${{ matrix.torch-version == '2.2.0' }}
+        name: Downgrade Numpy
         run: pip install numpy==1.*
 
+      - name: Assert versions
+        run: |
+          python -c "import torch; print(torch.__version__); assert torch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
+          python -c "import intel_extension_for_pytorch; print(intel_extension_for_pytorch.__version__); assert intel_extension_for_pytorch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"
+          python -c "import transformers; print(transformers.__version__); assert transformers.__version__.startswith('${{ matrix.transformers-version }}'.replace('.*', ''))"
+
       - name: Test with Pytest
         run: |
-          pytest tests/ipex/
+          pytest tests/ipex
diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml
index 2e97a2f12..d2599faa5 100644
--- a/.github/workflows/test_offline.yaml
+++ b/.github/workflows/test_offline.yaml
@@ -1,10 +1,13 @@
-name: Offline usage / Python - Test
+name: Offline - Test
 
 on:
   push:
-    branches: [main]
+    branches:
+      - main
+      - v*-release
   pull_request:
-    branches: [main]
+    branches:
+      - main
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -12,22 +15,21 @@ concurrency:
 
 jobs:
   build:
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.9"]
-        os: [ubuntu-latest]
+    runs-on: ubuntu-22.04
 
-    runs-on: ${{ matrix.os }}
     steps:
-      - uses: actions/checkout@v3
-      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v3
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: 3.9
+
       - name: Install dependencies
         run: |
           pip install .[tests,openvino]
+
       - name: Test
         run: |
           HF_HOME=/tmp/ huggingface-cli download hf-internal-testing/tiny-random-gpt2
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index bfec51e48..e2889cb4e 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -1,5 +1,3 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 name: OpenVINO - Test
 
 on:
@@ -20,40 +18,55 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.12"]
+        test-pattern:
+          [
+            "*modeling*",
+            "*diffusion*",
+            "*quantization*",
+            "*training*",
+            "*export*",
+          ]
         transformers-version: ["4.36.0", "latest"]
-        os: [ubuntu-latest]
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-22.04
+
     steps:
-      - uses: actions/checkout@v4
-      - name: Setup Python ${{ matrix.python-version }}
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Install lowest compatible transformers version
-        if: ${{ matrix.transformers-version != 'latest' }}
-        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.*
+          python-version: 3.9
 
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU
+          pip install --upgrade pip
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
+          pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing]
+
+      - if: ${{ matrix.transformers-version != 'latest' }}
+        name: Downgrade Transformers and Accelerate
+        run: |
+          pip install transformers==${{ matrix.transformers-version }} accelerate==0.*
+
+      - if: ${{ matrix.test-pattern == '*modeling*' }}
+        name: Uninstall NNCF
+        run: |
+          pip uninstall -y nncf
 
       - name: Test with Pytest
+        run: |
+          pytest tests/openvino/${{ matrix.test-pattern }} --durations=0
         env:
           HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+
+      - if: ${{ matrix.test-pattern == '*modeling*' }}
+        name: Install dependencies (nightly)
         run: |
-          pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0
-      - name: Test basic
-        run: |
-          pip uninstall -y nncf
-          pytest tests/openvino/test_modeling_basic.py
-      - name: Test openvino-nightly
+          pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+
+      - if: ${{ matrix.test-pattern == '*modeling*' }}
+        name: Test with Pytest (nightly)
         run: |
-          pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
-          optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov
+          pytest tests/openvino/test_modeling_basic.py --durations=0
diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml
deleted file mode 100644
index 7ea6898fa..000000000
--- a/.github/workflows/test_openvino_basic.yml
+++ /dev/null
@@ -1,83 +0,0 @@
-name: OpenVINO - Basic Test
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: "41 1 * * *" # run every day at 1:41
-  push:
-    branches:
-      - v*-release
-  pull_request:
-    types: [opened, synchronize, reopened, labeled]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  build:
-    if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') ||  (github.event_name == 'push') || contains( github.event.pull_request.labels.*.name, 'openvino-test') }}
-    strategy:
-      fail-fast: false
-      matrix:
-        # Testing lower and upper bound of supported Python versions
-        # This also ensures that the test fails if dependencies break for Python 3.7
-        python-version: ["3.9", "3.12"]
-        os: ["ubuntu-22.04", "windows-latest"]
-        transformers-version: ["latest"]
-        openvino: ["openvino openvino-tokenizers"]
-        nncf: ["nncf"]
-        include:
-          - python-version: "3.12"
-            os: "ubuntu-22.04"
-            transformers-version: "4.36.0"
-            openvino: "openvino openvino-tokenizers"
-            nncf: "nncf"
-          - python-version: "3.12"
-            os: "ubuntu-22.04"
-            transformers-version: "latest"
-            openvino: "--pre -U openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly"
-            nncf: "nncf"
-          - python-version: "3.12"
-            os: "ubuntu-22.04"
-            transformers-version: "latest"
-            openvino: "--pre -U openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly"
-            nncf: "git+https://github.com/openvinotoolkit/nncf.git"
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Install lowest compatible transformers version
-        if: ${{ matrix.transformers-version != 'latest' }}
-        run: pip install transformers==${{ matrix.transformers-version }}
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
-          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          # Install openvino manually to prevent dependency conflicts when .[openvino] pins
-          # optimum or transformers to a specific version
-          pip install ${{ matrix.openvino }}
-          pip install .[tests]
-
-      - name: Pip freeze
-        run: pip freeze
-
-      - name: Test with Pytest
-        run: |
-          pytest tests/openvino/test_modeling_basic.py
-
-      - name: Slow tests
-        run: |
-          pip install ${{ matrix.nncf }}
-          pytest tests/openvino -s -m "run_slow" --durations=0
-        env:
-          RUN_SLOW: 1
-          HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
diff --git a/.github/workflows/test_openvino_examples.yml b/.github/workflows/test_openvino_examples.yml
index 872058d24..5b1e8e9df 100644
--- a/.github/workflows/test_openvino_examples.yml
+++ b/.github/workflows/test_openvino_examples.yml
@@ -3,15 +3,15 @@ name: OpenVINO - Examples Test
 on:
   workflow_dispatch:
   schedule:
-    - cron: 0 1 * * 1  # run weekly: every Monday at 1am
+    - cron: 0 1 * * 1 # run weekly: every Monday at 1am
   push:
     paths:
-    - '.github/workflows/test_openvino_examples.yml'
-    - 'examples/openvino/**'
+      - ".github/workflows/test_openvino_examples.yml"
+      - "examples/openvino/**"
   pull_request:
     paths:
-    - '.github/workflows/test_openvino_examples.yml'
-    - 'examples/openvino/**'
+      - ".github/workflows/test_openvino_examples.yml"
+      - "examples/openvino/**"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -27,20 +27,22 @@ jobs:
     runs-on: ubuntu-22.04
 
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-
-    - name: Install dependencies
-      run: |
-        pip install .[openvino] jstyleson pytest
-        pip install -r examples/openvino/audio-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
-        pip install -r examples/openvino/image-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
-        pip install -r examples/openvino/question-answering/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
-        pip install -r examples/openvino/text-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
-
-    - name: Test examples
-      run: |
-        python -m pytest examples/openvino/test_examples.py
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          pip install -r examples/openvino/audio-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install -r examples/openvino/image-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install -r examples/openvino/question-answering/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install -r examples/openvino/text-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install .[openvino] jstyleson pytest
+
+      - name: Test examples
+        run: |
+          pytest examples/openvino/test_examples.py
diff --git a/.github/workflows/test_openvino_notebooks.yml b/.github/workflows/test_openvino_notebooks.yml
index 24eb3b4f1..8e3095b67 100644
--- a/.github/workflows/test_openvino_notebooks.yml
+++ b/.github/workflows/test_openvino_notebooks.yml
@@ -3,16 +3,15 @@ name: OpenVINO - Notebooks Test
 on:
   workflow_dispatch:
   schedule:
-    - cron:  '14 3 * * 1'  # run weekly: every Monday at 3:14
+    - cron: "14 3 * * 1" # run weekly: every Monday at 3:14
   push:
     paths:
-    - '.github/workflows/test_openvino_notebooks.yml'
-    - 'notebooks/openvino/*'
+      - ".github/workflows/test_openvino_notebooks.yml"
+      - "notebooks/openvino/*"
   pull_request:
     paths:
-    - '.github/workflows/test_openvino_notebooks.yml'
-    - 'notebooks/openvino/*'
-
+      - ".github/workflows/test_openvino_notebooks.yml"
+      - "notebooks/openvino/*"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -28,28 +27,24 @@ jobs:
     runs-on: ubuntu-22.04
 
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-
-    - name: Install dependencies
-      run: |
-        # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
-        # ffmpeg, torchaudio and pillow are required for image classification and audio classification pipelines
-        sudo apt-get install ffmpeg
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-        pip install -r notebooks/openvino/requirements.txt
-        pip install .[tests,openvino] nbval
-
-    - run: free -h
-    - run: lscpu
-    - run: pip freeze
-
-    - name: Test with Pytest
-      run: |
-        sed -i 's/NUM_TRAIN_ITEMS = 600/NUM_TRAIN_ITEMS = 10/' notebooks/openvino/question_answering_quantization.ipynb
-        sed -i 's/# %pip install/%pip install/' notebooks/openvino/optimum_openvino_inference.ipynb
-        python -m pytest --nbval-lax notebooks/openvino/optimum_openvino_inference.ipynb  notebooks/openvino/question_answering_quantization.ipynb
-
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          # ffmpeg is required for image classification and audio classification pipelines
+          sudo apt-get install ffmpeg
+          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install -r notebooks/openvino/requirements.txt
+          pip install .[tests,openvino] nbval
+
+      - name: Test with Pytest
+        run: |
+          sed -i 's/NUM_TRAIN_ITEMS = 600/NUM_TRAIN_ITEMS = 10/' notebooks/openvino/question_answering_quantization.ipynb
+          sed -i 's/# %pip install/%pip install/' notebooks/openvino/optimum_openvino_inference.ipynb
+          python -m pytest --nbval-lax notebooks/openvino/optimum_openvino_inference.ipynb  notebooks/openvino/question_answering_quantization.ipynb
diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml
new file mode 100644
index 000000000..bf52413a7
--- /dev/null
+++ b/.github/workflows/test_openvino_slow.yml
@@ -0,0 +1,75 @@
+name: OpenVINO - Slow Test
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "41 1 * * *" # run every day at 1:41
+  push:
+    branches:
+      - v*-release
+  pull_request:
+    types:
+      - opened
+      - labeled
+      - reopened
+      - synchronize
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') ||  (github.event_name == 'push') || contains( github.event.pull_request.labels.*.name, 'openvino-test') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-22.04", "windows-2019"]
+        openvino-version: ["stable", "nightly"]
+        transformers-version: ["4.36.0", "latest"]
+        nncf: ["nncf", "git+https://github.com/openvinotoolkit/nncf.git"]
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install .[openvino,tests] transformers[testing]
+          pip uninstall -y nncf
+
+      - if: ${{ matrix.openvino-version == 'nightly' }}
+        name: Install nightly OpenVINO
+        run: |
+          pip install openvino openvino-tokenizers --pre --upgrade --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+
+      - if: ${{ matrix.transformers-version != 'latest' }}
+        name: Downgrade Transformers and Accelerate
+        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.*
+
+      - name: Pip freeze
+        run: pip freeze
+
+      - name: Test with Pytest (basic)
+        run: |
+          pytest tests/openvino/test_modeling_basic.py
+
+      - name: Install dependencies (slow)
+        run: |
+          pip install ${{ matrix.nncf }}
+
+      - name: Test with Pytest (slow)
+        run: |
+          pytest tests/openvino -m "run_slow" --durations=0
+        env:
+          RUN_SLOW: 1
+          HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 18d8a7506..51041a2fb 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -27,8 +27,7 @@
 import numpy as np
 import openvino
 import torch
-from diffusers.configuration_utils import ConfigMixin
-from diffusers.pipelines import (
+from diffusers import (
     AutoPipelineForImage2Image,
     AutoPipelineForInpainting,
     AutoPipelineForText2Image,
@@ -41,7 +40,9 @@
     StableDiffusionXLImg2ImgPipeline,
     StableDiffusionXLInpaintPipeline,
     StableDiffusionXLPipeline,
+    pipelines,
 )
+from diffusers.configuration_utils import ConfigMixin
 from diffusers.schedulers import SchedulerMixin
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils.constants import CONFIG_NAME
@@ -409,16 +410,19 @@ def _from_pretrained(
             "tokenizer_2": None,
             "tokenizer_3": None,
             "feature_extractor": None,
-            "image_encoder": None,
             "safety_checker": None,
+            "image_encoder": None,
         }
         for name in submodels.keys():
-            if kwargs.get(name) is not None:
+            if name in kwargs:
                 submodels[name] = kwargs.pop(name)
             elif config.get(name, (None, None))[0] is not None:
-                library_name, library_classes = config.get(name)
-                library = importlib.import_module(library_name)
-                class_obj = getattr(library, library_classes)
+                module_name, module_class = config.get(name)
+                if hasattr(pipelines, module_name):
+                    module = getattr(pipelines, module_name)
+                else:
+                    module = importlib.import_module(module_name)
+                class_obj = getattr(module, module_class)
                 load_method = getattr(class_obj, "from_pretrained")
                 # Check if the module is in a subdirectory
                 if (model_save_path / name).is_dir():
diff --git a/setup.py b/setup.py
index 9ac8dce70..7ef3652f8 100644
--- a/setup.py
+++ b/setup.py
@@ -28,8 +28,8 @@
 
 INSTALL_REQUIRE = [
     "torch>=1.11",
-    "transformers>=4.36,<4.47",
     "optimum~=1.23",
+    "transformers>=4.36,<4.47",
     "datasets>=1.4.0",
     "sentencepiece",
     "setuptools",
@@ -55,18 +55,15 @@
     "tiktoken",
     "sentence-transformers",
     "open_clip_torch>=2.26.1",
+    "peft",
 ]
 
 QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"]
 
 EXTRAS_REQUIRE = {
-    "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
-    "openvino": [
-        "openvino==2024.4.1.dev20240926",
-        "nncf>=2.11.0",
-        "openvino-tokenizers[transformers]==2024.4.1.0.dev20240926",
-    ],
     "nncf": ["nncf>=2.11.0"],
+    "openvino": ["nncf>=2.11.0", "openvino==2024.4.1.dev20240926", "openvino-tokenizers==2024.4.1.0.dev20240926"],
+    "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
     "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.45"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
@@ -88,7 +85,6 @@
         "Intended Audience :: Education",
         "Intended Audience :: Science/Research",
         "Operating System :: OS Independent",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 6c68438c7..169701e4a 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -32,6 +32,7 @@
 from huggingface_hub import HfApi
 from parameterized import parameterized
 from PIL import Image
+from sentence_transformers import SentenceTransformer
 from transformers import (
     AutoConfig,
     AutoFeatureExtractor,
@@ -2414,3 +2415,36 @@ def test_functions(self):
 
         del model
         gc.collect()
+
+
+class OVModelForSTFeatureExtractionIntegrationTest(unittest.TestCase):
+    SUPPORTED_ARCHITECTURES = ("st-bert", "st-mpnet")
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_transformers(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        ov_model = OVSentenceTransformer.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        self.assertIsInstance(ov_model.config, PretrainedConfig)
+        self.assertTrue(hasattr(ov_model, "encode"))
+        st_model = SentenceTransformer(model_id)
+        sentences = ["This is an example sentence", "Each sentence is converted"]
+        st_embeddings = st_model.encode(sentences)
+        ov_embeddings = ov_model.encode(sentences)
+        # Compare tensor outputs
+        self.assertTrue(np.allclose(ov_embeddings, st_embeddings, atol=1e-4))
+        del st_embeddings
+        del ov_model
+        gc.collect()
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_sentence_transformers_save_and_infer(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        ov_model = OVSentenceTransformer.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        with TemporaryDirectory() as tmpdirname:
+            model_save_path = os.path.join(tmpdirname, "sentence_transformers_ov_model")
+            ov_model.save_pretrained(model_save_path)
+            model = OVSentenceTransformer.from_pretrained(model_save_path)
+            sentences = ["This is an example sentence", "Each sentence is converted"]
+            model.encode(sentences)
+        gc.collect()
diff --git a/tests/openvino/test_modeling_sentence_transformers.py b/tests/openvino/test_modeling_sentence_transformers.py
deleted file mode 100644
index 0ddd60ea0..000000000
--- a/tests/openvino/test_modeling_sentence_transformers.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#  Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import gc
-import os
-import unittest
-
-import numpy as np
-from parameterized import parameterized
-from sentence_transformers import SentenceTransformer
-from transformers import (
-    PretrainedConfig,
-    set_seed,
-)
-
-from optimum.intel import OVSentenceTransformer
-from optimum.intel.openvino.utils import TemporaryDirectory
-
-
-SEED = 42
-
-F32_CONFIG = {"INFERENCE_PRECISION_HINT": "f32"}
-
-MODEL_NAMES = {
-    "bert": "sentence-transformers/all-MiniLM-L6-v2",
-    "mpnet": "sentence-transformers/all-mpnet-base-v2",
-}
-
-
-class OVModelForSTFeatureExtractionIntegrationTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = (
-        "bert",
-        "mpnet",
-    )
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_compare_to_transformers(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        set_seed(SEED)
-        ov_model = OVSentenceTransformer.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
-        self.assertIsInstance(ov_model.config, PretrainedConfig)
-        self.assertTrue(hasattr(ov_model, "encode"))
-        st_model = SentenceTransformer(model_id)
-        sentences = ["This is an example sentence", "Each sentence is converted"]
-        st_embeddings = st_model.encode(sentences)
-        ov_embeddings = ov_model.encode(sentences)
-        # Compare tensor outputs
-        self.assertTrue(np.allclose(ov_embeddings, st_embeddings, atol=1e-4))
-        del st_embeddings
-        del ov_model
-        gc.collect()
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_sentence_transformers_save_and_infer(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        ov_model = OVSentenceTransformer.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
-        with TemporaryDirectory() as tmpdirname:
-            model_save_path = os.path.join(tmpdirname, "sentence_transformers_ov_model")
-            ov_model.save_pretrained(model_save_path)
-            model = OVSentenceTransformer.from_pretrained(model_save_path)
-            sentences = ["This is an example sentence", "Each sentence is converted"]
-            model.encode(sentences)
-        gc.collect()
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index f062ded11..2d2d71330 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -150,6 +150,8 @@
     "glm4": "katuni4ka/tiny-random-glm4",
     "open-clip": "hf-internal-testing/tiny-open-clip-model",
     "open-clip-ov": "zofinka/tiny-open-clip-model",
+    "st-bert": "sentence-transformers/all-MiniLM-L6-v2",
+    "st-mpnet": "sentence-transformers/all-mpnet-base-v2",
 }
 
 

From 222748e16f2a9c4881eb4b8f68d00cad32eda4a9 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Fri, 8 Nov 2024 11:55:09 +0400
Subject: [PATCH 35/53] fix conversion for text embeddings for fp16 models
 (#968)

* fix conversion for text embeddings for fp16 models

* fix rebasing issue

* apply review comments

* Update tests/openvino/utils_tests.py
---
 optimum/exporters/openvino/model_configs.py |  7 +++++++
 optimum/exporters/openvino/model_patcher.py | 21 +++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 9dbcacb7f..5276ade33 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -69,6 +69,7 @@
     GptNeoxJapaneseModelPatcher,
     GptNeoxModelPatcher,
     IBertModelPatcher,
+    InputEmbeddingPatcher,
     InternLM2Patcher,
     InternLMModelPatcher,
     InternVLChatImageEmbeddingModelPatcher,
@@ -1264,6 +1265,12 @@ def rename_ambiguous_inputs(self, inputs):
         model_inputs["input"] = inputs["input_ids"]
         return model_inputs
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        # making 16bit tracable overrides embeedings input signature these changes required to prevent this issue
+        return InputEmbeddingPatcher(self, model, model_kwargs)
+
 
 class LlavaConfigBehavior(str, enum.Enum):
     LANGUAGE = "language"
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 8507d94fe..dbbfb5662 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -2991,3 +2991,24 @@ def __init__(
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         self._model.forward = self._model.__orig_forward
+
+
+class InputEmbeddingPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+
+        def forward(self, input):
+            return self.__orig_forward(input)
+
+        model.forward = types.MethodType(forward, model)
+
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward

From c8876100f8ebf18666d2e5301d94fdb08e6002ee Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Fri, 8 Nov 2024 11:59:51 +0400
Subject: [PATCH 36/53] fix switching between legacy and new processing for
 llava (#970)

* fix switching between legacy and new processing for llava

* extend tests

* update legacy processing path

* replace llava test model

* Update tests/openvino/test_modeling.py
---
 .../openvino/modeling_visual_language.py      | 98 +++++++++----------
 tests/openvino/test_modeling.py               | 57 ++++++++++-
 tests/openvino/utils_tests.py                 |  2 +-
 3 files changed, 104 insertions(+), 53 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 74d7c88d6..80095fece 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -697,6 +697,33 @@ def can_generate(self):
 
 
 class _OVLlavaForCausalLM(OVModelForVisualCausalLM):
+    def __init__(
+        self,
+        language_model: ov.Model,
+        text_embeddings: ov.Model,
+        vision_embeddings: ov.Model,
+        config: PretrainedConfig = None,
+        device: str = "CPU",
+        dynamic_shapes: bool = True,
+        ov_config: Optional[Dict[str, str]] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            language_model=language_model,
+            text_embeddings=text_embeddings,
+            vision_embeddings=vision_embeddings,
+            config=config,
+            device=device,
+            dynamic_shapes=dynamic_shapes,
+            ov_config=ov_config,
+            model_save_dir=model_save_dir,
+            quantization_config=quantization_config,
+            **kwargs,
+        )
+        self._support_new_processing = hasattr(self.config, "image_seq_length")
+
     def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
         if input_ids is not None and input_ids.shape[1] == 1:
             return None
@@ -725,17 +752,11 @@ def merge_vision_text_embeddings(
         input_ids,
         attention_mask,
         position_ids=None,
-        legacy_processing=None,
+        legacy_processing=False,
         **kwargs,
     ):
         image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
         inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
-        if legacy_processing is None:
-            legacy_processing = (
-                not hasattr(self.config, "image_seq_length")
-                or ((input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length)
-                or (input_ids.shape[-1] == 1)
-            )
 
         if legacy_processing:
             pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
@@ -768,15 +789,6 @@ def merge_vision_text_embeddings(
             final_attention_mask = torch.zeros(
                 batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
             )
-            # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
-            # set the corresponding tensors into their correct target device.
-            target_device = inputs_embeds.device
-            batch_indices, non_image_indices, text_to_overwrite = (
-                batch_indices.to(target_device),
-                non_image_indices.to(target_device),
-                text_to_overwrite.to(target_device),
-            )
-            attention_mask = attention_mask.to(target_device)
 
             # 4. Fill the embeddings based on the mask. If we have ["hey" "", "how", "are"]
             # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
@@ -787,7 +799,7 @@ def merge_vision_text_embeddings(
                 (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
             )
             image_to_overwrite[batch_indices, text_to_overwrite] = False
-            image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+            image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None]
 
             if image_to_overwrite.sum() != image_features.shape[:-1].numel():
                 raise ValueError(
@@ -795,7 +807,7 @@ def merge_vision_text_embeddings(
                     f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
                 )
 
-            final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+            final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim)
             final_attention_mask |= image_to_overwrite
             position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
 
@@ -815,11 +827,12 @@ def merge_vision_text_embeddings(
     def get_multimodal_embeddings(
         self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, past_key_values=None, **kwargs
     ):
-        legacy_processing = (
-            not hasattr(self.config, "image_seq_length")
-            or ((input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length)
-            or (input_ids.shape[-1] == 1 and pixel_values is not None)
-        )
+        if pixel_values is not None and self._support_new_processing and past_key_values is None:
+            legacy_processing = (input_ids == self.config.image_token_index).sum(
+                1
+            ).max() < self.config.image_seq_length
+        else:
+            legacy_processing = True
         inputs_embeds, attention_mask, position_ids = super().get_multimodal_embeddings(
             input_ids, pixel_values, attention_mask, position_ids, legacy_processing=legacy_processing, **kwargs
         )
@@ -830,19 +843,9 @@ def get_multimodal_embeddings(
         return inputs_embeds, attention_mask, position_ids
 
     def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values):
-        if not self.language_model.stateful:
-            first_layer_past_key_value = torch.from_numpy(past_key_values[0][0][:, :, :, 0])
-        else:
-            first_layer_past_key_value = torch.from_numpy(
-                self.language_model.request.query_state()[0].state.data[:, :, :, 0]
-            )
-
-        # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-        batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
         # Get the target length
         target_length = input_ids.shape[1]
-        past_length = first_layer_past_key_value.shape[-1]
+        past_length = self.language_model._get_past_length(past_key_values)
 
         extended_attention_mask = torch.ones(
             (attention_mask.shape[0], past_length),
@@ -850,18 +853,9 @@ def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values):
             device=attention_mask.device,
         )
 
-        # Filter out only the tokens that can be un-attended, this can happen
-        # if one uses Llava + Fused modules where the cache on the
-        # first iteration is already big enough, or if one passes custom cache
-        valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-        new_batch_index = batch_index[valid_indices]
-        new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-        # Zero-out the places where we don't need to attend
-        extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
         attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-        position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+        position_ids = torch.cumsum(attention_mask, axis=1) - 1
+        position_ids[attention_mask == 0] = 1
         return attention_mask, position_ids
 
 
@@ -938,11 +932,13 @@ def get_multimodal_embeddings(
 
         inputs_embeds = self.get_text_embeddings(input_ids, **kwargs)
 
-        legacy_processing = (
-            not hasattr(self.config, "image_seq_length")
-            or ((input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length)
-            or (input_ids.shape[-1] == 1 and pixel_values is not None)
-        )
+        if pixel_values is not None and self._support_new_processing and past_key_values is None:
+            legacy_processing = (input_ids == self.config.image_token_index).sum(
+                1
+            ).max() < self.config.image_seq_length
+        else:
+            legacy_processing = True
+
         if pixel_values is not None and pixel_values.size(0) > 0:
             # ! infer image_num_patches from image_sizes
             image_num_patches = [
@@ -996,7 +992,7 @@ def merge_vision_text_embeddings(
         input_ids,
         attention_mask,
         position_ids=None,
-        legacy_processing=None,
+        legacy_processing=False,
         **kwargs,
     ):
         image_token_index = self.config.image_token_index
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 169701e4a..916833602 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1983,12 +1983,67 @@ def test_compare_to_transformers(self, model_arch):
             torch.equal(ov_outputs, transformers_outputs),
             f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model output {ov_outputs}",
         )
-
         del transformers_model
         del ov_model
 
         gc.collect()
 
+    @parameterized.expand(["llava", "llava_next"])
+    @unittest.skipIf(
+        is_transformers_version("<", "4.45.0"), reason="New preprocessing available only in transformers >= 4.45"
+    )
+    def test_llava_with_new_preprocessing(self, model_arch):
+        prompt = "\n What is shown in this image?"
+        model_id = MODEL_NAMES[model_arch]
+        config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
+        processor = AutoProcessor.from_pretrained(
+            model_id,
+            patch_size=config.vision_config.patch_size,
+            vision_feature_select_strategy=config.vision_feature_select_strategy,
+            trust_remote_code=model_arch in self.REMOTE_CODE_MODELS,
+        )
+        transformers_model = self.get_transformer_model_class(model_arch).from_pretrained(model_id)
+        ov_model = OVModelForVisualCausalLM.from_pretrained(
+            model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+        )
+        self.assertTrue(ov_model._support_new_processing)
+        self.assertTrue(processor.patch_size is not None)
+        self.assertTrue(processor.vision_feature_select_strategy is not None)
+        inputs = processor(images=self.IMAGE, text=prompt, return_tensors="pt")
+        self.assertTrue(
+            (inputs.input_ids == ov_model.config.image_token_index).sum(1).max() >= ov_model.config.image_seq_length
+        )
+        set_seed(SEED)
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**inputs)
+        set_seed(SEED)
+        ov_outputs = ov_model(**inputs)
+        self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
+        ov_model.generation_config.eos_token_id = None
+        transformers_model.generation_config.eos_token_id = None
+        ov_model.config.eos_token_id = None
+        transformers_model.config.eos_token_id = None
+        gen_config = GenerationConfig(
+            max_new_tokens=30,
+            min_new_tokens=30,
+            num_beams=3,
+            do_sample=False,
+            eos_token_id=None,
+        )
+        set_seed(SEED)
+        ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
+        set_seed(SEED)
+        with torch.no_grad():
+            transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config)
+        self.assertTrue(
+            torch.equal(ov_outputs, transformers_outputs),
+            f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model output {ov_outputs}",
+        )
+
+        del ov_model
+        del transformers_model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_generate_utils(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 2d2d71330..129defc82 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -76,7 +76,7 @@
     "llama": "HuggingFaceM4/tiny-random-LlamaForCausalLM",
     "llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM",
     "llama_gptq": "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-    "llava": "trl-internal-testing/tiny-random-LlavaForConditionalGeneration",
+    "llava": "katuni4ka/tiny-random-llava",
     "llava_next": "katuni4ka/tiny-random-llava-next",
     "m2m_100": "hf-internal-testing/tiny-random-m2m_100",
     "opt": "hf-internal-testing/tiny-random-OPTModel",

From a8e69a357772b73f456d1ef45c235c10f04f9567 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev 
Date: Mon, 11 Nov 2024 16:27:54 +0100
Subject: [PATCH 37/53] Quantization support for CausalVisualLMs (#951)

* Quantization support for CausalVisualLMs

* Tweaks

* Add tests

* Fix test

* Added a data-aware compression test for llava-next

* Add assemble_inputs() method to OVModelForVisualCausalLM

* Add support for minicpmv

* Add support for nanollava

* Add group size

* Fix test

* Added support for cli compression

* Tweak refs

* Fix test

* Rename assemble_input; fix tests

* Addressed suggested changes
---
 optimum/commands/export/openvino.py           |  15 +-
 optimum/intel/openvino/configuration.py       |  25 ++-
 .../openvino/modeling_visual_language.py      | 134 +++++++++++---
 optimum/intel/openvino/quantization.py        |  79 ++++++++-
 optimum/intel/openvino/utils.py               |   9 +
 tests/openvino/test_exporters_cli.py          |  31 +++-
 tests/openvino/test_quantization.py           | 163 ++++++++++++++----
 tests/openvino/utils_tests.py                 |  11 +-
 8 files changed, 391 insertions(+), 76 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 70d2e4885..2b031bad9 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -329,11 +329,18 @@ def run(self):
             model.save_pretrained(self.args.output)
             if not self.args.disable_convert_tokenizer:
                 maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
-        elif task.startswith("text-generation") and quantize_with_dataset:
-            from optimum.intel import OVModelForCausalLM
+        elif (task.startswith("text-generation") or task == "image-text-to-text") and quantize_with_dataset:
+            if task.startswith("text-generation"):
+                from optimum.intel import OVModelForCausalLM
 
-            # To quantize a text-generation model with a dataset, an instantiated OVModelForCausalLM is required
-            model = OVModelForCausalLM.from_pretrained(
+                model_cls = OVModelForCausalLM
+            else:
+                from optimum.intel import OVModelForVisualCausalLM
+
+                model_cls = OVModelForVisualCausalLM
+
+            # To quantize a model with a dataset, an instance of a model class is required
+            model = model_cls.from_pretrained(
                 self.args.model,
                 export=True,
                 quantization_config=quantization_config,
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index 403498ff7..6892d9308 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -26,6 +26,7 @@
 from optimum.configuration_utils import BaseConfig
 
 from ..utils.import_utils import is_nncf_available
+from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
 
 
 if is_nncf_available():
@@ -350,6 +351,11 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
         qptq (`bool`, *optional*):
             Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the
             difference between activations of a compressed and original layer. Dataset is required to run GPTQ.
+        processor (`str`, *optional*):
+            A transformers processor used to process inputs for multi-modal models. You can pass either:
+                - A string, the *model id* of a predefined processor hosted inside a model repo on huggingface.co.
+                - A path to a *directory* containing files required by the processor, for instance saved
+                    using the [`~AutoProcessor.save_pretrained`] method, e.g., `./my_model_directory/`.
     """
 
     def __init__(
@@ -369,6 +375,7 @@ def __init__(
         scale_estimation: bool = None,
         weight_format: Optional[str] = None,
         gptq: bool = None,
+        processor: Optional[str] = None,
         **kwargs,
     ):
         super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples)
@@ -383,6 +390,7 @@ def __init__(
         self.scale_estimation = scale_estimation
         self.weight_format = weight_format
         self.gptq = gptq
+        self.processor = processor
         self.post_init()
 
     def post_init(self):
@@ -400,16 +408,14 @@ def post_init(self):
                 f"If you wish to provide a custom dataset, please use the `OVQuantizer` instead."
             )
         if self.dataset is not None and isinstance(self.dataset, str):
-            llm_datasets = ["wikitext2", "c4", "c4-new"]
-            stable_diffusion_datasets = [
-                "conceptual_captions",
-                "laion/220k-GPT4Vision-captions-from-LIVIS",
-                "laion/filtered-wit",
-            ]
-            if self.dataset not in llm_datasets + stable_diffusion_datasets:
+            lm_datasets = ["wikitext2", "c4", "c4-new"]
+            visual_lm_datasets = list(PREDEFINED_VISUAL_LM_DATASETS.keys())
+            stable_diffusion_datasets = list(PREDEFINED_SD_DATASETS.keys())
+            if self.dataset not in lm_datasets + visual_lm_datasets + stable_diffusion_datasets:
                 raise ValueError(
                     f"""You have entered a string value for dataset. You can only choose between
-                    {llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
+                    {lm_datasets} for LLMs, {visual_lm_datasets} for visual LLMs
+                    or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
                 )
 
         if self.bits not in [4, 8]:
@@ -444,6 +450,9 @@ def post_init(self):
         if self.tokenizer is not None and not isinstance(self.tokenizer, str):
             raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}")
 
+        if self.processor is not None and not isinstance(self.processor, str):
+            raise ValueError(f"Processor is expected to be a string, but found {self.processor}")
+
         if self.weight_format is None:
             self.weight_format = "int4" if self.bits == 4 else "int8"
         if self.weight_format not in ["int4", "int8", "mxfp4"]:
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 80095fece..8f72a7353 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -1,6 +1,8 @@
+import copy
 import logging
 import os
 import warnings
+from abc import abstractmethod
 from pathlib import Path
 from typing import Dict, Optional, Tuple, Union
 
@@ -10,11 +12,19 @@
 from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from openvino._offline_transformations import apply_moc_transformations, compress_model_transformation
-from transformers import AutoConfig, GenerationConfig, GenerationMixin, PretrainedConfig
+from PIL.Image import Image
+from transformers import (
+    AutoConfig,
+    GenerationConfig,
+    GenerationMixin,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+)
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 
 from ...exporters.openvino import main_export
 from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name
+from .. import OVQuantizer
 from .configuration import OVConfig, OVWeightQuantizationConfig
 from .modeling_base import OVBaseModel, OVModelPart
 from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM
@@ -181,6 +191,7 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
         self._main_input = "images" if model_has_input_output_name(self.model, "images") else "pixel_values"
 
     def forward(self, pixel_values, **kwargs):
+        self._compile()
         inputs = {self._main_input: pixel_values}
         if len(self.input_names) > 1:
             for name in self.input_names:
@@ -210,6 +221,7 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
         self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
 
     def forward(self, image_feature, pos_embed, key_padding_mask):
+        self._compile()
         result = self.request(
             {"image_feature": image_feature, "pos_embed": pos_embed, "key_padding_mask": key_padding_mask}
         )[0]
@@ -244,7 +256,7 @@ def __init__(
         self.ov_config = {} if ov_config is None else {**ov_config}
         self.preprocessors = kwargs.get("preprocessors", [])
         self.lm_model = language_model
-        self.text_embdings_model = text_embeddings
+        self.text_embeddings_model = text_embeddings
         self.vision_embeddings_model = vision_embeddings
         self._supports_cache_class = False
         self.main_input_name = "input_ids"
@@ -261,13 +273,13 @@ def __init__(
         self._set_ov_config_parameters()
         self.language_model = OVModelWithEmbedForCausalLM(
             self.lm_model,
-            self.text_embdings_model,
+            self.text_embeddings_model,
             config=config,
             deivce=device,
             ov_config=ov_config,
             model_save_dir=model_save_dir,
             quantization_config=quantization_config,
-            compile=not self._compile_only,
+            compile=not self._compile_only and enable_compilation,
             compile_only=self._compile_only,
         )
         self.vision_embeddings = OVVisionEmbedding(self.vision_embeddings_model, self)
@@ -287,6 +299,18 @@ def __init__(
         except AttributeError:
             pass
 
+    def clear_requests(self):
+        if self._compile_only:
+            raise ValueError(
+                "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option"
+            )
+
+        self.language_model.clear_requests()
+        components = [self.vision_embeddings] + [getattr(self, part) for part in self.additional_parts]
+        for component in components:
+            if component is not None:
+                component.request = None
+
     def compile(self):
         self.language_model.compile()
         self.vision_embeddings._compile()
@@ -304,11 +328,11 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
             save_directory (`str` or `Path`):
                 The directory where to save the model files.
         """
-        src_files = [self.lm_model, self.text_embdings_model, self.vision_embeddings_model]
+        src_files = [self.lm_model, self.text_embeddings_model, self.vision_embeddings_model]
         dst_file_names = [
             "openvino_language_model.xml",
             "openvino_text_embeddings_model.xml",
-            "openvino_vision_embeddings.xml",
+            "openvino_vision_embeddings_model.xml",
         ]
         for part in self.additional_parts:
             model = getattr(self, f"{part}_model", None)
@@ -387,26 +411,18 @@ def _from_pretrained(
                 raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
             token = use_auth_token
 
-        model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
-
-        quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
-        compile_only = kwargs.get("compile_only", False)
-
-        # Load model from a local directory
-        if os.path.isdir(model_id):
-            model_save_dir = Path(model_id)
         model_file_names = {
             "language_model": "openvino_language_model.xml",
             "text_embeddings": "openvino_text_embeddings_model.xml",
             "vision_embeddings": "openvino_vision_embeddings_model.xml",
         }
 
+        model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
         for part in model_cls.additional_parts:
             model_file_names[part] = f"openvino_{part}_model.xml"
-        model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
-        quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
         compile_only = kwargs.get("compile_only", False)
         if os.path.isdir(model_id):
+            # Load model from a local directory
             model_save_dir = Path(model_id)
             file_names = {k: os.path.join(model_id, model_file_names[k]) for k in model_file_names}
         else:
@@ -424,11 +440,11 @@ def _from_pretrained(
                 file_names[name] = model_cache_path
             model_save_dir = Path(model_cache_path).parent
         if not compile_only:
-            language_model = model_cls.load_model(file_names["language_model"], quantization_config)
-            text_embeddings = model_cls.load_model(file_names["text_embeddings"], quantization_config)
-            vision_embeddings = model_cls.load_model(file_names["vision_embeddings"], quantization_config)
+            language_model = model_cls.load_model(file_names["language_model"])
+            text_embeddings = model_cls.load_model(file_names["text_embeddings"])
+            vision_embeddings = model_cls.load_model(file_names["vision_embeddings"])
             for part in model_cls.additional_parts:
-                kwargs[part] = model_cls.load_model(file_names[part], quantization_config)
+                kwargs[part] = model_cls.load_model(file_names[part])
         else:
             language_model = model_cls._compile_model(
                 file_names["language_model"],
@@ -468,7 +484,12 @@ def _from_pretrained(
         except Exception:
             pass
 
-        return model_cls(
+        quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
+        to_quantize = not compile_only and quantization_config is not None
+        if to_quantize:
+            kwargs["compile"] = False
+
+        model = model_cls(
             language_model=language_model,
             text_embeddings=text_embeddings,
             vision_embeddings=vision_embeddings,
@@ -478,6 +499,15 @@ def _from_pretrained(
             **kwargs,
         )
 
+        if to_quantize:
+            quantization_config_copy = copy.deepcopy(quantization_config)
+            quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
+            potential_processor_id = config.mm_vision_tower if isinstance(model, _OVNanoLlavaForCausalLM) else model_id
+            quantization_config_copy.processor = quantization_config.processor or potential_processor_id
+            OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
+
+        return model
+
     @classmethod
     def _from_transformers(
         cls,
@@ -556,8 +586,8 @@ def half(self):
         """
         apply_moc_transformations(self.lm_model, cf=False)
         compress_model_transformation(self.lm_model)
-        apply_moc_transformations(self.text_embdings_model, cf=False)
-        compress_model_transformation(self.text_embdings_model)
+        apply_moc_transformations(self.text_embeddings_model, cf=False)
+        compress_model_transformation(self.text_embeddings_model)
         apply_moc_transformations(self.vision_embeddings_model, cf=False)
         compress_model_transformation(self.vision_embeddings_model)
         for part in self.additional_parts:
@@ -695,6 +725,18 @@ def can_generate(self):
         """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
         return True
 
+    @staticmethod
+    @abstractmethod
+    def preprocess_inputs(
+        processor,
+        text: str,
+        image: Optional[Image] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+    ):
+        """
+        Preprocess input instruction and an image.
+        """
+
 
 class _OVLlavaForCausalLM(OVModelForVisualCausalLM):
     def __init__(
@@ -858,6 +900,20 @@ def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values):
         position_ids[attention_mask == 0] = 1
         return attention_mask, position_ids
 
+    @staticmethod
+    def preprocess_inputs(
+        processor,
+        text: str,
+        image: Optional[Image] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+    ):
+        if image is None:
+            raise ValueError("Image is required.")
+        chat_template = [{"role": "user", "content": [{"type": "text", "text": text}, {"type": "image"}]}]
+        prompt = processor.apply_chat_template(chat_template, add_generation_prompt=True)
+        inputs = processor(images=image, text=prompt, return_tensors="pt")
+        return inputs
+
 
 class _OVLlavaNextForCausalLM(_OVLlavaForCausalLM):
     # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_next/modeling_llava_next.py#L655
@@ -1372,6 +1428,19 @@ def merge_vision_text_embeddings(
                     )
         return vllm_embedding, attention_mask, position_ids
 
+    @staticmethod
+    def preprocess_inputs(
+        processor,
+        text: str,
+        image: Optional[Image] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+    ):
+        if image is None:
+            raise ValueError("Image is required.")
+        prompt = f"<|im_start|>user\n(./)\n{text}<|im_end|>\n<|im_start|>assistant\n"
+        inputs = processor([prompt], [image], return_tensors="pt")
+        return inputs
+
 
 class _OVNanoLlavaForCausalLM(OVModelForVisualCausalLM):
     def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
@@ -1544,6 +1613,25 @@ def get_multimodal_embeddings(
 
         return new_input_embeds, attention_mask, position_ids
 
+    @staticmethod
+    def preprocess_inputs(
+        processor,
+        text: str,
+        image: Optional[Image] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+    ):
+        if tokenizer is None:
+            raise ValueError("Tokenizer is required.")
+        messages = [{"role": "user", "content": f"\n{text}"}]
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("")]
+        input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
+        attention_mask = torch.ones_like(input_ids, dtype=torch.int64)
+        result = {"input_ids": input_ids, "attention_mask": attention_mask}
+        if image is not None:
+            result["images"] = torch.unsqueeze(processor(images=image, return_tensors="pt")["pixel_values"][0], 0)
+        return result
+
 
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index c2e880e62..06cc16d04 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -19,11 +19,14 @@
 import os
 import warnings
 from collections import deque
+from itertools import islice
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
+import datasets
 import nncf
 import openvino
+import requests
 import torch
 import transformers
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
@@ -33,9 +36,11 @@
 from nncf.torch.initialization import PTInitializingDataLoader
 from openvino._offline_transformations import compress_quantize_weights_transformation
 from openvino.runtime import Core, Tensor
+from PIL import Image
 from torch.utils._pytree import tree_map
 from torch.utils.data import DataLoader, RandomSampler
-from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
+from tqdm import tqdm
+from transformers import AutoProcessor, AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
 from transformers.pytorch_utils import Conv1D
 from transformers.utils import is_accelerate_available
 
@@ -62,6 +67,7 @@
     ONNX_WEIGHTS_NAME,
     OV_XML_FILE_NAME,
     PREDEFINED_SD_DATASETS,
+    PREDEFINED_VISUAL_LM_DATASETS,
 )
 
 
@@ -313,6 +319,8 @@ def _quantize_ovbasemodel(
         remove_unused_columns: bool = True,
         **kwargs,
     ):
+        from optimum.intel.openvino.modeling_visual_language import OVModelForVisualCausalLM
+
         if is_diffusers_available():
             from optimum.intel.openvino.modeling_diffusion import OVDiffusionPipeline
 
@@ -361,6 +369,8 @@ def _quantize_ovbasemodel(
 
                 if isinstance(self.model, OVModelForCausalLM):
                     calibration_dataset = self._prepare_causal_lm_dataset(quantization_config)
+                elif isinstance(self.model, OVModelForVisualCausalLM):
+                    calibration_dataset = self._prepare_visual_causal_lm_dataset(quantization_config)
                 elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
                     if not isinstance(quantization_config.dataset, str):
                         raise ValueError("Please provide dataset as one of the accepted dataset labels.")
@@ -421,6 +431,14 @@ def _quantize_ovbasemodel(
                     for sub_model in sub_models:
                         _weight_only_quantization(sub_model.model, quantization_config)
                     self.model.clear_requests()
+                elif isinstance(self.model, OVModelForVisualCausalLM):
+                    language_model = self.model.language_model
+                    _weight_only_quantization(language_model.model, quantization_config, calibration_dataset)
+                    sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts
+                    sub_models = [getattr(self.model, f"{name}_model") for name in sub_model_names]
+                    for sub_model in sub_models:
+                        _weight_only_quantization(sub_model, OVWeightQuantizationConfig(bits=8, sym=False))
+                    self.model.clear_requests()
                 else:
                     _weight_only_quantization(self.model.model, quantization_config, calibration_dataset)
                     self.model.request = None
@@ -733,6 +751,65 @@ def _prepare_causal_lm_dataset(self, quantization_config: OVWeightQuantizationCo
 
         return calibration_dataset
 
+    def _prepare_visual_causal_lm_dataset(self, config: OVWeightQuantizationConfig):
+        dataset_name = config.dataset
+        if dataset_name not in PREDEFINED_VISUAL_LM_DATASETS:
+            raise ValueError(
+                "You have entered a string value for dataset. You can only choose between"
+                f"{list(PREDEFINED_VISUAL_LM_DATASETS.keys())}, but the {dataset_name} was found"
+            )
+        if config.processor is None:
+            raise ValueError(
+                "`processor` must be specified in order to run data-aware weight compression. "
+                "Please provide it as a model id, or a path to a directory containing all the required "
+                "configuration files."
+            )
+
+        processor = AutoProcessor.from_pretrained(config.processor, trust_remote_code=config.trust_remote_code)
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=config.trust_remote_code)
+            tokenizer_error = None
+        except Exception as tokenizer_error:  # noqa: F841
+            tokenizer = None
+
+        dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS[dataset_name]
+        dataset = datasets.load_dataset(dataset_metadata["name"], split=dataset_metadata["split"]).shuffle(seed=0)
+        num_samples = min(config.num_samples or 128, len(dataset))
+        dataset = islice(dataset, num_samples)
+
+        calibration_dataset = []
+        for item in tqdm(dataset, desc="Collecting calibration dataset", total=num_samples):
+            instruction = item[dataset_metadata["inputs"]["instruction"]]
+            image_url = item[dataset_metadata["inputs"]["image_url"]]
+            image = Image.open(requests.get(image_url, stream=True).raw)
+
+            try:
+                inputs = self.model.preprocess_inputs(processor, instruction, image, tokenizer)
+            except ValueError as value_error:
+                if "Tokenizer is required." in str(value_error) and tokenizer_error is not None:
+                    raise tokenizer_error
+                raise value_error
+
+            input_ids = inputs.get("input_ids")
+            position_ids = torch.arange(input_ids.size(1)).unsqueeze(0).to(input_ids.device)
+
+            inputs_embeds, attention_mask, position_ids = self.model.get_multimodal_embeddings(
+                **inputs,
+                position_ids=position_ids,
+            )
+
+            language_model_inputs = self.model.language_model.prepare_inputs(
+                input_ids=None,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+            )
+
+            calibration_dataset.append(language_model_inputs)
+
+        calibration_dataset = nncf.Dataset(calibration_dataset)
+        return calibration_dataset
+
     def _prepare_text_generation_dataset(
         self, quantization_config: OVQuantizationConfig, calibration_dataloader: OVDataLoader
     ) -> nncf.Dataset:
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index ca7d17720..68458c85b 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -116,6 +116,7 @@
     "token-classification": "OVModelForTokenClassification",
     "question-answering": "OVModelForQuestionAnswering",
     "image-classification": "OVModelForImageClassification",
+    "image-text-to-text": "OVModelForVisualCausalLM",
     "audio-classification": "OVModelForAudioClassification",
     "stable-diffusion": "OVStableDiffusionPipeline",
     "stable-diffusion-xl": "OVStableDiffusionXLPipeline",
@@ -135,6 +136,14 @@
     "laion/filtered-wit": {"split": "train", "inputs": {"prompt": "caption"}},
 }
 
+PREDEFINED_VISUAL_LM_DATASETS = {
+    "contextual": {
+        "name": "ucla-contextual/contextual_test",
+        "split": "test",
+        "inputs": {"image_url": "image_url", "instruction": "instruction"},
+    }
+}
+
 
 NEED_CONVERT_TO_FAST_TOKENIZER: Tuple[Type[PreTrainedTokenizer]] = (CLIPTokenizer,)
 
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 7542a347d..9952611e4 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -36,6 +36,7 @@
     OVModelForSeq2SeqLM,
     OVModelForSequenceClassification,
     OVModelForTokenClassification,
+    OVModelForVisualCausalLM,
     OVModelOpenCLIPForZeroShotImageClassification,
     OVModelOpenCLIPText,
     OVModelOpenCLIPVisual,
@@ -103,7 +104,7 @@ class OVCLIExportTestCase(unittest.TestCase):
     if is_transformers_version(">=", "4.45"):
         SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("stable-diffusion-3", 9, 65))
 
-    TEST_4BIT_CONFIGURATONS = [
+    TEST_4BIT_CONFIGURATIONS = [
         ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}),
         ("text-generation-with-past", "opt125m", "int4 --group-size 64", {"int8": 4, "int4": 144}),
         ("text-generation-with-past", "opt125m", "mxfp4", {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}),
@@ -129,6 +130,26 @@ class OVCLIExportTestCase(unittest.TestCase):
         ),
     ]
 
+    if is_transformers_version(">=", "4.40.0"):
+        TEST_4BIT_CONFIGURATIONS.extend(
+            [
+                (
+                    "image-text-to-text",
+                    "llava_next",
+                    'int4 --group-size 16 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" '
+                    "--dataset contextual --num-samples 1",
+                    {"int8": 8, "int4": 22},
+                ),
+                (
+                    "image-text-to-text",
+                    "nanollava",
+                    'int4 --group-size 8 --ratio 0.9 --sensitivity-metric "mean_activation_variance" '
+                    "--dataset contextual --num-samples 1 --trust-remote-code",
+                    {"int8": 12, "int4": 18},
+                ),
+            ]
+        )
+
     def _openvino_export(self, model_name: str, task: str):
         with TemporaryDirectory() as tmpdir:
             main_export(
@@ -245,7 +266,7 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
             self.assertEqual(exp_num_int8, num_weight_nodes["int8"])
             self.assertEqual(exp_num_fq, num_fq)
 
-    @parameterized.expand(TEST_4BIT_CONFIGURATONS)
+    @parameterized.expand(TEST_4BIT_CONFIGURATIONS)
     def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_num_weight_nodes: dict):
         with TemporaryDirectory() as tmpdir:
             result = subprocess.run(
@@ -255,13 +276,17 @@ def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expec
                 capture_output=True,
             )
             model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {}
+            if "--trust-remote-code" in option:
+                model_kwargs["trust_remote_code"] = True
             model = eval(
                 _HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]
                 if task.replace("-with-past", "") in _HEAD_TO_AUTOMODELS
                 else _HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]
             ).from_pretrained(tmpdir, **model_kwargs)
 
-            _, num_weight_nodes = get_num_quantized_nodes(model)
+            ov_model = model.lm_model if task == "image-text-to-text" else model.model
+
+            _, num_weight_nodes = get_num_quantized_nodes(ov_model)
             expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
             self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 0e01932b6..3ee055e80 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -57,13 +57,13 @@
     OVStableDiffusionPipeline,
     OVStableDiffusionXLPipeline,
     OVStableDiffusion3Pipeline,
-    OVFluxPipeline,
     OVQuantizer,
     OVTrainer,
     OVQuantizationConfig,
     OVWeightQuantizationConfig,
     OVDynamicQuantizationConfig,
     OVModelOpenCLIPForZeroShotImageClassification,
+    OVModelForVisualCausalLM,
 )
 from optimum.intel.openvino.configuration import (
     OVQuantizationMethod,
@@ -191,17 +191,25 @@ class OVWeightCompressionTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 74),)
     SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "gpt2", 44, 44),)
 
-    LOAD_IN_4_BITS_SCOPE = (
-        (OVModelForCausalLM, "gpt2", dict(bits=4, sym=False, group_size=-1, ratio=0.8), {"int4": 30, "int8": 14}),
+    LOAD_IN_4_BITS_SCOPE = [
+        (
+            OVModelForCausalLM,  # model cls
+            "gpt2",  # model name
+            False,  # trust remote code
+            dict(bits=4, sym=False, group_size=-1, ratio=0.8),  # quantization config
+            {"int4": 30, "int8": 14},  # reference number of low-precision nodes
+        ),
         (
             OVModelForCausalLM,
             "gpt2",
+            False,
             dict(bits=4, weight_format="mxfp4", group_size=32),
             {"f4e2m1": 20, "f8e8m0": 20, "int8": 4},
         ),
         (
             OVModelForCausalLM,
             "gpt2",
+            False,
             dict(
                 bits=4,
                 sym=False,
@@ -213,12 +221,14 @@ class OVWeightCompressionTest(unittest.TestCase):
         (
             OVModelForCausalLM,
             "gpt2",
+            False,
             dict(bits=4, sym=False, group_size=-1, ratio=0.8, all_layers=True),
             {"int4": 26, "int8": 18},
         ),
         (
             OVModelForCausalLM,
             "opt",
+            False,
             dict(
                 bits=4,
                 sym=True,
@@ -232,6 +242,7 @@ class OVWeightCompressionTest(unittest.TestCase):
         (
             OVModelForCausalLM,
             "opt",
+            False,
             dict(
                 bits=4,
                 sym=True,
@@ -245,6 +256,7 @@ class OVWeightCompressionTest(unittest.TestCase):
         (
             OVModelForCausalLM,
             "llama_awq",
+            False,
             dict(
                 bits=4,
                 sym=True,
@@ -260,6 +272,7 @@ class OVWeightCompressionTest(unittest.TestCase):
         (
             OVModelForCausalLM,
             "llama_awq",
+            False,
             dict(
                 bits=4,
                 sym=True,
@@ -274,6 +287,7 @@ class OVWeightCompressionTest(unittest.TestCase):
         (
             OVModelForCausalLM,
             "llama_awq",
+            False,
             dict(
                 bits=4,
                 sym=True,
@@ -285,22 +299,87 @@ class OVWeightCompressionTest(unittest.TestCase):
             ),
             {"int4": 12, "int8": 8},
         ),
-    )
+    ]
 
-    SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = (
-        (OVModelForCausalLM, "gpt2"),
-        (OVModelForMaskedLM, "bert"),
-        (OVModelForTokenClassification, "roberta"),
-        (OVModelForImageClassification, "vit"),
-        (OVModelForSeq2SeqLM, "t5"),
-        (OVModelForSequenceClassification, "albert"),
-        (OVModelForQuestionAnswering, "distilbert"),
-        (OVModelForAudioClassification, "wav2vec2"),
-        (OVModelForFeatureExtraction, "blenderbot"),
-        (OVStableDiffusionPipeline, "stable-diffusion"),
-        (OVStableDiffusionXLPipeline, "stable-diffusion-xl"),
-        (OVModelOpenCLIPForZeroShotImageClassification, "open-clip"),
-    )
+    if is_transformers_version(">=", "4.40.0"):
+        LOAD_IN_4_BITS_SCOPE.extend(
+            [
+                (
+                    OVModelForVisualCausalLM,
+                    "llava_next",
+                    False,
+                    dict(
+                        bits=4,
+                        group_size=16,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="hessian_input_activation",
+                        num_samples=1,
+                        processor=MODEL_NAMES["llava_next"],
+                    ),
+                    {"int4": 24, "int8": 6},
+                ),
+                (
+                    OVModelForVisualCausalLM,
+                    "nanollava",
+                    True,
+                    dict(
+                        bits=4,
+                        group_size=8,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="mean_activation_magnitude",
+                        num_samples=1,
+                        processor=MODEL_NAMES["nanollava_vision_tower"],
+                        tokenizer=MODEL_NAMES["nanollava"],
+                        trust_remote_code=True,
+                    ),
+                    {"int4": 16, "int8": 14},
+                ),
+            ]
+        )
+
+    if is_transformers_version(">=", "4.45.0"):
+        LOAD_IN_4_BITS_SCOPE.append(
+            (
+                OVModelForVisualCausalLM,
+                "minicpmv",
+                True,
+                dict(
+                    bits=4,
+                    group_size=16,
+                    dataset="contextual",
+                    ratio=0.8,
+                    sensitivity_metric="mean_activation_magnitude",
+                    num_samples=1,
+                    processor=MODEL_NAMES["minicpmv"],
+                    trust_remote_code=True,
+                ),
+                {"int4": 22, "int8": 8},
+            )
+        )
+
+    SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = [
+        (OVModelForCausalLM, "gpt2", False),
+        (OVModelForMaskedLM, "bert", False),
+        (OVModelForTokenClassification, "roberta", False),
+        (OVModelForImageClassification, "vit", False),
+        (OVModelForSeq2SeqLM, "t5", False),
+        (OVModelForSequenceClassification, "albert", False),
+        (OVModelForQuestionAnswering, "distilbert", False),
+        (OVModelForAudioClassification, "wav2vec2", False),
+        (OVModelForFeatureExtraction, "blenderbot", False),
+        (OVStableDiffusionPipeline, "stable-diffusion", False),
+        (OVStableDiffusionXLPipeline, "stable-diffusion-xl", False),
+        (OVModelOpenCLIPForZeroShotImageClassification, "open-clip", False),
+        (OVModelForVisualCausalLM, "llava", False),
+    ]
+
+    if is_transformers_version(">=", "4.40.0"):
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "nanollava", True))
+
+    if is_transformers_version(">=", "4.45.0"):
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmv", True))
 
     SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [
         (OVStableDiffusionPipeline, "stable-diffusion", 72, 195),
@@ -429,8 +508,14 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, e
             self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict())
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
-    def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
-        model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=False)
+    def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust_remote_code):
+        model = model_cls.from_pretrained(
+            MODEL_NAMES[model_type],
+            export=True,
+            load_in_8bit=True,
+            stateful=False,
+            trust_remote_code=trust_remote_code,
+        )
 
         if model_type == "open-clip":
             self.assertEqual(model.text_model._openvino_config.quantization_config.bits, 8)
@@ -448,6 +533,9 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
             models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2)
         elif model_type == "open-clip":
             models = [model.text_model, model.visual_model]
+        elif model.export_feature == "image-text-to-text":
+            models = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
+            models += [getattr(model, part) for part in model.additional_parts]
         else:
             models = [model]
 
@@ -533,26 +621,26 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_
 
     @parameterized.expand(LOAD_IN_4_BITS_SCOPE)
     def test_ovmodel_4bit_auto_compression_with_config(
-        self, model_cls, model_name, quantization_config, expected_num_weight_nodes
+        self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes
     ):
         model_id = MODEL_NAMES[model_name]
         with TemporaryDirectory() as tmp_dir:
             quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
-            model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
+            model = model_cls.from_pretrained(
+                model_id, export=True, quantization_config=quantization_config, trust_remote_code=trust_remote_code
+            )
             if quantization_config.quant_method.lower() == "awq":
                 # TODO: Check that AWQ was actually applied
                 pass
 
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-            if tokenizer.pad_token is None:
-                tokenizer.pad_token = tokenizer.eos_token
+            ov_model = model.lm_model if model_cls == OVModelForVisualCausalLM else model.model
 
-            _, num_weight_nodes = get_num_quantized_nodes(model)
+            _, num_weight_nodes = get_num_quantized_nodes(ov_model)
             expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
             self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             model.save_pretrained(tmp_dir)
 
-            wc_rt_info = model.model.get_rt_info()["nncf"]["weight_compression"]
+            wc_rt_info = ov_model.get_rt_info()["nncf"]["weight_compression"]
             self.assertEqual(quantization_config.quant_method.lower() == "awq", wc_rt_info["awq"].value == "True")
             self.assertEqual(
                 quantization_config.scale_estimation or False, wc_rt_info["scale_estimation"].value == "True"
@@ -574,8 +662,10 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty
         self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
-    def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type):
-        model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=False)
+    def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type, trust_remote_code):
+        model = model_cls.from_pretrained(
+            MODEL_NAMES[model_type], export=True, load_in_8bit=False, trust_remote_code=trust_remote_code
+        )
         if model.export_feature.startswith("text2text-generation"):
             models = [model.encoder, model.decoder, model.decoder_with_past]
         elif model.export_feature == "text-to-image":
@@ -583,6 +673,9 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type):
             models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2)
         elif model_type == "open-clip":
             models = [model.text_model, model.visual_model]
+        elif model.export_feature == "image-text-to-text":
+            models = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
+            models += [getattr(model, part) for part in model.additional_parts]
         else:
             models = [model]
 
@@ -670,7 +763,7 @@ def main_export_not_in_stacktrace(*args, **kwargs):
 
     @parameterized.expand(LOAD_IN_4_BITS_SCOPE)
     def test_ovmodel_4bit_dynamic_with_config(
-        self, model_cls, model_name, quantization_config, expected_num_weight_nodes
+        self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes
     ):
         model_id = MODEL_NAMES[model_name]
         with TemporaryDirectory() as tmp_dir:
@@ -678,15 +771,15 @@ def test_ovmodel_4bit_dynamic_with_config(
             quantization_config = OVDynamicQuantizationConfig(
                 weights_group_size=group_size, activations_group_size=group_size, **quantization_config
             )
-            model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
+            model = model_cls.from_pretrained(
+                model_id, export=True, quantization_config=quantization_config, trust_remote_code=trust_remote_code
+            )
             self.assertEqual(model.ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"], str(group_size))
             self.assertEqual(model.ov_config["KV_CACHE_PRECISION"], "u8")
 
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-            if tokenizer.pad_token is None:
-                tokenizer.pad_token = tokenizer.eos_token
+            ov_model = model.lm_model if model_cls == OVModelForVisualCausalLM else model.model
 
-            _, num_weight_nodes = get_num_quantized_nodes(model)
+            _, num_weight_nodes = get_num_quantized_nodes(ov_model)
             expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
             self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             model.save_pretrained(tmp_dir)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 129defc82..fc3d97e24 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 import numpy as np
+import openvino as ov
 import torch
 
 
@@ -96,6 +97,7 @@
     "mpnet": "hf-internal-testing/tiny-random-MPNetModel",
     "mt5": "stas/mt5-tiny-random",
     "nanollava": "katuni4ka/tiny-random-nanollava",
+    "nanollava_vision_tower": "katuni4ka/tiny-random-siglip",
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
     "olmo": "katuni4ka/tiny-random-olmo-hf",
     "orion": "katuni4ka/tiny-random-orion",
@@ -178,10 +180,14 @@
     "open-clip": (20, 28),
     "stable-diffusion-3": (66, 42, 58, 30),
     "flux": (56, 24, 28, 64),
+    "llava": (30, 18, 2),
+    "llava_next": (30, 18, 2),
+    "minicpmv": (30, 52, 2, 12),
+    "nanollava": (30, 30, 2),
 }
 
 
-def get_num_quantized_nodes(ov_model):
+def get_num_quantized_nodes(model):
     num_fake_quantize = 0
     num_weight_nodes = {
         "int8": 0,
@@ -189,7 +195,8 @@ def get_num_quantized_nodes(ov_model):
         "f4e2m1": 0,
         "f8e8m0": 0,
     }
-    for elem in ov_model.model.get_ops():
+    ov_model = model if isinstance(model, ov.Model) else model.model
+    for elem in ov_model.get_ops():
         if "FakeQuantize" in elem.name:
             num_fake_quantize += 1
         for i in range(elem.get_output_size()):

From b3cbc951a87c58da3310b7a59f6e4c8523bc0b53 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov 
Date: Mon, 11 Nov 2024 19:52:56 +0400
Subject: [PATCH 38/53] Added a fix for FP16 overflow issue on GPU/NPU (#994)

* Added a fix for FP16 overflow issue on GPU/NPU

* Style

* Updated export test

* Style
---
 optimum/exporters/openvino/convert.py | 42 ++++++++++++++++++++++++++-
 tests/openvino/test_export.py         |  9 ++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index df2885fd0..11e93fb21 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -91,9 +91,31 @@
     from optimum.intel.openvino.configuration import OVConfig
 
 
-def _save_model(model, path: str, ov_config: Optional["OVConfig"] = None, library_name: Optional[str] = None):
+def _set_runtime_options(
+    models_and_export_configs: Dict[
+        str,
+        Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin", "DiffusionPipeline"], "OnnxConfig"],
+    ],
+    task: str,
+):
+    for model_name in models_and_export_configs.keys():
+        _, sub_export_config = models_and_export_configs[model_name]
+        if "vae_" in model_name or "text-generation" in task:
+            sub_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
+
+
+def _save_model(
+    model,
+    path: str,
+    ov_config: Optional["OVConfig"] = None,
+    library_name: Optional[str] = None,
+    config: OnnxConfig = None,
+):
     compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16"
     model = _add_version_info_to_model(model, library_name)
+
+    if hasattr(config, "runtime_options"):
+        model = _add_runtime_options_to_rt_info(model, config.runtime_options)
     save_model(model, path, compress_to_fp16)
 
 
@@ -213,6 +235,7 @@ def export_tensorflow(
         output.parent / output,
         ov_config=ov_config,
         library_name=library_name,
+        config=config,
     )
     del ov_model
     return input_names, output_names, True
@@ -276,6 +299,7 @@ def export_pytorch_via_onnx(
         output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output,
         ov_config=ov_config,
         library_name=library_name,
+        config=config,
     )
     del ov_model
     return input_names, output_names, True
@@ -450,6 +474,7 @@ def ts_patched_forward(*args, **kwargs):
             output,
             ov_config=ov_config,
             library_name=library_name,
+            config=config,
         )
         clear_class_registry()
         del ov_model
@@ -718,6 +743,8 @@ def export_from_model(
 
         model.save_config(output)
 
+    _set_runtime_options(models_and_export_configs, task)
+
     export_models(
         models_and_export_configs=models_and_export_configs,
         output_dir=output,
@@ -792,6 +819,19 @@ def export_tokenizer(
         save_model(model, output / file_name.format(suffix))
 
 
+def _add_runtime_options_to_rt_info(model: Model, options: Dict):
+    """
+    Add runtime optinos
+    """
+    try:
+        for name, value in options.items():
+            model.set_rt_info(value, ["runtime_options", name])
+    except Exception:
+        pass
+
+    return model
+
+
 def _add_version_info_to_model(model: Model, library_name: Optional[str] = None):
     """
     Add dependency versions to OpenVINO model
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 6a42c4a09..80a020d2b 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -124,6 +124,15 @@ def _openvino_export(
                     self.assertEqual(
                         ov_model.model.get_rt_info()["optimum"]["transformers_version"], _transformers_version
                     )
+                    self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "ACTIVATIONS_SCALE_FACTOR"]))
+
+                if library_name == "diffusers":
+                    self.assertTrue(
+                        ov_model.vae_encoder.model.has_rt_info(["runtime_options", "ACTIVATIONS_SCALE_FACTOR"])
+                    )
+                    self.assertTrue(
+                        ov_model.vae_decoder.model.has_rt_info(["runtime_options", "ACTIVATIONS_SCALE_FACTOR"])
+                    )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_export(self, model_type: str):

From 790244de0adfd6673f3efcbc00e81a4908da1ed3 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Tue, 12 Nov 2024 16:19:49 +0400
Subject: [PATCH 39/53] add saving safety_checker (#990)

* add saving safety_checker during conversion

* add safety_checker to save_pretrained

* add test

* Update modeling_diffusion.py
---
 optimum/exporters/openvino/convert.py        |  3 +++
 optimum/intel/openvino/modeling_diffusion.py |  2 ++
 tests/openvino/test_diffusion.py             | 27 ++++++++++++++++++++
 3 files changed, 32 insertions(+)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 11e93fb21..f046c32f8 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -740,6 +740,9 @@ def export_from_model(
         tokenizer_3 = getattr(model, "tokenizer_3", None)
         if tokenizer_3 is not None:
             tokenizer_3.save_pretrained(output.joinpath("tokenizer_3"))
+        safety_checker = getattr(model, "safety_checker", None)
+        if safety_checker is not None:
+            safety_checker.save_pretrained(output.joinpath("safety_checker"))
 
         model.save_config(output)
 
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 51041a2fb..7b3d1c0f4 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -295,6 +295,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
             self.tokenizer_3.save_pretrained(save_directory / "tokenizer_3")
         if self.feature_extractor is not None:
             self.feature_extractor.save_pretrained(save_directory / "feature_extractor")
+        if getattr(self, "safety_checker", None) is not None:
+            self.safety_checker.save_pretrained(save_directory / "safety_checker")
 
         self._save_openvino_config(save_directory)
 
diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
index 1467e5ed1..2baeba9a4 100644
--- a/tests/openvino/test_diffusion.py
+++ b/tests/openvino/test_diffusion.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 import unittest
+from pathlib import Path
 
 import numpy as np
 import pytest
@@ -35,6 +36,7 @@
     OVPipelineForInpainting,
     OVPipelineForText2Image,
 )
+from optimum.intel.openvino.utils import TemporaryDirectory
 from optimum.intel.utils.import_utils import is_transformers_version
 from optimum.utils.testing_utils import require_diffusers
 
@@ -309,6 +311,31 @@ def test_safety_checker(self, model_arch: str):
 
         np.testing.assert_allclose(ov_images, diffusers_images, atol=1e-4, rtol=1e-2)
 
+    @require_diffusers
+    def test_load_and_save_pipeline_with_safety_checker(self):
+        model_id = "katuni4ka/tiny-random-stable-diffusion-with-safety-checker"
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(model_id)
+        self.assertTrue(ov_pipeline.safety_checker is not None)
+        self.assertIsInstance(ov_pipeline.safety_checker, StableDiffusionSafetyChecker)
+        with TemporaryDirectory() as tmpdirname:
+            ov_pipeline.save_pretrained(tmpdirname)
+            for subdir in [
+                "text_encoder",
+                "tokenizer",
+                "unet",
+                "vae_encoder",
+                "vae_decoder",
+                "scheduler",
+                "feature_extractor",
+            ]:
+                subdir_path = Path(tmpdirname) / subdir
+                self.assertTrue(subdir_path.is_dir())
+            loaded_pipeline = self.OVMODEL_CLASS.from_pretrained(tmpdirname)
+            self.assertTrue(loaded_pipeline.safety_checker is not None)
+            self.assertIsInstance(loaded_pipeline.safety_checker, StableDiffusionSafetyChecker)
+            del loaded_pipeline
+        del ov_pipeline
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_height_width_properties(self, model_arch: str):
         batch_size, height, width, num_images_per_prompt = 2, 128, 64, 4

From 12783eef55701855dc1025eb70b054ac9e4507bf Mon Sep 17 00:00:00 2001
From: Nikita Savelyev 
Date: Tue, 12 Nov 2024 15:12:18 +0100
Subject: [PATCH 40/53] Raise exception when some compression parameter is
 given, but weight format is not. (#996)

---
 optimum/commands/export/openvino.py  |  5 +++--
 tests/openvino/test_exporters_cli.py | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 2b031bad9..f092d5cb2 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -262,8 +262,9 @@ def run(self):
         if self.args.weight_format is None:
             ov_config = None
             if not no_compression_parameter_provided(self.args):
-                logger.warning(
-                    "The provided compression parameters will not affect conversion because of the missing --weight-format argument."
+                raise ValueError(
+                    "Some compression parameters are provided, but the weight format is not specified. "
+                    "Please provide it with --weight-format argument."
                 )
         elif self.args.weight_format in {"fp16", "fp32"}:
             ov_config = OVConfig(dtype=self.args.weight_format)
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 9952611e4..91e19ff7a 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -384,3 +384,20 @@ def test_exporters_cli_open_clip(self):
             model = eval(_HEAD_TO_AUTOMODELS["open_clip"]).from_pretrained(tmpdir, compile=False)
             self.assertTrue("text_features" in model.text_model.output_names)
             self.assertTrue("image_features" in model.visual_model.output_names)
+
+    def test_export_openvino_with_missed_weight_format(self):
+        # Test that exception is raised when some compression parameter is given, but weight format is not.
+        with TemporaryDirectory() as tmpdir:
+            with self.assertRaises(subprocess.CalledProcessError) as exc_info:
+                subprocess.run(
+                    f"optimum-cli export openvino --model {MODEL_NAMES['gpt2']} --task text-generation --sym {tmpdir}",
+                    shell=True,
+                    check=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True,
+                )
+            self.assertIn(
+                "Some compression parameters are provided, but the weight format is not specified.",
+                str(exc_info.exception.stderr),
+            )

From 0447ae2fbe7f638edce1e9770af443fa9084af31 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Tue, 12 Nov 2024 19:14:08 +0400
Subject: [PATCH 41/53] add patching for update_causal_mask to falcon for >=
 4.45 (#989)

* add patching for update_causal_mask to falcon and gpt-like models for >=4.45

* fix falcon

* enable codegen2 back

* Apply suggestions from code review

Co-authored-by: Nikita Savelyev 

* Update optimum/exporters/openvino/model_patcher.py

---------

Co-authored-by: Nikita Savelyev 
---
 optimum/exporters/openvino/model_configs.py |  20 ++
 optimum/exporters/openvino/model_patcher.py | 228 ++++++++++++++++++--
 tests/openvino/test_modeling.py             |   5 +-
 3 files changed, 237 insertions(+), 16 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 5276ade33..e8c8e5d13 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -29,6 +29,7 @@
     CodeGenOnnxConfig,
     FalconOnnxConfig,
     GemmaOnnxConfig,
+    GPTJOnnxConfig,
     GPTNeoXOnnxConfig,
     IBertOnnxConfig,
     LlamaOnnxConfig,
@@ -66,6 +67,7 @@
     FalconModelPatcher,
     FluxTransfromerModelPatcher,
     Gemma2ModelPatcher,
+    GptJModelPatcher,
     GptNeoxJapaneseModelPatcher,
     GptNeoxModelPatcher,
     IBertModelPatcher,
@@ -726,6 +728,24 @@ def patch_model_for_export(
         return GptNeoxJapaneseModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+@register_in_tasks_manager(
+    "gptj",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class GPTJOpenVINOConfig(GPTJOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return GptJModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
 @register_in_tasks_manager(
     "cohere",
     *[
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index dbbfb5662..7406e1370 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -109,11 +109,20 @@ def patch_model_with_bettertransformer(model):
     return model
 
 
-def patch_update_causal_mask(model, transformers_version):
+def patch_update_causal_mask(model, transformers_version, inner_model_name="model", patch_fn=None):
     if is_transformers_version(">=", transformers_version):
-        inner_model = getattr(model, "model", getattr(model, "transformer", None))
+        inner_model = getattr(model, inner_model_name, None)
         if inner_model is not None:
-            inner_model._update_causal_mask = types.MethodType(_llama_gemma_update_causal_mask, inner_model)
+            if hasattr(inner_model, "_update_causal_mask"):
+                inner_model._orig_update_causal_mask = inner_model._update_causal_mask
+            patch_fn = patch_fn or _llama_gemma_update_causal_mask
+            inner_model._update_causal_mask = types.MethodType(patch_fn, inner_model)
+
+
+def unpatch_update_causal_mask(model, inner_model_name="model"):
+    inner_model = getattr(model, inner_model_name, None)
+    if inner_model is not None and hasattr(inner_model, "._orig_update_causal_mask"):
+        inner_model._update_causal_mask = inner_model._orig_update_causal_mask
 
 
 # initialization of sin/cos cached in bf16/fp16 leads to accuracy loss
@@ -579,13 +588,11 @@ def __enter__(self):
 
         # llama/gemma has some accuracy issues with bf16 with transformers >= 4.39
         # fill causal mask in slightly different way for avoid overflow on some platforms
-        patch_update_causal_mask(self._model, "4.39.0")
+        patch_update_causal_mask(self._model, "4.39.0", "model" if hasattr(self._model, "model") else "transformer")
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        inner_model = getattr(self._model, "model", getattr(self._model, "transformer", None))
-        if hasattr(inner_model, "_orig_update_causal_mask"):
-            inner_model._update_causal_mask = inner_model._orig_update_causal_mask
+        unpatch_update_causal_mask(self._model, "model" if hasattr(self._model, "model") else "transformer")
 
 
 # copied from https://github.com/huggingface/transformers/commit/57d7594a79a9f5d835abf2d4d384db0e4818e548 to unblock export with transformers 4.42
@@ -1865,6 +1872,67 @@ def __exit__(self, exc_type, exc_value, traceback):
                 layer.self_attn.forward = layer.self_attn._orig_forward
 
 
+# copied from  https://github.com/huggingface/optimum/blob/2112e99122d7f23a1da1a9d263fef64301050ea7/optimum/bettertransformer/models/attention.py#L168
+# for preserving backward compatibility between outdated codegen remote code and new transformers
+def _codegen_wrapped_scaled_dot_product_legacy(
+    self,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+):
+    from optimum.bettertransformer.models.attention import raise_on_head_mask
+
+    raise_on_head_mask(head_mask)
+    batch_size = query.shape[0]
+    mask_value = torch.finfo(value.dtype).min
+    mask_value = torch.full([], mask_value, dtype=value.dtype)
+
+    if batch_size == 1 and attention_mask is not None and attention_mask[0, 0, -1, -1] < -1:
+        raise ValueError("BetterTransformer does not support padding='max_length' with a batch size of 1.")
+
+    # in codegen the query and key are always in fp32 regardless of the dtype of the model
+    # https://github.com/huggingface/transformers/blob/5b28b7833297adf65c5160a685425ddb1eee5ce2/src/transformers/models/codegen/modeling_codegen.py#L226
+    query = query.to(value.dtype)
+    key = key.to(value.dtype)
+
+    dropout_p = self.dropout_prob_attn if self.training else 0.0
+    if batch_size == 1 or self.training:
+        if query.shape[2] > 1:
+            # first step of the decoding
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True
+            )
+        else:
+            # in this case, which is the later decoding steps, the `causal_mask`` in
+            # https://github.com/huggingface/transformers/blob/ae54e3c3b18bac0832ad62ea9b896dfd52a09850/src/transformers/models/gpt2/modeling_gpt2.py#L195
+            # is [True, ..., True] so actually not causal
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=False
+            )
+    else:
+        query_length, key_length = query.size(-2), key.size(-2)
+
+        # causal_mask is always [True, ..., True] otherwise, so executing this is unnecessary
+        if query_length > 1:
+            causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+
+            causal_mask = torch.where(causal_mask, 0, mask_value)
+
+            # torch.Tensor.expand does no memory copy
+            causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
+
+            # we use torch.min to avoid having tensor(-inf)
+            attention_mask = torch.min(causal_mask, attention_mask)
+
+        sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False
+        )
+
+    return sdpa_result, None
+
+
 class CodeGenModelPatcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
@@ -1873,14 +1941,23 @@ def __enter__(self):
         # For avoiding breaking model on tracing stage, we reduce area of bettertransformer patch only for _attn.
         from optimum.bettertransformer.models.attention import codegen_wrapped_scaled_dot_product
 
+        attn_fn = codegen_wrapped_scaled_dot_product
+        if is_torch_version(">=", "2.1.0") and is_transformers_version(">=", "4.45"):
+            # in transformers 4.45 causal_mask const buffer was removed from the model
+            # if it still exists, it means legacy remote code was loaded
+            if hasattr(self._model.transformer.h[0].attn, "causal_mask"):
+                attn_fn = _codegen_wrapped_scaled_dot_product_legacy
+
         for layer in self._model.transformer.h:
             if is_torch_version(">=", "2.1.0") and not self._model.config.output_attentions:
                 orig_self_attn_fwd = layer.attn._attn
-                layer.attn._attn = types.MethodType(codegen_wrapped_scaled_dot_product, layer.attn)
+                layer.attn._attn = types.MethodType(attn_fn, layer.attn)
                 layer.attn._orig_attn = orig_self_attn_fwd
+        patch_update_causal_mask(self._model, "4.45.0", "transformer")
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
+        unpatch_update_causal_mask(self._model, "transformer")
         for layer in self._model.transformer.h:
             if hasattr(layer.attn, "_orig_attn"):
                 layer.attn._attn = layer.attn._orig_attn
@@ -2275,8 +2352,7 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        if hasattr(self._model.model, "_orig_update_causal_mask"):
-            self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+        unpatch_update_causal_mask(self._model)
         for layer in self._model.model.layers:
             if hasattr(layer.self_attn, "_orig_forward"):
                 layer.self_attn.forward = layer.self_attn._orig_forward
@@ -2413,8 +2489,7 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        if hasattr(self._model.model, "_orig_update_causal_mask"):
-            self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+        unpatch_update_causal_mask(self._model)
 
 
 class RotaryEmbPatcher(DecoderModelPatcher):
@@ -2425,12 +2500,119 @@ def __enter__(self):
                 _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
 
 
+def _falcon_update_causal_mask(
+    self,
+    attention_mask: torch.Tensor,
+    input_tensor: torch.Tensor,
+    cache_position: torch.Tensor,
+    past_key_values: "Cache",
+    output_attentions: bool,
+    head_mask: torch.Tensor,
+    alibi: torch.Tensor,
+):
+    # copied from  https://github.com/huggingface/transformers/blob/a30c865f991dfec9452cc64bd9a97bfbb96be036/src/transformers/models/falcon/modeling_falcon.py#L1130
+    from transformers.cache_utils import StaticCache
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+
+    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+    if hasattr(self, "_prepare_4d_causal_attention_mask_with_cache_position"):
+        _prepare_4d_causal_attention_mask_with_cache_position = (
+            self._prepare_4d_causal_attention_mask_with_cache_position
+        )
+    else:
+        from transformers.models.falcon.modeling_falcon import _prepare_4d_causal_attention_mask_with_cache_position
+
+    if self.config._attn_implementation == "flash_attention_2":
+        if attention_mask is not None and 0.0 in attention_mask:
+            return attention_mask
+        return None
+
+    # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+    # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+    # to infer the attention mask.
+    past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+    using_static_cache = isinstance(past_key_values, StaticCache)
+
+    # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+    if (
+        self.config._attn_implementation == "sdpa"
+        and not using_static_cache
+        and not output_attentions
+        and head_mask is None
+        and alibi is None
+    ):
+        if AttentionMaskConverter._ignore_causal_mask_sdpa(
+            attention_mask,
+            inputs_embeds=input_tensor,
+            past_key_values_length=past_seen_tokens,
+            is_training=self.training,
+        ):
+            return None
+
+    dtype, device = input_tensor.dtype, input_tensor.device
+    # difference from original, replace torch.finfo(dtype).min to float16 for prevent overflow for fp16/bf16 execution
+    min_dtype = torch.finfo(torch.float16).min
+    batch_size, sequence_length, _ = input_tensor.shape
+    if using_static_cache:
+        target_length = past_key_values.get_max_length()
+    else:
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length
+        )
+
+    # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+    causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask,
+        sequence_length=sequence_length,
+        target_length=target_length,
+        dtype=dtype,
+        device=device,
+        min_dtype=min_dtype,
+        cache_position=cache_position,
+        batch_size=input_tensor.shape[0],
+    )
+
+    # We take care to integrate alibi bias in the causal_mask here
+    if head_mask is None and alibi is not None:
+        alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
+        causal_mask = torch.masked_fill(
+            alibi / math.sqrt(self.config.hidden_size // self.num_heads),
+            causal_mask < -1,
+            min_dtype,
+        )
+
+    if (
+        self.config._attn_implementation == "sdpa"
+        and attention_mask is not None
+        and attention_mask.device.type == "cuda"
+        and not output_attentions
+    ):
+        # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+        # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        # Details: https://github.com/pytorch/pytorch/issues/110213
+        causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+    return causal_mask
+
+
 class FalconModelPatcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
         if is_transformers_version("<", "4.44.99"):
             for layer in self._model.transformer.h:
                 _reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb)
+        else:
+            patch_update_causal_mask(self._model, "4.45.0", "transformer", _falcon_update_causal_mask)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        unpatch_update_causal_mask(self._model, "transformer")
 
 
 class GptNeoxModelPatcher(DecoderModelPatcher):
@@ -2439,6 +2621,22 @@ def __enter__(self):
         if is_transformers_version("<", "4.44.99"):
             for layer in self._model.gpt_neox.layers:
                 _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
+        else:
+            patch_update_causal_mask(self._model, "4.45.0", "gpt_neox")
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        unpatch_update_causal_mask(self._model, "gpt_neox")
+
+
+class GptJModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        patch_update_causal_mask(self._model, "4.45.0", "transformer")
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        unpatch_update_causal_mask(self._model, "transformer")
 
 
 class GptNeoxJapaneseModelPatcher(DecoderModelPatcher):
@@ -2447,6 +2645,12 @@ def __enter__(self):
         if is_transformers_version("<", "4.44.99"):
             for layer in self._model.gpt_neox_japanese.layers:
                 _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
+        else:
+            patch_update_causal_mask(self._model, "4.45.0", "gpt_neox_japanese")
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        unpatch_update_causal_mask(self._model, "gpt_neox_japanese")
 
 
 class Gemma2ModelPatcher(LlamaModelPatcher):
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 916833602..0218f6d0e 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -773,6 +773,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "bloom",
         "chatglm",
         "codegen",
+        "codegen2",
         "gpt2",
         "gpt_neo",
         "gpt_neox",
@@ -821,10 +822,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
             "mistral-nemo",
         )
 
-    # custom modeling defined in https://huggingface.co/katuni4ka/tiny-random-codegen2 differs from transformers after v4.45 resulting in unadapted patching
-    if is_transformers_version("<", "4.45.0"):
-        SUPPORTED_ARCHITECTURES += ("codegen2",)
-
     GENERATION_LENGTH = 100
     REMOTE_CODE_MODELS = (
         "chatglm",

From 5c879b96cd6f4ceca34df8343a3f05c619308887 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Wed, 13 Nov 2024 08:46:07 +0400
Subject: [PATCH 42/53] fix config saving when check on misplaced args broken
 (#966)

* fix config saving when check on misplaced args broken

* add internvl test

* fix tests

* fix tests

* numeric stability in tests

* fix code style

* update and reuse preprocess_inputs

* Update optimum/exporters/openvino/utils.py

Co-authored-by: Nikita Savelyev 

* Update tests/openvino/test_modeling.py

Co-authored-by: Nikita Savelyev 

* change preprocess_inputs signature

* fix quantization after signature update

* fix preparing generation config

* Update optimum/intel/openvino/modeling_visual_language.py

Co-authored-by: Nikita Savelyev 

---------

Co-authored-by: Nikita Savelyev 
---
 optimum/exporters/openvino/convert.py         |   9 +-
 optimum/exporters/openvino/model_configs.py   |   2 +-
 optimum/exporters/openvino/utils.py           |  18 ++
 optimum/intel/openvino/modeling_base.py       |   6 +-
 .../intel/openvino/modeling_base_seq2seq.py   |   6 +-
 .../openvino/modeling_visual_language.py      | 215 ++++++++++++++++--
 optimum/intel/openvino/quantization.py        |   4 +-
 tests/openvino/test_modeling.py               | 101 ++++----
 tests/openvino/utils_tests.py                 |   1 +
 9 files changed, 296 insertions(+), 66 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index f046c32f8..fdee8a3ef 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -71,6 +71,7 @@
     _get_open_clip_submodels_fn_and_export_configs,
     clear_class_registry,
     remove_none_from_dummy_inputs,
+    save_config,
 )
 
 
@@ -684,7 +685,11 @@ def export_from_model(
         files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
     elif library_name != "diffusers":
         if is_transformers_version(">=", "4.44.99"):
-            misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
+            # some model configs may have issues with loading without parameters initialization
+            try:
+                misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
+            except KeyError:
+                misplaced_generation_parameters = {}
             if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
                 logger.warning(
                     "Moving the following attributes in the config to the generation config: "
@@ -696,7 +701,7 @@ def export_from_model(
                     setattr(model.config, param_name, None)
 
         # Saving the model config and preprocessor as this is needed sometimes.
-        model.config.save_pretrained(output)
+        save_config(model.config, output)
         generation_config = getattr(model, "generation_config", None)
         if generation_config is not None:
             try:
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index e8c8e5d13..876672db4 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -1464,7 +1464,7 @@ class InternVLChatConfigBehavior(str, enum.Enum):
 @register_in_tasks_manager("internvl-chat", *["image-text-to-text"], library_name="transformers")
 class InternVLChatOpenVINOConfig(OnnxConfig):
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in InternVLChatConfigBehavior]
-    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+    NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
 
     def __init__(
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index 9286a37f7..701334209 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -13,7 +13,9 @@
 #  limitations under the License.
 
 import inspect
+import logging
 from collections import namedtuple
+from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from transformers.utils import is_torch_available
@@ -25,6 +27,9 @@
 from optimum.utils import is_diffusers_available
 
 
+logger = logging.getLogger(__name__)
+
+
 InputInfo = namedtuple("InputInfo", ["name", "shape", "type", "example"])
 
 
@@ -209,3 +214,16 @@ def get_submodels(model):
 
 
 MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv"]
+
+
+def save_config(config, save_dir):
+    try:
+        config.save_pretrained(save_dir)
+    except Exception as exp:
+        logger.warning(
+            f"Attempt to save config using standard API has failed with {exp}. There may be an issue with model config, please check its correctness before usage."
+        )
+        save_dir = Path(save_dir)
+        save_dir.mkdir(exist_ok=True, parents=True)
+        output_config_file = Path(save_dir / "config.json")
+        config.to_json_file(output_config_file, use_diff=True)
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index ed3cdadb5..320d77c4c 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -136,7 +136,11 @@ def __init__(
             self.generation_config = generation_config or GenerationConfig.from_model_config(config)
 
             if is_transformers_version(">=", "4.44.99"):
-                misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+                # some model configs may have issues with loading without parameters initialization
+                try:
+                    misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+                except KeyError:
+                    misplaced_generation_parameters = {}
                 if len(misplaced_generation_parameters) > 0:
                     logger.warning(
                         "Moving the following attributes in the config to the generation config: "
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
index 06c601148..0ce15641f 100644
--- a/optimum/intel/openvino/modeling_base_seq2seq.py
+++ b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -84,7 +84,11 @@ def __init__(
         self.generation_config = generation_config or GenerationConfig.from_model_config(config)
 
         if is_transformers_version(">=", "4.44.99"):
-            misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            # some model configs may have issues with loading without parameters initialization
+            try:
+                misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            except KeyError:
+                misplaced_generation_parameters = {}
             if len(misplaced_generation_parameters) > 0:
                 logger.warning(
                     "Moving the following attributes in the config to the generation config: "
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 8f72a7353..b7bf96a0d 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -15,6 +15,7 @@
 from PIL.Image import Image
 from transformers import (
     AutoConfig,
+    AutoImageProcessor,
     GenerationConfig,
     GenerationMixin,
     PretrainedConfig,
@@ -24,6 +25,7 @@
 
 from ...exporters.openvino import main_export
 from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name
+from ...exporters.openvino.utils import save_config
 from .. import OVQuantizer
 from .configuration import OVConfig, OVWeightQuantizationConfig
 from .modeling_base import OVBaseModel, OVModelPart
@@ -319,6 +321,13 @@ def compile(self):
             if part_model is not None:
                 part_model._compile()
 
+    def _save_config(self, save_directory):
+        """
+        Saves a model configuration into a directory, so that it can be re-loaded using the
+        [`from_pretrained`] class method.
+        """
+        save_config(self.config, save_directory)
+
     def _save_pretrained(self, save_directory: Union[str, Path]):
         """
         Saves the model to the OpenVINO IR format so that it can be re-loaded using the
@@ -728,9 +737,9 @@ def can_generate(self):
     @staticmethod
     @abstractmethod
     def preprocess_inputs(
-        processor,
         text: str,
         image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
         tokenizer: Optional[PreTrainedTokenizer] = None,
     ):
         """
@@ -902,15 +911,23 @@ def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values):
 
     @staticmethod
     def preprocess_inputs(
-        processor,
         text: str,
         image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
         tokenizer: Optional[PreTrainedTokenizer] = None,
     ):
-        if image is None:
-            raise ValueError("Image is required.")
-        chat_template = [{"role": "user", "content": [{"type": "text", "text": text}, {"type": "image"}]}]
-        prompt = processor.apply_chat_template(chat_template, add_generation_prompt=True)
+        if processor is None:
+            raise ValueError("Processor is required.")
+        if getattr(processor, "chat_template", None) is not None:
+            chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}]
+            if image is not None:
+                chat_prompt[0]["content"].append({"type": "image"})
+            prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
+        else:
+            if image is not None and "" not in text:
+                prompt = "\n" + text
+            else:
+                prompt = text
         inputs = processor(images=image, text=prompt, return_tensors="pt")
         return inputs
 
@@ -1209,6 +1226,159 @@ def merge_vision_text_embeddings(
         input_embeds = input_embeds.reshape(B, N, C)
         return input_embeds, attention_mask, position_ids
 
+    def preprocess_inputs(
+        self,
+        text: str,
+        image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+    ):
+        if tokenizer is None:
+            raise ValueError("Tokenizer is required.")
+        import torchvision.transforms as T
+        from torchvision.transforms.functional import InterpolationMode
+
+        IMG_START_TOKEN = ""
+        IMG_END_TOKEN = ""
+        IMG_CONTEXT_TOKEN = ""
+
+        IMAGENET_MEAN = (0.485, 0.456, 0.406)
+        IMAGENET_STD = (0.229, 0.224, 0.225)
+
+        def build_transform(input_size):
+            MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+            transform = T.Compose(
+                [
+                    T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+                    T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+                    T.ToTensor(),
+                    T.Normalize(mean=MEAN, std=STD),
+                ]
+            )
+            return transform
+
+        def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+            best_ratio_diff = float("inf")
+            best_ratio = (1, 1)
+            area = width * height
+            for ratio in target_ratios:
+                target_aspect_ratio = ratio[0] / ratio[1]
+                ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+                if ratio_diff < best_ratio_diff:
+                    best_ratio_diff = ratio_diff
+                    best_ratio = ratio
+                elif ratio_diff == best_ratio_diff:
+                    if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                        best_ratio = ratio
+            return best_ratio
+
+        def dynamic_preprocess(image, min_num=1, max_num=12, image_size=28, use_thumbnail=False):
+            orig_width, orig_height = image.size
+            aspect_ratio = orig_width / orig_height
+
+            # calculate the existing image aspect ratio
+            target_ratios = {
+                (i, j)
+                for n in range(min_num, max_num + 1)
+                for i in range(1, n + 1)
+                for j in range(1, n + 1)
+                if i * j <= max_num and i * j >= min_num
+            }
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+            # find the closest aspect ratio to the target
+            target_aspect_ratio = find_closest_aspect_ratio(
+                aspect_ratio, target_ratios, orig_width, orig_height, image_size
+            )
+
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+            blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+            # resize the image
+            resized_img = image.resize((target_width, target_height))
+            processed_images = []
+            for i in range(blocks):
+                box = (
+                    (i % (target_width // image_size)) * image_size,
+                    (i // (target_width // image_size)) * image_size,
+                    ((i % (target_width // image_size)) + 1) * image_size,
+                    ((i // (target_width // image_size)) + 1) * image_size,
+                )
+                # split the image
+                split_img = resized_img.crop(box)
+                processed_images.append(split_img)
+            assert len(processed_images) == blocks
+            if use_thumbnail and len(processed_images) != 1:
+                thumbnail_img = image.resize((image_size, image_size))
+                processed_images.append(thumbnail_img)
+            return processed_images
+
+        def load_image(image, input_size=448, max_num=12):
+            transform = build_transform(input_size=input_size)
+            images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+            pixel_values = [transform(image) for image in images]
+            pixel_values = torch.stack(pixel_values)
+            return pixel_values
+
+        if image is not None:
+            if "" not in text:
+                text = "\n" + text
+            pixel_values = load_image(image, input_size=self.config.vision_config.image_size)
+            num_patches = pixel_values.shape[0]
+            num_image_token = int(
+                (self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2
+                * (self.config.downsample_ratio**2)
+            )
+            image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * num_image_token * num_patches + IMG_END_TOKEN
+            text = text.replace("", image_tokens, 1)
+            text_inputs = tokenizer(text, return_tensors="pt")
+            inputs = dict(text_inputs)
+            inputs.update({"pixel_values": pixel_values})
+        else:
+            inputs = tokenizer(text, return_tensors="pt")
+        return inputs
+
+    # internvl has issue with check  _get_non_default_parameters, as wrkaraund overide _prepare_generation_config
+    def _prepare_generation_config(
+        self, generation_config: Optional[GenerationConfig], **kwargs: Dict
+    ) -> Tuple[GenerationConfig, Dict]:
+        using_model_generation_config = False
+        if generation_config is None:
+            if (
+                self.generation_config._from_model_config  # 1)
+                and self.generation_config._original_object_hash == hash(self.generation_config)  # 2)
+            ):
+                new_generation_config = GenerationConfig.from_model_config(self.config)
+                if new_generation_config != self.generation_config:  # 4)
+                    warnings.warn(
+                        "You have modified the pretrained model configuration to control generation. This is a"
+                        " deprecated strategy to control generation and will be removed in v5."
+                        " Please use and modify the model generation configuration (see"
+                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
+                        UserWarning,
+                    )
+                    self.generation_config = new_generation_config
+
+            generation_config = self.generation_config
+            using_model_generation_config = True
+
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        # If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model
+        if not using_model_generation_config:
+            if generation_config.bos_token_id is None:
+                generation_config.bos_token_id = self.generation_config.bos_token_id
+            if generation_config.eos_token_id is None:
+                generation_config.eos_token_id = self.generation_config.eos_token_id
+            if generation_config.pad_token_id is None:
+                generation_config.pad_token_id = self.generation_config.pad_token_id
+            if generation_config.decoder_start_token_id is None:
+                generation_config.decoder_start_token_id = self.generation_config.decoder_start_token_id
+
+        return generation_config, model_kwargs
+
 
 class _OVMiniCPMVForCausalLM(OVModelForVisualCausalLM):
     additional_parts = ["resampler"]
@@ -1430,14 +1600,22 @@ def merge_vision_text_embeddings(
 
     @staticmethod
     def preprocess_inputs(
-        processor,
         text: str,
         image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
         tokenizer: Optional[PreTrainedTokenizer] = None,
     ):
-        if image is None:
-            raise ValueError("Image is required.")
-        prompt = f"<|im_start|>user\n(./)\n{text}<|im_end|>\n<|im_start|>assistant\n"
+        if processor is None:
+            raise ValueError("Processor is required.")
+        if getattr(processor, "chat_template", None) is not None:
+            messages = [{"role": "user", "content": text if image is None else "(./)\n" + text}]
+            prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        else:
+            prompt = (
+                f"<|im_start|>user\n(./)\n{text}<|im_end|>\n<|im_start|>assistant\n"
+                if image is not None
+                else text
+            )
         inputs = processor([prompt], [image], return_tensors="pt")
         return inputs
 
@@ -1615,17 +1793,24 @@ def get_multimodal_embeddings(
 
     @staticmethod
     def preprocess_inputs(
-        processor,
         text: str,
         image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
         tokenizer: Optional[PreTrainedTokenizer] = None,
     ):
         if tokenizer is None:
             raise ValueError("Tokenizer is required.")
-        messages = [{"role": "user", "content": f"\n{text}"}]
-        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("")]
-        input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
+        if image is not None and processor is None:
+            raise ValueError("Processor is required.")
+        text_content = f"\n{text}" if image is not None else text
+        messages = [{"role": "user", "content": text_content}]
+        if tokenizer.chat_template is not None:
+            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        if image is not None:
+            text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("")]
+            input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
+        else:
+            input_ids = tokenizer(text, return_tensors="pt").input_ids
         attention_mask = torch.ones_like(input_ids, dtype=torch.int64)
         result = {"input_ids": input_ids, "attention_mask": attention_mask}
         if image is not None:
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 06cc16d04..a84b3e8f4 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -784,7 +784,9 @@ def _prepare_visual_causal_lm_dataset(self, config: OVWeightQuantizationConfig):
             image = Image.open(requests.get(image_url, stream=True).raw)
 
             try:
-                inputs = self.model.preprocess_inputs(processor, instruction, image, tokenizer)
+                inputs = self.model.preprocess_inputs(
+                    text=instruction, image=image, processor=processor, tokenizer=tokenizer
+                )
             except ValueError as value_error:
                 if "Tokenizer is required." in str(value_error) and tokenizer_error is not None:
                     raise tokenizer_error
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 0218f6d0e..12bb9e3e8 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1880,9 +1880,9 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.40.0"):
         SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"]
     if is_transformers_version(">=", "4.45.0"):
-        SUPPORTED_ARCHITECTURES += ["minicpmv"]
-    REMOTE_CODE_MODELS = ["minicpmv", "nanollava"]
+        SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2"]
     TASK = "image-text-to-text"
+    REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava"]
 
     IMAGE = Image.open(
         requests.get(
@@ -1902,45 +1902,25 @@ def get_transformer_model_class(self, model_arch):
             return LlavaNextForConditionalGeneration
         return AutoModelForCausalLM
 
-    def gen_inputs(self, model_arch, base_text_prompt, image=None):
-        model_id = MODEL_NAMES[model_arch]
-        if "llava" in model_arch:
-            prompt = f"\n {base_text_prompt}"
-        elif "minicpmv" in model_arch:
-            prompt = "<|im_start|>user\n(./)\n {base_text_prompt}<|im_end|>\n<|im_start|>assistant\n"
-        if model_arch != "nanollava":
-            processor = AutoProcessor.from_pretrained(
-                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
-            )
-            inputs = processor(images=[self.IMAGE.resize((600, 600))], text=[prompt], return_tensors="pt")
-        else:
-            config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
-            processor = AutoProcessor.from_pretrained(
-                config.mm_vision_tower, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
-            )
-            tokenizer = AutoTokenizer.from_pretrained(
-                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
-            )
-            image_input = None
-            if image is not None:
-                image_input = processor(images=image, return_tensors="pt")["pixel_values"]
-            text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")]
-
-            input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
-            attention_mask = torch.ones_like(input_ids, dtype=torch.int64)
-            inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "images": image_input}
-        return inputs
-
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
+        prompt = "What is shown in this image?"
         model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
         transformers_model = self.get_transformer_model_class(model_arch).from_pretrained(
             model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
         )
+        transformers_model.eval()
+        if "internvl2" in model_arch:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_id, trast_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            img_context_token_id = tokenizer.convert_tokens_to_ids("")
+            transformers_model.img_context_token_id = img_context_token_id
         if "nanollava" in model_arch:
             transformers_model.get_vision_tower().load_model()
-        inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE)
-
+        preprocessors = self.get_preprocessors(model_arch)
+        set_seed(SEED)
         ov_model = OVModelForVisualCausalLM.from_pretrained(
             model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
         )
@@ -1951,13 +1931,18 @@ def test_compare_to_transformers(self, model_arch):
             self.assertTrue(hasattr(ov_model, additional_part))
             self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part])
         self.assertIsInstance(ov_model.config, PretrainedConfig)
-        # pytorch minicpmv is not designed to be used via forward
-        if "minicpmv" not in model_arch:
+        inputs = ov_model.preprocess_inputs(**preprocessors, text=prompt, image=self.IMAGE.resize((600, 600)))
+        # pytorch minicpmv and internvl are not designed to be used via forward
+        if model_arch not in ["minicpmv", "internvl2"]:
+            set_seed(SEED)
+            ov_outputs = ov_model(**inputs)
             set_seed(SEED)
             with torch.no_grad():
                 transformers_outputs = transformers_model(**inputs)
-            ov_outputs = ov_model(**inputs)
-            self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
+            self.assertTrue(
+                torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4),
+                f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}",
+            )
 
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
@@ -1972,9 +1957,11 @@ def test_compare_to_transformers(self, model_arch):
         set_seed(SEED)
         ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
         set_seed(SEED)
-        transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config)
-        # original minicpmv always skip input tokens in generation results, while transformers based approach provide them
-        if model_arch == "minicpmv":
+        with torch.no_grad():
+            transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config)
+
+        # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
+        if model_arch in ["minicpmv", "internvl2"]:
             ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
         self.assertTrue(
             torch.equal(ov_outputs, transformers_outputs),
@@ -2047,18 +2034,19 @@ def test_generate_utils(self, model_arch):
         model = OVModelForVisualCausalLM.from_pretrained(
             model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
         )
+
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
-        inputs = self.gen_inputs(model_arch, "What is shown on this image?", self.IMAGE)
+        question = "Describe image"
+        preprocessors = self.get_preprocessors(model_arch)
+        inputs = model.preprocess_inputs(**preprocessors, text=question, image=self.IMAGE.resize((600, 600)))
         # General case
         outputs = model.generate(**inputs, max_new_tokens=10)
-        # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
-        outputs = outputs[:, inputs["input_ids"].shape[1] :]
-        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
         self.assertIsInstance(outputs[0], str)
 
         # No input image case
         question = "Hi, how are you?"
-        inputs = self.gen_inputs(model_arch, question, None)
+        inputs = model.preprocess_inputs(**preprocessors, text=question, image=None)
         outputs = model.generate(**inputs, max_new_tokens=10)
         # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
         outputs = outputs[:, inputs["input_ids"].shape[1] :]
@@ -2068,6 +2056,29 @@ def test_generate_utils(self, model_arch):
 
         gc.collect()
 
+    def get_preprocessors(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        if model_arch == "nanollava":
+            config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
+            processor = AutoProcessor.from_pretrained(
+                config.mm_vision_tower, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            preprocessors = {"processor": processor, "tokenizer": tokenizer}
+        elif model_arch == "internvl2":
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            preprocessors = {"processor": None, "tokenizer": tokenizer}
+        else:
+            processor = AutoProcessor.from_pretrained(
+                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            preprocessors = {"processor": processor, "tokenizer": None}
+        return preprocessors
+
 
 class OVModelForSpeechSeq2SeqIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = ("whisper",)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index fc3d97e24..313120833 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -71,6 +71,7 @@
     "ibert": "hf-internal-testing/tiny-random-ibert",
     "internlm": "katuni4ka/tiny-random-internlm",
     "internlm2": "katuni4ka/tiny-random-internlm2",
+    "internvl2": "katuni4ka/tiny-random-internvl2",
     "jais": "katuni4ka/tiny-random-jais",
     "levit": "hf-internal-testing/tiny-random-LevitModel",
     "longt5": "hf-internal-testing/tiny-random-longt5",

From 41637d0f37d55bbded064c7fdf6f178d0df1beb5 Mon Sep 17 00:00:00 2001
From: Liubov Talamanova 
Date: Wed, 13 Nov 2024 11:27:55 +0000
Subject: [PATCH 43/53] Introduce support for NF4 data type for OV weight
 compression (#988)

* Add NF4 weight format

* remove test

* Update optimum/intel/openvino/configuration.py

Co-authored-by: Nikita Savelyev 

* Update optimum/intel/openvino/configuration.py

Co-authored-by: Nikita Savelyev 

* Add extra checks

* apply black

---------

Co-authored-by: Nikita Savelyev 
---
 optimum/commands/export/openvino.py     |  2 +-
 optimum/intel/openvino/configuration.py | 18 ++++++++++--------
 optimum/intel/openvino/quantization.py  |  2 ++
 tests/openvino/test_exporters_cli.py    |  3 ++-
 tests/openvino/test_quantization.py     |  7 +++++++
 tests/openvino/utils_tests.py           |  3 +++
 6 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index f092d5cb2..32c8c0dc1 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -71,7 +71,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--weight-format",
         type=str,
-        choices=["fp32", "fp16", "int8", "int4", "mxfp4"],
+        choices=["fp32", "fp16", "int8", "int4", "mxfp4", "nf4"],
         default=None,
         help="The weight format of the exported model.",
     )
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index 6892d9308..1dba6c32f 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -347,7 +347,7 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
             Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and
             compressed layers. Providing a dataset is required to run scale estimation.
         weight_format (`str`, defaults to 'int'):
-            Data format weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4'].
+            Data format weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4'].
         qptq (`bool`, *optional*):
             Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the
             difference between activations of a compressed and original layer. Dataset is required to run GPTQ.
@@ -455,20 +455,22 @@ def post_init(self):
 
         if self.weight_format is None:
             self.weight_format = "int4" if self.bits == 4 else "int8"
-        if self.weight_format not in ["int4", "int8", "mxfp4"]:
+        if self.weight_format not in ["int4", "int8", "mxfp4", "nf4"]:
             raise ValueError(
-                f"Weight format must be one of the following: ['int4', 'int8', 'mxfp4'], but found: {self.weight_format}."
+                f"Weight format must be one of the following: ['int4', 'int8', 'mxfp4', 'nf4'], but found: {self.weight_format}."
             )
-        if self.weight_format == "mxfp4":
+        if self.weight_format in ["mxfp4", "nf4"]:
             if self.bits != 4:
                 raise ValueError(
-                    f"When applying weight compression with 'mxfp4' weight format the `bits` parameters must be set to 4, but found {self.bits}"
+                    f"When applying weight compression with '{self.weight_format}' weight format, the `bits` parameter must be set to 4, but found {self.bits}"
                 )
             if self.quant_method == OVQuantizationMethod.AWQ:
-                raise ValueError("The AWQ algorithm is not supported for 'mxfp4' weight format")
+                raise ValueError(f"The AWQ algorithm is not supported for '{self.weight_format}' weight format")
             if self.scale_estimation:
-                raise ValueError("The Scale Estimation algorithm is not supported for 'mxfp4' weight format")
-            if self.gptq:
+                raise ValueError(
+                    f"The Scale Estimation algorithm is not supported for '{self.weight_format}' weight format"
+                )
+            if self.weight_format == "mxfp4" and self.gptq:
                 raise ValueError("The GPTQ algorithm is not supported for 'mxfp4' weight format")
 
 
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index a84b3e8f4..899153626 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -930,6 +930,8 @@ def _weight_only_quantization(
 
     if config.weight_format == "mxfp4":
         mode = CompressWeightsMode.E2M1
+    elif config.weight_format == "nf4":
+        mode = CompressWeightsMode.NF4
     else:
         if config.bits == 8:
             mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 91e19ff7a..be73b6815 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -108,6 +108,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}),
         ("text-generation-with-past", "opt125m", "int4 --group-size 64", {"int8": 4, "int4": 144}),
         ("text-generation-with-past", "opt125m", "mxfp4", {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}),
+        ("text-generation-with-past", "opt125m", "nf4", {"int8": 4, "nf4": 72}),
         ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 8 --all-layers", {"int4": 16}),
         (
             "text-generation-with-past",
@@ -267,7 +268,7 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
             self.assertEqual(exp_num_fq, num_fq)
 
     @parameterized.expand(TEST_4BIT_CONFIGURATIONS)
-    def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_num_weight_nodes: dict):
+    def test_exporters_cli_4bit(self, task: str, model_type: str, option: str, expected_num_weight_nodes: dict):
         with TemporaryDirectory() as tmpdir:
             result = subprocess.run(
                 f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 3ee055e80..7a415c3a3 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -206,6 +206,13 @@ class OVWeightCompressionTest(unittest.TestCase):
             dict(bits=4, weight_format="mxfp4", group_size=32),
             {"f4e2m1": 20, "f8e8m0": 20, "int8": 4},
         ),
+        (
+            OVModelForCausalLM,
+            "gpt2",
+            False,
+            dict(bits=4, weight_format="nf4", group_size=32),
+            {"nf4": 20, "int8": 4},
+        ),
         (
             OVModelForCausalLM,
             "gpt2",
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 313120833..3822b7646 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -195,6 +195,7 @@ def get_num_quantized_nodes(model):
         "int4": 0,
         "f4e2m1": 0,
         "f8e8m0": 0,
+        "nf4": 0,
     }
     ov_model = model if isinstance(model, ov.Model) else model.model
     for elem in ov_model.get_ops():
@@ -210,4 +211,6 @@ def get_num_quantized_nodes(model):
                 num_weight_nodes["f4e2m1"] += 1
             if type_name == "f8e8m0":
                 num_weight_nodes["f8e8m0"] += 1
+            if type_name == "nf4":
+                num_weight_nodes["nf4"] += 1
     return num_fake_quantize, num_weight_nodes

From febc50e59acb27d65dffb505416da1cc11f54838 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Thu, 14 Nov 2024 18:37:11 +0400
Subject: [PATCH 44/53] phi3 vision (#977)

* wip

* wip wip

* fix images processing

* add test

* add input preprocessing

* Update tests/openvino/test_modeling.py

* Update optimum/exporters/openvino/__main__.py

* Update optimum/intel/openvino/modeling_visual_language.py

Co-authored-by: Nikita Savelyev 

* refactor export configs

---------

Co-authored-by: Nikita Savelyev 
---
 optimum/exporters/openvino/__main__.py        |   6 +
 optimum/exporters/openvino/convert.py         |  17 +-
 optimum/exporters/openvino/model_configs.py   | 426 +++++++++---------
 optimum/exporters/openvino/model_patcher.py   |  21 +
 optimum/exporters/openvino/utils.py           |   2 +-
 .../openvino/modeling_visual_language.py      | 165 ++++++-
 tests/openvino/test_modeling.py               |   4 +-
 tests/openvino/utils_tests.py                 |   1 +
 8 files changed, 426 insertions(+), 216 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index ee61563c9..dba4628d7 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -45,6 +45,8 @@
 from .utils import _MAX_UNCOMPRESSED_SIZE, MULTI_MODAL_TEXT_GENERATION_MODELS, clear_class_registry
 
 
+FORCE_ATTN_MODEL_CLASSES = {"phi3-v": "eager"}
+
 if TYPE_CHECKING:
     from optimum.intel.openvino.configuration import OVConfig
 
@@ -264,6 +266,10 @@ def main_export(
 
         if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
             loading_kwargs["attn_implementation"] = "eager"
+
+        # some models force flash_attn attention by default that does not support load model on cpu
+        if is_transformers_version(">=", "4.36") and model_type in FORCE_ATTN_MODEL_CLASSES:
+            loading_kwargs["_attn_implementation"] = FORCE_ATTN_MODEL_CLASSES[model_type]
         # there are some difference between remote and in library representation of past key values for some models,
         # for avoiding confusion we disable remote code for them
         if (
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index fdee8a3ef..a84ecfabd 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -712,7 +712,18 @@ def export_from_model(
                 )
 
         model_name_or_path = model.config._name_or_path
-        maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)
+        if preprocessors is not None:
+            # phi3-vision processor does not have chat_template attribute that breaks Processor saving on disk
+            if is_transformers_version(">=", "4.45") and model_type == "phi3-v" and len(preprocessors) > 1:
+                if not hasattr(preprocessors[1], "chat_template"):
+                    preprocessors[1].chat_template = getattr(preprocessors[0], "chat_template", None)
+            for processor in preprocessors:
+                try:
+                    processor.save_pretrained(output)
+                except Exception as ex:
+                    logger.error(f"Saving {type(processor)} failed with {ex}")
+        else:
+            maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)
 
         files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
 
@@ -891,6 +902,10 @@ def _get_multi_modal_submodels_and_export_configs(
     if model_type == "internvl-chat" and preprocessors is not None:
         model.config.img_context_token_id = preprocessors[0].convert_tokens_to_ids("")
 
+    if model_type == "phi3-v":
+        model.config.glb_GN = model.model.vision_embed_tokens.glb_GN.tolist()
+        model.config.sub_GN = model.model.vision_embed_tokens.sub_GN.tolist()
+
     if hasattr(model, "image_newline"):
         model.config.image_newline = model.image_newline.tolist()
     main_config_cls = TasksManager.get_exporter_config_constructor(
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 876672db4..b8310882b 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -86,6 +86,7 @@
     MPTModelPatcher,
     PersimmonModelPatcher,
     Phi3ModelPatcher,
+    Phi3VisionImageEmbeddingsPatcher,
     QwenModelPatcher,
     RotaryEmbPatcher,
     UpdateCausalMaskModelPatcher,
@@ -1292,6 +1293,48 @@ def patch_model_for_export(
         return InputEmbeddingPatcher(self, model, model_kwargs)
 
 
+def get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype):
+    model_type = model_type.replace("_", "-")
+
+    if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
+        raise ValueError(
+            f"Unsupported language model type provided `{model_type}`. Please define custom export config"
+        )
+
+    if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
+        raise ValueError(
+            f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
+        )
+    export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]["text-generation-with-past"]
+    export_config = export_config_class(
+        model_config,
+        use_past=True,
+        use_past_in_inputs=True,
+        int_dtype=int_dtype,
+        float_dtype=float_dtype,
+    )
+    return export_config
+
+
+def get_vlm_text_embeddings_config(model_type, model_config, int_dtype, float_dtype):
+    internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype)
+    InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
+    export_config = InputEmbedOpenvVINOConfig(
+        model_config,
+        task="feature-extraction",
+        int_dtype=int_dtype,
+        float_dtype=float_dtype,
+    )
+    return export_config
+
+
+def get_vlm_text_generation_config(model_type, model_config, int_dtype, float_dtype):
+    internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype)
+    export_config = LMInputEmbedsConfigHelper(internal_export_config)
+    export_config._normalized_config = internal_export_config._normalized_config
+    return export_config
+
+
 class LlavaConfigBehavior(str, enum.Enum):
     LANGUAGE = "language"
     VISION_EMBEDDINGS = "vision_embeddings"
@@ -1355,61 +1398,15 @@ def with_behavior(
 
         if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS:
             model_type = self._orig_config.text_config.model_type
-            model_type = model_type.replace("_", "-")
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config.text_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
+            return get_vlm_text_embeddings_config(
+                model_type, self._orig_config.text_config, self.int_dtype, self.float_dtype
             )
-            InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
-            export_config = InputEmbedOpenvVINOConfig(
-                self._orig_config.text_config,
-                task="feature-extraction",
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
-            )
-            return export_config
 
         if behavior == LlavaConfigBehavior.LANGUAGE:
             model_type = self._orig_config.text_config.model_type
-            model_type = model_type.replace("_", "-")
-
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config.text_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
+            return get_vlm_text_generation_config(
+                model_type, self._orig_config.text_config, self.int_dtype, self.float_dtype
             )
-            export_config = LMInputEmbedsConfigHelper(internal_export_config)
-            export_config._normalized_config = internal_export_config._normalized_config
-            return export_config
 
         if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
             return self.__class__(
@@ -1517,61 +1514,15 @@ def with_behavior(
 
         if behavior == InternVLChatConfigBehavior.TEXT_EMBEDDINGS:
             model_type = self._orig_config.llm_config.model_type
-            model_type = model_type.replace("_", "-")
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config.llm_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
-            )
-            InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
-            export_config = InputEmbedOpenvVINOConfig(
-                self._orig_config.llm_config,
-                task="feature-extraction",
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
+            return get_vlm_text_embeddings_config(
+                model_type, self._orig_config.llm_config, self.int_dtype, self.float_dtype
             )
-            return export_config
 
         if behavior == InternVLChatConfigBehavior.LANGUAGE:
             model_type = self._orig_config.llm_config.model_type
-            model_type = model_type.replace("_", "-")
-
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config.llm_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
+            return get_vlm_text_generation_config(
+                model_type, self._orig_config.llm_config, self.int_dtype, self.float_dtype
             )
-            export_config = LMInputEmbedsConfigHelper(internal_export_config)
-            export_config._normalized_config = internal_export_config._normalized_config
-            return export_config
 
         if behavior == InternVLChatConfigBehavior.VISION_EMBEDDINGS:
             return self.__class__(
@@ -1583,7 +1534,8 @@ def with_behavior(
                 preprocessors=self._preprocessors,
             )
 
-    def get_model_for_behavior(self, model, behavior: Union[str, LlavaConfigBehavior]):
+    @staticmethod
+    def get_model_for_behavior(model, behavior: Union[str, LlavaConfigBehavior]):
         if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
             behavior = InternVLChatConfigBehavior(behavior)
 
@@ -1653,7 +1605,8 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             return {}
         return {"last_hidden_state": {0: "batch_size"}}
 
-    def get_model_for_behavior(self, model, behavior: Union[str, LlavaConfigBehavior]):
+    @staticmethod
+    def get_model_for_behavior(model, behavior: Union[str, LlavaConfigBehavior]):
         if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
             behavior = LlavaConfigBehavior(behavior)
 
@@ -1684,61 +1637,11 @@ def with_behavior(
 
         if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS:
             model_type = self._orig_config.model_type.replace("llava-", "")
-            model_type = model_type.replace("_", "-")
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
-            )
-            InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
-            export_config = InputEmbedOpenvVINOConfig(
-                self._orig_config,
-                task="feature-extraction",
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
-            )
-            return export_config
+            return get_vlm_text_embeddings_config(model_type, self._orig_config, self.int_dtype, self.float_dtype)
 
         if behavior == LlavaConfigBehavior.LANGUAGE:
             model_type = self._orig_config.model_type.replace("llava-", "")
-            model_type = model_type.replace("_", "-")
-
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
-            )
-            export_config = LMInputEmbedsConfigHelper(internal_export_config)
-            export_config._normalized_config = internal_export_config._normalized_config
-            return export_config
+            return get_vlm_text_generation_config(model_type, self._orig_config, self.int_dtype, self.float_dtype)
 
         if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
             return self.__class__(
@@ -2090,62 +1993,10 @@ def with_behavior(
             behavior = MiniCPMVConfigBehavior(behavior)
 
         if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
-            model_type = "qwen2"
-            model_type = model_type.replace("_", "-")
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
-            )
-            InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
-            export_config = InputEmbedOpenvVINOConfig(
-                self._orig_config,
-                task="feature-extraction",
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
-            )
-            return export_config
+            return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
 
         if behavior == MiniCPMVConfigBehavior.LANGUAGE:
-            model_type = "qwen2"
-            model_type = model_type.replace("_", "-")
-
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
-            )
-            export_config = LMInputEmbedsConfigHelper(internal_export_config)
-            export_config._normalized_config = internal_export_config._normalized_config
-            return export_config
+            return get_vlm_text_generation_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
 
         if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
             return self.__class__(
@@ -2167,7 +2018,8 @@ def with_behavior(
                 preprocessors=self._preprocessors,
             )
 
-    def get_model_for_behavior(self, model, behavior: Union[str, MiniCPMVConfigBehavior]):
+    @staticmethod
+    def get_model_for_behavior(model, behavior: Union[str, MiniCPMVConfigBehavior]):
         if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
             behavior = MiniCPMVConfigBehavior(behavior)
 
@@ -2196,3 +2048,159 @@ def patch_model_for_export(
             return MiniCPMVResamplerModelPatcher(self, model, model_kwargs)
 
         return super().patch_model_for_export(model, model_kwargs)
+
+
+class Phi3VisionConfigBehavior(str, enum.Enum):
+    LANGUAGE = "language"
+    VISION_PROJECTION = "vision_projection"
+    VISION_EMBEDDINGS = "vision_embeddings"
+    TEXT_EMBEDDINGS = "text_embeddings"
+
+
+class DummyPhi3VisionProjectionInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("input",)
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = 336,
+        height: int = 336,
+        **kwargs,
+    ):
+        self.batch_size = batch_size
+        self._embed_layer_realization = normalized_config.config.embd_layer["embedding_cls"]
+        self.image_dim_out = normalized_config.config.img_processor["image_dim_out"]
+        self.height = height
+        self.width = width
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        h = self.height // 336
+        w = self.width // 336
+        feat_size = (h * w + 1) * 144 + 1 + (h + 1) * 12
+        if self._embed_layer_realization == "linear":
+            shape = [self.batch_size, feat_size, self.image_dim_out]
+        else:
+            shape = [self.batch_size, feat_size, self.image_dim_out * 4]
+        return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
+
+
+@register_in_tasks_manager("phi3-v", *["image-text-to-text"], library_name="transformers")
+class Phi3VisionOpenVINOConfig(OnnxConfig):
+    SUPPORTED_BEHAVIORS = [model_type.value for model_type in Phi3VisionConfigBehavior]
+    NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
+    MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        behavior: Phi3VisionConfigBehavior = Phi3VisionConfigBehavior.VISION_EMBEDDINGS,
+        preprocessors: Optional[List[Any]] = None,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+        )
+        self._behavior = behavior
+        self._orig_config = config
+        if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "img_processor"):
+            self._config = AutoConfig.from_pretrained(
+                config.img_processor["model_name"], trust_remote_code=True
+            ).vision_config
+            self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
+            self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
+        if self._behavior == Phi3VisionConfigBehavior.VISION_PROJECTION and hasattr(config, "img_processor"):
+            self._config = config
+            self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
+            self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyPhi3VisionProjectionInputGenerator,)
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
+            return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}}
+        if self._behavior == Phi3VisionConfigBehavior.VISION_PROJECTION:
+            return {"input": {0: "batch_size", 1: "img_feat_size"}}
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior in [Phi3VisionConfigBehavior.VISION_EMBEDDINGS, Phi3VisionConfigBehavior.VISION_PROJECTION]:
+            return {"last_hidden_state": {0: "batch_size", 1: "height_width_projection"}}
+        return {}
+
+    def with_behavior(
+        self,
+        behavior: Union[str, Phi3VisionConfigBehavior],
+    ):
+        """
+        Creates a config for different behaviour.
+        Args:
+            behavior ([`ConfigBehavior`]):
+                The behavior to use for the new instance.
+        """
+        if isinstance(behavior, str) and not isinstance(behavior, Phi3VisionConfigBehavior):
+            behavior = Phi3VisionConfigBehavior(behavior)
+
+        if behavior == Phi3VisionConfigBehavior.TEXT_EMBEDDINGS:
+            return get_vlm_text_embeddings_config("phi3", self._orig_config, self.int_dtype, self.float_dtype)
+
+        if behavior == Phi3VisionConfigBehavior.LANGUAGE:
+            return get_vlm_text_generation_config("phi3", self._orig_config, self.int_dtype, self.float_dtype)
+
+        if behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+        if behavior == Phi3VisionConfigBehavior.VISION_PROJECTION:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+    @staticmethod
+    def get_model_for_behavior(model, behavior: Union[str, Phi3VisionConfigBehavior]):
+        if isinstance(behavior, str) and not isinstance(behavior, Phi3VisionConfigBehavior):
+            behavior = Phi3VisionConfigBehavior(behavior)
+
+        if behavior == Phi3VisionConfigBehavior.LANGUAGE:
+            return model
+
+        if behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
+            vision_embeddings = model.model.vision_embed_tokens
+            vision_embeddings.config = model.config
+            return vision_embeddings
+
+        if behavior == Phi3VisionConfigBehavior.VISION_PROJECTION:
+            projection = model.model.vision_embed_tokens.img_projection
+            projection.config = model.config
+            return projection
+
+        if behavior == Phi3VisionConfigBehavior.TEXT_EMBEDDINGS:
+            text_embedding = model.model.embed_tokens
+            text_embedding.config = model.config
+            return text_embedding
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        model_kwargs = model_kwargs or {}
+        if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
+            return Phi3VisionImageEmbeddingsPatcher(self, model, model_kwargs)
+        return super().patch_model_for_export(model, model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 7406e1370..58659e637 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -1369,6 +1369,7 @@ def phi3_442_forward(
     output_attentions: Optional[bool] = None,
     output_hidden_states: Optional[bool] = None,
     return_dict: Optional[bool] = None,
+    **kwargs,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     from transformers.cache_utils import Cache, DynamicCache
     from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
@@ -3216,3 +3217,23 @@ def forward(self, input):
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         self._model.forward = self._model.__orig_forward
+
+
+def phi3_vision_embeddings_forward(self, pixel_values: torch.FloatTensor):
+    return self.get_img_features(pixel_values)
+
+
+class Phi3VisionImageEmbeddingsPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        model.forward = types.MethodType(phi3_vision_embeddings_forward, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index 701334209..7fb1bb5f1 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -213,7 +213,7 @@ def get_submodels(model):
     return custom_export, fn_get_submodels
 
 
-MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv"]
+MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv", "phi3-v"]
 
 
 def save_config(config, save_dir):
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index b7bf96a0d..35d91488d 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -230,7 +230,15 @@ def forward(self, image_feature, pos_embed, key_padding_mask):
         return result
 
 
-MODEL_PARTS_CLS_MAPPING = {"resampler": OVResampler}
+class OVVisionProjection(OVModelPart):
+    _model_name = "vision_projection"
+
+    def forward(self, img_features):
+        self._compile()
+        return self.request(img_features)[0]
+
+
+MODEL_PARTS_CLS_MAPPING = {"resampler": OVResampler, "vision_projection": OVVisionProjection}
 
 
 class OVModelForVisualCausalLM(OVBaseModel, GenerationMixin):
@@ -1802,8 +1810,8 @@ def preprocess_inputs(
             raise ValueError("Tokenizer is required.")
         if image is not None and processor is None:
             raise ValueError("Processor is required.")
-        text_content = f"\n{text}" if image is not None else text
-        messages = [{"role": "user", "content": text_content}]
+        text = f"\n{text}" if image is not None else text
+        messages = [{"role": "user", "content": text}]
         if tokenizer.chat_template is not None:
             text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         if image is not None:
@@ -1818,10 +1826,161 @@ def preprocess_inputs(
         return result
 
 
+class _OVPhi3VisionForCausalLM(OVModelForVisualCausalLM):
+    additional_parts = ["vision_projection"]
+
+    def __init__(
+        self,
+        language_model: ov.Model,
+        text_embeddings: ov.Model,
+        vision_embeddings: ov.Model,
+        config: PretrainedConfig = None,
+        device: str = "CPU",
+        dynamic_shapes: bool = True,
+        ov_config: Optional[Dict[str, str]] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            language_model,
+            text_embeddings,
+            vision_embeddings,
+            config,
+            device,
+            dynamic_shapes,
+            ov_config,
+            model_save_dir,
+            quantization_config,
+            **kwargs,
+        )
+        self.sub_GN = torch.tensor(self.config.sub_GN)
+        self.glb_GN = torch.tensor(self.config.glb_GN)
+
+    def get_vision_embeddings(self, pixel_values, image_sizes, **kwargs):
+        num_images, num_crops, c, h, w = pixel_values.shape
+        img_features = self.vision_embeddings(pixel_values.flatten(0, 1)).last_hidden_state.reshape(
+            num_images, num_crops, -1, self.config.img_processor["image_dim_out"]
+        )
+        image_features_proj = self.hd_feature_transform(img_features, image_sizes)
+        return image_features_proj
+
+    def hd_feature_transform(self, image_features, image_sizes):
+        """
+        image_features: (num_images, num_crops+1, 24*24, 1024)
+        """
+
+        image_features = torch.from_numpy(image_features)
+        global_image_features = image_features[:, 0]  # (num_images, 24*24, 1024)
+        # global feature can be viewed as a special HD case with num_crops 1x1
+        global_image_features_hd = self.reshape_hd_patches_2x2merge(global_image_features, 1, 1)
+        global_image_features_hd_newline = self.add_image_newline(global_image_features_hd)
+
+        all_image_embeddings = []
+        # need a for loop to process each image because of different image sizes
+        # (patch arrangement is different for each image)
+        for i, img_size in enumerate(image_sizes):
+            h, w = img_size
+            h_crop = h // 336
+            w_crop = w // 336
+            num_crops = h_crop * w_crop
+
+            # NOTE: real num_crops is padded
+            # (num_crops, 24*24, 1024)
+            sub_image_features = image_features[i, 1 : 1 + num_crops]
+            sub_image_features_hd = self.reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop)
+            sub_image_features_hd_newline = self.add_image_newline(sub_image_features_hd)
+
+            # [sub features, separator, global features]
+            all_image_embeddings.extend(
+                [
+                    sub_image_features_hd_newline.squeeze(0),  # (h_crop*12*(w_crop*12+1), 4096)
+                    self.glb_GN.squeeze(0),
+                    global_image_features_hd_newline[i],
+                ]
+            )
+        image_features_proj = self.vision_projection(torch.cat(all_image_embeddings, dim=0).unsqueeze(0))[0]
+
+        return image_features_proj
+
+    def reshape_hd_patches_2x2merge(self, image_features, h_crop, w_crop):
+        """
+        image_features: (num_images*num_crops, 24*24, 1024)
+        output: (num_images, h_crop*12, w_crop*12, 4096), h_crop*w_crop == num_crops
+        """
+        N, L, C = image_features.shape
+        assert L == 24 * 24 and C == 1024 and N % (h_crop * w_crop) == 0
+        num_images = N // (h_crop * w_crop)
+        H = int(L**0.5)
+        image_features_hd = (
+            image_features.reshape(N, H, H, C)  # N, 24, 24, 1024
+            .reshape(N, H // 2, 2, H // 2, 2, C)  # N, 12, 2, 12, 2, 1024
+            .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
+            .reshape(N, -1, 4 * C)  # N, 144, 4096
+            .reshape(num_images, h_crop, w_crop, H // 2, H // 2, -1)  # n_img, h_crop, w_crop, 12, 12, 4096
+            .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
+            .reshape(num_images, h_crop * H // 2, w_crop * H // 2, 4 * C)  # n_img, h_crop*12, w_crop*12, 4096
+        )
+
+        return image_features_hd
+
+    def add_image_newline(self, image_features_hd):
+        """
+        image_features_hd: (num_images, h_crop*12, w_crop*12, 4096)
+        output: (num_images, (h_crop*12) * (w_crop*12+1), 4096)
+        """
+        num_images, h, w, hid_dim = image_features_hd.shape
+        # add the newline token to the HD image feature patches
+        newline_embeddings = self.sub_GN.expand(num_images, h, -1, -1)  # (n_img, h, 1, hid_dim)
+        image_features_hd_newline = torch.cat([image_features_hd, newline_embeddings], dim=2).reshape(
+            num_images, -1, hid_dim
+        )
+        return image_features_hd_newline
+
+    def get_multimodal_embeddings(
+        self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, image_sizes=None, **kwargs
+    ):
+        MAX_INPUT_ID = int(1e9)
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        # positions for image tokens
+        positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=True)
+        has_image = len(positions[0].tolist()) > 0
+        input_ids = input_ids.clamp_min(0).clamp_max(self.config.vocab_size)
+        inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids, **kwargs))
+        if has_image:
+            vision_embeds = self.get_vision_embeddings(
+                pixel_values, input_ids=input_ids, image_sizes=image_sizes, **kwargs
+            )
+            image_features_proj = torch.from_numpy(vision_embeds)
+            inputs_embeds = inputs_embeds.index_put(positions, image_features_proj, accumulate=False)
+
+        return inputs_embeds, attention_mask, position_ids
+
+    @staticmethod
+    def preprocess_inputs(
+        text: str,
+        image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+    ):
+        if processor is None:
+            raise ValueError("Processor is required.")
+        if image is not None and "<|image_1|>" not in text:
+            text = "<|image_1|>\n" + text
+        if getattr(processor.tokenizer, "chat_template", None) is not None:
+            chat_prompt = [{"role": "user", "content": text}]
+            text = processor.tokenizer.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
+        inputs = processor(images=image, text=text, return_tensors="pt")
+        return inputs
+
+
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
     "llava_next": _OVLlavaNextForCausalLM,
     "internvl_chat": _OvInternVLForCausalLM,
     "minicpmv": _OVMiniCPMVForCausalLM,
     "llava-qwen2": _OVNanoLlavaForCausalLM,
+    "phi3_v": _OVPhi3VisionForCausalLM,
 }
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 12bb9e3e8..d9921e91e 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1880,9 +1880,9 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.40.0"):
         SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"]
     if is_transformers_version(">=", "4.45.0"):
-        SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2"]
+        SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v"]
     TASK = "image-text-to-text"
-    REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava"]
+    REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v"]
 
     IMAGE = Image.open(
         requests.get(
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 3822b7646..394151cc3 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -109,6 +109,7 @@
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     "phi": "echarlaix/tiny-random-PhiForCausalLM",
     "phi3": "Xenova/tiny-random-Phi3ForCausalLM",
+    "phi3_v": "katuni4ka/tiny-random-phi3-vision",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
     "qwen": "katuni4ka/tiny-random-qwen",
     "qwen2": "fxmarty/tiny-dummy-qwen2",

From 040ee1270f0c38d953ee5c2a00281b1d3046ddbd Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Fri, 15 Nov 2024 10:03:08 +0400
Subject: [PATCH 45/53] fix device selection for compilation language model in
 vlm and model saving (#967)

* fix device selection for compilation language model in vlm

* add more tests

* extend tests for vlm

* update tests

* update after rebase

* disable test for old transformers

* Update tests/openvino/test_modeling.py

* fix typo

* Apply suggestions from code review

Co-authored-by: Nikita Savelyev 

* add components

* fix after rebase

* reuse test image

* Update tests/openvino/test_modeling.py

Co-authored-by: Nikita Savelyev 

---------

Co-authored-by: Nikita Savelyev 
---
 optimum/intel/openvino/modeling_base.py       |   5 +-
 .../openvino/modeling_visual_language.py      | 150 ++++++++++++------
 optimum/intel/openvino/utils.py               |   3 +
 tests/openvino/test_export.py                 |  17 +-
 tests/openvino/test_exporters_cli.py          |   3 +
 tests/openvino/test_modeling.py               | 150 +++++++++++++++---
 tests/openvino/utils_tests.py                 |   2 +
 7 files changed, 255 insertions(+), 75 deletions(-)

diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 320d77c4c..8e936e09c 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -782,7 +782,7 @@ def __init__(
             for inputs in self.model.inputs
         }
         self.ov_config = ov_config or {**self.parent_model.ov_config}
-        self.request = None
+        self.request = None if not self.parent_model._compile_only else self.model
         self._model_name = model_name
         self.config = self.parent_model.config
         self._model_dir = Path(model_dir or parent_model._model_save_dir)
@@ -832,3 +832,6 @@ def __call__(self, *args, **kwargs):
 
     def forward(self, *args, **kwargs):
         raise NotImplementedError
+
+    def clear_requests(self):
+        self.request = None
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 35d91488d..e438d69e8 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -16,6 +16,7 @@
 from transformers import (
     AutoConfig,
     AutoImageProcessor,
+    AutoModelForCausalLM,
     GenerationConfig,
     GenerationMixin,
     PretrainedConfig,
@@ -30,7 +31,23 @@
 from .configuration import OVConfig, OVWeightQuantizationConfig
 from .modeling_base import OVBaseModel, OVModelPart
 from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM
-from .utils import TemporaryDirectory
+from .utils import (
+    OV_LANGUAGE_MODEL_NAME,
+    OV_TEXT_EMBEDDINGS_MODEL_NAME,
+    OV_VISION_EMBEDDINGS_MODEL_NAME,
+    TemporaryDirectory,
+)
+
+
+try:
+    from transformers import LlavaForConditionalGeneration
+except ImportError:
+    LlavaForConditionalGeneration = None
+
+try:
+    from transformers import LlavaNextForConditionalGeneration
+except ImportError:
+    LlavaNextForConditionalGeneration = None
 
 
 logger = logging.getLogger(__name__)
@@ -67,13 +84,19 @@ def __init__(
     def compile(self):
         if self.request is None:
             logger.info(f"Compiling the Language model to {self._device} ...")
-            self.request = core.compile_model(self.model, self._device, self.ov_config).create_infer_request()
+            super().compile()
         self._compile_text_emb()
 
     def _compile_text_emb(self):
         if self.text_emb_request is None:
             logger.info(f"Compiling the Text embeddings model to {self._device} ...")
-            self.text_emb_request = core.compile_model(self.text_emb_model, self._device, self.ov_config)
+            if self._compile_only:
+                self.text_emb_request = self.text_emb_model
+            else:
+                logger.info(f"Compiling the Text embeddings model to {self._device} ...")
+                self.text_emb_request = self._compile_model(
+                    self.text_emb_model, self._device, self.ov_config, self.model_save_dir
+                )
 
     def clear_requests(self):
         if self._compile_only:
@@ -238,12 +261,18 @@ def forward(self, img_features):
         return self.request(img_features)[0]
 
 
-MODEL_PARTS_CLS_MAPPING = {"resampler": OVResampler, "vision_projection": OVVisionProjection}
+MODEL_PARTS_CLS_MAPPING = {
+    "resampler": OVResampler,
+    "language_model": OVModelWithEmbedForCausalLM,
+    "vision_embeddings": OVVisionEmbedding,
+    "vision_projection": OVVisionProjection,
+}
 
 
 class OVModelForVisualCausalLM(OVBaseModel, GenerationMixin):
     export_feature = "image-text-to-text"
     additional_parts = []
+    auto_model_class = AutoModelForCausalLM
 
     def __init__(
         self,
@@ -285,11 +314,11 @@ def __init__(
             self.lm_model,
             self.text_embeddings_model,
             config=config,
-            deivce=device,
+            device=device,
             ov_config=ov_config,
             model_save_dir=model_save_dir,
             quantization_config=quantization_config,
-            compile=not self._compile_only and enable_compilation,
+            compile=self._compile_only or enable_compilation,
             compile_only=self._compile_only,
         )
         self.vision_embeddings = OVVisionEmbedding(self.vision_embeddings_model, self)
@@ -315,19 +344,15 @@ def clear_requests(self):
                 "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option"
             )
 
-        self.language_model.clear_requests()
-        components = [self.vision_embeddings] + [getattr(self, part) for part in self.additional_parts]
-        for component in components:
-            if component is not None:
-                component.request = None
+        for _, component in self.components.items():
+            component.clear_requests()
 
     def compile(self):
-        self.language_model.compile()
-        self.vision_embeddings._compile()
-        for part in self.additional_parts:
-            part_model = getattr(self, part, None)
-            if part_model is not None:
-                part_model._compile()
+        for _, component in self.components.items():
+            if isinstance(component, OVModelPart):
+                component._compile()
+            else:
+                component.compile()
 
     def _save_config(self, save_directory):
         """
@@ -345,21 +370,21 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
             save_directory (`str` or `Path`):
                 The directory where to save the model files.
         """
-        src_files = [self.lm_model, self.text_embeddings_model, self.vision_embeddings_model]
-        dst_file_names = [
-            "openvino_language_model.xml",
-            "openvino_text_embeddings_model.xml",
-            "openvino_vision_embeddings_model.xml",
-        ]
-        for part in self.additional_parts:
-            model = getattr(self, f"{part}_model", None)
-            if model is not None:
-                src_files.append(model)
-                dst_file_names.append(f"openvino_{part}_model.xml")
+        src_models = self.submodels
+        dst_file_names = {
+            "lm_model": OV_LANGUAGE_MODEL_NAME,
+            "text_embeddings_model": OV_TEXT_EMBEDDINGS_MODEL_NAME,
+            "vision_embeddings_model": OV_VISION_EMBEDDINGS_MODEL_NAME,
+        }
+        for name in self._submodel_names:
+            if name not in dst_file_names:
+                dst_file_names[name] = f"openvino_{name}.xml"
 
-        for src_file, dst_file_name in zip(src_files, dst_file_names):
+        for name in self._submodel_names:
+            model = src_models[name]
+            dst_file_name = dst_file_names[name]
             dst_path = os.path.join(save_directory, dst_file_name)
-            ov.save_model(src_file, dst_path, compress_to_fp16=False)
+            ov.save_model(model, dst_path, compress_to_fp16=False)
 
         self._save_openvino_config(save_directory)
         if self.generation_config is not None:
@@ -429,14 +454,18 @@ def _from_pretrained(
             token = use_auth_token
 
         model_file_names = {
-            "language_model": "openvino_language_model.xml",
-            "text_embeddings": "openvino_text_embeddings_model.xml",
-            "vision_embeddings": "openvino_vision_embeddings_model.xml",
+            "language_model": OV_LANGUAGE_MODEL_NAME,
+            "language_model_bin": OV_LANGUAGE_MODEL_NAME.replace(".xml", ".bin"),
+            "text_embeddings": OV_TEXT_EMBEDDINGS_MODEL_NAME,
+            "text_embeddings_bin": OV_TEXT_EMBEDDINGS_MODEL_NAME.replace(".xml", ".bin"),
+            "vision_embeddings": OV_VISION_EMBEDDINGS_MODEL_NAME,
+            "vision_embeddings_bin": OV_VISION_EMBEDDINGS_MODEL_NAME.replace(".xml", ".bin"),
         }
 
         model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
         for part in model_cls.additional_parts:
             model_file_names[part] = f"openvino_{part}_model.xml"
+            model_file_names[part + "_bin"] = f"openvino_{part}_model.bin"
         compile_only = kwargs.get("compile_only", False)
         if os.path.isdir(model_id):
             # Load model from a local directory
@@ -593,6 +622,28 @@ def _from_transformers(
             **kwargs,
         )
 
+    @property
+    def _component_names(self):
+        base_components = ["language_model", "vision_embeddings"]
+        additional_components = [part for part in self.additional_parts if getattr(self, part, None) is not None]
+        return base_components + additional_components
+
+    @property
+    def components(self):
+        return {component_name: getattr(self, component_name) for component_name in self._component_names}
+
+    @property
+    def _submodel_names(self):
+        model_names = ["lm_model", "text_embeddings_model", "vision_embeddings_model"]
+        for part in self.additional_parts:
+            if getattr(self, part, None) is not None:
+                model_names.append(part + "_model")
+        return model_names
+
+    @property
+    def submodels(self):
+        return {submodel_name: getattr(self, submodel_name) for submodel_name in self._submodel_names}
+
     def reshape(self, batch_size: int, sequence_length: int):
         logger.warning("Static shapes are not supported for causal language model.")
         return self
@@ -601,17 +652,14 @@ def half(self):
         """
         Converts all the model weights to FP16 for more efficient inference on GPU.
         """
-        apply_moc_transformations(self.lm_model, cf=False)
-        compress_model_transformation(self.lm_model)
-        apply_moc_transformations(self.text_embeddings_model, cf=False)
-        compress_model_transformation(self.text_embeddings_model)
-        apply_moc_transformations(self.vision_embeddings_model, cf=False)
-        compress_model_transformation(self.vision_embeddings_model)
-        for part in self.additional_parts:
-            model = getattr(self, f"{part}_model", None)
-            if model is not None:
-                apply_moc_transformations(model, cf=False)
-                compress_model_transformation(model)
+        for _, submodel in self.submodels.items():
+            apply_moc_transformations(submodel, cf=False)
+            compress_model_transformation(submodel)
+        return self
+
+    def to(self, device):
+        self.language_model.to(device)
+        super().to(device)
         return self
 
     def forward(
@@ -625,11 +673,8 @@ def forward(
         position_ids=None,
         image_bound=None,
         tgt_sizes=None,
-        images=None,
         **kwargs,
     ):
-        if pixel_values is None and images is not None:
-            pixel_values = images
         inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings(
             input_ids,
             pixel_values,
@@ -733,7 +778,6 @@ def prepare_inputs_for_generation(
                 "image_sizes": image_sizes,
                 "image_bound": kwargs.get("image_bound"),
                 "tgt_sizes": kwargs.get("tgt_sizes"),
-                "images": kwargs.get("images"),
             }
         )
         return model_inputs
@@ -756,6 +800,8 @@ def preprocess_inputs(
 
 
 class _OVLlavaForCausalLM(OVModelForVisualCausalLM):
+    auto_model_class = LlavaForConditionalGeneration
+
     def __init__(
         self,
         language_model: ov.Model,
@@ -941,6 +987,8 @@ def preprocess_inputs(
 
 
 class _OVLlavaNextForCausalLM(_OVLlavaForCausalLM):
+    auto_model_class = LlavaNextForConditionalGeneration
+
     # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_next/modeling_llava_next.py#L655
     def pack_image_features(self, image_features, image_sizes, image_newline=None):
         from transformers.models.llava_next.modeling_llava_next import get_anyres_image_grid_shape, unpad_image
@@ -1211,7 +1259,7 @@ def get_text_embeddings(self, input_ids, **kwargs):
         return super().get_text_embeddings(for_inputs_embeds_ids, **kwargs)
 
 
-class _OvInternVLForCausalLM(OVModelForVisualCausalLM):
+class _OVInternVLForCausalLM(OVModelForVisualCausalLM):
     def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
         if input_ids is not None and input_ids.shape[1] == 1:
             return None
@@ -1822,7 +1870,7 @@ def preprocess_inputs(
         attention_mask = torch.ones_like(input_ids, dtype=torch.int64)
         result = {"input_ids": input_ids, "attention_mask": attention_mask}
         if image is not None:
-            result["images"] = torch.unsqueeze(processor(images=image, return_tensors="pt")["pixel_values"][0], 0)
+            result["pixel_values"] = processor(images=[image], return_tensors="pt")["pixel_values"]
         return result
 
 
@@ -1979,8 +2027,8 @@ def preprocess_inputs(
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
     "llava_next": _OVLlavaNextForCausalLM,
-    "internvl_chat": _OvInternVLForCausalLM,
     "minicpmv": _OVMiniCPMVForCausalLM,
     "llava-qwen2": _OVNanoLlavaForCausalLM,
     "phi3_v": _OVPhi3VisionForCausalLM,
+    "internvl_chat": _OVInternVLForCausalLM,
 }
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index 68458c85b..cf5060f42 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -42,6 +42,9 @@
 OV_ENCODER_NAME = "openvino_encoder_model.xml"
 OV_DECODER_NAME = "openvino_decoder_model.xml"
 OV_DECODER_WITH_PAST_NAME = "openvino_decoder_with_past_model.xml"
+OV_TEXT_EMBEDDINGS_MODEL_NAME = "openvino_text_embeddings_model.xml"
+OV_LANGUAGE_MODEL_NAME = "openvino_language_model.xml"
+OV_VISION_EMBEDDINGS_MODEL_NAME = "openvino_vision_embeddings_model.xml"
 
 OV_TOKENIZER_NAME = "openvino_tokenizer{}.xml"
 OV_DETOKENIZER_NAME = "openvino_detokenizer{}.xml"
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 80a020d2b..4c42f8a33 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -41,12 +41,14 @@
     OVModelForSequenceClassification,
     OVModelForSpeechSeq2Seq,
     OVModelForTokenClassification,
+    OVModelForVisualCausalLM,
     OVStableDiffusion3Pipeline,
     OVStableDiffusionPipeline,
     OVStableDiffusionXLImg2ImgPipeline,
     OVStableDiffusionXLPipeline,
 )
 from optimum.intel.openvino.modeling_base import OVBaseModel
+from optimum.intel.openvino.modeling_visual_language import MODEL_TYPE_TO_CLS_MAPPING
 from optimum.intel.openvino.utils import TemporaryDirectory
 from optimum.intel.utils.import_utils import _transformers_version, is_transformers_version
 from optimum.utils.save_utils import maybe_load_preprocessors
@@ -70,12 +72,13 @@ class ExportModelTest(unittest.TestCase):
         "stable-diffusion-xl": OVStableDiffusionXLPipeline,
         "stable-diffusion-xl-refiner": OVStableDiffusionXLImg2ImgPipeline,
         "latent-consistency": OVLatentConsistencyModelPipeline,
+        "llava": OVModelForVisualCausalLM,
     }
 
     if is_transformers_version(">=", "4.45"):
         SUPPORTED_ARCHITECTURES.update({"stable-diffusion-3": OVStableDiffusion3Pipeline, "flux": OVFluxPipeline})
 
-    GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper")
+    GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "llava")
 
     def _openvino_export(
         self,
@@ -93,6 +96,10 @@ def _openvino_export(
             model_class = TasksManager.get_model_class_for_task(task, library=library_name)
             model = model_class(f"hf_hub:{model_name}", pretrained=True, exportable=True)
             TasksManager.standardize_model_attributes(model_name, model, library_name=library_name)
+        elif model_type == "llava":
+            model = MODEL_TYPE_TO_CLS_MAPPING[model_type].auto_model_class.from_pretrained(
+                model_name, **loading_kwargs
+            )
         else:
             model = auto_model.auto_model_class.from_pretrained(model_name, **loading_kwargs)
 
@@ -144,8 +151,12 @@ def test_export_with_custom_gen_config(self, model_type):
         task = auto_model.export_feature
         model_name = MODEL_NAMES[model_type]
         loading_kwargs = {"attn_implementation": "eager"} if model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED else {}
-
-        model = auto_model.auto_model_class.from_pretrained(model_name, **loading_kwargs)
+        if model_type == "llava":
+            model = MODEL_TYPE_TO_CLS_MAPPING[model_type].auto_model_class.from_pretrained(
+                model_name, **loading_kwargs
+            )
+        else:
+            model = auto_model.auto_model_class.from_pretrained(model_name, **loading_kwargs)
 
         model.generation_config.top_k = 42
         model.generation_config.do_sample = True
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index be73b6815..f218fa05b 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -93,6 +93,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         "stable-diffusion-xl": 4 if is_tokenizers_version("<", "0.20") else 0,
         "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") else 2,
         "flux": 4 if is_tokenizers_version("<", "0.20") else 0,
+        "llava": 2 if is_tokenizers_version("<", "0.20") else 0,
     }
 
     SUPPORTED_SD_HYBRID_ARCHITECTURES = [
@@ -244,6 +245,8 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
             elif model_type.startswith("stable-diffusion") or model_type.startswith("flux"):
                 models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder]
                 models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2)
+            elif task.startswith("image-text-to-text"):
+                models = [model.language_model, model.vision_embeddings]
             else:
                 models = [model]
 
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index d9921e91e..a9d0600e5 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import copy
 import gc
 import os
 import tempfile
@@ -61,7 +62,7 @@
 )
 from transformers.onnx.utils import get_preprocessor
 from transformers.testing_utils import slow
-from utils_tests import MODEL_NAMES
+from utils_tests import MODEL_NAMES, TEST_IMAGE_URL
 
 from optimum.exporters.openvino.model_patcher import patch_update_causal_mask
 from optimum.intel import (
@@ -93,10 +94,14 @@
 from optimum.intel.openvino.modeling_visual_language import (
     MODEL_PARTS_CLS_MAPPING,
     MODEL_TYPE_TO_CLS_MAPPING,
-    OVModelWithEmbedForCausalLM,
-    OVVisionEmbedding,
 )
-from optimum.intel.openvino.utils import TemporaryDirectory, _print_compiled_model_properties
+from optimum.intel.openvino.utils import (
+    OV_LANGUAGE_MODEL_NAME,
+    OV_TEXT_EMBEDDINGS_MODEL_NAME,
+    OV_VISION_EMBEDDINGS_MODEL_NAME,
+    TemporaryDirectory,
+    _print_compiled_model_properties,
+)
 from optimum.intel.pipelines import pipeline as optimum_pipeline
 from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version
 from optimum.intel.utils.modeling_utils import _find_files_matching_pattern
@@ -135,6 +140,7 @@ def __init__(self, *args, **kwargs):
         self.OV_DECODER_MODEL_ID = "helenai/gpt2-ov"
         self.OV_SEQ2SEQ_MODEL_ID = "echarlaix/t5-small-openvino"
         self.OV_DIFFUSION_MODEL_ID = "hf-internal-testing/tiny-stable-diffusion-openvino"
+        self.OV_VLM_MODEL_ID = "katuni4ka/tiny-random-llava-ov"
 
     def test_load_from_hub_and_save_model(self):
         tokenizer = AutoTokenizer.from_pretrained(self.OV_MODEL_ID)
@@ -223,6 +229,76 @@ def test_load_from_hub_and_save_decoder_model(self, use_cache):
         del model
         gc.collect()
 
+    @unittest.skipIf(
+        is_transformers_version("<", "4.45"),
+        "model tokenizer exported with tokenizers 0.20 is not compatible with old transformers",
+    )
+    def test_load_from_hub_and_save_visual_language_model(self):
+        model_id = self.OV_VLM_MODEL_ID
+        processor = get_preprocessor(model_id)
+        prompt = "\n What is shown in this image?"
+        image = Image.open(
+            requests.get(
+                TEST_IMAGE_URL,
+                stream=True,
+            ).raw
+        )
+        loaded_model = OVModelForVisualCausalLM.from_pretrained(model_id)
+        self.assertIsInstance(loaded_model, MODEL_TYPE_TO_CLS_MAPPING[loaded_model.config.model_type])
+        for component_name, component in loaded_model.components.items():
+            self.assertIsInstance(component, MODEL_PARTS_CLS_MAPPING[component_name])
+        self.assertIsInstance(loaded_model.config, PretrainedConfig)
+        # Test that PERFORMANCE_HINT is set to LATENCY by default
+        self.assertEqual(loaded_model.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
+
+        for component_name, component in loaded_model.components.items():
+            self.assertIsInstance(component.model, ov.Model)
+            if component_name == "language_model":
+                self.assertEqual(component.request.get_compiled_model().get_property("PERFORMANCE_HINT"), "LATENCY")
+                self.assertIsInstance(component.text_emb_model, ov.Model)
+                self.assertEqual(component.text_emb_request.get_property("PERFORMANCE_HINT"), "LATENCY")
+            else:
+                self.assertEqual(component.request.get_property("PERFORMANCE_HINT"), "LATENCY")
+
+        inputs = processor(images=image, text=prompt, return_tensors="pt")
+        set_seed(SEED)
+        loaded_model_outputs = loaded_model(**inputs)
+
+        with TemporaryDirectory() as tmpdirname:
+            loaded_model.save_pretrained(tmpdirname)
+            folder_contents = os.listdir(tmpdirname)
+            model_files = [
+                OV_LANGUAGE_MODEL_NAME,
+                OV_TEXT_EMBEDDINGS_MODEL_NAME,
+                OV_VISION_EMBEDDINGS_MODEL_NAME,
+            ]
+            model_files += ["openvino_{part}_model.xml" for part in loaded_model.additional_parts]
+            for xml_file_name in model_files:
+                self.assertTrue(xml_file_name in folder_contents)
+                self.assertTrue(xml_file_name.replace(".xml", ".bin") in folder_contents)
+            model = OVModelForVisualCausalLM.from_pretrained(tmpdirname)
+            compile_only_model = OVModelForVisualCausalLM.from_pretrained(tmpdirname, compile_only=True)
+            for _, submodel in compile_only_model.submodels.items():
+                self.assertIsInstance(submodel, ov.runtime.CompiledModel)
+            for component_name, component in compile_only_model.components.items():
+                self.assertIsInstance(component.model, ov.runtime.CompiledModel)
+                if component_name == "language_model":
+                    self.assertIsInstance(component.request, ov.runtime.InferRequest)
+                    self.assertIsInstance(component.text_emb_model, ov.runtime.CompiledModel)
+                    self.assertIsInstance(component.text_emb_request, ov.runtime.CompiledModel)
+                else:
+                    self.assertIsInstance(component.request, ov.runtime.CompiledModel)
+
+            outputs = compile_only_model(**inputs)
+            self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits))
+            del compile_only_model
+
+        outputs = model(**inputs)
+        self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits))
+        del loaded_model
+        del model
+        gc.collect()
+
     def test_load_from_hub_and_save_seq2seq_model(self):
         tokenizer = AutoTokenizer.from_pretrained(self.OV_SEQ2SEQ_MODEL_ID)
         tokens = tokenizer("This is a sample input", return_tensors="pt")
@@ -1332,7 +1408,7 @@ def test_compare_to_transformers(self, model_arch):
         set_seed(SEED)
         transformers_model = AutoModelForImageClassification.from_pretrained(model_id)
         preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         inputs = preprocessor(images=image, return_tensors="pt")
         with torch.no_grad():
@@ -1358,7 +1434,7 @@ def test_pipeline(self, model_arch):
         model.eval()
         preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
         pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor)
-        inputs = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        inputs = TEST_IMAGE_URL
         outputs = pipe(inputs)
         self.assertEqual(pipe.device, model.device)
         self.assertGreaterEqual(outputs[0]["score"], 0.0)
@@ -1379,7 +1455,7 @@ def test_compare_to_timm(self, model_id):
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         timm_model = timm.create_model(model_id, pretrained=True)
         preprocessor = TimmImageProcessor.from_pretrained(model_id)
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         inputs = preprocessor(images=image, return_tensors="pt")
         with torch.no_grad():
@@ -1886,7 +1962,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
 
     IMAGE = Image.open(
         requests.get(
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            TEST_IMAGE_URL,
             stream=True,
         ).raw
     )
@@ -1902,6 +1978,15 @@ def get_transformer_model_class(self, model_arch):
             return LlavaNextForConditionalGeneration
         return AutoModelForCausalLM
 
+    def _check_device_and_request(self, ov_model, expected_device, has_request):
+        request_check_fn = self.assertFalse if has_request else self.assertTrue
+        self.assertEqual(ov_model._device, expected_device)
+        for component_name, component in ov_model.components.items():
+            if component_name == "language_model":
+                request_check_fn(component.text_emb_request is None)
+            self.assertEqual(component._device, expected_device)
+            request_check_fn(component.request is None)
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
         prompt = "What is shown in this image?"
@@ -1922,23 +2007,35 @@ def test_compare_to_transformers(self, model_arch):
         preprocessors = self.get_preprocessors(model_arch)
         set_seed(SEED)
         ov_model = OVModelForVisualCausalLM.from_pretrained(
-            model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS, compile=False
         )
         self.assertIsInstance(ov_model, MODEL_TYPE_TO_CLS_MAPPING[ov_model.config.model_type])
-        self.assertIsInstance(ov_model.vision_embeddings, OVVisionEmbedding)
-        self.assertIsInstance(ov_model.language_model, OVModelWithEmbedForCausalLM)
-        for additional_part in ov_model.additional_parts:
-            self.assertTrue(hasattr(ov_model, additional_part))
-            self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part])
+        for component_name, component in ov_model.components.items():
+            self.assertIsInstance(component, MODEL_PARTS_CLS_MAPPING[component_name])
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         inputs = ov_model.preprocess_inputs(**preprocessors, text=prompt, image=self.IMAGE.resize((600, 600)))
-        # pytorch minicpmv and internvl are not designed to be used via forward
+        transformers_inputs = copy.deepcopy(inputs)
+        test_device = "AUTO"
+        ov_model.to(test_device)
+        self._check_device_and_request(ov_model, test_device, False)
+        test_device = "CPU"
+        ov_model.to(test_device)
+        ov_model.compile()
+        self._check_device_and_request(ov_model, test_device, True)
+        ov_model.clear_requests()
+        self._check_device_and_request(ov_model, test_device, False)
+
+        # nanollava pixel_values input named as images
+        if model_arch == "nanollava":
+            pixel_values = transformers_inputs.pop("pixel_values", None)
+            transformers_inputs["images"] = pixel_values
+        # pytorch minicpmv is not designed to be used via forward
         if model_arch not in ["minicpmv", "internvl2"]:
             set_seed(SEED)
             ov_outputs = ov_model(**inputs)
             set_seed(SEED)
             with torch.no_grad():
-                transformers_outputs = transformers_model(**inputs)
+                transformers_outputs = transformers_model(**transformers_inputs)
             self.assertTrue(
                 torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4),
                 f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}",
@@ -1958,7 +2055,7 @@ def test_compare_to_transformers(self, model_arch):
         ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
         set_seed(SEED)
         with torch.no_grad():
-            transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config)
+            transformers_outputs = transformers_model.generate(**transformers_inputs, generation_config=gen_config)
 
         # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
         if model_arch in ["minicpmv", "internvl2"]:
@@ -2079,6 +2176,19 @@ def get_preprocessors(self, model_arch):
             preprocessors = {"processor": processor, "tokenizer": None}
         return preprocessors
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_model_can_be_loaded_after_saving(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        with TemporaryDirectory() as save_dir:
+            ov_model = OVModelForVisualCausalLM.from_pretrained(
+                model_id, compile=False, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            ov_model.save_pretrained(save_dir)
+            ov_restored_model = OVModelForVisualCausalLM.from_pretrained(
+                save_dir, compile=False, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            self.assertIsInstance(ov_restored_model, type(ov_model))
+
 
 class OVModelForSpeechSeq2SeqIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = ("whisper",)
@@ -2156,7 +2266,7 @@ class OVModelForVision2SeqIntegrationTest(unittest.TestCase):
     SPEEDUP_CACHE = 1.1
 
     def _get_sample_image(self):
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         return image
 
@@ -2263,7 +2373,7 @@ class OVModelForCustomTasksIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES_WITH_HIDDEN_STATES = ["vit-with-hidden-states"]
 
     def _get_sample_image(self):
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         return image
 
@@ -2347,7 +2457,7 @@ class OVModelForOpenCLIPZeroShortImageClassificationTest(unittest.TestCase):
     OV_MODEL_ID_IR = MODEL_NAMES["open-clip-ov"]
 
     def _get_sample_image(self):
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         return image
 
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 394151cc3..dde7bafd3 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -188,6 +188,8 @@
     "nanollava": (30, 30, 2),
 }
 
+TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
 
 def get_num_quantized_nodes(model):
     num_fake_quantize = 0

From ef558f9e28e2464000be4ab7d7b88986b7f84d54 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev 
Date: Sat, 16 Nov 2024 06:06:53 +0100
Subject: [PATCH 46/53] Add compression tests to internvl2 and phi3v (#999)

* Fix NanoLLava quantization

* Add internvl2 compression tests

* Revert "Fix NanoLLava quantization"

This reverts commit 3eba1de92dd29b9d259dfd31b8e5bb3b3dd74c29.

* Add phi3 compression tests; fix phi3 preprocessors saving with optimum-cli quantization

* Trigger Tests

* Trigger Tests

* Trigger Tests
---
 optimum/commands/export/openvino.py   |  9 ++--
 optimum/exporters/openvino/convert.py | 37 ++++++++++------
 tests/openvino/test_exporters_cli.py  | 20 +++++++++
 tests/openvino/test_quantization.py   | 64 ++++++++++++++++++++-------
 4 files changed, 96 insertions(+), 34 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 32c8c0dc1..3b6b4de69 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -21,9 +21,10 @@
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 
 from ...exporters import TasksManager
+from ...exporters.openvino.convert import save_preprocessors
 from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
 from ...intel.utils.modeling_utils import _infer_library_from_model_name_or_path
-from ...utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
+from ...utils.save_utils import maybe_load_preprocessors
 from ..base import BaseOptimumCLICommand, CommandInfo
 
 
@@ -350,11 +351,9 @@ def run(self):
             )
             model.save_pretrained(self.args.output)
 
-            maybe_save_preprocessors(self.args.model, self.args.output, trust_remote_code=self.args.trust_remote_code)
+            preprocessors = maybe_load_preprocessors(self.args.model, trust_remote_code=self.args.trust_remote_code)
+            save_preprocessors(preprocessors, model.config, self.args.output, self.args.trust_remote_code)
             if not self.args.disable_convert_tokenizer:
-                preprocessors = maybe_load_preprocessors(
-                    self.args.model, trust_remote_code=self.args.trust_remote_code
-                )
                 maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors, task=task)
         else:
             # TODO : add input shapes
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index a84ecfabd..fdcfbecf5 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -21,6 +21,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import onnx
+from transformers import PretrainedConfig
 from transformers.generation import GenerationMixin
 from transformers.utils import is_tf_available, is_torch_available
 
@@ -711,19 +712,7 @@ def export_from_model(
                     f"The generation config will not be saved, saving failed with following error:\n{exception}"
                 )
 
-        model_name_or_path = model.config._name_or_path
-        if preprocessors is not None:
-            # phi3-vision processor does not have chat_template attribute that breaks Processor saving on disk
-            if is_transformers_version(">=", "4.45") and model_type == "phi3-v" and len(preprocessors) > 1:
-                if not hasattr(preprocessors[1], "chat_template"):
-                    preprocessors[1].chat_template = getattr(preprocessors[0], "chat_template", None)
-            for processor in preprocessors:
-                try:
-                    processor.save_pretrained(output)
-                except Exception as ex:
-                    logger.error(f"Saving {type(processor)} failed with {ex}")
-        else:
-            maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)
+        save_preprocessors(preprocessors, model.config, output, trust_remote_code)
 
         files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
 
@@ -838,6 +827,28 @@ def export_tokenizer(
         save_model(model, output / file_name.format(suffix))
 
 
+def save_preprocessors(
+    preprocessors: List, config: PretrainedConfig, output: Union[str, Path], trust_remote_code: bool
+):
+    model_name_or_path = config._name_or_path
+    if hasattr(config, "export_model_type"):
+        model_type = config.export_model_type.replace("_", "-")
+    else:
+        model_type = config.model_type.replace("_", "-")
+    if preprocessors is not None:
+        # phi3-vision processor does not have chat_template attribute that breaks Processor saving on disk
+        if is_transformers_version(">=", "4.45") and model_type == "phi3-v" and len(preprocessors) > 1:
+            if not hasattr(preprocessors[1], "chat_template"):
+                preprocessors[1].chat_template = getattr(preprocessors[0], "chat_template", None)
+        for processor in preprocessors:
+            try:
+                processor.save_pretrained(output)
+            except Exception as ex:
+                logger.error(f"Saving {type(processor)} failed with {ex}")
+    else:
+        maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)
+
+
 def _add_runtime_options_to_rt_info(model: Model, options: Dict):
     """
     Add runtime optinos
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index f218fa05b..783b994c1 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -152,6 +152,26 @@ class OVCLIExportTestCase(unittest.TestCase):
             ]
         )
 
+    if is_transformers_version(">=", "4.45.0"):
+        TEST_4BIT_CONFIGURATIONS.extend(
+            [
+                (
+                    "image-text-to-text",
+                    "internvl2",
+                    'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "hessian_input_activation" '
+                    "--dataset contextual --num-samples 1 --trust-remote-code",
+                    {"int8": 6, "int4": 24},
+                ),
+                (
+                    "image-text-to-text",
+                    "phi3_v",
+                    'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" '
+                    "--dataset contextual --num-samples 1 --trust-remote-code",
+                    {"int8": 4, "int4": 14},
+                ),
+            ]
+        )
+
     def _openvino_export(self, model_name: str, task: str):
         with TemporaryDirectory() as tmpdir:
             main_export(
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 7a415c3a3..48a36f604 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -347,23 +347,55 @@ class OVWeightCompressionTest(unittest.TestCase):
         )
 
     if is_transformers_version(">=", "4.45.0"):
-        LOAD_IN_4_BITS_SCOPE.append(
-            (
-                OVModelForVisualCausalLM,
-                "minicpmv",
-                True,
-                dict(
-                    bits=4,
-                    group_size=16,
-                    dataset="contextual",
-                    ratio=0.8,
-                    sensitivity_metric="mean_activation_magnitude",
-                    num_samples=1,
-                    processor=MODEL_NAMES["minicpmv"],
-                    trust_remote_code=True,
+        LOAD_IN_4_BITS_SCOPE.extend(
+            [
+                (
+                    OVModelForVisualCausalLM,
+                    "minicpmv",
+                    True,
+                    dict(
+                        bits=4,
+                        group_size=16,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="mean_activation_magnitude",
+                        num_samples=1,
+                        processor=MODEL_NAMES["minicpmv"],
+                        trust_remote_code=True,
+                    ),
+                    {"int4": 22, "int8": 8},
                 ),
-                {"int4": 22, "int8": 8},
-            )
+                (
+                    OVModelForVisualCausalLM,
+                    "internvl2",
+                    True,
+                    dict(
+                        bits=4,
+                        group_size=4,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="mean_activation_magnitude",
+                        num_samples=1,
+                        trust_remote_code=True,
+                    ),
+                    {"int4": 22, "int8": 8},
+                ),
+                (
+                    OVModelForVisualCausalLM,
+                    "phi3_v",
+                    True,
+                    dict(
+                        bits=4,
+                        group_size=16,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="mean_activation_magnitude",
+                        num_samples=1,
+                        trust_remote_code=True,
+                    ),
+                    {"int4": 14, "int8": 4},
+                ),
+            ]
         )
 
     SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = [

From 47089ba751c3274d529a27c9797eaa6010a08486 Mon Sep 17 00:00:00 2001
From: Emmanuel Ferdman 
Date: Mon, 18 Nov 2024 08:25:54 +0200
Subject: [PATCH 47/53] Update `run_ocr_post_training.py` reference (#974)

Signed-off-by: Emmanuel Ferdman 
---
 .../neural_compressor/optical-character-recognition/README.md   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/neural_compressor/optical-character-recognition/README.md b/examples/neural_compressor/optical-character-recognition/README.md
index f701badfc..7a80ed035 100644
--- a/examples/neural_compressor/optical-character-recognition/README.md
+++ b/examples/neural_compressor/optical-character-recognition/README.md
@@ -16,7 +16,7 @@ limitations under the License.
 
 # Optical Character Recognition
 
-The script [`run_ocr.py`](https://github.com/huggingface/optimum-intel/blob/main/examples/neural_compressor/optical-character-recognition/run_ocr.py)
+The script [`run_ocr_post_training.py`](https://github.com/huggingface/optimum-intel/blob/main/examples/neural_compressor/optical-character-recognition/run_ocr_post_training.py)
 allows us to apply different quantization approaches (such as dynamic and static quantization) 
 using the [Intel Neural Compressor ](https://github.com/intel/neural-compressor) library for optical character recognition tasks and [IAM](https://fki.tic.heia-fr.ch/databases/iam-handwriting-database) datasets.
 

From 7636733fe67898dbb18c357d9d08144864af466e Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Mon, 18 Nov 2024 12:50:36 +0400
Subject: [PATCH 48/53] align minicpm preprocessing with original model inputs,
 make internvl preproc static (#1003)

---
 .../intel/openvino/modeling_visual_language.py  | 17 +++++++++++++----
 optimum/intel/openvino/quantization.py          |  2 +-
 tests/openvino/test_modeling.py                 |  3 ++-
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index e438d69e8..a1b531a1f 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -793,6 +793,7 @@ def preprocess_inputs(
         image: Optional[Image] = None,
         processor: Optional[AutoImageProcessor] = None,
         tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
     ):
         """
         Preprocess input instruction and an image.
@@ -969,6 +970,7 @@ def preprocess_inputs(
         image: Optional[Image] = None,
         processor: Optional[AutoImageProcessor] = None,
         tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
     ):
         if processor is None:
             raise ValueError("Processor is required.")
@@ -1282,12 +1284,13 @@ def merge_vision_text_embeddings(
         input_embeds = input_embeds.reshape(B, N, C)
         return input_embeds, attention_mask, position_ids
 
+    @staticmethod
     def preprocess_inputs(
-        self,
         text: str,
         image: Optional[Image] = None,
         processor: Optional[AutoImageProcessor] = None,
         tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
     ):
         if tokenizer is None:
             raise ValueError("Tokenizer is required.")
@@ -1379,13 +1382,15 @@ def load_image(image, input_size=448, max_num=12):
             return pixel_values
 
         if image is not None:
+            if config is None:
+                raise ValueError("Config is required.")
             if "" not in text:
                 text = "\n" + text
-            pixel_values = load_image(image, input_size=self.config.vision_config.image_size)
+            pixel_values = load_image(image, input_size=config.vision_config.image_size)
             num_patches = pixel_values.shape[0]
             num_image_token = int(
-                (self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2
-                * (self.config.downsample_ratio**2)
+                (config.vision_config.image_size // config.vision_config.patch_size) ** 2
+                * (config.downsample_ratio**2)
             )
             image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * num_image_token * num_patches + IMG_END_TOKEN
             text = text.replace("", image_tokens, 1)
@@ -1660,6 +1665,7 @@ def preprocess_inputs(
         image: Optional[Image] = None,
         processor: Optional[AutoImageProcessor] = None,
         tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
     ):
         if processor is None:
             raise ValueError("Processor is required.")
@@ -1673,6 +1679,7 @@ def preprocess_inputs(
                 else text
             )
         inputs = processor([prompt], [image], return_tensors="pt")
+        inputs.pop("image_sizes", None)
         return inputs
 
 
@@ -1853,6 +1860,7 @@ def preprocess_inputs(
         image: Optional[Image] = None,
         processor: Optional[AutoImageProcessor] = None,
         tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
     ):
         if tokenizer is None:
             raise ValueError("Tokenizer is required.")
@@ -2012,6 +2020,7 @@ def preprocess_inputs(
         image: Optional[Image] = None,
         processor: Optional[AutoImageProcessor] = None,
         tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
     ):
         if processor is None:
             raise ValueError("Processor is required.")
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 899153626..92cd5e79b 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -785,7 +785,7 @@ def _prepare_visual_causal_lm_dataset(self, config: OVWeightQuantizationConfig):
 
             try:
                 inputs = self.model.preprocess_inputs(
-                    text=instruction, image=image, processor=processor, tokenizer=tokenizer
+                    text=instruction, image=image, processor=processor, tokenizer=tokenizer, config=self.model.config
                 )
             except ValueError as value_error:
                 if "Tokenizer is required." in str(value_error) and tokenizer_error is not None:
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index a9d0600e5..23b49efb3 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -2165,10 +2165,11 @@ def get_preprocessors(self, model_arch):
             )
             preprocessors = {"processor": processor, "tokenizer": tokenizer}
         elif model_arch == "internvl2":
+            config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
             tokenizer = AutoTokenizer.from_pretrained(
                 model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
             )
-            preprocessors = {"processor": None, "tokenizer": tokenizer}
+            preprocessors = {"processor": None, "tokenizer": tokenizer, "config": config}
         else:
             processor = AutoProcessor.from_pretrained(
                 model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS

From c7d6227f87ed6b6b5a1a7f9cc53a4b75fa412aec Mon Sep 17 00:00:00 2001
From: Nikita Savelyev 
Date: Mon, 18 Nov 2024 11:19:04 +0100
Subject: [PATCH 49/53] Compress VLM model components to int8_sym instead of
 int8_asym (#1002)

* Compress VLM model components to int8_sym instead of int8_asym

* Tweak references

* Update reference values
---
 optimum/intel/openvino/quantization.py | 2 +-
 tests/openvino/utils_tests.py          | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 92cd5e79b..1b36c98b4 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -437,7 +437,7 @@ def _quantize_ovbasemodel(
                     sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts
                     sub_models = [getattr(self.model, f"{name}_model") for name in sub_model_names]
                     for sub_model in sub_models:
-                        _weight_only_quantization(sub_model, OVWeightQuantizationConfig(bits=8, sym=False))
+                        _weight_only_quantization(sub_model, OVWeightQuantizationConfig(bits=8, sym=True))
                     self.model.clear_requests()
                 else:
                     _weight_only_quantization(self.model.model, quantization_config, calibration_dataset)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index dde7bafd3..b646b5b52 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -182,10 +182,10 @@
     "open-clip": (20, 28),
     "stable-diffusion-3": (66, 42, 58, 30),
     "flux": (56, 24, 28, 64),
-    "llava": (30, 18, 2),
-    "llava_next": (30, 18, 2),
-    "minicpmv": (30, 52, 2, 12),
-    "nanollava": (30, 30, 2),
+    "llava": (30, 9, 1),
+    "llava_next": (30, 9, 1),
+    "minicpmv": (30, 26, 1, 6),
+    "nanollava": (30, 15, 1),
 }
 
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"

From e3031f058fff4763a9fd917464e26aab9994449f Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Mon, 18 Nov 2024 12:23:35 +0100
Subject: [PATCH 50/53] [`fix`] Extend when a model repository/directory
 already has an exported OV model (#1000)

* Also accept e.g. "openvino_model_qint8_quantized.xml"

* Add test case

* Add missing subfolder call to test
---
 optimum/intel/openvino/modeling_base.py |  2 +-
 tests/openvino/test_modeling.py         | 24 +++++++++++++++++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 8e936e09c..4c91169bb 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -444,7 +444,7 @@ def from_pretrained(
 
             ov_files = _find_files_matching_pattern(
                 model_dir,
-                pattern=r"(.*)?openvino(.*)?\_model.xml$",
+                pattern=r"(.*)?openvino(.*)?\_model(.*)?.xml$",
                 subfolder=subfolder,
                 use_auth_token=token,
                 revision=revision,
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 23b49efb3..f7f677bf8 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -431,7 +431,7 @@ def test_infer_export_when_loading(self):
 
     def test_find_files_matching_pattern(self):
         model_id = "echarlaix/tiny-random-PhiForCausalLM"
-        pattern = r"(.*)?openvino(.*)?\_model.xml$"
+        pattern = r"(.*)?openvino(.*)?\_model(.*)?.xml$"
         # hub model
         for revision in ("main", "ov", "itrex"):
             ov_files = _find_files_matching_pattern(
@@ -452,7 +452,7 @@ def test_find_files_matching_pattern(self):
 
     @parameterized.expand(("stable-diffusion", "stable-diffusion-openvino"))
     def test_find_files_matching_pattern_sd(self, model_arch):
-        pattern = r"(.*)?openvino(.*)?\_model.xml$"
+        pattern = r"(.*)?openvino(.*)?\_model(.*)?.xml$"
         model_id = MODEL_NAMES[model_arch]
         # hub model
         ov_files = _find_files_matching_pattern(model_id, pattern=pattern)
@@ -470,7 +470,7 @@ def test_find_files_matching_pattern_sd(self, model_arch):
     def test_find_files_matching_pattern_with_config_in_root(self, subfolder):
         # Notably, the model has a config.json file in the root directory and not in the subfolder
         model_id = "sentence-transformers-testing/stsb-bert-tiny-openvino"
-        pattern = r"(.*)?openvino(.*)?\_model.xml$"
+        pattern = r"(.*)?openvino(.*)?\_model(.*)?.xml$"
         # hub model
         ov_files = _find_files_matching_pattern(model_id, pattern=pattern, subfolder=subfolder)
         self.assertTrue(len(ov_files) == 1 if subfolder == "openvino" else len(ov_files) == 0)
@@ -483,6 +483,24 @@ def test_find_files_matching_pattern_with_config_in_root(self, subfolder):
             ov_files = _find_files_matching_pattern(local_dir, pattern=pattern, subfolder=subfolder)
             self.assertTrue(len(ov_files) == 1 if subfolder == "openvino" else len(ov_files) == 0)
 
+    def test_find_files_matching_pattern_with_quantized_ov_model(self):
+        # This model only has "openvino/openvino_model_qint8_quantized.xml" and "openvino/openvino_model_qint8_quantized.bin"
+        # We want to ensure that this model is found, so the `export` isn't forced to True
+        model_id = "sentence-transformers-testing/stsb-bert-tiny-openvino-quantized-only"
+        subfolder = "openvino"
+        pattern = r"(.*)?openvino(.*)?\_model(.*)?.xml$"
+        # hub model
+        ov_files = _find_files_matching_pattern(model_id, pattern=pattern, subfolder=subfolder)
+        self.assertTrue(len(ov_files) == 1)
+
+        # local model
+        api = HfApi()
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            local_dir = Path(tmpdirname) / "model"
+            api.snapshot_download(repo_id=model_id, local_dir=local_dir)
+            ov_files = _find_files_matching_pattern(local_dir, pattern=pattern, subfolder=subfolder)
+            self.assertTrue(len(ov_files) == 1)
+
 
 class PipelineTest(unittest.TestCase):
     def test_load_model_from_hub(self):

From f75559025fc50f0ba719df613167a887698c7282 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova 
Date: Wed, 20 Nov 2024 18:33:57 +0400
Subject: [PATCH 51/53] fix backward compatibility for case loading
 preconverted sd without saved safety checker (#1004)

---
 optimum/intel/openvino/modeling_diffusion.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 7b3d1c0f4..3ce1cc73f 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -429,6 +429,14 @@ def _from_pretrained(
                 # Check if the module is in a subdirectory
                 if (model_save_path / name).is_dir():
                     submodels[name] = load_method(model_save_path / name)
+                # For backward compatibility with models exported using previous optimum version, where safety_checker saving was disabled
+                elif name == "safety_checker":
+                    logger.warning(
+                        "Pipeline config contains `safety_checker` subcomponent, while `safety_checker` is not available in model directory. "
+                        "`safety_checker` will be disabled. If you want to enable it please set it explicitly to `from_pretrained` method "
+                        "or reexport model with new optimum-intel version"
+                    )
+                    submodels[name] = None
                 else:
                     submodels[name] = load_method(model_save_path)
 

From 595246df6305759ac6310e3ede8db5356d7877e2 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev 
Date: Thu, 21 Nov 2024 10:13:42 +0100
Subject: [PATCH 52/53] Support NNCF 2.14 and OV 2024.5 (#997)

* Tweak reference values

* Update requirements to OV 2024.5

* Update tokenizers condition in tests

* Fix condition

* Trigger Tests

* Trigger Tests
---
 setup.py                             |  2 +-
 tests/openvino/test_exporters_cli.py | 17 +++++++++--------
 tests/openvino/test_quantization.py  |  4 ++--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/setup.py b/setup.py
index 7ef3652f8..0c4bbc154 100644
--- a/setup.py
+++ b/setup.py
@@ -62,7 +62,7 @@
 
 EXTRAS_REQUIRE = {
     "nncf": ["nncf>=2.11.0"],
-    "openvino": ["nncf>=2.11.0", "openvino==2024.4.1.dev20240926", "openvino-tokenizers==2024.4.1.0.dev20240926"],
+    "openvino": ["nncf>=2.11.0", "openvino==2024.5.0", "openvino-tokenizers==2024.5.0"],
     "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
     "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.45"],
     "diffusers": ["diffusers"],
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 783b994c1..67511bb84 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -50,6 +50,7 @@
 from optimum.intel.utils.import_utils import (
     compare_versions,
     is_openvino_tokenizers_available,
+    is_openvino_version,
     is_tokenizers_version,
     is_transformers_version,
 )
@@ -80,20 +81,20 @@ class OVCLIExportTestCase(unittest.TestCase):
     if is_transformers_version(">=", "4.45"):
         SUPPORTED_ARCHITECTURES.extend([("text-to-image", "stable-diffusion-3"), ("text-to-image", "flux")])
     EXPECTED_NUMBER_OF_TOKENIZER_MODELS = {
-        "gpt2": 2 if is_tokenizers_version("<", "0.20") else 0,
+        "gpt2": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
         "t5": 0,  # no .model file in the repository
         "albert": 0,  # not supported yet
         "distilbert": 1,  # no detokenizer
-        "roberta": 2 if is_tokenizers_version("<", "0.20") else 0,
+        "roberta": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
         "vit": 0,  # no tokenizer for image model
         "wav2vec2": 0,  # no tokenizer
         "bert": 1,  # no detokenizer
-        "blenderbot": 2 if is_tokenizers_version("<", "0.20") else 0,
-        "stable-diffusion": 2 if is_tokenizers_version("<", "0.20") else 0,
-        "stable-diffusion-xl": 4 if is_tokenizers_version("<", "0.20") else 0,
-        "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") else 2,
-        "flux": 4 if is_tokenizers_version("<", "0.20") else 0,
-        "llava": 2 if is_tokenizers_version("<", "0.20") else 0,
+        "blenderbot": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
+        "stable-diffusion": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
+        "stable-diffusion-xl": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
+        "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 2,
+        "flux": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
+        "llava": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
     }
 
     SUPPORTED_SD_HYBRID_ARCHITECTURES = [
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 48a36f604..2869acf83 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -258,7 +258,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 sensitivity_metric="mean_activation_magnitude",
                 dataset=["one two, " * i for i in range(10)],
             ),
-            {"int4": 25, "int8": 14},
+            {"int4": 24, "int8": 16},
         ),
         (
             OVModelForCausalLM,
@@ -915,7 +915,7 @@ def preprocess_function(examples, tokenizer):
 
 
 class OVTrainerTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("albert", 64, 39),)
+    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("albert", 63, 39),)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
     @unittest.skipIf(

From 080180bf175c1c395015ba1a1404f49b9e8808b7 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 21 Nov 2024 16:21:05 +0100
Subject: [PATCH 53/53] Bump python documentation dockerfile base image (#1006)

---
 docs/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Dockerfile b/docs/Dockerfile
index 4acde4e65..40142a2c0 100644
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8
+FROM python:3.9
 
 ARG commit_sha
 ARG clone_url