[OV]: Fixed inference after 4 bit weight compression (huggingface#569)

* [OV]: Fixed inferece after 4 bit weight compression * Fixed issue * Update optimum/intel/openvino/modeling_decoder.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Applied comments * Fixed issue when request is None --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
PenghuiCheng · Mar 13, 2024 · d62964a · d62964a
1 parent 416b528
commit d62964a
Showing 1 changed file with 6 additions and 2 deletions.
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
@@ -419,7 +419,8 @@ def prepare_inputs(
             # past_key_values are not used explicitly, instead they are handled inside the model
             if past_key_values is None:
                 # This is the first iteration in a sequence, reset all states
-                self.request.reset_state()
+                if self.request is not None:
+                    self.request.reset_state()
                 # Set initial value for the next beam_idx input that will be used at the current iteration
                 # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used
                 self.next_beam_idx = np.arange(batch_size, dtype=int)
@@ -592,7 +593,10 @@ def _from_pretrained(
         else:
             init_cls = cls
 
-        causal_model = init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
+        enable_compilation = kwargs.pop("compile", True) and not load_in_4bit
+        causal_model = init_cls(
+            model=model, config=config, model_save_dir=model_cache_path.parent, compile=enable_compilation, **kwargs
+        )
 
         if load_in_4bit:
             if not is_nncf_available():