Skip to content

Commit

Permalink
[OV]: Fixed inference after 4 bit weight compression (huggingface#569)
Browse files Browse the repository at this point in the history
* [OV]: Fixed inferece after 4 bit weight compression

* Fixed issue

* Update optimum/intel/openvino/modeling_decoder.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Applied comments

* Fixed issue when request is None

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
  • Loading branch information
2 people authored and PenghuiCheng committed Mar 13, 2024
1 parent 416b528 commit d62964a
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,8 @@ def prepare_inputs(
# past_key_values are not used explicitly, instead they are handled inside the model
if past_key_values is None:
# This is the first iteration in a sequence, reset all states
self.request.reset_state()
if self.request is not None:
self.request.reset_state()
# Set initial value for the next beam_idx input that will be used at the current iteration
# and will be optionally updated by _reorder_cache at the next iterations if beam_search is used
self.next_beam_idx = np.arange(batch_size, dtype=int)
Expand Down Expand Up @@ -592,7 +593,10 @@ def _from_pretrained(
else:
init_cls = cls

causal_model = init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
enable_compilation = kwargs.pop("compile", True) and not load_in_4bit
causal_model = init_cls(
model=model, config=config, model_save_dir=model_cache_path.parent, compile=enable_compilation, **kwargs
)

if load_in_4bit:
if not is_nncf_available():
Expand Down

0 comments on commit d62964a

Please sign in to comment.