Revert rotary embedding patching for recovering gpu accuracy (#855)

* revert rotary embedding patching for recovering gpu accuracy * revert tests * Update test_exporters_cli.py
huggingface · Aug 6, 2024 · 7c8650d · 7c8650d
1 parent bb38cce
commit 7c8650d
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 16 deletions.
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -561,17 +561,11 @@ def __enter__(self):
         # fill causal mask in slightly different way for avoid overflow on some platforms
         patch_update_causal_mask(self._model, "4.39.0")
 
-        if is_transformers_version(">=", "4.39.0"):
-            register_sin_cos_buffer(self._model)
-
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         if hasattr(self._model.model, "_orig_update_causal_mask"):
             self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
 
-            for layer in self._model.model.layers:
-                layer.self_attn.rotary_emb.forward = layer.self_attn.rotary_emb._orig_forward
-
 
 # copied from https://github.com/huggingface/transformers/commit/57d7594a79a9f5d835abf2d4d384db0e4818e548 to unblock export with transformers 4.42
 def _mistral_update_causal_mask(
@@ -692,10 +686,6 @@ def __enter__(self):
             self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask
             self._model.model._update_causal_mask = types.MethodType(_mistral_update_causal_mask, self._model.model)
 
-            # mistral has some accuracy issues with bf16 with transformers >= 4.42
-            # prefill rotary emb sin/cos for avoid this issue
-            register_sin_cos_buffer(self._model)
-
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
 

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -41,7 +41,7 @@
 )
 from optimum.intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS
 from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
-from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
+from optimum.intel.utils.import_utils import is_openvino_tokenizers_available
 
 
 class OVCLIExportTestCase(unittest.TestCase):
@@ -95,21 +95,21 @@ class OVCLIExportTestCase(unittest.TestCase):
             "llama_awq",
             "int4 --ratio 1.0 --sym --group-size 8 --all-layers",
             0,
-            32 if is_transformers_version("<", "4.39.0") else 34,
+            32,
         ),
         (
             "text-generation-with-past",
             "llama_awq",
             "int4 --ratio 1.0 --sym --group-size 16 --awq --dataset wikitext2 --num-samples 100 "
             "--sensitivity-metric max_activation_variance",
-            6 if is_transformers_version(">=", "4.39") else 4,
+            4,
             28,
         ),
         (
             "text-generation-with-past",
             "llama_awq",
             "int4 --ratio 1.0 --sym --group-size 16 --scale-estimation --dataset wikitext2 --num-samples 100 ",
-            6 if is_transformers_version(">=", "4.39") else 4,
+            4,
             28,
         ),
     ]

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -236,7 +236,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 quant_method=QuantizationMethod.AWQ,
                 scale_estimation=True,
             ),
-            18 if is_transformers_version(">=", "4.39") else 16,
+            16,
         ),
         (
             OVModelForCausalLM,
@@ -250,7 +250,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 dataset="c4",
                 quant_method="awq",
             ),
-            18 if is_transformers_version(">=", "4.39") else 16,
+            16,
         ),
     )