xfail for bug with f16 zp (openvinotoolkit#3220)

### Changes Mark as xfail PTWC torch tests counting number of int4/int8 ops. ### Reason for changes starting from OV 2025 ZP is represented as f16 in some cases (bug: 160006) ### Related tickets 159993 160006 ### Tests - [x] openvino-nightly/job/post_training_weight_compression/73 ![image](https://github.com/user-attachments/assets/bac6b5f0-0d8a-4890-9558-3ce6028ca56c) - [x] manual/job/post_training_weight_compression/303 ![image](https://github.com/user-attachments/assets/0d76fa7a-baf6-4515-bdd5-7bdda11b046e)
alexsu52 · Jan 29, 2025 · 553b21b · 553b21b
1 parent 0333814
commit 553b21b
Show file tree

Hide file tree

Showing 9 changed files with 81 additions and 34 deletions.
diff --git a/tests/post_training/README.md b/tests/post_training/README.md
@@ -152,3 +152,11 @@ To mark a test as expected to fail (xfail) when a validation metric does not mee
   ...
   metrics_xfail_reason: "Issue-<jira ticket number>"
 ```
+
+To mark a test as expected to fail (xfail) when a number of compression operations do not meet expectations, add the following line to the reference data:
+
+```yml
+<Name from model scopes>_backend_<BACKEND>:
+  ...
+  num_compressed_xfail_reason: "Issue-<jira ticket number>"
+```
diff --git a/tests/post_training/data/wc_reference_data_2025.0.yaml b/tests/post_training/data/wc_reference_data_2025.0.yaml
@@ -0,0 +1,10 @@
+tinyllama_int8_data_free_backend_TORCH:
+  metric_value: 0.95624
+  num_int4: 0
+  num_int8: 312
+  num_compressed_xfail_reason: "Issue-160006"
+tinyllama_int4_data_free_backend_TORCH:
+  metric_value: 0.73873
+  num_int4: 114
+  num_int8: 84
+  num_compressed_xfail_reason: "Issue-160006"
diff --git a/tests/post_training/experimental/sparsify_activations/pipelines.py b/tests/post_training/experimental/sparsify_activations/pipelines.py
@@ -35,6 +35,8 @@
 from tests.post_training.pipelines.base import LIMIT_LENGTH_OF_STATUS
 from tests.post_training.pipelines.base import PT_BACKENDS
 from tests.post_training.pipelines.base import BackendType
+from tests.post_training.pipelines.base import ErrorReason
+from tests.post_training.pipelines.base import ErrorReport
 from tests.post_training.pipelines.base import NumCompressNodes
 from tests.post_training.pipelines.base import RunInfo
 from tests.post_training.pipelines.image_classification_timm import ImageClassificationTimm
@@ -170,13 +172,14 @@ def _compress(self):
             )
 
     def _validate(self):
-        super()._validate()
+        errors = super()._validate()
         ref_num_sparse_activations = self.reference_data.get("num_sparse_activations", 0)
         num_sparse_activations = self.run_info.num_compress_nodes.num_sparse_activations
         if num_sparse_activations != ref_num_sparse_activations:
             status_msg = f"Regression: The number of sparse activations is {num_sparse_activations}, \
                 which differs from reference {ref_num_sparse_activations}."
-            raise ValueError(status_msg)
+            errors.append(ErrorReport(ErrorReason.NUM_COMPRESSED, status_msg))
+        return errors
 
 
 class LMSparsifyActivations(SAPipelineMixin, LMWeightCompression):

diff --git a/tests/post_training/pipelines/base.py b/tests/post_training/pipelines/base.py
@@ -19,7 +19,7 @@
 from datetime import timedelta
 from enum import Enum
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 
 import numpy as np
 import onnx
@@ -36,7 +36,18 @@
 from tools.memory_monitor import memory_monitor_context
 
 DEFAULT_VAL_THREADS = 4
-METRICS_XFAIL_REASON = "metrics_xfail_reason"
+XFAIL_SUFFIX = "_xfail_reason"
+
+
+class ErrorReason(Enum):
+    METRICS = "metrics"
+    NUM_COMPRESSED = "num_compressed"
+
+
+@dataclass
+class ErrorReport:
+    reason: ErrorReason
+    msg: str
 
 
 class BackendType(Enum):
@@ -278,9 +289,31 @@ def get_num_compressed(self) -> None:
     def run_bench(self) -> None:
         """Run a benchmark to collect performance statistics."""
 
-    @abstractmethod
-    def _validate(self) -> None:
-        """Validate IR."""
+    def _validate(self) -> List[ErrorReport]:
+        """
+        Validates some test criteria.
+        returns:
+            A list of error reports generated during validation.
+        """
+        return []
+
+    def _process_errors(self, errors) -> str:
+        """
+        Processes a list of error reports and updates the run status.
+
+        :param errors: A list of error reports.
+        :return: A string representing the concatenated statuses of the processed errors.
+        """
+        xfails, msg_list = [], []
+        for report in errors:
+            xfail_reason = report.reason.value + XFAIL_SUFFIX
+            if xfail_reason in self.reference_data:
+                xfails.append(f"XFAIL: {self.reference_data[xfail_reason]} - {report.msg}")
+            else:
+                msg_list.append(report.msg)
+        if msg_list:
+            raise ValueError("\n".join(msg_list))
+        self.run_info.status = "\n".join(xfails)
 
     def prepare(self):
         """
@@ -302,7 +335,7 @@ def validate(self) -> None:
             return
         print("Validation...")
 
-        self._validate()
+        errors = self._validate()
 
         metric_value = self.run_info.metric_value
         metric_reference = self.reference_data.get("metric_value")
@@ -311,22 +344,19 @@ def validate(self) -> None:
         if metric_value is not None and metric_value_fp32 is not None:
             self.run_info.metric_diff = round(self.run_info.metric_value - self.reference_data["metric_value_fp32"], 5)
 
-        status_msg = None
         if (
             metric_value is not None
             and metric_reference is not None
             and not np.isclose(metric_value, metric_reference, atol=self.reference_data.get("atol", 0.001))
         ):
+            status_msg = None
             if metric_value < metric_reference:
                 status_msg = f"Regression: Metric value is less than reference {metric_value} < {metric_reference}"
             if metric_value > metric_reference:
                 status_msg = f"Improvement: Metric value is better than reference {metric_value} > {metric_reference}"
-
-        if status_msg is not None:
-            if METRICS_XFAIL_REASON in self.reference_data:
-                self.run_info.status = f"XFAIL: {self.reference_data[METRICS_XFAIL_REASON]} - {status_msg}"
-            else:
-                raise ValueError(status_msg)
+            if status_msg:
+                errors.append(ErrorReport(ErrorReason.METRICS, status_msg))
+        self._process_errors(errors)
 
     def run(self) -> None:
         """

diff --git a/tests/post_training/pipelines/causal_language_model.py b/tests/post_training/pipelines/causal_language_model.py
@@ -57,6 +57,3 @@ def prepare_calibration_dataset(self):
 
         if self.backend == BackendType.OPTIMUM:
             self.calibration_dataset = calibration_dataset
-
-    def _validate(self):
-        pass
diff --git a/tests/post_training/pipelines/gpt.py b/tests/post_training/pipelines/gpt.py
@@ -95,6 +95,3 @@ def preprocess_function(examples):
             self.calibration_dataset = calibration_dataset
         else:
             self.calibration_dataset = nncf.Dataset(calibration_dataset, self.get_transform_calibration_fn())
-
-    def _validate(self):
-        pass
diff --git a/tests/post_training/pipelines/image_classification_base.py b/tests/post_training/pipelines/image_classification_base.py
@@ -11,6 +11,7 @@
 
 import copy
 import os
+from typing import List
 
 import numpy as np
 import openvino as ov
@@ -21,6 +22,7 @@
 import nncf
 from nncf.common.logging.track_progress import track
 from tests.post_training.pipelines.base import DEFAULT_VAL_THREADS
+from tests.post_training.pipelines.base import ErrorReport
 from tests.post_training.pipelines.base import PTQTestPipeline
 
 
@@ -33,7 +35,7 @@ def prepare_calibration_dataset(self):
 
         self.calibration_dataset = nncf.Dataset(loader, self.get_transform_calibration_fn())
 
-    def _validate(self):
+    def _validate(self) -> List[ErrorReport]:
         val_dataset = datasets.ImageFolder(root=self.data_dir / "imagenet" / "val", transform=self.transform)
         val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=2, shuffle=False)
 
@@ -78,3 +80,4 @@ def process_result(request, userdata):
 
         self.run_info.metric_name = "Acc@1"
         self.run_info.metric_value = acc_top1
+        return []
diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
@@ -14,7 +14,7 @@
 import shutil
 import time
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 
 import numpy as np
 import openvino as ov
@@ -31,6 +31,8 @@
 from tests.cross_fw.shared.paths import TEST_ROOT
 from tests.post_training.pipelines.base import BackendType
 from tests.post_training.pipelines.base import BaseTestPipeline
+from tests.post_training.pipelines.base import ErrorReason
+from tests.post_training.pipelines.base import ErrorReport
 from tests.post_training.pipelines.base import StatsFromOutput
 from tools.memory_monitor import MemoryType
 from tools.memory_monitor import MemoryUnit
@@ -257,7 +259,8 @@ def _compress(self):
             **self.compression_params,
         )
 
-    def _validate(self):
+    def _validate(self) -> List[ErrorReport]:
+        errors = []
         is_stateful = self.params.get("is_stateful", False)
         core = ov.Core()
 
@@ -309,12 +312,11 @@ def _validate(self):
         num_int4_value = self.run_info.num_compress_nodes.num_int4
         num_int8_value = self.run_info.num_compress_nodes.num_int8
 
+        template = "Regression: The number of int{} ops is different than reference {} != {}"
         if num_int4_reference != num_int4_value:
-            status_msg = f"Regression: The number of int4 ops is different \
-                than reference {num_int4_reference} != {num_int4_value}"
-            raise ValueError(status_msg)
-
+            status_msg = template.format(4, num_int4_reference, num_int4_value)
+            errors.append(ErrorReport(ErrorReason.NUM_COMPRESSED, status_msg))
         if num_int8_reference != num_int8_value:
-            status_msg = f"Regression: The number of int8 ops is different \
-                than reference {num_int8_reference} != {num_int8_value}"
-            raise ValueError(status_msg)
+            status_msg = template.format(8, num_int8_reference, num_int8_value)
+            errors.append(ErrorReport(ErrorReason.NUM_COMPRESSED, status_msg))
+        return errors
diff --git a/tests/post_training/pipelines/masked_language_modeling.py b/tests/post_training/pipelines/masked_language_modeling.py
@@ -106,6 +106,3 @@ def preprocess_function(examples):
             self.calibration_dataset = calibration_dataset
         else:
             self.calibration_dataset = nncf.Dataset(calibration_dataset, self.get_transform_calibration_fn())
-
-    def _validate(self):
-        pass