From 125b2c3e7b38a0ee02eaa16e3b93f3371cb7b6ab Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Wed, 26 Jan 2022 22:11:12 +0300
Subject: [PATCH 01/11] Improve simple merge

---
 datumaro/components/errors.py            |  29 +++++-
 datumaro/components/media.py             |  13 ++-
 datumaro/components/operations.py        | 111 +++++++++++++++++------
 datumaro/plugins/voc_format/extractor.py |  23 +++--
 tests/test_dataset.py                    |  46 +++++++++-
 tests/test_images.py                     |   7 +-
 6 files changed, 184 insertions(+), 45 deletions(-)

diff --git a/datumaro/components/errors.py b/datumaro/components/errors.py
index 627ebe2997..6cbf486fde 100644
--- a/datumaro/components/errors.py
+++ b/datumaro/components/errors.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: MIT
 
+from typing import Any, Optional, Tuple
+
 from attr import attrib, attrs
 
 
@@ -223,14 +225,35 @@ def _my__init__(self, msg=None, *, sources=None):
 
 @attrs
 class MismatchingImageInfoError(DatasetMergeError):
-    item_id = attrib()
-    a = attrib()
-    b = attrib()
+    item_id: Optional[Tuple[str, str]] = attrib()
+    a: int = attrib()
+    b: int = attrib()
 
     def __str__(self):
         return "Item %s: mismatching image size info: %s vs %s" % \
             (self.item_id, self.a, self.b)
 
+@attrs
+class MismatchingImagePathError(DatasetMergeError):
+    item_id: Optional[Tuple[str, str]] = attrib()
+    a: str = attrib()
+    b: str = attrib()
+
+    def __str__(self):
+        return "Item %s: mismatching image path info: %s vs %s" % \
+            (self.item_id, self.a, self.b)
+
+@attrs
+class MismatchingAttributesError(DatasetMergeError):
+    item_id: Optional[Tuple[str, str]] = attrib()
+    key: str = attrib()
+    a: Any = attrib()
+    b: Any = attrib()
+
+    def __str__(self):
+        return "Item %s: mismatching image attribute %s: %s vs %s" % \
+            (self.item_id or '', self.key, self.a, self.b)
+
 class ConflictingCategoriesError(DatasetMergeError):
     pass
 
diff --git a/datumaro/components/media.py b/datumaro/components/media.py
index e72afe35a4..1e5c53a126 100644
--- a/datumaro/components/media.py
+++ b/datumaro/components/media.py
@@ -49,9 +49,14 @@ def __init__(self,
         if size is not None:
             assert len(size) == 2 and 0 < size[0] and 0 < size[1], size
             size = tuple(map(int, size))
+
         self._size = size # (H, W)
-        if not self._size and isinstance(data, np.ndarray):
-            self._size = data.shape[:2]
+
+        if isinstance(data, np.ndarray):
+            if not self._size:
+                self._size = data.shape[:2]
+            else:
+                assert self._size == data.shape[:2]
 
         assert path is None or isinstance(path, str), path
         if path is None:
@@ -72,8 +77,8 @@ def __init__(self,
         self._ext = ext
 
         if not isinstance(data, np.ndarray):
-            assert path or callable(data), "Image can not be empty"
-            assert data is None or callable(data)
+            assert path or callable(data) or size, "Image can not be empty"
+            assert data is None or callable(data) or size
             if path and osp.isfile(path) or data:
                 data = lazy_image(path, loader=data)
         self._data = data
diff --git a/datumaro/components/operations.py b/datumaro/components/operations.py
index ff8ef4fdbf..04c963df7a 100644
--- a/datumaro/components/operations.py
+++ b/datumaro/components/operations.py
@@ -4,7 +4,7 @@
 
 from collections import OrderedDict
 from copy import deepcopy
-from typing import Callable, Dict, Optional, Set, Tuple
+from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple
 from unittest import TestCase
 import hashlib
 import logging as log
@@ -15,17 +15,19 @@
 import numpy as np
 
 from datumaro.components.annotation import (
-    AnnotationType, Bbox, Label, LabelCategories, MaskCategories,
+    Annotation, AnnotationType, Bbox, Label, LabelCategories, MaskCategories,
     PointsCategories,
 )
 from datumaro.components.cli_plugin import CliPlugin
 from datumaro.components.dataset import Dataset, DatasetItemStorage, IDataset
 from datumaro.components.errors import (
     AnnotationsTooCloseError, ConflictingCategoriesError, DatasetMergeError,
-    FailedAttrVotingError, FailedLabelVotingError, MismatchingImageInfoError,
-    NoMatchingAnnError, NoMatchingItemError, WrongGroupError,
+    FailedAttrVotingError, FailedLabelVotingError, MismatchingAttributesError,
+    MismatchingImageInfoError, MismatchingImagePathError, NoMatchingAnnError,
+    NoMatchingItemError, WrongGroupError,
 )
 from datumaro.components.extractor import CategoriesInfo, DatasetItem
+from datumaro.components.media import Image
 from datumaro.util import filter_dict, find
 from datumaro.util.annotation_util import (
     OKS, approximate_line, bbox_iou, find_instances, max_bbox, mean_bbox,
@@ -106,40 +108,95 @@ def merge(cls, *sources):
     def merge_items(cls, existing_item, current_item):
         return existing_item.wrap(
             image=cls.merge_images(existing_item, current_item),
+            attributes=cls.merge_attrs(
+                existing_item.attributes, current_item.attributes,
+                item_id=(existing_item.id, existing_item.subset)),
             annotations=cls.merge_anno(
                 existing_item.annotations, current_item.annotations))
 
     @staticmethod
-    def merge_images(existing_item, current_item):
-        image = None
-        if existing_item.has_image and current_item.has_image:
-            if existing_item.image.has_data:
-                image = existing_item.image
+    def merge_attrs(a: Dict, b: Dict,
+            item_id: Optional[Tuple[str, str]] = None) -> Dict:
+        merged = {}
+
+        for name in set(a) | set(b):
+            a_val = a.get(name, None)
+            b_val = b.get(name, None)
+
+            if name not in a:
+                m_val = b_val
+            elif name not in b:
+                m_val = a_val
+            elif a_val != b_val:
+                raise MismatchingAttributesError(item_id, name, a_val, b_val)
             else:
-                image = current_item.image
-
-            if existing_item.image.path != current_item.image.path:
-                if not existing_item.image.path:
-                    image._path = current_item.image.path
-
-            if all([existing_item.image._size, current_item.image._size]):
-                if existing_item.image._size != current_item.image._size:
-                    raise MismatchingImageInfoError(
-                        (existing_item.id, existing_item.subset),
-                        existing_item.image._size, current_item.image._size)
-            elif existing_item.image._size:
-                image._size = existing_item.image._size
+                m_val = a_val
+
+            merged[name] = m_val
+
+        return merged
+
+    @staticmethod
+    def merge_images(item_a: DatasetItem, item_b: DatasetItem) -> Image:
+        image = None
+
+        if item_a.has_image and item_b.has_image:
+            if item_a.image.path and item_b.image.path and \
+                    item_a.image.path != item_b.image.path and \
+                    item_a.image.has_data is item_b.image.has_data:
+                # We use has_data as a replacement for path existence check
+                # - If only one image has data, we'll use it. The other
+                #   one is just a path metainfo, which is not significant
+                #   in this case.
+                # - If both images have data or both don't, we need
+                #   to compare paths.
+                #
+                # Different paths can aclually point to the same file,
+                # but it's not the case we'd like to allow here to be
+                # a "simple" merging strategy used for extractor joining
+                raise MismatchingImagePathError(
+                    (item_a.id, item_a.subset),
+                    item_a.image.path, item_b.image.path)
+
+            if item_a.image.has_size and item_b.image.has_size and \
+                    item_a.image.size != item_b.image.size:
+                raise MismatchingImageInfoError(
+                    (item_a.id, item_a.subset),
+                    item_a.image.size, item_b.image.size)
+
+            # Avoid direct comparison here for better performance
+            # If there are 2 "data-only" images, they won't be compared and
+            # we just use the first one
+            if item_a.image.has_data:
+                image = item_a.image
+            elif item_b.image.has_data:
+                image = item_b.image
+            elif item_a.image.path:
+                image = item_a.image
+            elif item_b.image.path:
+                image = item_b.image
+            elif item_a.image.has_size:
+                image = item_a.image
+            elif item_b.image.has_size:
+                image = item_b.image
             else:
-                image._size = current_item.image._size
-        elif existing_item.has_image:
-            image = existing_item.image
+                image = item_a.image
+
+            if not image.has_data or not image.has_size:
+                if item_a.image._size:
+                    image._size = item_a.image._size
+                elif item_b.image._size:
+                    image._size = item_b.image._size
+        elif item_a.has_image:
+            image = item_a.image
         else:
-            image = current_item.image
+            image = item_b.image
 
         return image
 
     @staticmethod
-    def merge_anno(a, b):
+    def merge_anno(a: Iterable[Annotation],
+            b: Iterable[Annotation]) -> List[Annotation]:
         return merge_annotations_equal(a, b)
 
     @staticmethod
diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py
index d57b2e25a4..97b60d3d81 100644
--- a/datumaro/plugins/voc_format/extractor.py
+++ b/datumaro/plugins/voc_format/extractor.py
@@ -130,12 +130,20 @@ def __init__(self, path, task):
         super().__init__(path, task)
 
     def __iter__(self):
+        image_dir = osp.join(self._dataset_dir, VocPath.IMAGES_DIR)
+        if osp.isdir(image_dir):
+            images = {
+                osp.splitext(osp.relpath(p, image_dir))[0].replace('\\', '/'): p
+                for p in find_images(image_dir, recursive=True)
+            }
+        else:
+            images = {}
+
         anno_dir = osp.join(self._dataset_dir, VocPath.ANNOTATIONS_DIR)
 
         for item_id in self._items:
             log.debug("Reading item '%s'" % item_id)
-            image = item_id + VocPath.IMAGE_EXT
-            height, width = 0, 0
+            size = None
 
             anns = []
             ann_file = osp.join(anno_dir, item_id + '.xml')
@@ -147,14 +155,17 @@ def __iter__(self):
                 width = root_elem.find('size/width')
                 if width is not None:
                     width = int(width.text)
+                if height and width:
+                    size = (height, width)
                 filename_elem = root_elem.find('filename')
                 if filename_elem is not None:
-                    image = filename_elem.text
+                    image = osp.join(image_dir, filename_elem.text)
                 anns = self._parse_annotations(root_elem)
+            else:
+                image = images.pop(item_id, None)
 
-            image = osp.join(self._dataset_dir, VocPath.IMAGES_DIR, image)
-            if height and width:
-                image = Image(path=image, size=(height, width))
+            if image or size:
+                image = Image(path=image, size=size)
 
             yield DatasetItem(id=item_id, subset=self._subset,
                 image=image, annotations=anns)
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index f69f1216b4..f399fab35d 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -17,7 +17,9 @@
 )
 from datumaro.components.environment import Environment
 from datumaro.components.errors import (
-    ConflictingCategoriesError, DatasetNotFoundError, MultipleFormatsMatchError,
+    ConflictingCategoriesError, DatasetNotFoundError,
+    MismatchingAttributesError, MismatchingImageInfoError,
+    MismatchingImagePathError, MultipleFormatsMatchError,
     NoMatchingFormatsError, RepeatedItemError, UnknownFormatError,
 )
 from datumaro.components.extractor import (
@@ -390,14 +392,14 @@ def test_can_join_annotations(self):
             DatasetItem(id=1, subset='train', annotations=[
                 Label(1, id=3),
                 Label(2, attributes={ 'x': 1 }),
-            ])
+            ], attributes={'x': 1, 'y': 2})
         ], categories=['a', 'b', 'c', 'd'])
 
         b = Dataset.from_iterable([
             DatasetItem(id=1, subset='train', annotations=[
                 Label(2, attributes={ 'x': 1 }),
                 Label(3, id=4),
-            ])
+            ], attributes={'z': 3, 'y': 2})
         ], categories=['a', 'b', 'c', 'd'])
 
         expected = Dataset.from_iterable([
@@ -405,7 +407,7 @@ def test_can_join_annotations(self):
                 Label(1, id=3),
                 Label(2, attributes={ 'x': 1 }),
                 Label(3, id=4),
-            ])
+            ], attributes={'x': 1, 'y': 2, 'z': 3})
         ], categories=['a', 'b', 'c', 'd'])
 
         merged = Dataset.from_extractors(a, b)
@@ -420,6 +422,42 @@ def test_cant_join_different_categories(self):
         with self.assertRaises(ConflictingCategoriesError):
             Dataset.from_extractors(s1, s2)
 
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_cant_join_different_image_info(self):
+        s1 = Dataset.from_iterable([
+            DatasetItem(1, image=Image(path='1.png', size=(2, 4)))
+        ])
+        s2 = Dataset.from_iterable([
+            DatasetItem(1, image=Image(path='1.png', size=(4, 2)))
+        ])
+
+        with self.assertRaises(MismatchingImageInfoError):
+            Dataset.from_extractors(s1, s2)
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_cant_join_different_images(self):
+        s1 = Dataset.from_iterable([
+            DatasetItem(1, image=Image(path='1.png'))
+        ])
+        s2 = Dataset.from_iterable([
+            DatasetItem(1, image=Image(path='2.png'))
+        ])
+
+        with self.assertRaises(MismatchingImagePathError):
+            Dataset.from_extractors(s1, s2)
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_cant_join_different_attrs(self):
+        s1 = Dataset.from_iterable([
+            DatasetItem(1, attributes={'x': 1})
+        ])
+        s2 = Dataset.from_iterable([
+            DatasetItem(1, attributes={'x': 2})
+        ])
+
+        with self.assertRaises(MismatchingAttributesError):
+            Dataset.from_extractors(s1, s2)
+
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_can_join_datasets(self):
         s1 = Dataset.from_iterable([ DatasetItem(0), DatasetItem(1) ])
diff --git a/tests/test_images.py b/tests/test_images.py
index 829ff0dcbb..717ab5ca7e 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -98,11 +98,15 @@ def test_ctors(self):
                     np.testing.assert_array_equal(img.data, image)
                     self.assertEqual(img.size, tuple(image.shape[:2]))
 
+            with self.subTest():
+                img = Image(size=(2, 4))
+                self.assertEqual(img.size, (2, 4))
+
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_ctor_errors(self):
         with self.subTest('no data specified'):
             with self.assertRaisesRegex(Exception, "can not be empty"):
-                Image(ext='jpg', size=(1, 2))
+                Image(ext='jpg')
 
         with self.subTest('either path or ext'):
             with self.assertRaisesRegex(Exception, "both 'path' and 'ext'"):
@@ -135,6 +139,7 @@ def test_ctors(self):
                 { 'data': image_bytes, 'path': path, 'size': (2, 4) },
                 { 'path': path },
                 { 'path': path, 'size': (2, 4) },
+                { 'path': path, 'size': (2, 4) },
             ]:
                 with self.subTest(**args):
                     img = ByteImage(**args)

From 9249983e30166a06e9910cb954b2b09a250b51b4 Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Thu, 27 Jan 2022 10:56:08 +0300
Subject: [PATCH 02/11] Update changelog

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce2db9e22c..54b8daabff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -42,6 +42,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/621>)
 - OpenVINO telemetry library 2022.1.0 from PyPI.
   (<https://github.com/openvinotoolkit/datumaro/pull/625>)
+- Allowed `Image` creation from just `size` info
+  (<https://github.com/openvinotoolkit/datumaro/pull/634>)
+- Added image search in VOC XML-based subformats
+  (<https://github.com/openvinotoolkit/datumaro/pull/634>)
+- Added image path equality checks in simple merge, when applicable
+  (<https://github.com/openvinotoolkit/datumaro/pull/634>)
 
 ### Deprecated
 - TBD
@@ -70,6 +76,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Numeric warnings that sometimes occurred in `stats` command
   (e.g. <https://github.com/openvinotoolkit/datumaro/issues/607>)
   (<https://github.com/openvinotoolkit/datumaro/pull/621>)
+- Added missing item attribute merging in simple merge
+  (<https://github.com/openvinotoolkit/datumaro/pull/634>)
 
 ### Security
 - TBD

From 41a19eb4cea958a2f74ae2a1782d89ec2f36f5ab Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Thu, 27 Jan 2022 11:16:33 +0300
Subject: [PATCH 03/11] Add image docs

---
 datumaro/components/media.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/datumaro/components/media.py b/datumaro/components/media.py
index 1e5c53a126..31242d6a67 100644
--- a/datumaro/components/media.py
+++ b/datumaro/components/media.py
@@ -44,7 +44,25 @@ def __init__(self,
             *,
             path: Optional[str] = None,
             ext: Optional[str] = None,
-            size: Optional[Tuple[int, int]] = None):
+            size: Optional[Tuple[int, int]] = None) -> None:
+        """
+        Creates an image.
+
+        Any combinations of the `data`, `path` and `size` arguments are
+        possible, but at least one of them must be provided.
+        The `ext` parameter cannot be used as a single argument for
+        construction.
+
+        Args:
+            data - Image pixels or a function to retrieve them. The expected
+                image shape is (H, W [, C]). If a function is provided,
+                it must accept image path as the first argument.
+            path - Image path
+            ext - Image extension. Cannot be used together with `path`. It is
+                useful for saving with a custom extension.
+            size - A pair (H, W), which represents image size.
+        """
+
         assert size is None or len(size) == 2, size
         if size is not None:
             assert len(size) == 2 and 0 < size[0] and 0 < size[1], size
@@ -102,6 +120,7 @@ def has_data(self) -> bool:
 
     @property
     def has_size(self) -> bool:
+        """Indicates that size info is cached and won't require image loading"""
         return self._size is not None or isinstance(self._data, np.ndarray)
 
     @property

From e6545ed87d3b265c52cd78c4e5eb5bd9baf4c853 Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Thu, 27 Jan 2022 11:28:31 +0300
Subject: [PATCH 04/11] Fix tests

---
 tests/cli/test_image_zip_format.py | 2 +-
 tests/test_images.py               | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/cli/test_image_zip_format.py b/tests/cli/test_image_zip_format.py
index 7dde4707a1..ca64cd8ab3 100644
--- a/tests/cli/test_image_zip_format.py
+++ b/tests/cli/test_image_zip_format.py
@@ -49,7 +49,7 @@ def test_can_save_and_load(self):
     def test_can_export_zip_images_from_coco_dataset(self):
         with TestDir() as test_dir:
             coco_dir = osp.join(__file__[:__file__.rfind(osp.join('tests', ''))],
-                'tests', 'assets', 'coco_dataset')
+                'tests', 'assets', 'coco_dataset', 'coco')
 
             run(self, 'create', '-o', test_dir)
             run(self, 'import', '-p', test_dir, '-f', 'coco', coco_dir)
diff --git a/tests/test_images.py b/tests/test_images.py
index 717ab5ca7e..fc0b651c8e 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -65,8 +65,8 @@ class ImageTest(TestCase):
     def test_lazy_image_shape(self):
         data = np.ones((5, 6, 3))
 
-        image_lazy = Image(data=data, size=(2, 4))
-        image_eager = Image(data=data)
+        image_lazy = Image(data=lambda _: data, size=(2, 4))
+        image_eager = Image(data=lambda _: data)
 
         self.assertEqual((2, 4), image_lazy.size)
         self.assertEqual((5, 6), image_eager.size)

From 022d40eb5b7cc3ff9060746fed060f8e56501033 Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Tue, 1 Feb 2022 17:57:30 +0300
Subject: [PATCH 05/11] Make secondary methods of ExactMatcher private

---
 datumaro/components/operations.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/datumaro/components/operations.py b/datumaro/components/operations.py
index e7ac8f0248..023a061194 100644
--- a/datumaro/components/operations.py
+++ b/datumaro/components/operations.py
@@ -4,7 +4,7 @@
 
 from collections import OrderedDict
 from copy import deepcopy
-from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple
 from unittest import TestCase
 import hashlib
 import logging as log
@@ -107,7 +107,7 @@ def merge(cls, *sources: IDataset) -> DatasetItemStorage:
                 existing_item = items.get(item.id, item.subset)
                 if existing_item is not None:
                     try:
-                        item = cls.merge_items(existing_item, item)
+                        item = cls._merge_items(existing_item, item)
                     except DatasetMergeError as e:
                         e.sources = set(range(source_idx))
                         raise e
@@ -116,19 +116,19 @@ def merge(cls, *sources: IDataset) -> DatasetItemStorage:
         return items
 
     @classmethod
-    def merge_items(cls, existing_item: DatasetItem,
+    def _merge_items(cls, existing_item: DatasetItem,
             current_item: DatasetItem) -> DatasetItem:
         return existing_item.wrap(
-            image=cls.merge_images(existing_item, current_item),
-            attributes=cls.merge_attrs(
+            image=cls._merge_images(existing_item, current_item),
+            attributes=cls._merge_attrs(
                 existing_item.attributes, current_item.attributes,
                 item_id=(existing_item.id, existing_item.subset)),
-            annotations=cls.merge_anno(
+            annotations=cls._merge_anno(
                 existing_item.annotations, current_item.annotations))
 
     @staticmethod
-    def merge_attrs(a: Dict, b: Dict,
-            item_id: Optional[Tuple[str, str]] = None) -> Dict:
+    def _merge_attrs(a: Dict[str, Any], b: Dict[str, Any],
+            item_id: Tuple[str, str]) -> Dict:
         merged = {}
 
         for name in set(a) | set(b):
@@ -149,7 +149,7 @@ def merge_attrs(a: Dict, b: Dict,
         return merged
 
     @staticmethod
-    def merge_images(item_a: DatasetItem, item_b: DatasetItem) -> Image:
+    def _merge_images(item_a: DatasetItem, item_b: DatasetItem) -> Image:
         image = None
 
         if item_a.has_image and item_b.has_image:
@@ -207,7 +207,7 @@ def merge_images(item_a: DatasetItem, item_b: DatasetItem) -> Image:
         return image
 
     @staticmethod
-    def merge_anno(a: Iterable[Annotation], b: Iterable[Annotation]) \
+    def _merge_anno(a: Iterable[Annotation], b: Iterable[Annotation]) \
             -> List[Annotation]:
         return merge_annotations_equal(a, b)
 

From 166062297263ca9673e7e1cee376a044c081ae33 Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Tue, 1 Feb 2022 17:57:59 +0300
Subject: [PATCH 06/11] Use define and field in errors

---
 datumaro/components/errors.py | 268 +++++++++++++++++-----------------
 1 file changed, 134 insertions(+), 134 deletions(-)

diff --git a/datumaro/components/errors.py b/datumaro/components/errors.py
index 6cbf486fde..a6afba9471 100644
--- a/datumaro/components/errors.py
+++ b/datumaro/components/errors.py
@@ -1,10 +1,10 @@
-# Copyright (C) 2020-2021 Intel Corporation
+# Copyright (C) 2020-2022 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
-from typing import Any, Optional, Tuple
+from typing import Any, Tuple
 
-from attr import attrib, attrs
+from attrs import define, field
 
 
 class ImmutableObjectError(Exception):
@@ -26,9 +26,9 @@ class ReadonlyProjectError(VcsError):
     def __str__(self):
         return "Can't change a read-only project"
 
-@attrs
+@define
 class UnknownRefError(VcsError):
-    ref = attrib()
+    ref = field()
 
     def __str__(self):
         return f"Can't parse ref '{self.ref}'"
@@ -39,9 +39,9 @@ class MissingObjectError(VcsError):
 class MismatchingObjectError(VcsError):
     pass
 
-@attrs
+@define
 class UnsavedChangesError(VcsError):
-    paths = attrib()
+    paths = field()
 
     def __str__(self):
         return "There are some uncommitted changes: %s" % ', '.join(self.paths)
@@ -99,47 +99,47 @@ def __str__(self):
             """
 
 
-@attrs
+@define
 class ProjectNotFoundError(DatumaroError):
-    path = attrib()
+    path = field()
 
     def __str__(self):
         return f"Can't find project at '{self.path}'"
 
-@attrs
+@define
 class ProjectAlreadyExists(DatumaroError):
-    path = attrib()
+    path = field()
 
     def __str__(self):
         return f"Can't create project: a project already exists " \
             f"at '{self.path}'"
 
-@attrs
+@define
 class UnknownSourceError(DatumaroError):
-    name = attrib()
+    name = field()
 
     def __str__(self):
         return f"Unknown source '{self.name}'"
 
-@attrs
+@define
 class UnknownTargetError(DatumaroError):
-    name = attrib()
+    name = field()
 
     def __str__(self):
         return f"Unknown target '{self.name}'"
 
-@attrs
+@define
 class UnknownFormatError(DatumaroError):
-    format = attrib()
+    format = field()
 
     def __str__(self):
         return f"Unknown source format '{self.format}'. To make it " \
             "available, add the corresponding Extractor implementation " \
             "to the environment"
 
-@attrs
+@define
 class SourceExistsError(DatumaroError):
-    name = attrib()
+    name = field()
 
     def __str__(self):
         return f"Source '{self.name}' already exists"
@@ -148,16 +148,16 @@ def __str__(self):
 class DatasetImportError(DatumaroError):
     pass
 
-@attrs
+@define
 class DatasetNotFoundError(DatasetImportError):
-    path = attrib()
+    path = field()
 
     def __str__(self):
         return f"Failed to find dataset at '{self.path}'"
 
-@attrs
+@define
 class MultipleFormatsMatchError(DatasetImportError):
-    formats = attrib()
+    formats = field()
 
     def __str__(self):
         return "Failed to detect dataset format automatically:" \
@@ -176,9 +176,9 @@ class CategoriesRedefinedError(DatasetError):
     def __str__(self):
         return "Categories can only be set once for a dataset"
 
-@attrs
+@define
 class RepeatedItemError(DatasetError):
-    item_id = attrib()
+    item_id = field()
 
     def __str__(self):
         return f"Item {self.item_id} is repeated in the source sequence."
@@ -187,23 +187,23 @@ def __str__(self):
 class DatasetQualityError(DatasetError):
     pass
 
-@attrs
+@define
 class AnnotationsTooCloseError(DatasetQualityError):
-    item_id = attrib()
-    a = attrib()
-    b = attrib()
-    distance = attrib()
+    item_id = field()
+    a = field()
+    b = field()
+    distance = field()
 
     def __str__(self):
         return "Item %s: annotations are too close: %s, %s, distance = %s" % \
             (self.item_id, self.a, self.b, self.distance)
 
-@attrs
+@define
 class WrongGroupError(DatasetQualityError):
-    item_id = attrib()
-    found = attrib(converter=set)
-    expected = attrib(converter=set)
-    group = attrib(converter=list)
+    item_id = field()
+    found = field(converter=set)
+    expected = field(converter=set)
+    group = field(converter=list)
 
     def __str__(self):
         return "Item %s: annotation group has wrong labels: " \
@@ -211,9 +211,9 @@ def __str__(self):
             (self.item_id, self.found, self.expected, self.group)
 
 
-@attrs(init=False)
+@define(init=False)
 class DatasetMergeError(DatasetError):
-    sources = attrib(converter=set, factory=set, kw_only=True)
+    sources = field(converter=set, factory=set, kw_only=True)
 
     def _my__init__(self, msg=None, *, sources=None):
         super().__init__(msg)
@@ -223,84 +223,84 @@ def _my__init__(self, msg=None, *, sources=None):
 # when __init__ is defined directly
 setattr(DatasetMergeError, '__init__', DatasetMergeError._my__init__)
 
-@attrs
+@define
 class MismatchingImageInfoError(DatasetMergeError):
-    item_id: Optional[Tuple[str, str]] = attrib()
-    a: int = attrib()
-    b: int = attrib()
+    item_id: Tuple[str, str]
+    a: Tuple[int, int]
+    b: Tuple[int, int]
 
     def __str__(self):
         return "Item %s: mismatching image size info: %s vs %s" % \
             (self.item_id, self.a, self.b)
 
-@attrs
+@define
 class MismatchingImagePathError(DatasetMergeError):
-    item_id: Optional[Tuple[str, str]] = attrib()
-    a: str = attrib()
-    b: str = attrib()
+    item_id: Tuple[str, str]
+    a: str
+    b: str
 
     def __str__(self):
         return "Item %s: mismatching image path info: %s vs %s" % \
             (self.item_id, self.a, self.b)
 
-@attrs
+@define
 class MismatchingAttributesError(DatasetMergeError):
-    item_id: Optional[Tuple[str, str]] = attrib()
-    key: str = attrib()
-    a: Any = attrib()
-    b: Any = attrib()
+    item_id: Tuple[str, str]
+    key: str
+    a: Any
+    b: Any
 
     def __str__(self):
         return "Item %s: mismatching image attribute %s: %s vs %s" % \
-            (self.item_id or '', self.key, self.a, self.b)
+            (self.item_id, self.key, self.a, self.b)
 
 class ConflictingCategoriesError(DatasetMergeError):
     pass
 
-@attrs
+@define
 class NoMatchingAnnError(DatasetMergeError):
-    item_id = attrib()
-    ann = attrib()
+    item_id = field()
+    ann = field()
 
     def __str__(self):
         return "Item %s: can't find matching annotation " \
             "in sources %s, annotation is %s" % \
             (self.item_id, self.sources, self.ann)
 
-@attrs
+@define
 class NoMatchingItemError(DatasetMergeError):
-    item_id = attrib()
+    item_id = field()
 
     def __str__(self):
         return "Item %s: can't find matching item in sources %s" % \
             (self.item_id, self.sources)
 
-@attrs
+@define
 class FailedLabelVotingError(DatasetMergeError):
-    item_id = attrib()
-    votes = attrib()
-    ann = attrib(default=None)
+    item_id = field()
+    votes = field()
+    ann = field(default=None)
 
     def __str__(self):
         return "Item %s: label voting failed%s, votes %s, sources %s" % \
             (self.item_id, 'for ann %s' % self.ann if self.ann else '',
             self.votes, self.sources)
 
-@attrs
+@define
 class FailedAttrVotingError(DatasetMergeError):
-    item_id = attrib()
-    attr = attrib()
-    votes = attrib()
-    ann = attrib()
+    item_id = field()
+    attr = field()
+    votes = field()
+    ann = field()
 
     def __str__(self):
         return "Item %s: attribute voting failed " \
             "for ann %s, votes %s, sources %s" % \
             (self.item_id, self.ann, self.votes, self.sources)
 
-@attrs
+@define
 class DatasetValidationError(DatumaroError):
-    severity = attrib()
+    severity = field()
 
     def to_dict(self):
         return {
@@ -310,10 +310,10 @@ def to_dict(self):
         }
 
 
-@attrs
+@define
 class DatasetItemValidationError(DatasetValidationError):
-    item_id = attrib()
-    subset = attrib()
+    item_id = field()
+    subset = field()
 
     def to_dict(self):
         dict_repr = super().to_dict()
@@ -321,103 +321,103 @@ def to_dict(self):
         dict_repr['subset'] = self.subset
         return dict_repr
 
-@attrs
+@define
 class MissingLabelCategories(DatasetValidationError):
     def __str__(self):
         return "Metadata (ex. LabelCategories) should be defined" \
             " to validate a dataset."
 
 
-@attrs
+@define
 class MissingAnnotation(DatasetItemValidationError):
-    ann_type = attrib()
+    ann_type = field()
 
     def __str__(self):
         return f"Item needs '{self.ann_type}' annotation(s), " \
             "but not found."
 
-@attrs
+@define
 class MultiLabelAnnotations(DatasetItemValidationError):
     def __str__(self):
         return 'Item needs a single label but multiple labels are found.'
 
-@attrs
+@define
 class MissingAttribute(DatasetItemValidationError):
-    label_name = attrib()
-    attr_name = attrib()
+    label_name = field()
+    attr_name = field()
 
     def __str__(self):
         return f"Item needs the attribute '{self.attr_name}' " \
             f"for the label '{self.label_name}'."
 
-@attrs
+@define
 class UndefinedLabel(DatasetItemValidationError):
-    label_name = attrib()
+    label_name = field()
 
     def __str__(self):
         return f"Item has the label '{self.label_name}' which " \
             "is not defined in metadata."
 
-@attrs
+@define
 class UndefinedAttribute(DatasetItemValidationError):
-    label_name = attrib()
-    attr_name = attrib()
+    label_name = field()
+    attr_name = field()
 
     def __str__(self):
         return f"Item has the attribute '{self.attr_name}' for the " \
             f"label '{self.label_name}' which is not defined in metadata."
 
-@attrs
+@define
 class LabelDefinedButNotFound(DatasetValidationError):
-    label_name = attrib()
+    label_name = field()
 
     def __str__(self):
         return f"The label '{self.label_name}' is defined in " \
                 "metadata, but not found in the dataset."
 
-@attrs
+@define
 class AttributeDefinedButNotFound(DatasetValidationError):
-    label_name = attrib()
-    attr_name = attrib()
+    label_name = field()
+    attr_name = field()
 
     def __str__(self):
         return f"The attribute '{self.attr_name}' for the label " \
             f"'{self.label_name}' is defined in metadata, but not " \
             "found in the dataset."
 
-@attrs
+@define
 class OnlyOneLabel(DatasetValidationError):
-    label_name = attrib()
+    label_name = field()
 
     def __str__(self):
         return f"The dataset has only one label '{self.label_name}'."
 
-@attrs
+@define
 class OnlyOneAttributeValue(DatasetValidationError):
-    label_name = attrib()
-    attr_name = attrib()
-    value = attrib()
+    label_name = field()
+    attr_name = field()
+    value = field()
 
     def __str__(self):
         return "The dataset has the only attribute value " \
             f"'{self.value}' for the attribute '{self.attr_name}' for the " \
             f"label '{self.label_name}'."
 
-@attrs
+@define
 class FewSamplesInLabel(DatasetValidationError):
-    label_name = attrib()
-    count = attrib()
+    label_name = field()
+    count = field()
 
     def __str__(self):
         return f"The number of samples in the label '{self.label_name}'" \
             f" might be too low. Found '{self.count}' samples."
 
-@attrs
+@define
 class FewSamplesInAttribute(DatasetValidationError):
-    label_name = attrib()
-    attr_name = attrib()
-    attr_value = attrib()
-    count = attrib()
+    label_name = field()
+    attr_name = field()
+    attr_value = field()
+    count = field()
 
     def __str__(self):
         return "The number of samples for attribute = value " \
@@ -425,69 +425,69 @@ def __str__(self):
             f"'{self.label_name}' might be too low. " \
             f"Found '{self.count}' samples."
 
-@attrs
+@define
 class ImbalancedLabels(DatasetValidationError):
     def __str__(self):
         return 'There is an imbalance in the label distribution.'
 
-@attrs
+@define
 class ImbalancedAttribute(DatasetValidationError):
-    label_name = attrib()
-    attr_name = attrib()
+    label_name = field()
+    attr_name = field()
 
     def __str__(self):
         return "There is an imbalance in the distribution of attribute" \
             f" '{self. attr_name}' for the label '{self.label_name}'."
 
-@attrs
+@define
 class ImbalancedDistInLabel(DatasetValidationError):
-    label_name = attrib()
-    prop = attrib()
+    label_name = field()
+    prop = field()
 
     def __str__(self):
         return f"Values of '{self.prop}' are not evenly " \
                 f"distributed for '{self.label_name}' label."
 
-@attrs
+@define
 class ImbalancedDistInAttribute(DatasetValidationError):
-    label_name = attrib()
-    attr_name = attrib()
-    attr_value = attrib()
-    prop = attrib()
+    label_name = field()
+    attr_name = field()
+    attr_value = field()
+    prop = field()
 
     def __str__(self):
         return f"Values of '{self.prop}' are not evenly " \
             f"distributed for '{self.attr_name}' = '{self.attr_value}' for " \
             f"the '{self.label_name}' label."
 
-@attrs
+@define
 class NegativeLength(DatasetItemValidationError):
-    ann_id = attrib()
-    prop = attrib()
-    val = attrib()
+    ann_id = field()
+    prop = field()
+    val = field()
 
     def __str__(self):
         return f"Annotation '{self.ann_id}' in " \
             "the item should have a positive value of " \
             f"'{self.prop}' but got '{self.val}'."
 
-@attrs
+@define
 class InvalidValue(DatasetItemValidationError):
-    ann_id = attrib()
-    prop = attrib()
+    ann_id = field()
+    prop = field()
 
     def __str__(self):
         return f"Annotation '{self.ann_id}' in " \
             'the item has an inf or a NaN value of ' \
             f"'{self.prop}'."
 
-@attrs
+@define
 class FarFromLabelMean(DatasetItemValidationError):
-    label_name = attrib()
-    ann_id = attrib()
-    prop = attrib()
-    mean = attrib()
-    val = attrib()
+    label_name = field()
+    ann_id = field()
+    prop = field()
+    mean = field()
+    val = field()
 
     def __str__(self):
         return f"Annotation '{self.ann_id}' in " \
@@ -495,15 +495,15 @@ def __str__(self):
             "is too far from the label average. (mean of " \
             f"'{self.label_name}' label: {self.mean}, got '{self.val}')."
 
-@attrs
+@define
 class FarFromAttrMean(DatasetItemValidationError):
-    label_name = attrib()
-    ann_id = attrib()
-    attr_name = attrib()
-    attr_value = attrib()
-    prop = attrib()
-    mean = attrib()
-    val = attrib()
+    label_name = field()
+    ann_id = field()
+    attr_name = field()
+    attr_value = field()
+    prop = field()
+    mean = field()
+    val = field()
 
     def __str__(self):
         return f"Annotation '{self.ann_id}' in the " \

From f2bc5ab2962d3eed8c799e0e8b53d0b8a9eece96 Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Tue, 1 Feb 2022 18:13:55 +0300
Subject: [PATCH 07/11] Fix image tests

---
 datumaro/components/media.py | 7 -------
 tests/test_images.py         | 9 +++------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/datumaro/components/media.py b/datumaro/components/media.py
index 31242d6a67..d024ee7d51 100644
--- a/datumaro/components/media.py
+++ b/datumaro/components/media.py
@@ -67,15 +67,8 @@ def __init__(self,
         if size is not None:
             assert len(size) == 2 and 0 < size[0] and 0 < size[1], size
             size = tuple(map(int, size))
-
         self._size = size # (H, W)
 
-        if isinstance(data, np.ndarray):
-            if not self._size:
-                self._size = data.shape[:2]
-            else:
-                assert self._size == data.shape[:2]
-
         assert path is None or isinstance(path, str), path
         if path is None:
             path = ''
diff --git a/tests/test_images.py b/tests/test_images.py
index fc0b651c8e..4de3662ec1 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -62,14 +62,12 @@ def tearDown(self) -> None:
 
 class ImageTest(TestCase):
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
-    def test_lazy_image_shape(self):
+    def test_can_report_cached_size(self):
         data = np.ones((5, 6, 3))
 
-        image_lazy = Image(data=lambda _: data, size=(2, 4))
-        image_eager = Image(data=lambda _: data)
+        image = Image(data=lambda _: data, size=(2, 4))
 
-        self.assertEqual((2, 4), image_lazy.size)
-        self.assertEqual((5, 6), image_eager.size)
+        self.assertEqual((2, 4), image.size)
 
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_ctors(self):
@@ -139,7 +137,6 @@ def test_ctors(self):
                 { 'data': image_bytes, 'path': path, 'size': (2, 4) },
                 { 'path': path },
                 { 'path': path, 'size': (2, 4) },
-                { 'path': path, 'size': (2, 4) },
             ]:
                 with self.subTest(**args):
                     img = ByteImage(**args)

From 807d0ccd2c4a72f7c6a21947d7d7f11c9dc5bf87 Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Tue, 1 Feb 2022 18:31:07 +0300
Subject: [PATCH 08/11] Fix wording in Image doc

---
 datumaro/components/media.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/datumaro/components/media.py b/datumaro/components/media.py
index b4b450e957..587a5ba7f3 100644
--- a/datumaro/components/media.py
+++ b/datumaro/components/media.py
@@ -48,8 +48,8 @@ def __init__(self,
         """
         Creates an image.
 
-        Any combinations of the `data`, `path` and `size` arguments are
-        possible, but at least one of them must be provided.
+        Any combination of the `data`, `path` and `size` is possible,
+        but at least one of these arguments must be provided.
         The `ext` parameter cannot be used as a single argument for
         construction.
 
@@ -58,8 +58,9 @@ def __init__(self,
                 image shape is (H, W [, C]). If a function is provided,
                 it must accept image path as the first argument.
             path - Image path
-            ext - Image extension. Cannot be used together with `path`. It is
-                useful for saving with a custom extension.
+            ext - Image extension. Cannot be used together with `path`. It can
+                be used for saving with a custom extension - in that case,
+                the image need to have the `data` and `ext` fields defined.
             size - A pair (H, W), which represents image size.
         """
 

From d0e39ed7e43de51d997d9d855cca4713b2a10741 Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Tue, 1 Feb 2022 18:32:49 +0300
Subject: [PATCH 09/11] Throw an error

---
 datumaro/components/operations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datumaro/components/operations.py b/datumaro/components/operations.py
index d5b95b3ae4..1fd73e28a7 100644
--- a/datumaro/components/operations.py
+++ b/datumaro/components/operations.py
@@ -189,7 +189,7 @@ def _merge_images(item_a: DatasetItem, item_b: DatasetItem) -> Image:
             elif item_b.image.has_size:
                 image = item_b.image
             else:
-                image = item_a.image
+                assert False, "Unknown image field combination"
 
             if not image.has_data or not image.has_size:
                 if item_a.image._size:

From f88fcf76b7412c2764dc155252085ef55d627ed8 Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Tue, 1 Feb 2022 19:28:32 +0300
Subject: [PATCH 10/11] Fix test error

---
 datumaro/components/errors.py | 86 +++++++++++++++++------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/datumaro/components/errors.py b/datumaro/components/errors.py
index a6afba9471..2b555cc26a 100644
--- a/datumaro/components/errors.py
+++ b/datumaro/components/errors.py
@@ -26,7 +26,7 @@ class ReadonlyProjectError(VcsError):
     def __str__(self):
         return "Can't change a read-only project"
 
-@define
+@define(auto_exc=False)
 class UnknownRefError(VcsError):
     ref = field()
 
@@ -39,7 +39,7 @@ class MissingObjectError(VcsError):
 class MismatchingObjectError(VcsError):
     pass
 
-@define
+@define(auto_exc=False)
 class UnsavedChangesError(VcsError):
     paths = field()
 
@@ -99,14 +99,14 @@ def __str__(self):
             """
 
 
-@define
+@define(auto_exc=False)
 class ProjectNotFoundError(DatumaroError):
     path = field()
 
     def __str__(self):
         return f"Can't find project at '{self.path}'"
 
-@define
+@define(auto_exc=False)
 class ProjectAlreadyExists(DatumaroError):
     path = field()
 
@@ -114,21 +114,21 @@ def __str__(self):
         return f"Can't create project: a project already exists " \
             f"at '{self.path}'"
 
-@define
+@define(auto_exc=False)
 class UnknownSourceError(DatumaroError):
     name = field()
 
     def __str__(self):
         return f"Unknown source '{self.name}'"
 
-@define
+@define(auto_exc=False)
 class UnknownTargetError(DatumaroError):
     name = field()
 
     def __str__(self):
         return f"Unknown target '{self.name}'"
 
-@define
+@define(auto_exc=False)
 class UnknownFormatError(DatumaroError):
     format = field()
 
@@ -137,7 +137,7 @@ def __str__(self):
             "available, add the corresponding Extractor implementation " \
             "to the environment"
 
-@define
+@define(auto_exc=False)
 class SourceExistsError(DatumaroError):
     name = field()
 
@@ -148,14 +148,14 @@ def __str__(self):
 class DatasetImportError(DatumaroError):
     pass
 
-@define
+@define(auto_exc=False)
 class DatasetNotFoundError(DatasetImportError):
     path = field()
 
     def __str__(self):
         return f"Failed to find dataset at '{self.path}'"
 
-@define
+@define(auto_exc=False)
 class MultipleFormatsMatchError(DatasetImportError):
     formats = field()
 
@@ -176,7 +176,7 @@ class CategoriesRedefinedError(DatasetError):
     def __str__(self):
         return "Categories can only be set once for a dataset"
 
-@define
+@define(auto_exc=False)
 class RepeatedItemError(DatasetError):
     item_id = field()
 
@@ -187,7 +187,7 @@ def __str__(self):
 class DatasetQualityError(DatasetError):
     pass
 
-@define
+@define(auto_exc=False)
 class AnnotationsTooCloseError(DatasetQualityError):
     item_id = field()
     a = field()
@@ -198,7 +198,7 @@ def __str__(self):
         return "Item %s: annotations are too close: %s, %s, distance = %s" % \
             (self.item_id, self.a, self.b, self.distance)
 
-@define
+@define(auto_exc=False)
 class WrongGroupError(DatasetQualityError):
     item_id = field()
     found = field(converter=set)
@@ -211,7 +211,7 @@ def __str__(self):
             (self.item_id, self.found, self.expected, self.group)
 
 
-@define(init=False)
+@define(auto_exc=False, init=False)
 class DatasetMergeError(DatasetError):
     sources = field(converter=set, factory=set, kw_only=True)
 
@@ -223,7 +223,7 @@ def _my__init__(self, msg=None, *, sources=None):
 # when __init__ is defined directly
 setattr(DatasetMergeError, '__init__', DatasetMergeError._my__init__)
 
-@define
+@define(auto_exc=False)
 class MismatchingImageInfoError(DatasetMergeError):
     item_id: Tuple[str, str]
     a: Tuple[int, int]
@@ -233,7 +233,7 @@ def __str__(self):
         return "Item %s: mismatching image size info: %s vs %s" % \
             (self.item_id, self.a, self.b)
 
-@define
+@define(auto_exc=False)
 class MismatchingImagePathError(DatasetMergeError):
     item_id: Tuple[str, str]
     a: str
@@ -243,7 +243,7 @@ def __str__(self):
         return "Item %s: mismatching image path info: %s vs %s" % \
             (self.item_id, self.a, self.b)
 
-@define
+@define(auto_exc=False)
 class MismatchingAttributesError(DatasetMergeError):
     item_id: Tuple[str, str]
     key: str
@@ -257,7 +257,7 @@ def __str__(self):
 class ConflictingCategoriesError(DatasetMergeError):
     pass
 
-@define
+@define(auto_exc=False)
 class NoMatchingAnnError(DatasetMergeError):
     item_id = field()
     ann = field()
@@ -267,7 +267,7 @@ def __str__(self):
             "in sources %s, annotation is %s" % \
             (self.item_id, self.sources, self.ann)
 
-@define
+@define(auto_exc=False)
 class NoMatchingItemError(DatasetMergeError):
     item_id = field()
 
@@ -275,7 +275,7 @@ def __str__(self):
         return "Item %s: can't find matching item in sources %s" % \
             (self.item_id, self.sources)
 
-@define
+@define(auto_exc=False)
 class FailedLabelVotingError(DatasetMergeError):
     item_id = field()
     votes = field()
@@ -286,7 +286,7 @@ def __str__(self):
             (self.item_id, 'for ann %s' % self.ann if self.ann else '',
             self.votes, self.sources)
 
-@define
+@define(auto_exc=False)
 class FailedAttrVotingError(DatasetMergeError):
     item_id = field()
     attr = field()
@@ -298,7 +298,7 @@ def __str__(self):
             "for ann %s, votes %s, sources %s" % \
             (self.item_id, self.ann, self.votes, self.sources)
 
-@define
+@define(auto_exc=False)
 class DatasetValidationError(DatumaroError):
     severity = field()
 
@@ -310,7 +310,7 @@ def to_dict(self):
         }
 
 
-@define
+@define(auto_exc=False)
 class DatasetItemValidationError(DatasetValidationError):
     item_id = field()
     subset = field()
@@ -321,14 +321,14 @@ def to_dict(self):
         dict_repr['subset'] = self.subset
         return dict_repr
 
-@define
+@define(auto_exc=False)
 class MissingLabelCategories(DatasetValidationError):
     def __str__(self):
         return "Metadata (ex. LabelCategories) should be defined" \
             " to validate a dataset."
 
 
-@define
+@define(auto_exc=False)
 class MissingAnnotation(DatasetItemValidationError):
     ann_type = field()
 
@@ -336,12 +336,12 @@ def __str__(self):
         return f"Item needs '{self.ann_type}' annotation(s), " \
             "but not found."
 
-@define
+@define(auto_exc=False)
 class MultiLabelAnnotations(DatasetItemValidationError):
     def __str__(self):
         return 'Item needs a single label but multiple labels are found.'
 
-@define
+@define(auto_exc=False)
 class MissingAttribute(DatasetItemValidationError):
     label_name = field()
     attr_name = field()
@@ -350,7 +350,7 @@ def __str__(self):
         return f"Item needs the attribute '{self.attr_name}' " \
             f"for the label '{self.label_name}'."
 
-@define
+@define(auto_exc=False)
 class UndefinedLabel(DatasetItemValidationError):
     label_name = field()
 
@@ -358,7 +358,7 @@ def __str__(self):
         return f"Item has the label '{self.label_name}' which " \
             "is not defined in metadata."
 
-@define
+@define(auto_exc=False)
 class UndefinedAttribute(DatasetItemValidationError):
     label_name = field()
     attr_name = field()
@@ -367,7 +367,7 @@ def __str__(self):
         return f"Item has the attribute '{self.attr_name}' for the " \
             f"label '{self.label_name}' which is not defined in metadata."
 
-@define
+@define(auto_exc=False)
 class LabelDefinedButNotFound(DatasetValidationError):
     label_name = field()
 
@@ -375,7 +375,7 @@ def __str__(self):
         return f"The label '{self.label_name}' is defined in " \
                 "metadata, but not found in the dataset."
 
-@define
+@define(auto_exc=False)
 class AttributeDefinedButNotFound(DatasetValidationError):
     label_name = field()
     attr_name = field()
@@ -385,14 +385,14 @@ def __str__(self):
             f"'{self.label_name}' is defined in metadata, but not " \
             "found in the dataset."
 
-@define
+@define(auto_exc=False)
 class OnlyOneLabel(DatasetValidationError):
     label_name = field()
 
     def __str__(self):
         return f"The dataset has only one label '{self.label_name}'."
 
-@define
+@define(auto_exc=False)
 class OnlyOneAttributeValue(DatasetValidationError):
     label_name = field()
     attr_name = field()
@@ -403,7 +403,7 @@ def __str__(self):
             f"'{self.value}' for the attribute '{self.attr_name}' for the " \
             f"label '{self.label_name}'."
 
-@define
+@define(auto_exc=False)
 class FewSamplesInLabel(DatasetValidationError):
     label_name = field()
     count = field()
@@ -412,7 +412,7 @@ def __str__(self):
         return f"The number of samples in the label '{self.label_name}'" \
             f" might be too low. Found '{self.count}' samples."
 
-@define
+@define(auto_exc=False)
 class FewSamplesInAttribute(DatasetValidationError):
     label_name = field()
     attr_name = field()
@@ -425,12 +425,12 @@ def __str__(self):
             f"'{self.label_name}' might be too low. " \
             f"Found '{self.count}' samples."
 
-@define
+@define(auto_exc=False)
 class ImbalancedLabels(DatasetValidationError):
     def __str__(self):
         return 'There is an imbalance in the label distribution.'
 
-@define
+@define(auto_exc=False)
 class ImbalancedAttribute(DatasetValidationError):
     label_name = field()
     attr_name = field()
@@ -439,7 +439,7 @@ def __str__(self):
         return "There is an imbalance in the distribution of attribute" \
             f" '{self. attr_name}' for the label '{self.label_name}'."
 
-@define
+@define(auto_exc=False)
 class ImbalancedDistInLabel(DatasetValidationError):
     label_name = field()
     prop = field()
@@ -448,7 +448,7 @@ def __str__(self):
         return f"Values of '{self.prop}' are not evenly " \
                 f"distributed for '{self.label_name}' label."
 
-@define
+@define(auto_exc=False)
 class ImbalancedDistInAttribute(DatasetValidationError):
     label_name = field()
     attr_name = field()
@@ -460,7 +460,7 @@ def __str__(self):
             f"distributed for '{self.attr_name}' = '{self.attr_value}' for " \
             f"the '{self.label_name}' label."
 
-@define
+@define(auto_exc=False)
 class NegativeLength(DatasetItemValidationError):
     ann_id = field()
     prop = field()
@@ -471,7 +471,7 @@ def __str__(self):
             "the item should have a positive value of " \
             f"'{self.prop}' but got '{self.val}'."
 
-@define
+@define(auto_exc=False)
 class InvalidValue(DatasetItemValidationError):
     ann_id = field()
     prop = field()
@@ -481,7 +481,7 @@ def __str__(self):
             'the item has an inf or a NaN value of ' \
             f"'{self.prop}'."
 
-@define
+@define(auto_exc=False)
 class FarFromLabelMean(DatasetItemValidationError):
     label_name = field()
     ann_id = field()
@@ -495,7 +495,7 @@ def __str__(self):
             "is too far from the label average. (mean of " \
             f"'{self.label_name}' label: {self.mean}, got '{self.val}')."
 
-@define
+@define(auto_exc=False)
 class FarFromAttrMean(DatasetItemValidationError):
     label_name = field()
     ann_id = field()

From 88ed8497b0bb8df0642474bcb801faca22b4438a Mon Sep 17 00:00:00 2001
From: Maxim Zhiltsov <maxim.zhiltsov@intel.com>
Date: Fri, 4 Feb 2022 15:34:35 +0300
Subject: [PATCH 11/11] Remove extra casting

---
 datumaro/components/operations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datumaro/components/operations.py b/datumaro/components/operations.py
index 1fd73e28a7..2a263cdc28 100644
--- a/datumaro/components/operations.py
+++ b/datumaro/components/operations.py
@@ -128,7 +128,7 @@ def _merge_attrs(a: Dict[str, Any], b: Dict[str, Any],
             item_id: Tuple[str, str]) -> Dict:
         merged = {}
 
-        for name in set(a) | set(b):
+        for name in a.keys() | b.keys():
             a_val = a.get(name, None)
             b_val = b.get(name, None)