Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve simple merge #634

Merged
merged 14 commits into from
Feb 4, 2022
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/621>)
- OpenVINO telemetry library 2022.1.0 from PyPI.
(<https://github.com/openvinotoolkit/datumaro/pull/625>)
- Allowed `Image` creation from just `size` info
(<https://github.com/openvinotoolkit/datumaro/pull/634>)
- Added image search in VOC XML-based subformats
(<https://github.com/openvinotoolkit/datumaro/pull/634>)
- Added image path equality checks in simple merge, when applicable
(<https://github.com/openvinotoolkit/datumaro/pull/634>)

### Deprecated
- TBD
Expand Down Expand Up @@ -70,6 +76,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Numeric warnings that sometimes occurred in `stats` command
(e.g. <https://github.com/openvinotoolkit/datumaro/issues/607>)
(<https://github.com/openvinotoolkit/datumaro/pull/621>)
- Added missing item attribute merging in simple merge
(<https://github.com/openvinotoolkit/datumaro/pull/634>)

### Security
- TBD
Expand Down
29 changes: 26 additions & 3 deletions datumaro/components/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: MIT

from typing import Any, Optional, Tuple

from attr import attrib, attrs


Expand Down Expand Up @@ -223,14 +225,35 @@ def _my__init__(self, msg=None, *, sources=None):

@attrs
class MismatchingImageInfoError(DatasetMergeError):
item_id = attrib()
a = attrib()
b = attrib()
item_id: Optional[Tuple[str, str]] = attrib()
a: int = attrib()
IRDonch marked this conversation as resolved.
Show resolved Hide resolved
b: int = attrib()

def __str__(self):
return "Item %s: mismatching image size info: %s vs %s" % \
(self.item_id, self.a, self.b)

@attrs
IRDonch marked this conversation as resolved.
Show resolved Hide resolved
class MismatchingImagePathError(DatasetMergeError):
item_id: Optional[Tuple[str, str]] = attrib()
a: str = attrib()
b: str = attrib()

def __str__(self):
return "Item %s: mismatching image path info: %s vs %s" % \
(self.item_id, self.a, self.b)

@attrs
class MismatchingAttributesError(DatasetMergeError):
item_id: Optional[Tuple[str, str]] = attrib()
key: str = attrib()
a: Any = attrib()
b: Any = attrib()

def __str__(self):
return "Item %s: mismatching image attribute %s: %s vs %s" % \
(self.item_id or '', self.key, self.a, self.b)
IRDonch marked this conversation as resolved.
Show resolved Hide resolved

class ConflictingCategoriesError(DatasetMergeError):
pass

Expand Down
34 changes: 29 additions & 5 deletions datumaro/components/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,37 @@ def __init__(self,
*,
path: Optional[str] = None,
ext: Optional[str] = None,
size: Optional[Tuple[int, int]] = None):
size: Optional[Tuple[int, int]] = None) -> None:
"""
Creates an image.

Any combinations of the `data`, `path` and `size` arguments are
possible, but at least one of them must be provided.
The `ext` parameter cannot be used as a single argument for
construction.

Args:
data - Image pixels or a function to retrieve them. The expected
image shape is (H, W [, C]). If a function is provided,
it must accept image path as the first argument.
path - Image path
ext - Image extension. Cannot be used together with `path`. It is
useful for saving with a custom extension.
IRDonch marked this conversation as resolved.
Show resolved Hide resolved
size - A pair (H, W), which represents image size.
"""

assert size is None or len(size) == 2, size
if size is not None:
assert len(size) == 2 and 0 < size[0] and 0 < size[1], size
size = tuple(map(int, size))

self._size = size # (H, W)
if not self._size and isinstance(data, np.ndarray):
self._size = data.shape[:2]

if isinstance(data, np.ndarray):
if not self._size:
self._size = data.shape[:2]
else:
assert self._size == data.shape[:2]

assert path is None or isinstance(path, str), path
if path is None:
Expand All @@ -72,8 +95,8 @@ def __init__(self,
self._ext = ext

if not isinstance(data, np.ndarray):
assert path or callable(data), "Image can not be empty"
assert data is None or callable(data)
assert path or callable(data) or size, "Image can not be empty"
assert data is None or callable(data) or size
IRDonch marked this conversation as resolved.
Show resolved Hide resolved
if path and osp.isfile(path) or data:
data = lazy_image(path, loader=data)
self._data = data
Expand All @@ -97,6 +120,7 @@ def has_data(self) -> bool:

@property
def has_size(self) -> bool:
"""Indicates that size info is cached and won't require image loading"""
return self._size is not None or isinstance(self._data, np.ndarray)

@property
Expand Down
111 changes: 84 additions & 27 deletions datumaro/components/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from collections import OrderedDict
from copy import deepcopy
from typing import Callable, Dict, Optional, Set, Tuple
from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple
from unittest import TestCase
import hashlib
import logging as log
Expand All @@ -15,17 +15,19 @@
import numpy as np

from datumaro.components.annotation import (
AnnotationType, Bbox, Label, LabelCategories, MaskCategories,
Annotation, AnnotationType, Bbox, Label, LabelCategories, MaskCategories,
PointsCategories,
)
from datumaro.components.cli_plugin import CliPlugin
from datumaro.components.dataset import Dataset, DatasetItemStorage, IDataset
from datumaro.components.errors import (
AnnotationsTooCloseError, ConflictingCategoriesError, DatasetMergeError,
FailedAttrVotingError, FailedLabelVotingError, MismatchingImageInfoError,
NoMatchingAnnError, NoMatchingItemError, WrongGroupError,
FailedAttrVotingError, FailedLabelVotingError, MismatchingAttributesError,
MismatchingImageInfoError, MismatchingImagePathError, NoMatchingAnnError,
NoMatchingItemError, WrongGroupError,
)
from datumaro.components.extractor import CategoriesInfo, DatasetItem
from datumaro.components.media import Image
from datumaro.util import filter_dict, find
from datumaro.util.annotation_util import (
OKS, approximate_line, bbox_iou, find_instances, max_bbox, mean_bbox,
Expand Down Expand Up @@ -106,40 +108,95 @@ def merge(cls, *sources):
def merge_items(cls, existing_item, current_item):
return existing_item.wrap(
image=cls.merge_images(existing_item, current_item),
attributes=cls.merge_attrs(
existing_item.attributes, current_item.attributes,
item_id=(existing_item.id, existing_item.subset)),
annotations=cls.merge_anno(
existing_item.annotations, current_item.annotations))

@staticmethod
def merge_images(existing_item, current_item):
image = None
if existing_item.has_image and current_item.has_image:
if existing_item.image.has_data:
image = existing_item.image
def merge_attrs(a: Dict, b: Dict,
IRDonch marked this conversation as resolved.
Show resolved Hide resolved
item_id: Optional[Tuple[str, str]] = None) -> Dict:
merged = {}

for name in set(a) | set(b):
IRDonch marked this conversation as resolved.
Show resolved Hide resolved
a_val = a.get(name, None)
b_val = b.get(name, None)

if name not in a:
m_val = b_val
elif name not in b:
m_val = a_val
elif a_val != b_val:
raise MismatchingAttributesError(item_id, name, a_val, b_val)
else:
image = current_item.image

if existing_item.image.path != current_item.image.path:
if not existing_item.image.path:
image._path = current_item.image.path

if all([existing_item.image._size, current_item.image._size]):
if existing_item.image._size != current_item.image._size:
raise MismatchingImageInfoError(
(existing_item.id, existing_item.subset),
existing_item.image._size, current_item.image._size)
elif existing_item.image._size:
image._size = existing_item.image._size
m_val = a_val

merged[name] = m_val

return merged

@staticmethod
def merge_images(item_a: DatasetItem, item_b: DatasetItem) -> Image:
image = None

if item_a.has_image and item_b.has_image:
if item_a.image.path and item_b.image.path and \
item_a.image.path != item_b.image.path and \
item_a.image.has_data is item_b.image.has_data:
# We use has_data as a replacement for path existence check
# - If only one image has data, we'll use it. The other
# one is just a path metainfo, which is not significant
# in this case.
# - If both images have data or both don't, we need
# to compare paths.
#
# Different paths can aclually point to the same file,
# but it's not the case we'd like to allow here to be
# a "simple" merging strategy used for extractor joining
raise MismatchingImagePathError(
(item_a.id, item_a.subset),
item_a.image.path, item_b.image.path)

if item_a.image.has_size and item_b.image.has_size and \
item_a.image.size != item_b.image.size:
raise MismatchingImageInfoError(
(item_a.id, item_a.subset),
item_a.image.size, item_b.image.size)

# Avoid direct comparison here for better performance
# If there are 2 "data-only" images, they won't be compared and
# we just use the first one
if item_a.image.has_data:
image = item_a.image
elif item_b.image.has_data:
image = item_b.image
elif item_a.image.path:
image = item_a.image
elif item_b.image.path:
image = item_b.image
elif item_a.image.has_size:
image = item_a.image
elif item_b.image.has_size:
image = item_b.image
else:
image._size = current_item.image._size
elif existing_item.has_image:
image = existing_item.image
image = item_a.image
IRDonch marked this conversation as resolved.
Show resolved Hide resolved

if not image.has_data or not image.has_size:
if item_a.image._size:
image._size = item_a.image._size
elif item_b.image._size:
image._size = item_b.image._size
elif item_a.has_image:
image = item_a.image
else:
image = current_item.image
image = item_b.image

return image

@staticmethod
def merge_anno(a, b):
def merge_anno(a: Iterable[Annotation],
b: Iterable[Annotation]) -> List[Annotation]:
return merge_annotations_equal(a, b)

@staticmethod
Expand Down
23 changes: 17 additions & 6 deletions datumaro/plugins/voc_format/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,20 @@ def __init__(self, path, task):
super().__init__(path, task)

def __iter__(self):
image_dir = osp.join(self._dataset_dir, VocPath.IMAGES_DIR)
if osp.isdir(image_dir):
images = {
osp.splitext(osp.relpath(p, image_dir))[0].replace('\\', '/'): p
for p in find_images(image_dir, recursive=True)
}
else:
images = {}

anno_dir = osp.join(self._dataset_dir, VocPath.ANNOTATIONS_DIR)

for item_id in self._items:
log.debug("Reading item '%s'" % item_id)
image = item_id + VocPath.IMAGE_EXT
height, width = 0, 0
size = None

anns = []
ann_file = osp.join(anno_dir, item_id + '.xml')
Expand All @@ -147,14 +155,17 @@ def __iter__(self):
width = root_elem.find('size/width')
if width is not None:
width = int(width.text)
if height and width:
size = (height, width)
filename_elem = root_elem.find('filename')
if filename_elem is not None:
image = filename_elem.text
image = osp.join(image_dir, filename_elem.text)
anns = self._parse_annotations(root_elem)
else:
image = images.pop(item_id, None)

image = osp.join(self._dataset_dir, VocPath.IMAGES_DIR, image)
if height and width:
image = Image(path=image, size=(height, width))
if image or size:
image = Image(path=image, size=size)

yield DatasetItem(id=item_id, subset=self._subset,
image=image, annotations=anns)
Expand Down
2 changes: 1 addition & 1 deletion tests/cli/test_image_zip_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_can_save_and_load(self):
def test_can_export_zip_images_from_coco_dataset(self):
with TestDir() as test_dir:
coco_dir = osp.join(__file__[:__file__.rfind(osp.join('tests', ''))],
'tests', 'assets', 'coco_dataset')
'tests', 'assets', 'coco_dataset', 'coco')

run(self, 'create', '-o', test_dir)
run(self, 'import', '-p', test_dir, '-f', 'coco', coco_dir)
Expand Down
Loading