Skip to content

Commit

Permalink
Add initial support for the Open Images dataset (#291)
Browse files Browse the repository at this point in the history
* Support reading or Labels in Open Images (v4, v5, v6)

* Add tests for the Open Images extractor/importer

* Add Open Images documentation

* Update changelog
  • Loading branch information
Roman Donchenko authored Jun 25, 2021
1 parent d2073b8 commit 5209d42
Show file tree
Hide file tree
Showing 18 changed files with 468 additions and 0 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `ItemTransform` class, which describes item-wise dataset `Transform`s (<https://github.com/openvinotoolkit/datumaro/pull/297>)
- `keep-empty` export parameter in VOC format (<https://github.com/openvinotoolkit/datumaro/pull/297>)
- A base class for dataset validation plugins (<https://github.com/openvinotoolkit/datumaro/pull/299>)
- Partial support for the Open Images format;
only reading is supported, and only images and image-level labels can be read
(<https://github.com/openvinotoolkit/datumaro/pull/291>).

### Changed
- Tensorflow AVX check is made optional in API and is disabled by default (<https://github.com/openvinotoolkit/datumaro/pull/305>)
Expand Down
216 changes: 216 additions & 0 deletions datumaro/plugins/open_images_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

import contextlib
import csv
import fnmatch
import glob
import json
import os
import os.path as osp
import re

from attr import attrs

from datumaro.components.errors import DatasetError, RepeatedItemError, UndefinedLabel
from datumaro.components.extractor import (
AnnotationType, DatasetItem, Importer, Label, LabelCategories, Extractor,
)
from datumaro.components.validator import Severity
from datumaro.util.image import find_images

# A regex to check whether a subset name can be used as a "normal" path
# component.
# Accepting a subset name that doesn't match this regex could lead
# to accessing data outside of the expected directory, so it's best
# to reject them.
_RE_INVALID_SUBSET = re.compile(r'''
# empty
| \.\.? # special path component
| .*[/\\\0].* # contains special characters
''', re.VERBOSE)

@attrs(auto_attribs=True)
class UnsupportedSubsetNameError(DatasetError):
subset: str

def __str__(self):
return "Item %s has an unsupported subset name %r." % (self.item_id, self.subset)

class OpenImagesPath:
ANNOTATIONS_DIR = 'annotations'
FULL_IMAGE_DESCRIPTION_NAME = 'image_ids_and_rotation.csv'
SUBSET_IMAGE_DESCRIPTION_PATTERNS = (
'*-images-with-rotation.csv',
'*-images-with-labels-with-rotation.csv',
)

class OpenImagesExtractor(Extractor):
def __init__(self, path):
if not osp.isdir(path):
raise FileNotFoundError("Can't read dataset directory '%s'" % path)

super().__init__()

self._dataset_dir = path

self._annotation_files = os.listdir(
osp.join(path, OpenImagesPath.ANNOTATIONS_DIR))

self._categories = {}
self._items = []

self._load_categories()
self._load_items()

def __iter__(self):
return iter(self._items)

def categories(self):
return self._categories

@contextlib.contextmanager
def _open_csv_annotation(self, file_name):
absolute_path = osp.join(self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, file_name)

with open(absolute_path, 'r', encoding='utf-8', newline='') as f:
yield csv.DictReader(f)

def _glob_annotations(self, pattern):
for annotation_file in self._annotation_files:
if fnmatch.fnmatch(annotation_file, pattern):
yield annotation_file

def _load_categories(self):
label_categories = LabelCategories()

# In OID v6, the class description file is prefixed with `oidv6-`, whereas
# in the previous versions, it isn't. We try to find it regardless.
# We use a wildcard so that if, say, OID v7 is released in the future with
# a similar layout as v6, it's automatically supported.
# If the file doesn't exist with either name, we'll fail trying to open
# `class-descriptions.csv`.

V5_CLASS_DESCRIPTIONS = 'class-descriptions.csv'

annotation_name = [
*self._glob_annotations('oidv*-class-descriptions.csv'),
V5_CLASS_DESCRIPTIONS,
][0]

with self._open_csv_annotation(annotation_name) as class_description_reader:
# Prior to OID v6, this file didn't contain a header row.
if annotation_name == V5_CLASS_DESCRIPTIONS:
class_description_reader.fieldnames = ('LabelName', 'DisplayName')

for class_description in class_description_reader:
label_name = class_description['LabelName']
label_categories.add(label_name)

self._categories[AnnotationType.label] = label_categories

self._load_label_category_parents()

def _load_label_category_parents(self):
label_categories = self._categories[AnnotationType.label]

hierarchy_path = osp.join(
self._dataset_dir, OpenImagesPath.ANNOTATIONS_DIR, 'bbox_labels_600_hierarchy.json')

try:
with open(hierarchy_path, 'rb') as hierarchy_file:
root_node = json.load(hierarchy_file)
except FileNotFoundError:
return

def set_parents_from_node(node, category):
for child_node in node.get('Subcategory', []):
_, child_category = label_categories.find(child_node['LabelName'])

if category is not None and child_category is not None:
child_category.parent = category.name

set_parents_from_node(child_node, child_category)

_, root_category = label_categories.find(root_node['LabelName'])
set_parents_from_node(root_node, root_category)

def _load_items(self):
image_paths_by_id = {
osp.splitext(osp.basename(path))[0]: path
for path in find_images(
osp.join(self._dataset_dir, 'images'),
recursive=True, max_depth=1)
}

items_by_id = {}

def load_from(annotation_name):
with self._open_csv_annotation(annotation_name) as image_reader:
for image_description in image_reader:
image_id = image_description['ImageID']
if image_id in items_by_id:
raise RepeatedItemError(item_id=image_id)

subset = image_description['Subset']

if _RE_INVALID_SUBSET.fullmatch(subset):
raise UnsupportedSubsetNameError(item_id=image_id, subset=subset)

items_by_id[image_id] = DatasetItem(
id=image_id,
image=image_paths_by_id.get(image_id),
subset=subset,
)

# It's preferable to load the combined image description file,
# because it contains descriptions for training images without human-annotated labels
# (the file specific to the training set doesn't).
# However, if it's missing, we'll try loading subset-specific files instead, so that
# this extractor can be used on individual subsets of the dataset.
try:
load_from(OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME)
except FileNotFoundError:
for pattern in OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS:
for path in self._glob_annotations(pattern):
load_from(path)

self._items.extend(items_by_id.values())

self._load_labels(items_by_id)

def _load_labels(self, items_by_id):
label_categories = self._categories[AnnotationType.label]

# TODO: implement reading of machine-annotated labels

for label_path in self._glob_annotations('*-human-imagelabels.csv'):
with self._open_csv_annotation(label_path) as label_reader:
for label_description in label_reader:
image_id = label_description['ImageID']
item = items_by_id[image_id]

confidence = float(label_description['Confidence'])

label_name = label_description['LabelName']
label_index, _ = label_categories.find(label_name)
if label_index is None:
raise UndefinedLabel(
item_id=item.id, subset=item.subset,
label_name=label_name, severity=Severity.error)
item.annotations.append(Label(
label=label_index, attributes={'score': confidence}))


class OpenImagesImporter(Importer):
@classmethod
def find_sources(cls, path):
for pattern in [
OpenImagesPath.FULL_IMAGE_DESCRIPTION_NAME,
*OpenImagesPath.SUBSET_IMAGE_DESCRIPTION_PATTERNS,
]:
if glob.glob(osp.join(glob.escape(path), OpenImagesPath.ANNOTATIONS_DIR, pattern)):
return [{'url': path, 'format': 'open_images'}]

return []
135 changes: 135 additions & 0 deletions docs/formats/open_images_user_manual.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# Open Images user manual

## Contents

- [Format specification](#format-specification)
- [Load Open Images dataset](#load-open-images-dataset)
- [Export to other formats](#export-to-other-formats)
- [Export to Open Images](#export-to-open-images)
- [Particular use cases](#particular-use-cases)

## Format specification

A description of the Open Images Dataset (OID) format is available
on [its website](https://storage.googleapis.com/openimages/web/download.html).
Datumaro supports versions 4, 5 and 6.

Datumaro currently supports only the human-verified image-level label annotations from this dataset.

## Load Open Images dataset

The Open Images dataset is available for free download.

See the [`open-images-dataset` GitHub repository](https://github.com/cvdfoundation/open-images-dataset)
for information on how to download the images.

Datumaro also requires the image description files,
which can be downloaded from the following URLs:

- [complete set](https://storage.googleapis.com/openimages/2018_04/image_ids_and_rotation.csv)
- [train set](https://storage.googleapis.com/openimages/v6/oidv6-train-images-with-labels-with-rotation.csv)
- [validation set](https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv)
- [test set](https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv)

Datumaro expects at least one of the files above to be present.

In addition, the following metadata file must be present as well:

- [class descriptions](https://storage.googleapis.com/openimages/v6/oidv6-class-descriptions.csv)

You can optionally download the following additional metadata file:

- [class hierarchy](https://storage.googleapis.com/openimages/2018_04/bbox_labels_600_hierarchy.json)

Annotations can be downloaded from the following URLs:

- [train image labels](https://storage.googleapis.com/openimages/v6/oidv6-train-annotations-human-imagelabels.csv)
- [validation image labels](https://storage.googleapis.com/openimages/v5/validation-annotations-human-imagelabels.csv)
- [test image labels](https://storage.googleapis.com/openimages/v5/test-annotations-human-imagelabels.csv)

The annotations are optional.

There are two ways to create Datumaro project and add OID to it:

``` bash
datum import --format open_images --input-path <path/to/dataset>
# or
datum create
datum add path -f open_images <path/to/dataset>
```

It is possible to specify project name and project directory; run
`datum create --help` for more information.

Open Images dataset directory should have the following structure:

```
└─ Dataset/
├── annotations/
│ └── bbox_labels_600_hierarchy.json
│ └── image_ids_and_rotation.csv
│ └── oidv6-class-descriptions.csv
│ └── *-human-imagelabels.csv
└── images/
├── test
│ ├── <image_name1.jpg>
│ ├── <image_name2.jpg>
│ └── ...
├── train
│ ├── <image_name1.jpg>
│ ├── <image_name2.jpg>
│ └── ...
└── validation
├── <image_name1.jpg>
├── <image_name2.jpg>
└── ...
```

To use per-subset image description files instead of `image_ids_and_rotation.csv`,
place them in the `annotations` subdirectory.

## Export to other formats

Datumaro can convert OID into any other format [Datumaro supports](../user_manual.md#supported-formats).
To get the expected result, the dataset needs to be converted to a format
that supports image-level labels.
There are a few ways to convert OID to other dataset format:

``` bash
datum project import -f open_images -i <path/to/open_images>
datum export -f cvat -o <path/to/output/dir>
# or
datum convert -if open_images -i <path/to/open_images> -f cvat -o <path/to/output/dir>
```

Some formats provide extra options for conversion.
These options are passed after double dash (`--`) in the command line.
To get information about them, run

`datum export -f <FORMAT> -- -h`

## Export to Open Images

Converting datasets to the Open Images format is currently not supported.

## Particular use cases

Datumaro supports filtering, transformation, merging etc. for all formats
and for the Open Images format in particular. Follow
[user manual](../user_manual.md)
to get more information about these operations.

Here is an example of using Datumaro operations to solve
a particular problem with the Open Images dataset:

### Example. How to load the Open Images dataset and convert to the format used by CVAT

```bash
datum create -o project
datum add path -p project -f open_images ./open-images-dataset/
datum stats -p project
datum export -p project -o dataset -f cvat --overwrite -- --save-images
```

More examples of working with OID from code can be found in
[tests](../../tests/test_open_images_format.py).
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/m/0,Generic label #0
/m/1,Generic label #1
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,Thumbnail300KURL,Rotation
cc,test,,,,,Intel,Test Image CC,,,,0
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,Thumbnail300KURL,Rotation
aa,train,,,,,Intel,Test Image AA,,,,0
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"LabelName": "/m/x",
"Subcategory": [
{
"LabelName": "/m/0",
"Subcategory": [
{
"LabelName": "/m/1"
}
]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,Thumbnail300KURL,Rotation
a,train,,,,,Intel,Test Image A,,,,0
b,train,,,,,Intel,Test Image B,,,,0
c,test,,,,,Intel,Test Image C,,,,0
d,validation,,,,Intel,Test Image D,,,,0
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
LabelName,DisplayName
/m/0,Generic label #0
/m/1,Generic label #1
/m/2,Generic label #2
/m/3,Generic label #3
Loading

0 comments on commit 5209d42

Please sign in to comment.