Skip to content

Commit

Permalink
feat: support basins with blown indices
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed May 27, 2024
1 parent 42de023 commit 436e707
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 8 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
0.59.0
- feat: support basins with blown indices
- enh: increase verbosity when failing to resolve basins
- enh: implement dtype for `H5ScalarEvent`
0.58.8
- enh: existence of "index_online" section is no longer required
0.58.7
Expand Down
7 changes: 6 additions & 1 deletion dclab/rtdc_dataset/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import hashlib
import os.path
import pathlib
import traceback
from typing import Literal
import uuid
import random
Expand Down Expand Up @@ -291,9 +292,13 @@ def _get_basin_feature_data(
data = bn.get_feature_data(feat)
# The data are available, we may abort the search.
break
except BaseException:
except (KeyError, OSError, PermissionError, RecursionError):
# Basin data not available
pass
except BaseException:
warnings.warn(f"Could not access {feat} in {self}:\n"
f"{traceback.format_exc()}")
pass
return data

@staticmethod
Expand Down
130 changes: 123 additions & 7 deletions dclab/rtdc_dataset/feat_basin.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from __future__ import annotations

import abc
import numbers
import threading
from typing import Dict, List, Literal
import weakref
Expand Down Expand Up @@ -269,13 +270,10 @@ def load_dataset(self, location, **kwargs):
"""
ds = self._load_dataset(location, **kwargs)
if self.mapping != "same":
# apply filter
ds.filter.manual[:] = False
ds.filter.manual[self.basinmap] = True
ds.apply_filter()
# return hierarchy child
from .fmt_hierarchy import RTDC_Hierarchy # avoid circular imports
ds_bn = RTDC_Hierarchy(ds)
# The array `self.basinmap` may contain duplicate elements,
# which is why we cannot use hierarchy children to access the
# data (sometimes the data must be blown-up rather than gated).
ds_bn = BasinProxy(ds=ds, basinmap=self.basinmap)
else:
ds_bn = ds
return ds_bn
Expand Down Expand Up @@ -315,6 +313,124 @@ def verify_basin(self, availability=True, run_identifier=True):
return check_rid and check_avail


class BasinProxy:
def __init__(self, ds, basinmap):
"""Proxy for accessing data in basin datasets
The idea of a basin proxy is to give access to the data of an
:class:`.RTDCBase` that is mapped, i.e. the indices defined for
the basin do not coincide with the indices in the downstream
dataset.
This class achieves two things:
1. Subset indexing: For every event in the downstream dataset, there
is *only* one corresponding event in the basin dataset. This
could also be achieved via hierarchy children
(:class:`RTDCHierarchy`).
2. Blown indexing: Two different events in the downstream dataset
can refer to one event in the basin dataset. I.e. the basin
dataset contains fewer events than the downstream dataset,
because e.g. it is a raw image recording series that has been
processed and multiple events were found in one frame.
Parameters
----------
ds: RTDCBase
the basin dataset
basinmap: np.ndarray
1D integer indexing array that maps the events of the basin
dataset to the downstream dataset
"""
self.ds = ds
self.basinmap = basinmap
self._features = {}

def __contains__(self, item):
return item in self.ds

def __getattr__(self, item):
if item in [
"basins",
"close",
"features",
"features_ancillary",
"features_basin",
"features_innate",
"features_loaded",
"features_local",
"features_scalar",
"get_measurement_identifier",
]:
return getattr(self.ds, item)
else:
raise AttributeError(
f"BasinProxy does not implement {item}")

def __getitem__(self, feat):
if feat not in self._features:
feat_obj = BasinProxyFeature(feat_obj=self.ds[feat],
basinmap=self.basinmap)
self._features[feat] = feat_obj
return self._features[feat]

def __len__(self):
return len(self.basinmap)


class BasinProxyFeature:
def __init__(self, feat_obj, basinmap):
"""Wrap around a feature object, mapping it upon data access"""
self.feat_obj = feat_obj
self.basinmap = basinmap
self._cache = None
self.is_scalar = bool(len(self.feat_obj.shape) == 1)

def __array__(self):
if self._cache is None and self.is_scalar:
self._cache = self.feat_obj[:][self.basinmap]
else:
# This is dangerous territory in terms of memory usage
out_arr = np.empty((len(self.basinmap),) + self.feat_obj.shape[1:],
dtype=self.feat_obj.dtype)
for ii, idx in enumerate(self.basinmap):
out_arr[ii] = self.feat_obj[idx]
return out_arr
return self._cache

def __getattr__(self, item):
if item in [
"dtype",
"shape",
"size",
]:
return getattr(self.feat_obj, item)
else:
raise AttributeError(
f"BasinProxyFeature does not implement {item}")

def __getitem__(self, index):
if self._cache is None and isinstance(index, numbers.Integral):
# single index, cheap operation
return self.feat_obj[self.basinmap[index]]
elif not self.is_scalar:
# image, mask, etc
if index == slice(None):
indices = self.basinmap
else:
indices = self.basinmap[index]
out_arr = np.empty((len(indices),) + self.feat_obj.shape[1:],
dtype=self.feat_obj.dtype)
for ii, idx in enumerate(indices):
out_arr[ii] = self.feat_obj[idx]
return out_arr
else:
# sets the cache if not already set
return self.__array__()[index]

def __len__(self):
return len(self.feat_obj)


def basin_priority_sorted_key(bdict: Dict):
"""Yield a sorting value for a given basin that can be used with `sorted`
Expand Down
4 changes: 4 additions & 0 deletions dclab/rtdc_dataset/fmt_hdf5/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,10 @@ def mean(self, *args, **kwargs):
def min(self, *args, **kwargs):
return self._fetch_ufunc_attr("min", np.nanmin)

@property
def dtype(self):
return self.h5ds.dtype

@property
def shape(self):
return self.h5ds.shape
Expand Down
36 changes: 36 additions & 0 deletions tests/test_rtdc_feat_basin_mapped.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,42 @@ def test_basin_inception():
)


@pytest.mark.parametrize("basinmap", [
[1, 3, 4], # very simple case
[1, 1, 1, 2], # not trivial, not realizable with hierarchy children
])
def test_basin_mapped(basinmap):
path = retrieve_data("fmt-hdf5_image-mask-blood_2021.zip")
with h5py.File(path, "a") as h5:
# delete circularity to avoid ancillary feature computation in this
# test.
del h5["events"]["circ"]

path_out = path.with_name("level1.rtdc")
basinmap = np.array(basinmap, dtype=np.uint64)

# create basin
with dclab.new_dataset(path) as ds0, dclab.RTDCWriter(path_out) as hw1:
hw1.store_metadata(ds0.config.as_dict(pop_filtering=True))
hw1.store_basin(basin_name="level1",
basin_type="file",
basin_format="hdf5",
basin_locs=[path],
basin_map=basinmap
)

# Checks basin
with dclab.new_dataset(path) as ds0, dclab.new_dataset(path_out) as ds1:
assert np.all(ds1["basinmap0"] == basinmap)
assert len(ds1.basins) == 1
assert ds1.basins[0].verify_basin()
assert "mapped" in str(ds1.basins[0])
assert "deform" in ds1.basins[0].features
assert np.all(ds1["deform"] == ds0["deform"][basinmap])
assert np.all(ds1["image"] == ds0["image"][:][basinmap])
assert np.all(ds1["mask"] == ds0["mask"][:][basinmap])


def test_error_when_basinmap_not_given():
"""This is a test for when the basinmap feature for mapping is missing
Expand Down

0 comments on commit 436e707

Please sign in to comment.