Skip to content

Commit

Permalink
enh: change measurement identifier upon filtered export
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed Nov 2, 2023
1 parent 301285a commit d6da97d
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- enh: more caching of event size and shape for HDF5 format
- enh: faster computation of contour length for DCOR format
- enh: use dcserv version 2 in DCOR format (fast S3 access)
- enh: change measurement identifier upon filtered export
- setup: pin s3fs>=2023.10.0
- setup: pin upper bounds of dependencies
0.54.2
Expand Down
7 changes: 7 additions & 0 deletions dclab/rtdc_dataset/export.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Export RT-DC measurement data"""
import codecs
import pathlib
import uuid
import warnings

import h5py
Expand Down Expand Up @@ -240,6 +241,12 @@ def hdf5(self, path, features=None, filtered=True,
# add user-defined metadata
if "user" in self.rtdc_ds.config:
meta["user"] = self.rtdc_ds.config["user"].copy()
if filtered:
# Define a new measurement identifier, so that we are not running
# into any problems with basins being defined for filtered data.
ds_run_id = self.rtdc_ds.get_measurement_identifier()
random_ap = str(uuid.uuid4())[:4]
meta["experiment"]["run identifier"] = f"{ds_run_id}-{random_ap}"

if filtered:
filtarr = self.rtdc_ds.filter.all
Expand Down
51 changes: 43 additions & 8 deletions tests/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,49 @@ def test_hdf5_filtered_index():
assert ds2.config["experiment"]["event count"] == n - 1


@pytest.mark.filterwarnings(
"ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning")
def test_hdf5_identifier_different():
"""Make sure the identifier is changed upon export"""
pin = retrieve_data("fmt-hdf5_image-bg_2020.zip")
pout = pin.with_name("exported.rtdc")
with new_dataset(pin) as din:
mid_in = din.get_measurement_identifier()
assert len(din) == 5
assert mid_in
din.config["filtering"]["deform min"] = np.mean(din["deform"])
din.config["filtering"]["deform max"] = np.max(din["deform"])
din.apply_filter()
din.export.hdf5(pout, ["area_um", "deform"], filtered=True)

with new_dataset(pout) as dout:
mid_out = dout.get_measurement_identifier()
assert mid_out.startswith(mid_in)
assert mid_out != mid_in
assert len(dout) == 3


@pytest.mark.filterwarnings(
"ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning")
def test_hdf5_identifier_same_unfiltered():
"""Make sure the identifier is changed upon export"""
pin = retrieve_data("fmt-hdf5_image-bg_2020.zip")
pout = pin.with_name("exported.rtdc")
with new_dataset(pin) as din:
mid_in = din.get_measurement_identifier()
assert len(din) == 5
assert mid_in
din.config["filtering"]["deform min"] = np.mean(din["deform"])
din.config["filtering"]["deform max"] = np.max(din["deform"])
din.apply_filter()
din.export.hdf5(pout, ["area_um", "deform"], filtered=False)

with new_dataset(pout) as dout:
mid_out = dout.get_measurement_identifier()
assert mid_out == mid_in
assert len(dout) == 5


def test_hdf5_image_bg():
n = 65
keys = ["image", "image_bg"]
Expand Down Expand Up @@ -618,11 +661,3 @@ def test_tsv_not_filtered():
edest = tempfile.mkdtemp()
f1 = join(edest, "test.tsv")
ds.export.tsv(f1, keys, filtered=False)


if __name__ == "__main__":
# Run all tests
_loc = locals()
for _key in list(_loc.keys()):
if _key.startswith("test_") and hasattr(_loc[_key], "__call__"):
_loc[_key]()

0 comments on commit d6da97d

Please sign in to comment.