Skip to content

Commit

Permalink
enh: priority-based basin sorting (file over remote, http over dcor)
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed Jan 1, 2024
1 parent 4fa0f4a commit 0804911
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
0.56.1
- enh: priority-based basin sorting (file over remote, http over dcor)
0.56.0
- feat: allow nested basins
- feat: implement DCOR basins
Expand Down
5 changes: 4 additions & 1 deletion dclab/rtdc_dataset/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,10 @@ def basins_retrieve(self):
basins = []
bc = feat_basin.get_basin_classes()
muid = self.get_measurement_identifier()
for bdict in self.basins_get_dicts():
# Sort basins according to priority
bdicts_srt = sorted(self.basins_get_dicts(),
key=feat_basin.basin_priority_sorted_key)
for bdict in bdicts_srt:
if bdict["format"] not in bc:
warnings.warn(f"Encountered unsupported basin "
f"format '{bdict['format']}'!")
Expand Down
25 changes: 25 additions & 0 deletions dclab/rtdc_dataset/feat_basin.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from __future__ import annotations

import abc
from typing import Dict


class Basin(abc.ABC):
Expand Down Expand Up @@ -158,6 +159,30 @@ def load_dataset(self, location, **kwargs):
"""Subclasses should return an instance of :class:`.RTDCBase`"""


def basin_priority_sorted_key(bdict: Dict):
"""Yield a sorting value for a given basin that can be used with `sorted`
Basins are normally stored in random order in a dataset. This method
brings them into correct order, prioritizing:
- type "file" over "remote"
- format "HTTP" over "S3" over "dcor"
"""
srt_type = {
"file": "a",
"remote": "b",
}.get(bdict.get("type"), "z")

srt_format = {
"hdf5": "a",
"http": "b",
"s3": "c",
"dcor": "d",
}.get(bdict.get("format"), "z")

return srt_type + srt_format


def get_basin_classes():
bc = {}
for bcls in Basin.__subclasses__():
Expand Down
3 changes: 2 additions & 1 deletion dclab/rtdc_dataset/fmt_hdf5/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ def hash(self):
def basins_get_dicts(self):
"""Return list of dicts for all basins defined in `self.h5file`"""
basins = []
for bk in sorted(self.h5file.get("basins", [])): # `sorted` priority
# Do not sort anything here, sorting is done in `RTDCBase`.
for bk in self.h5file.get("basins", []):
bdat = list(self.h5file["basins"][bk])
if isinstance(bdat[0], bytes):
bdat = [bi.decode("utf") for bi in bdat]
Expand Down
32 changes: 32 additions & 0 deletions tests/test_rtdc_feat_basin.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,43 @@
import pytest

from dclab import new_dataset, rtdc_dataset, RTDCWriter
from dclab.rtdc_dataset import feat_basin


from helper_methods import retrieve_data


def test_basin_sorting_basic():
bnlist = [
{"type": "remote", "format": "dcor", "ident": 0},
{"type": "file", "format": "hdf5", "ident": 1},
{"type": "hans", "format": "hdf5", "ident": 2},
{"type": "remote", "format": "http", "ident": 3},
]
sorted_list = sorted(bnlist, key=feat_basin.basin_priority_sorted_key)
assert sorted_list[0]["ident"] == 1
assert sorted_list[1]["ident"] == 3
assert sorted_list[2]["ident"] == 0
assert sorted_list[3]["ident"] == 2


@pytest.mark.parametrize("btype,bformat,sortval", [
["file", "hdf5", "aa"],
["remote", "http", "bb"],
["remote", "s3", "bc"],
["remote", "dcor", "bd"],
["peter", "hdf5", "za"],
["remote", "hans", "bz"],
["hans", "peter", "zz"],
]
)
def test_basin_sorting_key(btype, bformat, sortval):
bdict = {"type": btype,
"format": bformat,
}
assert feat_basin.basin_priority_sorted_key(bdict) == sortval


def test_basin_unsupported_basin_format():
h5path = retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip")
h5path_small = h5path.with_name("smaller.rtdc")
Expand Down

0 comments on commit 0804911

Please sign in to comment.