diff --git a/CHANGELOG b/CHANGELOG index 4c2678e9..317673cb 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,6 @@ 0.56.0 - feat: allow nested basins + - feat: implement DCOR basins - fix: make sure basins are always closed on context exit (#238) - enh: requests session pooling for fmt_http and fmt_dcor - enh: implement context manager for RTDCBase diff --git a/dclab/rtdc_dataset/fmt_dcor/__init__.py b/dclab/rtdc_dataset/fmt_dcor/__init__.py index bff5b6f7..c11a3720 100644 --- a/dclab/rtdc_dataset/fmt_dcor/__init__.py +++ b/dclab/rtdc_dataset/fmt_dcor/__init__.py @@ -4,3 +4,4 @@ from .base import ( DCOR_CERTS_SEARCH_PATHS, RTDC_DCOR, get_server_cert_path, is_dcor_url ) +from .basin import DCORBasin diff --git a/dclab/rtdc_dataset/fmt_dcor/base.py b/dclab/rtdc_dataset/fmt_dcor/base.py index b6cb66d0..97a66ccf 100644 --- a/dclab/rtdc_dataset/fmt_dcor/base.py +++ b/dclab/rtdc_dataset/fmt_dcor/base.py @@ -20,7 +20,7 @@ #: Regular expression for matching a DCOR resource URL REGEXP_DCOR_URL = re.compile( - r"^(https?:\/\/)?" # protocol + r"^(https?:\/\/)?" # scheme r"([a-z0-9-\.]*\/?api\/3\/action\/dcserv\?id=)?" # host with API r"[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$") # id diff --git a/dclab/rtdc_dataset/fmt_dcor/basin.py b/dclab/rtdc_dataset/fmt_dcor/basin.py new file mode 100644 index 00000000..a0146e94 --- /dev/null +++ b/dclab/rtdc_dataset/fmt_dcor/basin.py @@ -0,0 +1,72 @@ +import re + +from ..feat_basin import Basin + +from .api import REQUESTS_AVAILABLE, APIHandler, DCORAccessError +from .base import RTDC_DCOR + + +REGEXP_FULL_DCOR_URL = re.compile( + r"^https?:\/\/" # scheme + r"[a-z0-9-\.]*\.[a-z0-9-\.]*\/?api\/3\/action\/dcserv\?id=" # host and API + r"[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$") # id + + +class DCORBasin(Basin): + basin_format = "dcor" + basin_type = "remote" + + def __init__(self, *args, **kwargs): + """Access to private and public DCOR resources + + Since version 2 of the DCOR data API, all feature data are + accessed via :class:`.HTTPBasin`s on S3. The DCOR basin is just + a wrapper around those `HTTPBasin`s. + + For private resources, the DCOR format facilitates authentication + via access tokens. Behind the scenes, DCOR creates a pre-signed + URL to access private data on an S3 object storage provider. + Note that you must let dclab know your DCOR access + token via :func:`.APIHandler.add_api_key` for this to work. + + The `location` must be a full DCOR URL, including the scheme + and netloc, e.g: + + https://dcor.mpl.mpg.de/api/3/action/dcserv? + id=b1404eb5-f661-4920-be79-5ff4e85915d5 + """ + self._available_verified = None + super(DCORBasin, self).__init__(*args, **kwargs) + + def load_dataset(self, location, **kwargs): + return RTDC_DCOR(location, enable_basins=True, **kwargs) + + def is_available(self): + """Check whether a DCOR resource is available + + Notes + ----- + - Make sure that your DCOR access token is stored in + :class:`.APIHandler`. You can add tokens with + :func:`.APIHandler.add_api_key`. + """ + if not REQUESTS_AVAILABLE: + # don't even bother + self._available_verified = False + elif not is_full_dcor_url(self.location): + # not a full DCOR URL + self._available_verified = False + if self._available_verified is None: + api = APIHandler(self.location) + try: + self._available_verified = api.get("valid") + except DCORAccessError: + self._available_verified = False + return self._available_verified + + +def is_full_dcor_url(string): + if not isinstance(string, str): + return False + else: + return REGEXP_FULL_DCOR_URL.match(string.strip()) diff --git a/tests/test_rtdc_fmt_dcor_basin.py b/tests/test_rtdc_fmt_dcor_basin.py new file mode 100644 index 00000000..cc3cd13d --- /dev/null +++ b/tests/test_rtdc_fmt_dcor_basin.py @@ -0,0 +1,204 @@ +import json +import uuid +import socket + +import h5py +import numpy as np + +import pytest + +from dclab import new_dataset, RTDCWriter +from dclab.rtdc_dataset.fmt_dcor import DCORBasin, RTDC_DCOR + + +from helper_methods import retrieve_data + + +pytest.importorskip("requests") + + +dcor_url = ("https://dcor.mpl.mpg.de/api/3/action/dcserv?id=" + "fb719fb2-bd9f-817a-7d70-f4002af916f0") + + +with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + s.connect(("dcor.mpl.mpg.de", 443)) + except (socket.gaierror, OSError): + pytest.skip("No connection to DCOR", + allow_module_level=True) + + +@pytest.mark.filterwarnings( + "ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning") +def test_basin_as_dict(tmp_path): + tmp_path = tmp_path.resolve() + h5path = tmp_path / "test_basin_dcor.rtdc" + + with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src: + # Store non-existent basin information + with RTDCWriter(dst, mode="append") as hw: + meta = src.config.as_dict(pop_filtering=True) + hw.store_metadata(meta) + hw.store_basin(basin_name="example basin", + basin_type="remote", + basin_format="dcor", + basin_locs=[dcor_url], + basin_descr="an example DCOR test basin", + ) + + with new_dataset(h5path) as ds: + assert ds._enable_basins + bdict = ds.basins[0].as_dict() + assert bdict["basin_name"] == "example basin" + assert bdict["basin_type"] == "remote" + assert bdict["basin_format"] == "dcor" + assert bdict["basin_locs"] == [dcor_url] + assert bdict["basin_descr"] == "an example DCOR test basin" + + # Now use the data from `bdict` to create a new basin + h5path_two = h5path.with_name("smaller_two.rtdc") + + # Dataset creation + with RTDCWriter(h5path_two) as hw: + # first, copy all the scalar features to the new file + hw.store_metadata(meta) + hw.store_basin(**bdict) + + with new_dataset(h5path_two) as ds2: + bdict2 = ds2.basins[0].as_dict() + assert bdict2["basin_name"] == "example basin" + assert bdict2["basin_type"] == "remote" + assert bdict2["basin_format"] == "dcor" + assert bdict2["basin_locs"] == [dcor_url] + assert bdict2["basin_descr"] == "an example DCOR test basin" + + +@pytest.mark.parametrize("url", [ + "https://example.com/nonexistentbucket/nonexistentkey", + f"https://objectstore.hpccloud.mpcdf.mpg.de/noexist-{uuid.uuid4()}/key", +]) +def test_basin_not_available(url): + h5path = retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip") + + # Dataset creation + with h5py.File(h5path, "a") as dst: + # Store non-existent basin information + bdat = { + "type": "remote", + "format": "dcor", + "urls": [ + # does not exist + url + ] + } + blines = json.dumps(bdat, indent=2).split("\n") + basins = dst.require_group("basins") + with RTDCWriter(dst, mode="append") as hw: + hw.write_text(basins, "my_basin", blines) + + # Open the dataset and check whether basin is missing + with new_dataset(h5path) as ds: + assert not ds.features_basin + # This is a very subtle test for checking whether invalid basins + # are just ignored: + _ = ds["index"] + + # Also test that on a lower level + bn = DCORBasin("https://dcor.mpl.mpg.de/api/3/action/dcserv?id=" + "00000000-0000-0000-0000-000000000000") + assert not bn.is_available() + with pytest.raises(ValueError, match="is not available"): + _ = bn.ds + + +def test_create_basin_file_non_matching_identifier(tmp_path): + h5path = tmp_path / "test_basin_dcor.rtdc" + + with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src: + # Store non-existent basin information + bdat = { + "type": "remote", + "format": "dcor", + "urls": [dcor_url], + "features": ["deform"], + } + blines = json.dumps(bdat, indent=2).split("\n") + basins = dst.require_group("basins") + with RTDCWriter(dst, mode="append") as hw: + hw.write_text(basins, "my_basin", blines) + meta = src.config.as_dict(pop_filtering=True) + meta["experiment"]["run identifier"] = "hoolahoop" + hw.store_metadata(meta) + + with new_dataset(h5path) as ds: + assert ds.basins + # The feature shows up as available... + assert ds.features_basin == ["deform"] + # ...but it is actually not, since the run identifier does not match + # and therefore dclab does not allow the user to access it. + with pytest.raises(KeyError, match="deform"): + _ = ds["deform"] + + +@pytest.mark.filterwarnings( + "ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning") +def test_create_basin_file_with_no_data(tmp_path): + h5path = tmp_path / "test_basin_dcor.rtdc" + + with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src: + # Store non-existent basin information + bdat = { + "type": "remote", + "format": "dcor", + "urls": [dcor_url] + } + blines = json.dumps(bdat, indent=2).split("\n") + basins = dst.require_group("basins") + with RTDCWriter(dst, mode="append") as hw: + hw.write_text(basins, "my_basin", blines) + meta = src.config.as_dict(pop_filtering=True) + hw.store_metadata(meta) + + with new_dataset(h5path) as ds: + # This is essentially a nested basin features test. The basin is + # a DCOR dataset which has two basins, the condensed version of the + # data and the full version of the data as HTTP basins. + assert len(ds.basins) == 1 + bn = ds.basins[0] + assert len(bn.ds.basins) == 2 + assert ds.features_basin + assert len(ds) == 5000 + assert np.allclose(ds["deform"][0], 0.009741939, + atol=0, rtol=1e-5) + + +@pytest.mark.filterwarnings( + "ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning") +def test_create_basin_file_with_one_feature(tmp_path): + h5path = tmp_path / "test_basin_dcor.rtdc" + + with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src: + # Store non-existent basin information + bdat = { + "type": "remote", + "format": "dcor", + "urls": [dcor_url], + "features": ["deform"], + } + blines = json.dumps(bdat, indent=2).split("\n") + basins = dst.require_group("basins") + with RTDCWriter(dst, mode="append") as hw: + hw.write_text(basins, "my_basin", blines) + meta = src.config.as_dict(pop_filtering=True) + hw.store_metadata(meta) + + with new_dataset(h5path) as ds: + assert ds.features_basin + assert len(ds) == 5000 + assert "deform" in ds.features_basin + assert "area_um" not in ds.features_basin + assert "deform" in ds + assert "area_um" not in ds + assert np.allclose(ds["deform"][0], 0.009741939, + atol=0, rtol=1e-5)