Skip to content

Commit

Permalink
feat: implement DCOR basins
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed Dec 30, 2023
1 parent 1ee7009 commit 68c8852
Show file tree
Hide file tree
Showing 5 changed files with 279 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
0.56.0
- feat: allow nested basins
- feat: implement DCOR basins
- fix: make sure basins are always closed on context exit (#238)
- enh: requests session pooling for fmt_http and fmt_dcor
- enh: implement context manager for RTDCBase
Expand Down
1 change: 1 addition & 0 deletions dclab/rtdc_dataset/fmt_dcor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
from .base import (
DCOR_CERTS_SEARCH_PATHS, RTDC_DCOR, get_server_cert_path, is_dcor_url
)
from .basin import DCORBasin
2 changes: 1 addition & 1 deletion dclab/rtdc_dataset/fmt_dcor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

#: Regular expression for matching a DCOR resource URL
REGEXP_DCOR_URL = re.compile(
r"^(https?:\/\/)?" # protocol
r"^(https?:\/\/)?" # scheme
r"([a-z0-9-\.]*\/?api\/3\/action\/dcserv\?id=)?" # host with API
r"[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$") # id

Expand Down
72 changes: 72 additions & 0 deletions dclab/rtdc_dataset/fmt_dcor/basin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import re

from ..feat_basin import Basin

from .api import REQUESTS_AVAILABLE, APIHandler, DCORAccessError
from .base import RTDC_DCOR


REGEXP_FULL_DCOR_URL = re.compile(
r"^https?:\/\/" # scheme
r"[a-z0-9-\.]*\.[a-z0-9-\.]*\/?api\/3\/action\/dcserv\?id=" # host and API
r"[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$") # id


class DCORBasin(Basin):
basin_format = "dcor"
basin_type = "remote"

def __init__(self, *args, **kwargs):
"""Access to private and public DCOR resources
Since version 2 of the DCOR data API, all feature data are
accessed via :class:`.HTTPBasin`s on S3. The DCOR basin is just
a wrapper around those `HTTPBasin`s.
For private resources, the DCOR format facilitates authentication
via access tokens. Behind the scenes, DCOR creates a pre-signed
URL to access private data on an S3 object storage provider.
Note that you must let dclab know your DCOR access
token via :func:`.APIHandler.add_api_key` for this to work.
The `location` must be a full DCOR URL, including the scheme
and netloc, e.g:
https://dcor.mpl.mpg.de/api/3/action/dcserv?
id=b1404eb5-f661-4920-be79-5ff4e85915d5
"""
self._available_verified = None
super(DCORBasin, self).__init__(*args, **kwargs)

def load_dataset(self, location, **kwargs):
return RTDC_DCOR(location, enable_basins=True, **kwargs)

def is_available(self):
"""Check whether a DCOR resource is available
Notes
-----
- Make sure that your DCOR access token is stored in
:class:`.APIHandler`. You can add tokens with
:func:`.APIHandler.add_api_key`.
"""
if not REQUESTS_AVAILABLE:
# don't even bother
self._available_verified = False
elif not is_full_dcor_url(self.location):
# not a full DCOR URL
self._available_verified = False
if self._available_verified is None:
api = APIHandler(self.location)
try:
self._available_verified = api.get("valid")
except DCORAccessError:
self._available_verified = False
return self._available_verified


def is_full_dcor_url(string):
if not isinstance(string, str):
return False
else:
return REGEXP_FULL_DCOR_URL.match(string.strip())
204 changes: 204 additions & 0 deletions tests/test_rtdc_fmt_dcor_basin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import json
import uuid
import socket

import h5py
import numpy as np

import pytest

from dclab import new_dataset, RTDCWriter
from dclab.rtdc_dataset.fmt_dcor import DCORBasin, RTDC_DCOR


from helper_methods import retrieve_data


pytest.importorskip("requests")


dcor_url = ("https://dcor.mpl.mpg.de/api/3/action/dcserv?id="
"fb719fb2-bd9f-817a-7d70-f4002af916f0")


with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.connect(("dcor.mpl.mpg.de", 443))
except (socket.gaierror, OSError):
pytest.skip("No connection to DCOR",
allow_module_level=True)


@pytest.mark.filterwarnings(
"ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning")
def test_basin_as_dict(tmp_path):
tmp_path = tmp_path.resolve()
h5path = tmp_path / "test_basin_dcor.rtdc"

with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src:
# Store non-existent basin information
with RTDCWriter(dst, mode="append") as hw:
meta = src.config.as_dict(pop_filtering=True)
hw.store_metadata(meta)
hw.store_basin(basin_name="example basin",
basin_type="remote",
basin_format="dcor",
basin_locs=[dcor_url],
basin_descr="an example DCOR test basin",
)

with new_dataset(h5path) as ds:
assert ds._enable_basins
bdict = ds.basins[0].as_dict()
assert bdict["basin_name"] == "example basin"
assert bdict["basin_type"] == "remote"
assert bdict["basin_format"] == "dcor"
assert bdict["basin_locs"] == [dcor_url]
assert bdict["basin_descr"] == "an example DCOR test basin"

# Now use the data from `bdict` to create a new basin
h5path_two = h5path.with_name("smaller_two.rtdc")

# Dataset creation
with RTDCWriter(h5path_two) as hw:
# first, copy all the scalar features to the new file
hw.store_metadata(meta)
hw.store_basin(**bdict)

with new_dataset(h5path_two) as ds2:
bdict2 = ds2.basins[0].as_dict()
assert bdict2["basin_name"] == "example basin"
assert bdict2["basin_type"] == "remote"
assert bdict2["basin_format"] == "dcor"
assert bdict2["basin_locs"] == [dcor_url]
assert bdict2["basin_descr"] == "an example DCOR test basin"


@pytest.mark.parametrize("url", [
"https://example.com/nonexistentbucket/nonexistentkey",
f"https://objectstore.hpccloud.mpcdf.mpg.de/noexist-{uuid.uuid4()}/key",
])
def test_basin_not_available(url):
h5path = retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip")

# Dataset creation
with h5py.File(h5path, "a") as dst:
# Store non-existent basin information
bdat = {
"type": "remote",
"format": "dcor",
"urls": [
# does not exist
url
]
}
blines = json.dumps(bdat, indent=2).split("\n")
basins = dst.require_group("basins")
with RTDCWriter(dst, mode="append") as hw:
hw.write_text(basins, "my_basin", blines)

# Open the dataset and check whether basin is missing
with new_dataset(h5path) as ds:
assert not ds.features_basin
# This is a very subtle test for checking whether invalid basins
# are just ignored:
_ = ds["index"]

# Also test that on a lower level
bn = DCORBasin("https://dcor.mpl.mpg.de/api/3/action/dcserv?id="
"00000000-0000-0000-0000-000000000000")
assert not bn.is_available()
with pytest.raises(ValueError, match="is not available"):
_ = bn.ds


def test_create_basin_file_non_matching_identifier(tmp_path):
h5path = tmp_path / "test_basin_dcor.rtdc"

with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src:
# Store non-existent basin information
bdat = {
"type": "remote",
"format": "dcor",
"urls": [dcor_url],
"features": ["deform"],
}
blines = json.dumps(bdat, indent=2).split("\n")
basins = dst.require_group("basins")
with RTDCWriter(dst, mode="append") as hw:
hw.write_text(basins, "my_basin", blines)
meta = src.config.as_dict(pop_filtering=True)
meta["experiment"]["run identifier"] = "hoolahoop"
hw.store_metadata(meta)

with new_dataset(h5path) as ds:
assert ds.basins
# The feature shows up as available...
assert ds.features_basin == ["deform"]
# ...but it is actually not, since the run identifier does not match
# and therefore dclab does not allow the user to access it.
with pytest.raises(KeyError, match="deform"):
_ = ds["deform"]


@pytest.mark.filterwarnings(
"ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning")
def test_create_basin_file_with_no_data(tmp_path):
h5path = tmp_path / "test_basin_dcor.rtdc"

with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src:
# Store non-existent basin information
bdat = {
"type": "remote",
"format": "dcor",
"urls": [dcor_url]
}
blines = json.dumps(bdat, indent=2).split("\n")
basins = dst.require_group("basins")
with RTDCWriter(dst, mode="append") as hw:
hw.write_text(basins, "my_basin", blines)
meta = src.config.as_dict(pop_filtering=True)
hw.store_metadata(meta)

with new_dataset(h5path) as ds:
# This is essentially a nested basin features test. The basin is
# a DCOR dataset which has two basins, the condensed version of the
# data and the full version of the data as HTTP basins.
assert len(ds.basins) == 1
bn = ds.basins[0]
assert len(bn.ds.basins) == 2
assert ds.features_basin
assert len(ds) == 5000
assert np.allclose(ds["deform"][0], 0.009741939,
atol=0, rtol=1e-5)


@pytest.mark.filterwarnings(
"ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning")
def test_create_basin_file_with_one_feature(tmp_path):
h5path = tmp_path / "test_basin_dcor.rtdc"

with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src:
# Store non-existent basin information
bdat = {
"type": "remote",
"format": "dcor",
"urls": [dcor_url],
"features": ["deform"],
}
blines = json.dumps(bdat, indent=2).split("\n")
basins = dst.require_group("basins")
with RTDCWriter(dst, mode="append") as hw:
hw.write_text(basins, "my_basin", blines)
meta = src.config.as_dict(pop_filtering=True)
hw.store_metadata(meta)

with new_dataset(h5path) as ds:
assert ds.features_basin
assert len(ds) == 5000
assert "deform" in ds.features_basin
assert "area_um" not in ds.features_basin
assert "deform" in ds
assert "area_um" not in ds
assert np.allclose(ds["deform"][0], 0.009741939,
atol=0, rtol=1e-5)

0 comments on commit 68c8852

Please sign in to comment.