-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1ee7009
commit 68c8852
Showing
5 changed files
with
279 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import re | ||
|
||
from ..feat_basin import Basin | ||
|
||
from .api import REQUESTS_AVAILABLE, APIHandler, DCORAccessError | ||
from .base import RTDC_DCOR | ||
|
||
|
||
REGEXP_FULL_DCOR_URL = re.compile( | ||
r"^https?:\/\/" # scheme | ||
r"[a-z0-9-\.]*\.[a-z0-9-\.]*\/?api\/3\/action\/dcserv\?id=" # host and API | ||
r"[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$") # id | ||
|
||
|
||
class DCORBasin(Basin): | ||
basin_format = "dcor" | ||
basin_type = "remote" | ||
|
||
def __init__(self, *args, **kwargs): | ||
"""Access to private and public DCOR resources | ||
Since version 2 of the DCOR data API, all feature data are | ||
accessed via :class:`.HTTPBasin`s on S3. The DCOR basin is just | ||
a wrapper around those `HTTPBasin`s. | ||
For private resources, the DCOR format facilitates authentication | ||
via access tokens. Behind the scenes, DCOR creates a pre-signed | ||
URL to access private data on an S3 object storage provider. | ||
Note that you must let dclab know your DCOR access | ||
token via :func:`.APIHandler.add_api_key` for this to work. | ||
The `location` must be a full DCOR URL, including the scheme | ||
and netloc, e.g: | ||
https://dcor.mpl.mpg.de/api/3/action/dcserv? | ||
id=b1404eb5-f661-4920-be79-5ff4e85915d5 | ||
""" | ||
self._available_verified = None | ||
super(DCORBasin, self).__init__(*args, **kwargs) | ||
|
||
def load_dataset(self, location, **kwargs): | ||
return RTDC_DCOR(location, enable_basins=True, **kwargs) | ||
|
||
def is_available(self): | ||
"""Check whether a DCOR resource is available | ||
Notes | ||
----- | ||
- Make sure that your DCOR access token is stored in | ||
:class:`.APIHandler`. You can add tokens with | ||
:func:`.APIHandler.add_api_key`. | ||
""" | ||
if not REQUESTS_AVAILABLE: | ||
# don't even bother | ||
self._available_verified = False | ||
elif not is_full_dcor_url(self.location): | ||
# not a full DCOR URL | ||
self._available_verified = False | ||
if self._available_verified is None: | ||
api = APIHandler(self.location) | ||
try: | ||
self._available_verified = api.get("valid") | ||
except DCORAccessError: | ||
self._available_verified = False | ||
return self._available_verified | ||
|
||
|
||
def is_full_dcor_url(string): | ||
if not isinstance(string, str): | ||
return False | ||
else: | ||
return REGEXP_FULL_DCOR_URL.match(string.strip()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
import json | ||
import uuid | ||
import socket | ||
|
||
import h5py | ||
import numpy as np | ||
|
||
import pytest | ||
|
||
from dclab import new_dataset, RTDCWriter | ||
from dclab.rtdc_dataset.fmt_dcor import DCORBasin, RTDC_DCOR | ||
|
||
|
||
from helper_methods import retrieve_data | ||
|
||
|
||
pytest.importorskip("requests") | ||
|
||
|
||
dcor_url = ("https://dcor.mpl.mpg.de/api/3/action/dcserv?id=" | ||
"fb719fb2-bd9f-817a-7d70-f4002af916f0") | ||
|
||
|
||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | ||
try: | ||
s.connect(("dcor.mpl.mpg.de", 443)) | ||
except (socket.gaierror, OSError): | ||
pytest.skip("No connection to DCOR", | ||
allow_module_level=True) | ||
|
||
|
||
@pytest.mark.filterwarnings( | ||
"ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning") | ||
def test_basin_as_dict(tmp_path): | ||
tmp_path = tmp_path.resolve() | ||
h5path = tmp_path / "test_basin_dcor.rtdc" | ||
|
||
with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src: | ||
# Store non-existent basin information | ||
with RTDCWriter(dst, mode="append") as hw: | ||
meta = src.config.as_dict(pop_filtering=True) | ||
hw.store_metadata(meta) | ||
hw.store_basin(basin_name="example basin", | ||
basin_type="remote", | ||
basin_format="dcor", | ||
basin_locs=[dcor_url], | ||
basin_descr="an example DCOR test basin", | ||
) | ||
|
||
with new_dataset(h5path) as ds: | ||
assert ds._enable_basins | ||
bdict = ds.basins[0].as_dict() | ||
assert bdict["basin_name"] == "example basin" | ||
assert bdict["basin_type"] == "remote" | ||
assert bdict["basin_format"] == "dcor" | ||
assert bdict["basin_locs"] == [dcor_url] | ||
assert bdict["basin_descr"] == "an example DCOR test basin" | ||
|
||
# Now use the data from `bdict` to create a new basin | ||
h5path_two = h5path.with_name("smaller_two.rtdc") | ||
|
||
# Dataset creation | ||
with RTDCWriter(h5path_two) as hw: | ||
# first, copy all the scalar features to the new file | ||
hw.store_metadata(meta) | ||
hw.store_basin(**bdict) | ||
|
||
with new_dataset(h5path_two) as ds2: | ||
bdict2 = ds2.basins[0].as_dict() | ||
assert bdict2["basin_name"] == "example basin" | ||
assert bdict2["basin_type"] == "remote" | ||
assert bdict2["basin_format"] == "dcor" | ||
assert bdict2["basin_locs"] == [dcor_url] | ||
assert bdict2["basin_descr"] == "an example DCOR test basin" | ||
|
||
|
||
@pytest.mark.parametrize("url", [ | ||
"https://example.com/nonexistentbucket/nonexistentkey", | ||
f"https://objectstore.hpccloud.mpcdf.mpg.de/noexist-{uuid.uuid4()}/key", | ||
]) | ||
def test_basin_not_available(url): | ||
h5path = retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip") | ||
|
||
# Dataset creation | ||
with h5py.File(h5path, "a") as dst: | ||
# Store non-existent basin information | ||
bdat = { | ||
"type": "remote", | ||
"format": "dcor", | ||
"urls": [ | ||
# does not exist | ||
url | ||
] | ||
} | ||
blines = json.dumps(bdat, indent=2).split("\n") | ||
basins = dst.require_group("basins") | ||
with RTDCWriter(dst, mode="append") as hw: | ||
hw.write_text(basins, "my_basin", blines) | ||
|
||
# Open the dataset and check whether basin is missing | ||
with new_dataset(h5path) as ds: | ||
assert not ds.features_basin | ||
# This is a very subtle test for checking whether invalid basins | ||
# are just ignored: | ||
_ = ds["index"] | ||
|
||
# Also test that on a lower level | ||
bn = DCORBasin("https://dcor.mpl.mpg.de/api/3/action/dcserv?id=" | ||
"00000000-0000-0000-0000-000000000000") | ||
assert not bn.is_available() | ||
with pytest.raises(ValueError, match="is not available"): | ||
_ = bn.ds | ||
|
||
|
||
def test_create_basin_file_non_matching_identifier(tmp_path): | ||
h5path = tmp_path / "test_basin_dcor.rtdc" | ||
|
||
with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src: | ||
# Store non-existent basin information | ||
bdat = { | ||
"type": "remote", | ||
"format": "dcor", | ||
"urls": [dcor_url], | ||
"features": ["deform"], | ||
} | ||
blines = json.dumps(bdat, indent=2).split("\n") | ||
basins = dst.require_group("basins") | ||
with RTDCWriter(dst, mode="append") as hw: | ||
hw.write_text(basins, "my_basin", blines) | ||
meta = src.config.as_dict(pop_filtering=True) | ||
meta["experiment"]["run identifier"] = "hoolahoop" | ||
hw.store_metadata(meta) | ||
|
||
with new_dataset(h5path) as ds: | ||
assert ds.basins | ||
# The feature shows up as available... | ||
assert ds.features_basin == ["deform"] | ||
# ...but it is actually not, since the run identifier does not match | ||
# and therefore dclab does not allow the user to access it. | ||
with pytest.raises(KeyError, match="deform"): | ||
_ = ds["deform"] | ||
|
||
|
||
@pytest.mark.filterwarnings( | ||
"ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning") | ||
def test_create_basin_file_with_no_data(tmp_path): | ||
h5path = tmp_path / "test_basin_dcor.rtdc" | ||
|
||
with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src: | ||
# Store non-existent basin information | ||
bdat = { | ||
"type": "remote", | ||
"format": "dcor", | ||
"urls": [dcor_url] | ||
} | ||
blines = json.dumps(bdat, indent=2).split("\n") | ||
basins = dst.require_group("basins") | ||
with RTDCWriter(dst, mode="append") as hw: | ||
hw.write_text(basins, "my_basin", blines) | ||
meta = src.config.as_dict(pop_filtering=True) | ||
hw.store_metadata(meta) | ||
|
||
with new_dataset(h5path) as ds: | ||
# This is essentially a nested basin features test. The basin is | ||
# a DCOR dataset which has two basins, the condensed version of the | ||
# data and the full version of the data as HTTP basins. | ||
assert len(ds.basins) == 1 | ||
bn = ds.basins[0] | ||
assert len(bn.ds.basins) == 2 | ||
assert ds.features_basin | ||
assert len(ds) == 5000 | ||
assert np.allclose(ds["deform"][0], 0.009741939, | ||
atol=0, rtol=1e-5) | ||
|
||
|
||
@pytest.mark.filterwarnings( | ||
"ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning") | ||
def test_create_basin_file_with_one_feature(tmp_path): | ||
h5path = tmp_path / "test_basin_dcor.rtdc" | ||
|
||
with h5py.File(h5path, "a") as dst, RTDC_DCOR(dcor_url) as src: | ||
# Store non-existent basin information | ||
bdat = { | ||
"type": "remote", | ||
"format": "dcor", | ||
"urls": [dcor_url], | ||
"features": ["deform"], | ||
} | ||
blines = json.dumps(bdat, indent=2).split("\n") | ||
basins = dst.require_group("basins") | ||
with RTDCWriter(dst, mode="append") as hw: | ||
hw.write_text(basins, "my_basin", blines) | ||
meta = src.config.as_dict(pop_filtering=True) | ||
hw.store_metadata(meta) | ||
|
||
with new_dataset(h5path) as ds: | ||
assert ds.features_basin | ||
assert len(ds) == 5000 | ||
assert "deform" in ds.features_basin | ||
assert "area_um" not in ds.features_basin | ||
assert "deform" in ds | ||
assert "area_um" not in ds | ||
assert np.allclose(ds["deform"][0], 0.009741939, | ||
atol=0, rtol=1e-5) |