Skip to content

Commit

Permalink
ref: make everything ready for next DCOR dcserv version
Browse files Browse the repository at this point in the history
(only have to populate the condensed datasets in S3 for it to work)
  • Loading branch information
paulmueller committed Oct 24, 2023
1 parent fcbcd27 commit 8eb38a0
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 19 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
0.54.3
- fix: implement `__contains__` for DCOR logs and tables
- enh: add requests timeout for DCOR data
- enh: more caching of event size and shape for HDF5 format
- enh: faster computation of contour length for DCOR format
- fix: implement `__contains__` for DCOR logs and tables
- enh: use dcserv version 2 in DCOR format (fast S3 access)
- setup: pin s3fs>=2023.10.0
0.54.2
Expand Down
23 changes: 16 additions & 7 deletions dclab/rtdc_dataset/fmt_dcor/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class APIHandler:
#: DCOR API Keys/Tokens in the current session
api_keys = []

def __init__(self, url, api_key="", cert_path=None):
def __init__(self, url, api_key="", cert_path=None, dcserv_api_version=2):
"""
Parameters
Expand All @@ -40,6 +40,8 @@ def __init__(self, url, api_key="", cert_path=None):
self.verify = cert_path or True
#: DCOR API Token
self.api_key = api_key
#: ckanext-dc_serve dcserv API version
self.dcserv_api_version = dcserv_api_version
self._cache = {}

@classmethod
Expand All @@ -56,30 +58,37 @@ def _get(self, query, feat=None, trace=None, event=None, api_key="",
retries=3):
# "version=2" introduced in dclab 0.54.3
# (supported since ckanext.dc_serve 0.13.2)
qstr = f"&version=2&query={query}"
qstr = f"&version={self.dcserv_api_version}&query={query}"
if feat is not None:
qstr += f"&feature={feat}"
if trace is not None:
qstr += f"&trace={trace}"
if event is not None:
qstr += f"&event={event}"
apicall = self.url + qstr
fail_reasons = []
for _ in range(retries):
req = requests.get(apicall,
headers={"Authorization": api_key},
verify=self.verify)
verify=self.verify,
timeout=3.1,
)
try:
jreq = req.json()
except json.decoder.JSONDecodeError:
fail_reasons.append("invalid json")
time.sleep(0.1) # wait a bit, maybe the server is overloaded
continue
except requests.urllib3.exceptions.ConnectionError:
fail_reasons.append("connection problem")
time.sleep(5) # wait a bit, maybe the server is overloaded
continue
else:
break
else:
raise DCORAccessError(f"Could not complete query '{apicall}', "
"because the response did not contain any "
f"JSON-parseable data. Retried {retries} "
"times.")
raise DCORAccessError(f"Could not complete query '{apicall}'. "
f"I retried {retries} times. "
f"Messages: {fail_reasons}")
return jreq

def get(self, query, feat=None, trace=None, event=None):
Expand Down
13 changes: 10 additions & 3 deletions dclab/rtdc_dataset/fmt_dcor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@

class RTDC_DCOR(RTDCBase):
def __init__(self, url, host="dcor.mpl.mpg.de", api_key="",
use_ssl=None, cert_path=None, *args, **kwargs):
use_ssl=None, cert_path=None, dcserv_api_version=2,
*args, **kwargs):
"""Wrap around the DCOR API
Parameters
Expand All @@ -52,6 +53,10 @@ def __init__(self, url, host="dcor.mpl.mpg.de", api_key="",
The (optional) path to a server CA bundle; this should only
be necessary for DCOR instances in the intranet with a custom
CA or for certificate pinning.
dcserv_api_version: int
Version of the dcserv API to use. In version 0.13.2 of
ckanext-dc_serve, version 2 was introduced which entails
serving an S3-basin-only dataset.
*args:
Arguments for `RTDCBase`
**kwargs:
Expand All @@ -74,8 +79,10 @@ def __init__(self, url, host="dcor.mpl.mpg.de", api_key="",
if cert_path is None:
cert_path = get_server_cert_path(get_host_from_url(self.path))

self.api = api.APIHandler(url=self.path, api_key=api_key,
cert_path=cert_path)
self.api = api.APIHandler(url=self.path,
api_key=api_key,
cert_path=cert_path,
dcserv_api_version=dcserv_api_version)

# Parse configuration
self.config = Configuration(cfg=self.api.get(query="metadata"))
Expand Down
1 change: 0 additions & 1 deletion dclab/rtdc_dataset/fmt_hdf5/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,4 +233,3 @@ def shape(self):
atrace = list(self.h5group.keys())[0]
self._shape = tuple([len(self)] + list(self.h5group[atrace].shape))
return self._shape

2 changes: 1 addition & 1 deletion dclab/rtdc_dataset/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def new_dataset(data, identifier=None, **kwargs):
identifier: str
A unique identifier for this dataset. If set to `None`
an identifier is generated.
kwargs: dict
kwargs:
Additional parameters passed to the RTDCBase subclass
Returns
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ dynamic = ["version"]

[project.optional-dependencies]
all = ["dclab[dcor,export,lme4,ml,s3,tdms]"]
dcor = ["requests>=2.31.0"] # CVE-2023-32681
dcor = ["requests>=2.31.0", # CVE-2023-32681
"dclab[s3]" # fast access via basins
]
export = ["fcswrite>=0.5.0", # fcs export
"imageio[ffmpeg]", # avi export
]
Expand Down
28 changes: 25 additions & 3 deletions tests/test_rtdc_fmt_dcor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,15 @@
from helper_methods import retrieve_data


try:
from s3fs import S3FileSystem
except ImportError:
S3FileSystem = None


pytest.importorskip("requests")
pytest.importorskip("s3fs")


with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
Expand All @@ -20,6 +28,14 @@
allow_module_level=True)


@pytest.fixture(autouse=True)
def s3fs_cleanup():
# Clear the cache, so we get a clean slate every time we instantiate
# an S3FileSystem.
yield
S3FileSystem.cachable = False


class MockAPIHandler(dclab.rtdc_dataset.fmt_dcor.api.APIHandler):
def get(self, query, feat=None, trace=None, event=None):
"""Mocks communication with the DCOR API"""
Expand Down Expand Up @@ -77,8 +93,10 @@ def test_dcor_base(monkeypatch):


def test_dcor_cache_scalar():
# Testing the cache is only relevant for api version 1
# calibration beads
with dclab.new_dataset("fb719fb2-bd9f-817a-7d70-f4002af916f0") as ds:
with dclab.new_dataset("fb719fb2-bd9f-817a-7d70-f4002af916f0",
dcserv_api_version=1) as ds:
# sanity checks
assert len(ds) == 5000
assert "area_um" in ds
Expand All @@ -91,8 +109,10 @@ def test_dcor_cache_scalar():


def test_dcor_cache_trace():
# Testing the cache is only relevant for api version 1
# calibration beads
with dclab.new_dataset("fb719fb2-bd9f-817a-7d70-f4002af916f0") as ds:
with dclab.new_dataset("fb719fb2-bd9f-817a-7d70-f4002af916f0",
dcserv_api_version=1) as ds:
# sanity checks
assert len(ds) == 5000
assert "trace" in ds
Expand Down Expand Up @@ -135,9 +155,10 @@ def test_dcor_hierarchy(monkeypatch):

def test_dcor_logs():
with dclab.new_dataset("fb719fb2-bd9f-817a-7d70-f4002af916f0") as ds:
assert len(ds.logs) == 1
assert len(ds.logs) >= 2 # there might be others
assert ds.logs["log"][0] \
== "[LOG] number of written datasets 0 10:04:05.893"
assert "dclab-condense" in ds.logs


def test_dcor_shape_contour():
Expand Down Expand Up @@ -185,6 +206,7 @@ def test_dcor_slicing_contour(idxs):
"""Test slicing of contour data"""
# reticulocytes.rtdc contains contour data
ds = dclab.new_dataset("13247dd0-3d8b-711d-a410-468b4de6fb7a")

data_ref = [
ds["contour"][0],
ds["contour"][2],
Expand Down
2 changes: 1 addition & 1 deletion tests/test_rtdc_fmt_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

@pytest.fixture(autouse=True)
def s3fs_cleanup():
# Clear the cache so we get a clean slate every time we instantiate
# Clear the cache, so we get a clean slate every time we instantiate
# an S3FileSystem.
yield
S3FileSystem.cachable = False
Expand Down
2 changes: 1 addition & 1 deletion tests/test_rtdc_fmt_s3_basin.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

@pytest.fixture()
def s3fs_cleanup():
# Clear the cache so we get a clean slate every time we instantiate
# Clear the cache, so we get a clean slate every time we instantiate
# an S3FileSystem.
yield
S3FileSystem.cachable = False
Expand Down

0 comments on commit 8eb38a0

Please sign in to comment.