diff --git a/conda/meta.yaml b/conda/meta.yaml index b0aae100..36a7cef6 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -9,6 +9,7 @@ requirements: - setuptools run: - python>=3.8 + - python-dateutil - scipp - h5py diff --git a/pyproject.toml b/pyproject.toml index e5f54638..a5cf6602 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,10 @@ build-backend = "setuptools.build_meta" [tool.pytest.ini_options] addopts = "-ra -v" testpaths = "tests" +filterwarnings = [ + "error", + "ignore::UserWarning", +] [tool.mypy] mypy_path = "src" diff --git a/setup.cfg b/setup.cfg index 18c631f8..26089eb4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,6 +23,7 @@ package_dir = = src packages = find: install_requires = + python-dateutil scipp>=0.12 h5py python_requires = >=3.8 diff --git a/src/scippnexus/_common.py b/src/scippnexus/_common.py index b8b9bade..03daca88 100644 --- a/src/scippnexus/_common.py +++ b/src/scippnexus/_common.py @@ -9,7 +9,6 @@ def convert_time_to_datetime64( raw_times: sc.Variable, - group_path: str, start: str = None, scaling_factor: Union[float, np.float_] = None) -> sc.Variable: """ @@ -25,8 +24,6 @@ def convert_time_to_datetime64( Args: raw_times: The raw time data from a nexus file. - group_path: The path within the nexus file to the log being read. - Used to generate warnings if loading the log fails. start: Optional, the start time of the log in an ISO8601 string. If not provided, defaults to the beginning of the unix epoch (1970-01-01T00:00:00). @@ -34,31 +31,21 @@ def convert_time_to_datetime64( time series data and the unit of the raw_times Variable. If not provided, defaults to 1 (a no-op scaling factor). """ - try: - raw_times_ns = sc.to_unit(raw_times, sc.units.ns, copy=False) - except sc.UnitError: - raise sc.UnitError( - f"The units of time in the entry at " - f"'{group_path}/time{{units}}' must be convertible to seconds, " - f"but this cannot be done for '{raw_times.unit}'. Skipping " - f"loading group at '{group_path}'.") - - try: - _start_ts = sc.scalar(value=np.datetime64(start or "1970-01-01T00:00:00"), - unit=sc.units.ns, - dtype=sc.DType.datetime64) - except ValueError: - raise ValueError( - f"The date string '{start}' in the entry at " - f"'{group_path}/time@start' failed to parse as an ISO8601 date. " - f"Skipping loading group at '{group_path}'") + if (raw_times.dtype + in (sc.DType.float64, sc.DType.float32)) or scaling_factor is not None: + unit = sc.units.ns + else: + # determine more precise unit + ratio = sc.scalar(1.0, unit=start.unit) / sc.scalar( + 1.0, unit=raw_times.unit).to(unit=start.unit) + unit = start.unit if ratio.value < 1.0 else raw_times.unit if scaling_factor is None: - times = raw_times_ns.astype(sc.DType.int64, copy=False) + times = raw_times else: - _scale = sc.scalar(value=scaling_factor) - times = (raw_times_ns * _scale).astype(sc.DType.int64, copy=False) - return _start_ts + times + times = raw_times * sc.scalar(value=scaling_factor) + return start.to(unit=unit, copy=False) + times.to( + dtype=sc.DType.int64, unit=unit, copy=False) def _to_canonical_select(dims: List[str], diff --git a/src/scippnexus/docs/our-interpretation-of-the-nexus-format.md b/src/scippnexus/docs/our-interpretation-of-the-nexus-format.md index ab3931be..022d8a0f 100644 --- a/src/scippnexus/docs/our-interpretation-of-the-nexus-format.md +++ b/src/scippnexus/docs/our-interpretation-of-the-nexus-format.md @@ -47,6 +47,16 @@ More concretely this means that, e.g., for loading an `NXdetector` from a NH, th If the above yields no more than one item, the group can be loaded. +## Datetime fields + +HDF5 does not support storing date and time information such as `np.datetime64`. +`NXlog` and `NXevent_data` specify specific attributes for fields that have to be interpreted as date and time, in particular [NXlog/time@start](https://manual.nexusformat.org/classes/base_classes/NXlog.html#nxlog-time-start-attribute) and [NXevent_data/event_time_offset@offset](https://manual.nexusformat.org/classes/base_classes/NXevent_data.html#nxevent-data-event-time-offset-field). +No *general* definition or intention is documented in the NF, but according to TR this is nevertheless standard. +Due to the attribute naming mismatch in the two cases where it *is* specified we need to assume that naming is arbitrary. +Therefore, we search *all* attributes of a field for a date and time offset, provided that the field's unit is a time unit. +It is unclear what should be done in the case of multiple matches. +As of April 2022 we ignore the date and time offsets in this case, since guessing which one to use based on the attribute name does not seem desirable. + ## Bin edges For [NXdetector](https://manual.nexusformat.org/classes/base_classes/NXdetector.html) the NF defines a [time_of_flight](https://manual.nexusformat.org/classes/base_classes/NXdetector.html#nxdetector-time-of-flight-field) field, exceeding the data shape by one, i.e., it is meant as bin-edges. diff --git a/src/scippnexus/nxevent_data.py b/src/scippnexus/nxevent_data.py index 5340bbc0..5786c261 100644 --- a/src/scippnexus/nxevent_data.py +++ b/src/scippnexus/nxevent_data.py @@ -5,7 +5,7 @@ import numpy as np import scipp as sc -from ._common import to_plain_index, convert_time_to_datetime64 +from ._common import to_plain_index from .nxobject import NXobject, ScippIndex, NexusStructureError _event_dimension = "event" @@ -57,11 +57,7 @@ def _getitem(self, select: ScippIndex) -> sc.DataArray: index = slice(start, stop, stride) event_index = self['event_index'][index].values - event_time_zero = self['event_time_zero'] - event_time_zero = convert_time_to_datetime64( - event_time_zero[index], - start=event_time_zero.attrs.get('offset'), - group_path=self.name) + event_time_zero = self['event_time_zero'][index] num_event = self["event_time_offset"].shape[0] # Some files contain uint64 "max" indices, which turn into negatives during diff --git a/src/scippnexus/nxlog.py b/src/scippnexus/nxlog.py index 08673771..640da128 100644 --- a/src/scippnexus/nxlog.py +++ b/src/scippnexus/nxlog.py @@ -4,7 +4,6 @@ from typing import List, Union import scipp as sc -from ._common import convert_time_to_datetime64 from .nxobject import NXobject, ScippIndex from .nxdata import NXdata @@ -38,20 +37,11 @@ def _nxbase(self) -> NXdata: return NXdata(self._group, signal_name_default='value', axes=axes) def _getitem(self, select: ScippIndex) -> sc.DataArray: - data: sc.DataArray = self._nxbase[select] - # The 'time' field in NXlog contains extra properties 'start' and - # 'scaling_factor' that are not handled by NXdata. These are used - # to transform to a datetime-coord. - if 'time' in self: - if 'time' not in data.coords: - raise sc.DimensionError( - "NXlog is time-dependent, but failed to load `time` dataset") - data.coords['time'] = convert_time_to_datetime64( - raw_times=data.coords.pop('time'), - start=self['time'].attrs.get('start'), - scaling_factor=self['time'].attrs.get('scaling_factor'), - group_path=self['time'].name) - return data + base = self._nxbase + # Field loads datetime offset attributes automatically, but for NXlog this + # may apparently be omitted and must then interpreted as relative to epoch. + base.child_params['time'] = {'is_time': True} + return base[select] def _get_field_dims(self, name: str) -> Union[None, List[str]]: return self._nxbase._get_field_dims(name) diff --git a/src/scippnexus/nxobject.py b/src/scippnexus/nxobject.py index 0bfaaf28..ddb7b306 100644 --- a/src/scippnexus/nxobject.py +++ b/src/scippnexus/nxobject.py @@ -2,7 +2,10 @@ # Copyright (c) 2022 Scipp contributors (https://github.com/scipp) # @author Simon Heybrock from __future__ import annotations +import re import warnings +import datetime +import dateutil.parser from enum import Enum, auto import functools from typing import List, Union, Any, Dict, Tuple, Protocol @@ -14,6 +17,7 @@ from ._hdf5_nexus import _ensure_supported_int_type, _warn_latin1_decode from .typing import H5Group, H5Dataset, ScippIndex from ._common import to_plain_index +from ._common import convert_time_to_datetime64 NXobjectIndex = Union[str, ScippIndex] @@ -84,6 +88,9 @@ def __getitem__(self, name: str) -> Any: def __setitem__(self, name: str, val: Any): self._attrs[name] = val + def __iter__(self): + yield from self._attrs + def get(self, name: str, default=None) -> Any: return self[name] if name in self else default @@ -91,14 +98,57 @@ def keys(self): return self._attrs.keys() +def _is_time(obj): + dummy = sc.empty(dims=[], shape=[], unit=obj.unit) + try: + dummy.to(unit='s') + return True + except sc.UnitError: + return False + + +def _as_datetime(obj: Any): + if isinstance(obj, str): + try: + # NumPy and scipp cannot handle timezone information. We therefore apply it, + # i.e., convert to UTC. + # Would like to use dateutil directly, but with Python's datetime we do not + # get nanosecond precision. Therefore we combine numpy and dateutil parsing. + date_only = 'T' not in obj + if date_only: + return sc.datetime(obj) + date, time = obj.split('T') + time_and_timezone_offset = re.split(r'Z|\+|-', time) + time = time_and_timezone_offset[0] + if len(time_and_timezone_offset) == 1: + # No timezone, parse directly (scipp based on numpy) + return sc.datetime(f'{date}T{time}') + else: + # There is timezone info. Parse with dateutil. + dt = dateutil.parser.isoparse(obj) + dt = dt.replace(microsecond=0) # handled by numpy + dt = dt.astimezone(datetime.timezone.utc) + dt = dt.replace(tzinfo=None).isoformat() + # We operate with string operations here and thus end up parsing date + # and time twice. The reason is that the timezone-offset arithmetic + # cannot be done, e.g., in nanoseconds without causing rounding errors. + if '.' in time: + dt += f".{time.split('.')[1]}" + return sc.datetime(dt) + except ValueError: + pass + return None + + class Field: """NeXus field. In HDF5 fields are represented as dataset. """ - def __init__(self, dataset: H5Dataset, dims=None): + def __init__(self, dataset: H5Dataset, dims=None, is_time=None): self._dataset = dataset self._shape = list(self._dataset.shape) + self._is_time = is_time # NeXus treats [] and [1] interchangeably. In general this is ill-defined, but # the best we can do appears to be squeezing unless the file provides names for # dimensions. The shape property of this class does thus not necessarily return @@ -137,6 +187,18 @@ def __getitem__(self, select) -> sc.Variable: self._dataset.read_direct(variable.values, source_sel=index) else: variable.values = self._dataset[index] + if self._is_time or _is_time(variable): + starts = [] + for name in self.attrs: + if (dt := _as_datetime(self.attrs[name])) is not None: + starts.append(dt) + if self._is_time and len(starts) == 0: + starts.append(sc.epoch(unit='ns')) + if len(starts) == 1: + variable = convert_time_to_datetime64( + variable, + start=starts[0], + scaling_factor=self.attrs.get('scaling_factor')) return variable def __repr__(self) -> str: @@ -208,6 +270,7 @@ class NXobject: """ def __init__(self, group: H5Group): self._group = group + self.child_params = {} def _get_child( self, @@ -220,7 +283,7 @@ def _get_child( item = self._group[name] if hasattr(item, 'shape'): dims = self._get_field_dims(name) if use_field_dims else None - return Field(item, dims=dims) + return Field(item, dims=dims, **self.child_params.get(name, {})) else: return _make(item) da = self._getitem(name) @@ -322,9 +385,14 @@ def create_field(self, name: str, data: DimensionedArray, **kwargs) -> Field: values = data.values if data.dtype == sc.DType.string: values = np.array(data.values, dtype=object) + elif data.dtype == sc.DType.datetime64: + start = sc.epoch(unit=data.unit) + values = (data - start).values dataset = self._group.create_dataset(name, data=values, **kwargs) if data.unit is not None: dataset.attrs['units'] = str(data.unit) + if data.dtype == sc.DType.datetime64: + dataset.attrs['start'] = str(start.value) return Field(dataset, data.dims) def create_class(self, name: str, nx_class: NX_class) -> NXobject: diff --git a/tests/nexus_test.py b/tests/nexus_test.py index 2ae31d29..0124434c 100644 --- a/tests/nexus_test.py +++ b/tests/nexus_test.py @@ -227,6 +227,63 @@ def test_field_of_extended_ascii_in_ascii_encoded_dataset_is_loaded_correctly(): sc.array(dims=['dim_0'], values=["run at rot=90°", "run at rot=90°x"])) +def test_ms_field_with_second_datetime_attribute_loaded_as_ms_datetime(nxroot): + nxroot['mytime'] = sc.arange('ignored', 2, unit='ms') + nxroot['mytime'].attrs['start_time'] = '2022-12-12T12:13:14' + assert sc.identical( + nxroot['mytime'][...], + sc.datetimes(dims=['dim_0'], + unit='ms', + values=['2022-12-12T12:13:14.000', '2022-12-12T12:13:14.001'])) + + +def test_ns_field_with_second_datetime_attribute_loaded_as_ns_datetime(nxroot): + nxroot['mytime'] = sc.arange('ignored', 2, unit='ns') + nxroot['mytime'].attrs['start_time'] = '1970-01-01T00:00:00' + assert sc.identical( + nxroot['mytime'][...], + sc.datetimes( + dims=['dim_0'], + unit='ns', + values=['1970-01-01T00:00:00.000000000', '1970-01-01T00:00:00.000000001'])) + + +def test_second_field_with_ns_datetime_attribute_loaded_as_ns_datetime(nxroot): + nxroot['mytime'] = sc.arange('ignored', 2, unit='s') + nxroot['mytime'].attrs['start_time'] = '1984-01-01T00:00:00.000000000' + assert sc.identical( + nxroot['mytime'][...], + sc.datetimes(dims=['dim_0'], + unit='ns', + values=['1984-01-01T00:00:00', '1984-01-01T00:00:01'])) + + +@pytest.mark.parametrize('timezone,hhmm', [('Z', '12:00'), ('+04', '08:00'), + ('+00', '12:00'), ('-02', '14:00'), + ('+1130', '00:30'), ('-0930', '21:30'), + ('+11:30', '00:30'), ('-09:30', '21:30')]) +def test_timezone_information_in_datetime_attribute_is_applied(nxroot, timezone, hhmm): + nxroot['mytime'] = sc.scalar(value=3, unit='s') + nxroot['mytime'].attrs['start_time'] = f'1984-01-01T12:00:00{timezone}' + assert sc.identical(nxroot['mytime'][...], + sc.datetime(unit='s', value=f'1984-01-01T{hhmm}:03')) + + +def test_timezone_information_in_datetime_attribute_preserves_ns_precision(nxroot): + nxroot['mytime'] = sc.scalar(value=3, unit='s') + nxroot['mytime'].attrs['start_time'] = '1984-01-01T12:00:00.123456789+0200' + assert sc.identical(nxroot['mytime'][...], + sc.datetime(unit='ns', value='1984-01-01T10:00:03.123456789')) + + +def test_loads_bare_timestamps_if_multiple_candidate_datetime_offsets_found(nxroot): + offsets = sc.arange('ignored', 2, unit='ms') + nxroot['mytime'] = offsets + nxroot['mytime'].attrs['offset'] = '2022-12-12T12:13:14' + nxroot['mytime'].attrs['start_time'] = '2022-12-12T12:13:15' + assert sc.identical(nxroot['mytime'][...], offsets.rename(ignored='dim_0')) + + def create_event_data_ids_1234(group): group['event_id'] = sc.array(dims=[''], unit=None, values=[1, 2, 4, 1, 2, 2]) group['event_time_offset'] = sc.array(dims=[''], diff --git a/tests/nxdata_test.py b/tests/nxdata_test.py index 55023cfe..0da0f0a8 100644 --- a/tests/nxdata_test.py +++ b/tests/nxdata_test.py @@ -1,4 +1,5 @@ import h5py +import numpy as np import scipp as sc from scippnexus import NXroot, NX_class import pytest @@ -226,6 +227,15 @@ def test_create_field_from_variable(nxroot, unit): assert sc.identical(loaded, var.rename(xx=loaded.dim)) +def test_create_datetime_field_from_variable(nxroot): + var = sc.datetime(np.datetime64('now'), unit='ns') + sc.arange( + 'time', 1, 4, dtype='int64', unit='ns') + nxroot.create_field('field', var) + loaded = nxroot['field'][...] + # Nexus does not support storing dim labels + assert sc.identical(loaded, var.rename(time=loaded.dim)) + + @pytest.mark.parametrize("nx_class", [NX_class.NXdata, NX_class.NXlog]) def test_create_class(nxroot, nx_class): group = nxroot.create_class('group', nx_class) @@ -304,3 +314,20 @@ def test_unnamed_extra_dims_of_multidim_coords_are_squeezed(nxroot): assert data['xx'].ndim == 1 assert data['xx'].shape == [2] assert sc.identical(data['xx'][...], xx['ignored', 0]) + + +def test_fields_with_datetime_attribute_are_loaded_as_datetime(nxroot): + da = sc.DataArray( + sc.epoch(unit='s') + + sc.array(dims=['xx', 'yy'], unit='s', values=[[1, 2, 3], [4, 5, 6]])) + da.coords['xx'] = da.data['yy', 0] + da.coords['xx2'] = da.data['yy', 1] + da.coords['yy'] = da.data['xx', 0] + data = nxroot.create_class('data1', NX_class.NXdata) + data.attrs['axes'] = da.dims + data.attrs['signal'] = 'signal' + data.create_field('signal', da.data) + data.create_field('xx', da.coords['xx']) + data.create_field('xx2', da.coords['xx2']) + data.create_field('yy', da.coords['yy']) + assert sc.identical(data[...], da)