Merge branch 'main' into some-typing-improvements

scipp · Apr 13, 2022 · 7871899 · 7871899
2 parents ea09fca + c1185a6
commit 7871899
Show file tree

Hide file tree

Showing 10 changed files with 189 additions and 48 deletions.
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -9,6 +9,7 @@ requirements:
     - setuptools
   run:
     - python>=3.8
+    - python-dateutil
     - scipp
     - h5py
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,10 @@ build-backend = "setuptools.build_meta"
 [tool.pytest.ini_options]
 addopts = "-ra -v"
 testpaths = "tests"
+filterwarnings = [
+  "error",
+  "ignore::UserWarning",
+]
 
 [tool.mypy]
 mypy_path = "src"

diff --git a/setup.cfg b/setup.cfg
@@ -23,6 +23,7 @@ package_dir =
     = src
 packages = find:
 install_requires =
+    python-dateutil
     scipp>=0.12
     h5py
 python_requires = >=3.8

diff --git a/src/scippnexus/_common.py b/src/scippnexus/_common.py
@@ -9,7 +9,6 @@
 
 def convert_time_to_datetime64(
         raw_times: sc.Variable,
-        group_path: str,
         start: str = None,
         scaling_factor: Union[float, np.float_] = None) -> sc.Variable:
     """
@@ -25,40 +24,28 @@ def convert_time_to_datetime64(
 
     Args:
         raw_times: The raw time data from a nexus file.
-        group_path: The path within the nexus file to the log being read.
-            Used to generate warnings if loading the log fails.
         start: Optional, the start time of the log in an ISO8601
             string. If not provided, defaults to the beginning of the
             unix epoch (1970-01-01T00:00:00).
         scaling_factor: Optional, the scaling factor between the provided
             time series data and the unit of the raw_times Variable. If
             not provided, defaults to 1 (a no-op scaling factor).
     """
-    try:
-        raw_times_ns = sc.to_unit(raw_times, sc.units.ns, copy=False)
-    except sc.UnitError:
-        raise sc.UnitError(
-            f"The units of time in the entry at "
-            f"'{group_path}/time{{units}}' must be convertible to seconds, "
-            f"but this cannot be done for '{raw_times.unit}'. Skipping "
-            f"loading group at '{group_path}'.")
-
-    try:
-        _start_ts = sc.scalar(value=np.datetime64(start or "1970-01-01T00:00:00"),
-                              unit=sc.units.ns,
-                              dtype=sc.DType.datetime64)
-    except ValueError:
-        raise ValueError(
-            f"The date string '{start}' in the entry at "
-            f"'{group_path}/time@start' failed to parse as an ISO8601 date. "
-            f"Skipping loading group at '{group_path}'")
+    if (raw_times.dtype
+            in (sc.DType.float64, sc.DType.float32)) or scaling_factor is not None:
+        unit = sc.units.ns
+    else:
+        # determine more precise unit
+        ratio = sc.scalar(1.0, unit=start.unit) / sc.scalar(
+            1.0, unit=raw_times.unit).to(unit=start.unit)
+        unit = start.unit if ratio.value < 1.0 else raw_times.unit
 
     if scaling_factor is None:
-        times = raw_times_ns.astype(sc.DType.int64, copy=False)
+        times = raw_times
     else:
-        _scale = sc.scalar(value=scaling_factor)
-        times = (raw_times_ns * _scale).astype(sc.DType.int64, copy=False)
-    return _start_ts + times
+        times = raw_times * sc.scalar(value=scaling_factor)
+    return start.to(unit=unit, copy=False) + times.to(
+        dtype=sc.DType.int64, unit=unit, copy=False)
 
 
 def _to_canonical_select(dims: List[str],

diff --git a/src/scippnexus/docs/our-interpretation-of-the-nexus-format.md b/src/scippnexus/docs/our-interpretation-of-the-nexus-format.md
@@ -47,6 +47,16 @@ More concretely this means that, e.g., for loading an `NXdetector` from a NH, th
 
 If the above yields no more than one item, the group can be loaded.
 
+## Datetime fields
+
+HDF5 does not support storing date and time information such as `np.datetime64`.
+`NXlog` and `NXevent_data` specify specific attributes for fields that have to be interpreted as date and time, in particular [NXlog/time@start](https://manual.nexusformat.org/classes/base_classes/NXlog.html#nxlog-time-start-attribute) and [NXevent_data/event_time_offset@offset](https://manual.nexusformat.org/classes/base_classes/NXevent_data.html#nxevent-data-event-time-offset-field).
+No *general* definition or intention is documented in the NF, but according to TR this is nevertheless standard.
+Due to the attribute naming mismatch in the two cases where it *is* specified we need to assume that naming is arbitrary.
+Therefore, we search *all* attributes of a field for a date and time offset, provided that the field's unit is a time unit.
+It is unclear what should be done in the case of multiple matches.
+As of April 2022 we ignore the date and time offsets in this case, since guessing which one to use based on the attribute name does not seem desirable.
+
 ## Bin edges
 
 For [NXdetector](https://manual.nexusformat.org/classes/base_classes/NXdetector.html) the NF defines a [time_of_flight](https://manual.nexusformat.org/classes/base_classes/NXdetector.html#nxdetector-time-of-flight-field) field, exceeding the data shape by one, i.e., it is meant as bin-edges.

diff --git a/src/scippnexus/nxevent_data.py b/src/scippnexus/nxevent_data.py
@@ -5,7 +5,7 @@
 import numpy as np
 import scipp as sc
 
-from ._common import to_plain_index, convert_time_to_datetime64
+from ._common import to_plain_index
 from .nxobject import NXobject, ScippIndex, NexusStructureError
 
 _event_dimension = "event"
@@ -57,11 +57,7 @@ def _getitem(self, select: ScippIndex) -> sc.DataArray:
             index = slice(start, stop, stride)
 
         event_index = self['event_index'][index].values
-        event_time_zero = self['event_time_zero']
-        event_time_zero = convert_time_to_datetime64(
-            event_time_zero[index],
-            start=event_time_zero.attrs.get('offset'),
-            group_path=self.name)
+        event_time_zero = self['event_time_zero'][index]
 
         num_event = self["event_time_offset"].shape[0]
         # Some files contain uint64 "max" indices, which turn into negatives during

diff --git a/src/scippnexus/nxlog.py b/src/scippnexus/nxlog.py
@@ -4,7 +4,6 @@
 
 from typing import List, Union
 import scipp as sc
-from ._common import convert_time_to_datetime64
 from .nxobject import NXobject, ScippIndex
 from .nxdata import NXdata
 
@@ -38,20 +37,11 @@ def _nxbase(self) -> NXdata:
         return NXdata(self._group, signal_name_default='value', axes=axes)
 
     def _getitem(self, select: ScippIndex) -> sc.DataArray:
-        data: sc.DataArray = self._nxbase[select]
-        # The 'time' field in NXlog contains extra properties 'start' and
-        # 'scaling_factor' that are not handled by NXdata. These are used
-        # to transform to a datetime-coord.
-        if 'time' in self:
-            if 'time' not in data.coords:
-                raise sc.DimensionError(
-                    "NXlog is time-dependent, but failed to load `time` dataset")
-            data.coords['time'] = convert_time_to_datetime64(
-                raw_times=data.coords.pop('time'),
-                start=self['time'].attrs.get('start'),
-                scaling_factor=self['time'].attrs.get('scaling_factor'),
-                group_path=self['time'].name)
-        return data
+        base = self._nxbase
+        # Field loads datetime offset attributes automatically, but for NXlog this
+        # may apparently be omitted and must then interpreted as relative to epoch.
+        base.child_params['time'] = {'is_time': True}
+        return base[select]
 
     def _get_field_dims(self, name: str) -> Union[None, List[str]]:
         return self._nxbase._get_field_dims(name)
diff --git a/src/scippnexus/nxobject.py b/src/scippnexus/nxobject.py
@@ -2,7 +2,10 @@
 # Copyright (c) 2022 Scipp contributors (https://github.com/scipp)
 # @author Simon Heybrock
 from __future__ import annotations
+import re
 import warnings
+import datetime
+import dateutil.parser
 from enum import Enum, auto
 import functools
 from typing import List, Union, Any, Dict, Tuple, Protocol
@@ -14,6 +17,7 @@
 from ._hdf5_nexus import _ensure_supported_int_type, _warn_latin1_decode
 from .typing import H5Group, H5Dataset, ScippIndex
 from ._common import to_plain_index
+from ._common import convert_time_to_datetime64
 
 NXobjectIndex = Union[str, ScippIndex]
 
@@ -84,21 +88,67 @@ def __getitem__(self, name: str) -> Any:
     def __setitem__(self, name: str, val: Any):
         self._attrs[name] = val
 
+    def __iter__(self):
+        yield from self._attrs
+
     def get(self, name: str, default=None) -> Any:
         return self[name] if name in self else default
 
     def keys(self):
         return self._attrs.keys()
 
 
+def _is_time(obj):
+    dummy = sc.empty(dims=[], shape=[], unit=obj.unit)
+    try:
+        dummy.to(unit='s')
+        return True
+    except sc.UnitError:
+        return False
+
+
+def _as_datetime(obj: Any):
+    if isinstance(obj, str):
+        try:
+            # NumPy and scipp cannot handle timezone information. We therefore apply it,
+            # i.e., convert to UTC.
+            # Would like to use dateutil directly, but with Python's datetime we do not
+            # get nanosecond precision. Therefore we combine numpy and dateutil parsing.
+            date_only = 'T' not in obj
+            if date_only:
+                return sc.datetime(obj)
+            date, time = obj.split('T')
+            time_and_timezone_offset = re.split(r'Z|\+|-', time)
+            time = time_and_timezone_offset[0]
+            if len(time_and_timezone_offset) == 1:
+                # No timezone, parse directly (scipp based on numpy)
+                return sc.datetime(f'{date}T{time}')
+            else:
+                # There is timezone info. Parse with dateutil.
+                dt = dateutil.parser.isoparse(obj)
+                dt = dt.replace(microsecond=0)  # handled by numpy
+                dt = dt.astimezone(datetime.timezone.utc)
+                dt = dt.replace(tzinfo=None).isoformat()
+                # We operate with string operations here and thus end up parsing date
+                # and time twice. The reason is that the timezone-offset arithmetic
+                # cannot be done, e.g., in nanoseconds without causing rounding errors.
+                if '.' in time:
+                    dt += f".{time.split('.')[1]}"
+                return sc.datetime(dt)
+        except ValueError:
+            pass
+    return None
+
+
 class Field:
     """NeXus field.
 
     In HDF5 fields are represented as dataset.
     """
-    def __init__(self, dataset: H5Dataset, dims=None):
+    def __init__(self, dataset: H5Dataset, dims=None, is_time=None):
         self._dataset = dataset
         self._shape = list(self._dataset.shape)
+        self._is_time = is_time
         # NeXus treats [] and [1] interchangeably. In general this is ill-defined, but
         # the best we can do appears to be squeezing unless the file provides names for
         # dimensions. The shape property of this class does thus not necessarily return
@@ -137,6 +187,18 @@ def __getitem__(self, select) -> sc.Variable:
             self._dataset.read_direct(variable.values, source_sel=index)
         else:
             variable.values = self._dataset[index]
+        if self._is_time or _is_time(variable):
+            starts = []
+            for name in self.attrs:
+                if (dt := _as_datetime(self.attrs[name])) is not None:
+                    starts.append(dt)
+            if self._is_time and len(starts) == 0:
+                starts.append(sc.epoch(unit='ns'))
+            if len(starts) == 1:
+                variable = convert_time_to_datetime64(
+                    variable,
+                    start=starts[0],
+                    scaling_factor=self.attrs.get('scaling_factor'))
         return variable
 
     def __repr__(self) -> str:
@@ -208,6 +270,7 @@ class NXobject:
     """
     def __init__(self, group: H5Group):
         self._group = group
+        self.child_params = {}
 
     def _get_child(
             self,
@@ -220,7 +283,7 @@ def _get_child(
             item = self._group[name]
             if hasattr(item, 'shape'):
                 dims = self._get_field_dims(name) if use_field_dims else None
-                return Field(item, dims=dims)
+                return Field(item, dims=dims, **self.child_params.get(name, {}))
             else:
                 return _make(item)
         da = self._getitem(name)
@@ -322,9 +385,14 @@ def create_field(self, name: str, data: DimensionedArray, **kwargs) -> Field:
         values = data.values
         if data.dtype == sc.DType.string:
             values = np.array(data.values, dtype=object)
+        elif data.dtype == sc.DType.datetime64:
+            start = sc.epoch(unit=data.unit)
+            values = (data - start).values
         dataset = self._group.create_dataset(name, data=values, **kwargs)
         if data.unit is not None:
             dataset.attrs['units'] = str(data.unit)
+        if data.dtype == sc.DType.datetime64:
+            dataset.attrs['start'] = str(start.value)
         return Field(dataset, data.dims)
 
     def create_class(self, name: str, nx_class: NX_class) -> NXobject:

diff --git a/tests/nexus_test.py b/tests/nexus_test.py
@@ -227,6 +227,63 @@ def test_field_of_extended_ascii_in_ascii_encoded_dataset_is_loaded_correctly():
             sc.array(dims=['dim_0'], values=["run at rot=90°", "run at rot=90°x"]))
 
 
+def test_ms_field_with_second_datetime_attribute_loaded_as_ms_datetime(nxroot):
+    nxroot['mytime'] = sc.arange('ignored', 2, unit='ms')
+    nxroot['mytime'].attrs['start_time'] = '2022-12-12T12:13:14'
+    assert sc.identical(
+        nxroot['mytime'][...],
+        sc.datetimes(dims=['dim_0'],
+                     unit='ms',
+                     values=['2022-12-12T12:13:14.000', '2022-12-12T12:13:14.001']))
+
+
+def test_ns_field_with_second_datetime_attribute_loaded_as_ns_datetime(nxroot):
+    nxroot['mytime'] = sc.arange('ignored', 2, unit='ns')
+    nxroot['mytime'].attrs['start_time'] = '1970-01-01T00:00:00'
+    assert sc.identical(
+        nxroot['mytime'][...],
+        sc.datetimes(
+            dims=['dim_0'],
+            unit='ns',
+            values=['1970-01-01T00:00:00.000000000', '1970-01-01T00:00:00.000000001']))
+
+
+def test_second_field_with_ns_datetime_attribute_loaded_as_ns_datetime(nxroot):
+    nxroot['mytime'] = sc.arange('ignored', 2, unit='s')
+    nxroot['mytime'].attrs['start_time'] = '1984-01-01T00:00:00.000000000'
+    assert sc.identical(
+        nxroot['mytime'][...],
+        sc.datetimes(dims=['dim_0'],
+                     unit='ns',
+                     values=['1984-01-01T00:00:00', '1984-01-01T00:00:01']))
+
+
+@pytest.mark.parametrize('timezone,hhmm', [('Z', '12:00'), ('+04', '08:00'),
+                                           ('+00', '12:00'), ('-02', '14:00'),
+                                           ('+1130', '00:30'), ('-0930', '21:30'),
+                                           ('+11:30', '00:30'), ('-09:30', '21:30')])
+def test_timezone_information_in_datetime_attribute_is_applied(nxroot, timezone, hhmm):
+    nxroot['mytime'] = sc.scalar(value=3, unit='s')
+    nxroot['mytime'].attrs['start_time'] = f'1984-01-01T12:00:00{timezone}'
+    assert sc.identical(nxroot['mytime'][...],
+                        sc.datetime(unit='s', value=f'1984-01-01T{hhmm}:03'))
+
+
+def test_timezone_information_in_datetime_attribute_preserves_ns_precision(nxroot):
+    nxroot['mytime'] = sc.scalar(value=3, unit='s')
+    nxroot['mytime'].attrs['start_time'] = '1984-01-01T12:00:00.123456789+0200'
+    assert sc.identical(nxroot['mytime'][...],
+                        sc.datetime(unit='ns', value='1984-01-01T10:00:03.123456789'))
+
+
+def test_loads_bare_timestamps_if_multiple_candidate_datetime_offsets_found(nxroot):
+    offsets = sc.arange('ignored', 2, unit='ms')
+    nxroot['mytime'] = offsets
+    nxroot['mytime'].attrs['offset'] = '2022-12-12T12:13:14'
+    nxroot['mytime'].attrs['start_time'] = '2022-12-12T12:13:15'
+    assert sc.identical(nxroot['mytime'][...], offsets.rename(ignored='dim_0'))
+
+
 def create_event_data_ids_1234(group):
     group['event_id'] = sc.array(dims=[''], unit=None, values=[1, 2, 4, 1, 2, 2])
     group['event_time_offset'] = sc.array(dims=[''],