Skip to content

Commit

Permalink
Merge branch 'main' into some-typing-improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonHeybrock authored Apr 13, 2022
2 parents ea09fca + c1185a6 commit 7871899
Show file tree
Hide file tree
Showing 10 changed files with 189 additions and 48 deletions.
1 change: 1 addition & 0 deletions conda/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ requirements:
- setuptools
run:
- python>=3.8
- python-dateutil
- scipp
- h5py

Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ build-backend = "setuptools.build_meta"
[tool.pytest.ini_options]
addopts = "-ra -v"
testpaths = "tests"
filterwarnings = [
"error",
"ignore::UserWarning",
]

[tool.mypy]
mypy_path = "src"
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ package_dir =
= src
packages = find:
install_requires =
python-dateutil
scipp>=0.12
h5py
python_requires = >=3.8
Expand Down
37 changes: 12 additions & 25 deletions src/scippnexus/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

def convert_time_to_datetime64(
raw_times: sc.Variable,
group_path: str,
start: str = None,
scaling_factor: Union[float, np.float_] = None) -> sc.Variable:
"""
Expand All @@ -25,40 +24,28 @@ def convert_time_to_datetime64(
Args:
raw_times: The raw time data from a nexus file.
group_path: The path within the nexus file to the log being read.
Used to generate warnings if loading the log fails.
start: Optional, the start time of the log in an ISO8601
string. If not provided, defaults to the beginning of the
unix epoch (1970-01-01T00:00:00).
scaling_factor: Optional, the scaling factor between the provided
time series data and the unit of the raw_times Variable. If
not provided, defaults to 1 (a no-op scaling factor).
"""
try:
raw_times_ns = sc.to_unit(raw_times, sc.units.ns, copy=False)
except sc.UnitError:
raise sc.UnitError(
f"The units of time in the entry at "
f"'{group_path}/time{{units}}' must be convertible to seconds, "
f"but this cannot be done for '{raw_times.unit}'. Skipping "
f"loading group at '{group_path}'.")

try:
_start_ts = sc.scalar(value=np.datetime64(start or "1970-01-01T00:00:00"),
unit=sc.units.ns,
dtype=sc.DType.datetime64)
except ValueError:
raise ValueError(
f"The date string '{start}' in the entry at "
f"'{group_path}/time@start' failed to parse as an ISO8601 date. "
f"Skipping loading group at '{group_path}'")
if (raw_times.dtype
in (sc.DType.float64, sc.DType.float32)) or scaling_factor is not None:
unit = sc.units.ns
else:
# determine more precise unit
ratio = sc.scalar(1.0, unit=start.unit) / sc.scalar(
1.0, unit=raw_times.unit).to(unit=start.unit)
unit = start.unit if ratio.value < 1.0 else raw_times.unit

if scaling_factor is None:
times = raw_times_ns.astype(sc.DType.int64, copy=False)
times = raw_times
else:
_scale = sc.scalar(value=scaling_factor)
times = (raw_times_ns * _scale).astype(sc.DType.int64, copy=False)
return _start_ts + times
times = raw_times * sc.scalar(value=scaling_factor)
return start.to(unit=unit, copy=False) + times.to(
dtype=sc.DType.int64, unit=unit, copy=False)


def _to_canonical_select(dims: List[str],
Expand Down
10 changes: 10 additions & 0 deletions src/scippnexus/docs/our-interpretation-of-the-nexus-format.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ More concretely this means that, e.g., for loading an `NXdetector` from a NH, th

If the above yields no more than one item, the group can be loaded.

## Datetime fields

HDF5 does not support storing date and time information such as `np.datetime64`.
`NXlog` and `NXevent_data` specify specific attributes for fields that have to be interpreted as date and time, in particular [NXlog/time@start](https://manual.nexusformat.org/classes/base_classes/NXlog.html#nxlog-time-start-attribute) and [NXevent_data/event_time_offset@offset](https://manual.nexusformat.org/classes/base_classes/NXevent_data.html#nxevent-data-event-time-offset-field).
No *general* definition or intention is documented in the NF, but according to TR this is nevertheless standard.
Due to the attribute naming mismatch in the two cases where it *is* specified we need to assume that naming is arbitrary.
Therefore, we search *all* attributes of a field for a date and time offset, provided that the field's unit is a time unit.
It is unclear what should be done in the case of multiple matches.
As of April 2022 we ignore the date and time offsets in this case, since guessing which one to use based on the attribute name does not seem desirable.

## Bin edges

For [NXdetector](https://manual.nexusformat.org/classes/base_classes/NXdetector.html) the NF defines a [time_of_flight](https://manual.nexusformat.org/classes/base_classes/NXdetector.html#nxdetector-time-of-flight-field) field, exceeding the data shape by one, i.e., it is meant as bin-edges.
Expand Down
8 changes: 2 additions & 6 deletions src/scippnexus/nxevent_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
import scipp as sc

from ._common import to_plain_index, convert_time_to_datetime64
from ._common import to_plain_index
from .nxobject import NXobject, ScippIndex, NexusStructureError

_event_dimension = "event"
Expand Down Expand Up @@ -57,11 +57,7 @@ def _getitem(self, select: ScippIndex) -> sc.DataArray:
index = slice(start, stop, stride)

event_index = self['event_index'][index].values
event_time_zero = self['event_time_zero']
event_time_zero = convert_time_to_datetime64(
event_time_zero[index],
start=event_time_zero.attrs.get('offset'),
group_path=self.name)
event_time_zero = self['event_time_zero'][index]

num_event = self["event_time_offset"].shape[0]
# Some files contain uint64 "max" indices, which turn into negatives during
Expand Down
20 changes: 5 additions & 15 deletions src/scippnexus/nxlog.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from typing import List, Union
import scipp as sc
from ._common import convert_time_to_datetime64
from .nxobject import NXobject, ScippIndex
from .nxdata import NXdata

Expand Down Expand Up @@ -38,20 +37,11 @@ def _nxbase(self) -> NXdata:
return NXdata(self._group, signal_name_default='value', axes=axes)

def _getitem(self, select: ScippIndex) -> sc.DataArray:
data: sc.DataArray = self._nxbase[select]
# The 'time' field in NXlog contains extra properties 'start' and
# 'scaling_factor' that are not handled by NXdata. These are used
# to transform to a datetime-coord.
if 'time' in self:
if 'time' not in data.coords:
raise sc.DimensionError(
"NXlog is time-dependent, but failed to load `time` dataset")
data.coords['time'] = convert_time_to_datetime64(
raw_times=data.coords.pop('time'),
start=self['time'].attrs.get('start'),
scaling_factor=self['time'].attrs.get('scaling_factor'),
group_path=self['time'].name)
return data
base = self._nxbase
# Field loads datetime offset attributes automatically, but for NXlog this
# may apparently be omitted and must then interpreted as relative to epoch.
base.child_params['time'] = {'is_time': True}
return base[select]

def _get_field_dims(self, name: str) -> Union[None, List[str]]:
return self._nxbase._get_field_dims(name)
72 changes: 70 additions & 2 deletions src/scippnexus/nxobject.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
# Copyright (c) 2022 Scipp contributors (https://github.com/scipp)
# @author Simon Heybrock
from __future__ import annotations
import re
import warnings
import datetime
import dateutil.parser
from enum import Enum, auto
import functools
from typing import List, Union, Any, Dict, Tuple, Protocol
Expand All @@ -14,6 +17,7 @@
from ._hdf5_nexus import _ensure_supported_int_type, _warn_latin1_decode
from .typing import H5Group, H5Dataset, ScippIndex
from ._common import to_plain_index
from ._common import convert_time_to_datetime64

NXobjectIndex = Union[str, ScippIndex]

Expand Down Expand Up @@ -84,21 +88,67 @@ def __getitem__(self, name: str) -> Any:
def __setitem__(self, name: str, val: Any):
self._attrs[name] = val

def __iter__(self):
yield from self._attrs

def get(self, name: str, default=None) -> Any:
return self[name] if name in self else default

def keys(self):
return self._attrs.keys()


def _is_time(obj):
dummy = sc.empty(dims=[], shape=[], unit=obj.unit)
try:
dummy.to(unit='s')
return True
except sc.UnitError:
return False


def _as_datetime(obj: Any):
if isinstance(obj, str):
try:
# NumPy and scipp cannot handle timezone information. We therefore apply it,
# i.e., convert to UTC.
# Would like to use dateutil directly, but with Python's datetime we do not
# get nanosecond precision. Therefore we combine numpy and dateutil parsing.
date_only = 'T' not in obj
if date_only:
return sc.datetime(obj)
date, time = obj.split('T')
time_and_timezone_offset = re.split(r'Z|\+|-', time)
time = time_and_timezone_offset[0]
if len(time_and_timezone_offset) == 1:
# No timezone, parse directly (scipp based on numpy)
return sc.datetime(f'{date}T{time}')
else:
# There is timezone info. Parse with dateutil.
dt = dateutil.parser.isoparse(obj)
dt = dt.replace(microsecond=0) # handled by numpy
dt = dt.astimezone(datetime.timezone.utc)
dt = dt.replace(tzinfo=None).isoformat()
# We operate with string operations here and thus end up parsing date
# and time twice. The reason is that the timezone-offset arithmetic
# cannot be done, e.g., in nanoseconds without causing rounding errors.
if '.' in time:
dt += f".{time.split('.')[1]}"
return sc.datetime(dt)
except ValueError:
pass
return None


class Field:
"""NeXus field.
In HDF5 fields are represented as dataset.
"""
def __init__(self, dataset: H5Dataset, dims=None):
def __init__(self, dataset: H5Dataset, dims=None, is_time=None):
self._dataset = dataset
self._shape = list(self._dataset.shape)
self._is_time = is_time
# NeXus treats [] and [1] interchangeably. In general this is ill-defined, but
# the best we can do appears to be squeezing unless the file provides names for
# dimensions. The shape property of this class does thus not necessarily return
Expand Down Expand Up @@ -137,6 +187,18 @@ def __getitem__(self, select) -> sc.Variable:
self._dataset.read_direct(variable.values, source_sel=index)
else:
variable.values = self._dataset[index]
if self._is_time or _is_time(variable):
starts = []
for name in self.attrs:
if (dt := _as_datetime(self.attrs[name])) is not None:
starts.append(dt)
if self._is_time and len(starts) == 0:
starts.append(sc.epoch(unit='ns'))
if len(starts) == 1:
variable = convert_time_to_datetime64(
variable,
start=starts[0],
scaling_factor=self.attrs.get('scaling_factor'))
return variable

def __repr__(self) -> str:
Expand Down Expand Up @@ -208,6 +270,7 @@ class NXobject:
"""
def __init__(self, group: H5Group):
self._group = group
self.child_params = {}

def _get_child(
self,
Expand All @@ -220,7 +283,7 @@ def _get_child(
item = self._group[name]
if hasattr(item, 'shape'):
dims = self._get_field_dims(name) if use_field_dims else None
return Field(item, dims=dims)
return Field(item, dims=dims, **self.child_params.get(name, {}))
else:
return _make(item)
da = self._getitem(name)
Expand Down Expand Up @@ -322,9 +385,14 @@ def create_field(self, name: str, data: DimensionedArray, **kwargs) -> Field:
values = data.values
if data.dtype == sc.DType.string:
values = np.array(data.values, dtype=object)
elif data.dtype == sc.DType.datetime64:
start = sc.epoch(unit=data.unit)
values = (data - start).values
dataset = self._group.create_dataset(name, data=values, **kwargs)
if data.unit is not None:
dataset.attrs['units'] = str(data.unit)
if data.dtype == sc.DType.datetime64:
dataset.attrs['start'] = str(start.value)
return Field(dataset, data.dims)

def create_class(self, name: str, nx_class: NX_class) -> NXobject:
Expand Down
57 changes: 57 additions & 0 deletions tests/nexus_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,63 @@ def test_field_of_extended_ascii_in_ascii_encoded_dataset_is_loaded_correctly():
sc.array(dims=['dim_0'], values=["run at rot=90°", "run at rot=90°x"]))


def test_ms_field_with_second_datetime_attribute_loaded_as_ms_datetime(nxroot):
nxroot['mytime'] = sc.arange('ignored', 2, unit='ms')
nxroot['mytime'].attrs['start_time'] = '2022-12-12T12:13:14'
assert sc.identical(
nxroot['mytime'][...],
sc.datetimes(dims=['dim_0'],
unit='ms',
values=['2022-12-12T12:13:14.000', '2022-12-12T12:13:14.001']))


def test_ns_field_with_second_datetime_attribute_loaded_as_ns_datetime(nxroot):
nxroot['mytime'] = sc.arange('ignored', 2, unit='ns')
nxroot['mytime'].attrs['start_time'] = '1970-01-01T00:00:00'
assert sc.identical(
nxroot['mytime'][...],
sc.datetimes(
dims=['dim_0'],
unit='ns',
values=['1970-01-01T00:00:00.000000000', '1970-01-01T00:00:00.000000001']))


def test_second_field_with_ns_datetime_attribute_loaded_as_ns_datetime(nxroot):
nxroot['mytime'] = sc.arange('ignored', 2, unit='s')
nxroot['mytime'].attrs['start_time'] = '1984-01-01T00:00:00.000000000'
assert sc.identical(
nxroot['mytime'][...],
sc.datetimes(dims=['dim_0'],
unit='ns',
values=['1984-01-01T00:00:00', '1984-01-01T00:00:01']))


@pytest.mark.parametrize('timezone,hhmm', [('Z', '12:00'), ('+04', '08:00'),
('+00', '12:00'), ('-02', '14:00'),
('+1130', '00:30'), ('-0930', '21:30'),
('+11:30', '00:30'), ('-09:30', '21:30')])
def test_timezone_information_in_datetime_attribute_is_applied(nxroot, timezone, hhmm):
nxroot['mytime'] = sc.scalar(value=3, unit='s')
nxroot['mytime'].attrs['start_time'] = f'1984-01-01T12:00:00{timezone}'
assert sc.identical(nxroot['mytime'][...],
sc.datetime(unit='s', value=f'1984-01-01T{hhmm}:03'))


def test_timezone_information_in_datetime_attribute_preserves_ns_precision(nxroot):
nxroot['mytime'] = sc.scalar(value=3, unit='s')
nxroot['mytime'].attrs['start_time'] = '1984-01-01T12:00:00.123456789+0200'
assert sc.identical(nxroot['mytime'][...],
sc.datetime(unit='ns', value='1984-01-01T10:00:03.123456789'))


def test_loads_bare_timestamps_if_multiple_candidate_datetime_offsets_found(nxroot):
offsets = sc.arange('ignored', 2, unit='ms')
nxroot['mytime'] = offsets
nxroot['mytime'].attrs['offset'] = '2022-12-12T12:13:14'
nxroot['mytime'].attrs['start_time'] = '2022-12-12T12:13:15'
assert sc.identical(nxroot['mytime'][...], offsets.rename(ignored='dim_0'))


def create_event_data_ids_1234(group):
group['event_id'] = sc.array(dims=[''], unit=None, values=[1, 2, 4, 1, 2, 2])
group['event_time_offset'] = sc.array(dims=[''],
Expand Down
Loading

0 comments on commit 7871899

Please sign in to comment.