Skip to content

Commit

Permalink
file: support saving a cropped region of the h5 file
Browse files Browse the repository at this point in the history
  • Loading branch information
JoepVanlier committed Jul 20, 2023
1 parent 11dbf1b commit 8380ffa
Show file tree
Hide file tree
Showing 6 changed files with 289 additions and 16 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
* Added `PowerSpectrum.identify_peaks()` method to the [`PowerSpectrum`](https://lumicks-pylake.readthedocs.io/en/v1.0.0/_api/lumicks.pylake.force_calibration.power_spectrum.PowerSpectrum.html) class. This method uses probability to identify peaks in the spectrum that are not due to the movement of beads in an optical trap. This is beta functionality. While usable, this has not yet been tested in a large number of different scenarios. The API can still be subject to change without any prior deprecation notice!
* Added support for accessing `Kymo`, `Scan` and `PointScan` by path (e.g. `file["Kymograph"]["my_kymo"]` or `file["Kymograph/my_kymo"]`).
* Added support for slicing `PointScan`.
* Added the ability to specify a cropping region when exporting to an h5-file using `file.save_as(filename, crop_time_range=(starting_timestamp, ending_timestamp))`.

#### Other changes

Expand Down
7 changes: 7 additions & 0 deletions docs/whatsnew/1.2.0/1_2_0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,10 @@ emission wavelength using the `from_wavelength()` method of :data:`~lumicks.pyla

Kymographs showing tracks in three color channels using the default colormaps (left) and colormaps
corresponding to the actual emission colors (right).

Cropping h5 files
-----------------

You can now use `File.save_as(crop_time_range=(start_timestamp, stop_timestamp))` to export a specific time range to a new `h5` file.
This can be useful for when you want to export a specific part of the timeline or a partial kymograph for instance.
Exporting a partial file helps keep file size down and makes it easier to share only the relevant parts of your data with others.
55 changes: 55 additions & 0 deletions lumicks/pylake/channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,25 @@ def from_dataset(dset, y_label="y", calibration=None):
calibration=calibration,
)

def to_dataset(self, parent, name, **kwargs):
"""Save this to an h5 dataset
Parameters
----------
parent : h5py.Group or h5py.File
location to save to.
name : str
name of the new dataset
**kwargs
forwarded to h5py.Group.create_dataset()
"""
dset = parent.create_dataset(name, data=self._src_data, **kwargs)
dset.attrs["Kind"] = "Continuous"
dset.attrs["Sample rate (Hz)"] = self.sample_rate
dset.attrs["Start time (ns)"] = self.start
dset.attrs["Stop time (ns)"] = self.stop
return dset

@property
def data(self) -> npt.ArrayLike:
if self._cached_data is None:
Expand Down Expand Up @@ -686,6 +705,26 @@ def __len__(self):
calibration=calibration,
)

def to_dataset(self, parent, name, **kwargs):
"""Save this to an h5 dataset
Parameters
----------
parent : h5py.Group or h5py.File
location to save to.
name : str
name of the new dataset
**kwargs
forwarded to h5py.Group.create_dataset()
"""
compound_type = np.dtype([("Timestamp", np.int64), ("Value", float)])
data = np.array([(t, d) for t, d in zip(self.timestamps, self.data)], compound_type)
dset = parent.create_dataset(name, data=data, **kwargs)
dset.attrs["Kind"] = b"TimeSeries"
dset.attrs["Start time (ns)"] = self.start
dset.attrs["Stop time (ns)"] = self.stop
return dset

@property
def data(self) -> npt.ArrayLike:
if self._cached_data is None:
Expand Down Expand Up @@ -762,6 +801,22 @@ def _apply_mask(self, mask):
def from_dataset(dset, y_label="y"):
return Slice(TimeTags(dset))

def to_dataset(self, parent, name, **kwargs):
"""Save this to an h5 dataset
Parameters
----------
parent : h5py.Group or h5py.File
location to save to.
name : str
name of the new dataset
**kwargs
forwarded to h5py.Group.create_dataset()
"""
dset = parent.create_dataset(name, data=self.data, **kwargs)
dset.attrs["Kind"] = "TimeTags"
return dset

@property
def timestamps(self) -> npt.ArrayLike:
# For time tag data, the data is the timestamps!
Expand Down
86 changes: 75 additions & 11 deletions lumicks/pylake/detail/h5_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,106 @@
from fnmatch import fnmatch


def write_h5(h5_file, output_filename, compression_level=5, omit_data=None):
def _write_numerical_data(
lk_file, out_file, name, node, compression_level, crop_time_range, verbose
):
"""Write numerical data"""
if crop_time_range:
sliced = lk_file[name][slice(*crop_time_range)]
if not sliced:
if verbose:
print(f"{name} dropped from dataset (no data within time window)")
else:
sliced._src.to_dataset(
out_file,
name,
compression="gzip",
compression_opts=compression_level,
)
else:
out_file.create_dataset(
name, data=node, compression="gzip", compression_opts=compression_level
)
out_file[name].attrs.update(node.attrs)


def _write_cropped_metadata(lk_file, out_file, name, node, crop_time_range, verbose):
"""Write non-numerical data"""

def write_node():
out_file.create_dataset(name, data=node)
out_file[name].attrs.update(node.attrs)

if not crop_time_range:
write_node()
else:
# Override time ranges. Items know how to crop themselves.
try:
start, stop = (
getattr(lk_file[name][slice(*crop_time_range)], field)
for field in ("start", "stop")
)
if stop >= crop_time_range[0] and start < crop_time_range[1] and (stop - start) > 0:
write_node()
out_file[name].attrs["Start time (ns)"] = start
out_file[name].attrs["Stop time (ns)"] = stop
else:
if verbose:
print(f"{name} removed from file (out of cropping range)")
except (IndexError, TypeError):
if verbose:
print(f"{name} not cropped")


def write_h5(
lk_file,
output_filename,
compression_level=5,
omit_data=None,
*,
crop_time_range=None,
verbose=False,
):
"""Write a modified h5 file to disk.
Parameters
----------
h5_file : h5py.File
loaded h5 file
lk_file : lk.File
pylake file handle
output_filename : str
Output file name.
compression_level : int
Compression level for gzip compression.
omit_data : str or iterable of str, optional
Which data sets to omit. Should be a set of h5 paths.
crop_time_range : tuple of np.int64
Specify a time interval to crop to (tuple of a start and stop time). Interval must be
specified in nanoseconds since epoch (the same format as timestamps).
verbose : bool, optional.
Print verbose output. Default: False.
"""
omit_data = {omit_data} if isinstance(omit_data, str) else omit_data
h5_file = lk_file.h5

with h5py.File(output_filename, "w") as out_file:

def traversal_function(name, node):
if omit_data and any([fnmatch(name, o) for o in omit_data]):
print(f"Omitted {name} from export")
if verbose:
print(f"Omitted {name} from export")
return

if isinstance(node, h5py.Dataset):
if node.dtype.kind == "O":
# Non-numerical data doesn't support compression
out_file.create_dataset(name, data=node)
_write_cropped_metadata(lk_file, out_file, name, node, crop_time_range, verbose)
else:
# Numerical data can benefit a lot from compression
out_file.create_dataset(
name, data=node, compression="gzip", compression_opts=compression_level
_write_numerical_data(
lk_file, out_file, name, node, compression_level, crop_time_range, verbose
)

else:
out_file.create_group(f"{name}")

out_file[name].attrs.update(node.attrs)
out_file[name].attrs.update(node.attrs)

h5_file.visititems(traversal_function)
out_file.attrs.update(h5_file.attrs)
22 changes: 20 additions & 2 deletions lumicks/pylake/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,9 @@ def notes(self) -> Dict[str, Note]:
"""Notes stored in the file"""
return self._get_object_dictionary("Note", Note)

def save_as(self, filename, compression_level=5, omit_data=None):
def save_as(
self, filename, compression_level=5, omit_data=None, *, crop_time_range=None, verbose=True
):
"""Write a modified h5 file to disk.
When transferring data, it can be beneficial to omit some channels from the h5 file, or use
Expand All @@ -318,6 +320,11 @@ def save_as(self, filename, compression_level=5, omit_data=None):
Which data sets to omit. Should be a set of h5 paths (e.g. {"Force HF/Force 1y"}).
`fnmatch` patterns are used to specify which fields to omit, which means you can use
wildcards as well (see examples below).
crop_time_range : tuple of np.int64, optional
Specify a time interval to crop to (tuple of a start and stop time). Interval must be
specified in nanoseconds since epoch (the same format as timestamps).
verbose : bool, optional
Print verbose output. Default: True.
Examples
--------
Expand All @@ -344,5 +351,16 @@ def save_as(self, filename, compression_level=5, omit_data=None):
# Omit Scan "1"
file.save_as("no_scan_1.h5", omit_data="Scan/1")
# Save only the region that contains the kymograph `kymo1`.
kymo = file.kymos["kymo1"]
file.save_as("only_kymo.h5", crop_time_range=(kymo.start, kymo.stop))
"""
write_h5(self.h5, filename, compression_level, omit_data)
write_h5(
self,
filename,
compression_level,
omit_data,
crop_time_range=crop_time_range,
verbose=verbose,
)
Loading

0 comments on commit 8380ffa

Please sign in to comment.