file: support saving a cropped region of the h5 file

lumicks · Jul 20, 2023 · 8380ffa · 8380ffa
1 parent 11dbf1b
commit 8380ffa
Show file tree

Hide file tree

Showing 6 changed files with 289 additions and 16 deletions.
diff --git a/changelog.md b/changelog.md
@@ -10,6 +10,7 @@
 * Added `PowerSpectrum.identify_peaks()` method to the [`PowerSpectrum`](https://lumicks-pylake.readthedocs.io/en/v1.0.0/_api/lumicks.pylake.force_calibration.power_spectrum.PowerSpectrum.html) class. This method uses probability to identify peaks in the spectrum that are not due to the movement of beads in an optical trap. This is beta functionality. While usable, this has not yet been tested in a large number of different scenarios. The API can still be subject to change without any prior deprecation notice!
 * Added support for accessing `Kymo`, `Scan` and `PointScan` by path (e.g. `file["Kymograph"]["my_kymo"]` or `file["Kymograph/my_kymo"]`).
 * Added support for slicing `PointScan`.
+* Added the ability to specify a cropping region when exporting to an h5-file using `file.save_as(filename, crop_time_range=(starting_timestamp, ending_timestamp))`.
 
 #### Other changes
 

diff --git a/docs/whatsnew/1.2.0/1_2_0.rst b/docs/whatsnew/1.2.0/1_2_0.rst
@@ -18,3 +18,10 @@ emission wavelength using the `from_wavelength()` method of :data:`~lumicks.pyla
 
     Kymographs showing tracks in three color channels using the default colormaps (left) and colormaps
     corresponding to the actual emission colors (right).
+
+Cropping h5 files
+-----------------
+
+You can now use `File.save_as(crop_time_range=(start_timestamp, stop_timestamp))` to export a specific time range to a new `h5` file.
+This can be useful for when you want to export a specific part of the timeline or a partial kymograph for instance.
+Exporting a partial file helps keep file size down and makes it easier to share only the relevant parts of your data with others.
diff --git a/lumicks/pylake/channel.py b/lumicks/pylake/channel.py
@@ -575,6 +575,25 @@ def from_dataset(dset, y_label="y", calibration=None):
             calibration=calibration,
         )
 
+    def to_dataset(self, parent, name, **kwargs):
+        """Save this to an h5 dataset
+
+        Parameters
+        ----------
+        parent : h5py.Group or h5py.File
+            location to save to.
+        name : str
+            name of the new dataset
+        **kwargs
+            forwarded to h5py.Group.create_dataset()
+        """
+        dset = parent.create_dataset(name, data=self._src_data, **kwargs)
+        dset.attrs["Kind"] = "Continuous"
+        dset.attrs["Sample rate (Hz)"] = self.sample_rate
+        dset.attrs["Start time (ns)"] = self.start
+        dset.attrs["Stop time (ns)"] = self.stop
+        return dset
+
     @property
     def data(self) -> npt.ArrayLike:
         if self._cached_data is None:
@@ -686,6 +705,26 @@ def __len__(self):
             calibration=calibration,
         )
 
+    def to_dataset(self, parent, name, **kwargs):
+        """Save this to an h5 dataset
+
+        Parameters
+        ----------
+        parent : h5py.Group or h5py.File
+            location to save to.
+        name : str
+            name of the new dataset
+        **kwargs
+            forwarded to h5py.Group.create_dataset()
+        """
+        compound_type = np.dtype([("Timestamp", np.int64), ("Value", float)])
+        data = np.array([(t, d) for t, d in zip(self.timestamps, self.data)], compound_type)
+        dset = parent.create_dataset(name, data=data, **kwargs)
+        dset.attrs["Kind"] = b"TimeSeries"
+        dset.attrs["Start time (ns)"] = self.start
+        dset.attrs["Stop time (ns)"] = self.stop
+        return dset
+
     @property
     def data(self) -> npt.ArrayLike:
         if self._cached_data is None:
@@ -762,6 +801,22 @@ def _apply_mask(self, mask):
     def from_dataset(dset, y_label="y"):
         return Slice(TimeTags(dset))
 
+    def to_dataset(self, parent, name, **kwargs):
+        """Save this to an h5 dataset
+
+        Parameters
+        ----------
+        parent : h5py.Group or h5py.File
+            location to save to.
+        name : str
+            name of the new dataset
+        **kwargs
+            forwarded to h5py.Group.create_dataset()
+        """
+        dset = parent.create_dataset(name, data=self.data, **kwargs)
+        dset.attrs["Kind"] = "TimeTags"
+        return dset
+
     @property
     def timestamps(self) -> npt.ArrayLike:
         # For time tag data, the data is the timestamps!

diff --git a/lumicks/pylake/detail/h5_helper.py b/lumicks/pylake/detail/h5_helper.py
@@ -2,42 +2,106 @@
 from fnmatch import fnmatch
 
 
-def write_h5(h5_file, output_filename, compression_level=5, omit_data=None):
+def _write_numerical_data(
+    lk_file, out_file, name, node, compression_level, crop_time_range, verbose
+):
+    """Write numerical data"""
+    if crop_time_range:
+        sliced = lk_file[name][slice(*crop_time_range)]
+        if not sliced:
+            if verbose:
+                print(f"{name} dropped from dataset (no data within time window)")
+        else:
+            sliced._src.to_dataset(
+                out_file,
+                name,
+                compression="gzip",
+                compression_opts=compression_level,
+            )
+    else:
+        out_file.create_dataset(
+            name, data=node, compression="gzip", compression_opts=compression_level
+        )
+        out_file[name].attrs.update(node.attrs)
+
+
+def _write_cropped_metadata(lk_file, out_file, name, node, crop_time_range, verbose):
+    """Write non-numerical data"""
+
+    def write_node():
+        out_file.create_dataset(name, data=node)
+        out_file[name].attrs.update(node.attrs)
+
+    if not crop_time_range:
+        write_node()
+    else:
+        # Override time ranges. Items know how to crop themselves.
+        try:
+            start, stop = (
+                getattr(lk_file[name][slice(*crop_time_range)], field)
+                for field in ("start", "stop")
+            )
+            if stop >= crop_time_range[0] and start < crop_time_range[1] and (stop - start) > 0:
+                write_node()
+                out_file[name].attrs["Start time (ns)"] = start
+                out_file[name].attrs["Stop time (ns)"] = stop
+            else:
+                if verbose:
+                    print(f"{name} removed from file (out of cropping range)")
+        except (IndexError, TypeError):
+            if verbose:
+                print(f"{name} not cropped")
+
+
+def write_h5(
+    lk_file,
+    output_filename,
+    compression_level=5,
+    omit_data=None,
+    *,
+    crop_time_range=None,
+    verbose=False,
+):
     """Write a modified h5 file to disk.
 
     Parameters
     ----------
-    h5_file : h5py.File
-        loaded h5 file
+    lk_file : lk.File
+        pylake file handle
     output_filename : str
         Output file name.
     compression_level : int
         Compression level for gzip compression.
     omit_data : str or iterable of str, optional
         Which data sets to omit. Should be a set of h5 paths.
+    crop_time_range : tuple of np.int64
+        Specify a time interval to crop to (tuple of a start and stop time). Interval must be
+        specified in nanoseconds since epoch (the same format as timestamps).
+    verbose : bool, optional.
+        Print verbose output. Default: False.
     """
     omit_data = {omit_data} if isinstance(omit_data, str) else omit_data
+    h5_file = lk_file.h5
 
     with h5py.File(output_filename, "w") as out_file:
 
         def traversal_function(name, node):
             if omit_data and any([fnmatch(name, o) for o in omit_data]):
-                print(f"Omitted {name} from export")
+                if verbose:
+                    print(f"Omitted {name} from export")
                 return
 
             if isinstance(node, h5py.Dataset):
                 if node.dtype.kind == "O":
-                    # Non-numerical data doesn't support compression
-                    out_file.create_dataset(name, data=node)
+                    _write_cropped_metadata(lk_file, out_file, name, node, crop_time_range, verbose)
                 else:
-                    # Numerical data can benefit a lot from compression
-                    out_file.create_dataset(
-                        name, data=node, compression="gzip", compression_opts=compression_level
+                    _write_numerical_data(
+                        lk_file, out_file, name, node, compression_level, crop_time_range, verbose
                     )
+
             else:
                 out_file.create_group(f"{name}")
-
-            out_file[name].attrs.update(node.attrs)
+                out_file[name].attrs.update(node.attrs)
 
         h5_file.visititems(traversal_function)
         out_file.attrs.update(h5_file.attrs)
diff --git a/lumicks/pylake/file.py b/lumicks/pylake/file.py
@@ -299,7 +299,9 @@ def notes(self) -> Dict[str, Note]:
         """Notes stored in the file"""
         return self._get_object_dictionary("Note", Note)
 
-    def save_as(self, filename, compression_level=5, omit_data=None):
+    def save_as(
+        self, filename, compression_level=5, omit_data=None, *, crop_time_range=None, verbose=True
+    ):
         """Write a modified h5 file to disk.
 
         When transferring data, it can be beneficial to omit some channels from the h5 file, or use
@@ -318,6 +320,11 @@ def save_as(self, filename, compression_level=5, omit_data=None):
             Which data sets to omit. Should be a set of h5 paths (e.g. {"Force HF/Force 1y"}).
             `fnmatch` patterns are used to specify which fields to omit, which means you can use
             wildcards as well (see examples below).
+        crop_time_range : tuple of np.int64, optional
+            Specify a time interval to crop to (tuple of a start and stop time). Interval must be
+            specified in nanoseconds since epoch (the same format as timestamps).
+        verbose : bool, optional
+            Print verbose output. Default: True.
 
         Examples
         --------
@@ -344,5 +351,16 @@ def save_as(self, filename, compression_level=5, omit_data=None):
 
             # Omit Scan "1"
             file.save_as("no_scan_1.h5", omit_data="Scan/1")
+
+            # Save only the region that contains the kymograph `kymo1`.
+            kymo = file.kymos["kymo1"]
+            file.save_as("only_kymo.h5", crop_time_range=(kymo.start, kymo.stop))
         """
-        write_h5(self.h5, filename, compression_level, omit_data)
+        write_h5(
+            self,
+            filename,
+            compression_level,
+            omit_data,
+            crop_time_range=crop_time_range,
+            verbose=verbose,
+        )