Support Fitsio (#54)

* add subfolder for io funcs * minimal SDSS dr16 fits load success (fixes #25) * add docs for SDSS and w.i.p. fits support
cbyrohl · Aug 23, 2023 · 0891683 · 0891683
1 parent 97797e6
commit 0891683
Show file tree

Hide file tree

Showing 9 changed files with 286 additions and 51 deletions.
diff --git a/docs/supported_data.md b/docs/supported_data.md
@@ -12,17 +12,21 @@ If you want to use a dataset that is not listed here, read on [here](dataset_str
 | [Gaia](https://www.cosmos.esa.int/web/gaia/dr3)       | :material-database-check-outline:[^1] | *Observations* of a billion nearby stars                                                                        |
 | [Illustris](https://www.illustris-project.org/)       | :material-check-all:                  | Cosmological galaxy formation *simulations*                                                                     |
 | [LGalaxies](customs/lgalaxies.md)                     | :material-check-all:                  | Semi-analytical model for [Millenium](https://wwwmpa.mpa-garching.mpg.de/galform/virgo/millennium/) simulations |
+| [SDSS DR16](https://www.sdss.org/dr16/)               | :material-check:                      | *Observations* for millions of galaxies                                                                         |
 | [SIMBA](http://simba.roe.ac.uk/)                      | :material-check-all:                  | Cosmological galaxy formation *simulations*                                                                     |
 | [TNG](https://www.tng-project.org/)                   | :material-check-all:                  | Cosmological galaxy formation *simulations*                                                                     |
 | TNG-Cluster                                           | :material-check-all:                  | Cosmological zoom-in galaxy formation *simulations*                                                             |
 
 
-A :material-check-all: checkmark indicates support out-of-the-box, a :material-check: checkmark indicates support by creating a suitable configuration file.
+
+A :material-check-all: checkmark indicates support out-of-the-box, a :material-check: checkmark indicates work-in-progress support or the need to create a suitable configuration file.
 A :material-database-check-outline: checkmark indicates support for converted HDF5 versions of the original data.
 
 
 # File-format requirements
 
 As of now, two underlying file formats are supported: hdf5 and zarr. Multi-file hdf5 is supported, for which a directory is passed as *path*, which contains only hdf5 files of the pattern *prefix.XXX.hdf5*, where *prefix* will be determined automatically and *XXX* is a contiguous list of integers indicating the order of hdf5 files to be merged. Hdf5 files are expected to have the same structure and all fields, i.e. hdf5 datasets, will be concatenated along their first axis.
 
+Support for FITS is work-in-progress, also see [here](../tutorial/observations/#fits-files) for a proof-of-concept.
+
 [^1]: The HDF5 version of GAIA DR3 is available [here](https://www.tng-project.org/data/obs/).
diff --git a/docs/tutorial/observations.md b/docs/tutorial/observations.md
@@ -163,3 +163,45 @@ We discuss more advanced and interactive visualization methods [here](../visuali
 1. The *compute()* on `im2d` results in a two-dimensional array which we can display.
 
 ![2D histogram example](../images/simple_hist2d_obs.png)
+
+
+## FITS files
+
+Observations are often stored in [FITS](https://en.wikipedia.org/wiki/FITS) files. Support in scida work-in-progress
+and requires the [astropy](https://www.astropy.org/) package.
+
+Here we show use of the SDSS DR16:
+
+```pycon
+>>> from scida import load
+>>> import matplotlib.pyplot as plt
+>>> import numpy as np
+>>> path = "/virgotng/mpia/obs/SDSS/specObj-dr16.fits"
+>>> ds = load(path)
+>>>
+>>> cx = ds.data["CX"].compute()
+>>> cy = ds.data["CY"].compute()
+>>> cz = ds.data["CZ"].compute()
+>>> z = ds.data["Z"].compute()
+>>>
+>>> # order by redshift for scatter plotting
+>>> idx = np.argsort(z)
+>>>
+>>> theta = np.arccos(cz.magnitude / np.sqrt(cx**2 + cy**2 + cz**2).magnitude)
+>>> phi = np.arctan2(cy.magnitude,cx.magnitude)
+>>>
+>>> fig = plt.figure(figsize=(10,5))
+>>> ax = fig.add_subplot(111, projection="aitoff")
+>>> ra = phi[idx]
+>>> dec = -(theta-np.pi/2.0)
+>>> sc = ax.scatter(ra, dec[idx], s=0.05, c=z[idx], rasterized=True)
+>>> fig.colorbar(sc, label="redshift")
+>>> ax.set_xticklabels(['14h','16h','18h','20h','22h','0h','2h','4h','6h','8h','10h'])
+>>> ax.set_xlabel("RA")
+>>> ax.set_ylabel("DEC")
+>>> ax.grid(True)
+>>> plt.savefig("sdss_dr16.png", dpi=150)
+>>> plt.show()
+```
+
+![SDSS DR16 Aitoff projection](../images/sdss_dr16.png)
diff --git a/src/scida/interfaces/mixins/units.py b/src/scida/interfaces/mixins/units.py
@@ -419,7 +419,8 @@ def add_units(container: FieldContainer, basepath: str):
                             log.debug("Field %s already has units, overwriting." % k)
                             container[k] = container[k].magnitude * unit
                         else:
-                            container[k] = unit * container[k]
+                            if np.issubdtype(container[k].dtype, np.number):
+                                container[k] = unit * container[k]
                         if units == "cgs" and isinstance(container[k], pint.Quantity):
                             container[k] = container[k].to_base_units()
 

diff --git a/src/scida/io/__init__.py b/src/scida/io/__init__.py
@@ -0,0 +1,8 @@
+from scida.io._base import (
+    ChunkedHDF5Loader,
+    load,
+    load_datadict_old,
+    load_metadata,
+    load_metadata_all,
+    walk_hdf5file,
+)
diff --git a/src/scida/io.py → src/scida/io/_base.py b/src/scida/io.py → src/scida/io/_base.py
@@ -15,6 +15,7 @@
 from scida.config import get_config
 from scida.fields import FieldContainer, walk_container
 from scida.helpers_hdf5 import create_mergedhdf5file, walk_hdf5file, walk_zarrfile
+from scida.io.fits import fitsrecords_to_daskarrays
 from scida.misc import get_container_from_path, return_hdf5cachepath
 
 log = logging.getLogger(__name__)
@@ -35,6 +36,59 @@ def load(self):
         pass
 
 
+class FITSLoader(Loader):
+    def __init__(self, path):
+        super().__init__(path)
+        self.location = self.path
+
+    def load(
+        self,
+        overwrite=False,
+        fileprefix="",
+        token="",
+        chunksize="auto",
+        virtualcache=False,
+        **kwargs
+    ):
+        log.warning("FITS file support is work-in-progress.")
+        self.location = self.path
+        from astropy.io import fits
+
+        ext = 1
+        with fits.open(self.location, memmap=True, mode="denywrite") as hdulist:
+            arr = hdulist[ext].data
+        darrs = fitsrecords_to_daskarrays(arr)
+
+        derivedfields_kwargs = kwargs.get("derivedfields_kwargs", {})
+        withunits = kwargs.get("withunits", False)
+        rootcontainer = FieldContainer(
+            fieldrecipes_kwargs=derivedfields_kwargs, withunits=withunits
+        )
+
+        for name, darr in darrs.items():
+            rootcontainer[name] = darr
+
+        metadata = {}
+        return rootcontainer, metadata
+
+    def load_metadata(self, **kwargs):
+        """Take a quick glance at the metadata, only attributes."""
+        from astropy.io import fits
+
+        ext = 0
+        with fits.open(self.location, memmap=True, mode="denywrite") as hdulist:
+            return hdulist[ext].header
+
+    def load_metadata_all(self, **kwargs):
+        """Take a quick glance at metadata."""
+        from astropy.io import fits
+
+        ext = 1
+        with fits.open(self.location, memmap=True, mode="denywrite") as hdulist:
+            arr = hdulist[ext].data
+        return arr.header
+
+
 class HDF5Loader(Loader):
     def __init__(self, path):
         super().__init__(path)
@@ -53,11 +107,11 @@ def load(
         tree = {}
         walk_hdf5file(self.location, tree=tree)
         file = h5py.File(self.location, "r")
-        datadict = load_datadict_old(
+        data, metadata = load_datadict_old(
             self.path, file, token=token, chunksize=chunksize, **kwargs
         )
         self.file = file
-        return datadict
+        return data, metadata
 
     def load_metadata(self, **kwargs):
         """Take a quick glance at the metadata, only attributes."""
@@ -90,15 +144,15 @@ def load(
         tree = {}
         walk_zarrfile(self.location, tree=tree)
         self.file = zarr.open(self.location)
-        datadict = load_datadict_old(
+        data, metadata = load_datadict_old(
             self.path,
             self.file,
             token=token,
             chunksize=chunksize,
             filetype="zarr",
             **kwargs
         )
-        return datadict
+        return data, metadata
 
     def load_metadata(self, **kwargs):
         """Take a quick glance at the metadata."""
@@ -159,18 +213,18 @@ def load(
             )
 
         try:
-            datadict = self.load_cachefile(
+            data, metadata = self.load_cachefile(
                 cachefp, token=token, chunksize=chunksize, **kwargs
             )
         except InvalidCacheError:
             # if we get an error, we try to create a new cache file (once)
             log.info("Invalid cache file, attempting to create new one.")
             os.remove(cachefp)
             self.create_cachefile(fileprefix=fileprefix, virtualcache=virtualcache)
-            datadict = self.load_cachefile(
+            data, metadata = self.load_cachefile(
                 cachefp, token=token, chunksize=chunksize, **kwargs
             )
-        return datadict
+        return data, metadata
 
     def get_chunkedfiles(self, fileprefix: Optional[str] = "") -> list:
         """
@@ -318,47 +372,19 @@ def load_datadict_old(
                 )
                 continue
 
-        if lazy and i > 0:  # need one non-lazy entry though later on...
-
-            def field(
-                arrs,
-                snap=None,
-                h5path="",
-                chunksize="",
-                name="",
-                inline_array=False,
-                file=None,
-                **kwargs
-            ):
-                hds = file[h5path]
-                arr = da.from_array(
-                    hds,
-                    chunks=chunksize,
-                    name=name,
-                    inline_array=inline_array,
-                )
-                return arr
-
-            fnc = partial(
-                field,
-                h5path=dataset[0],
-                chunksize=chunksize,
-                name=name,
-                inline_array=inline_array,
-                file=file,
-            )
-            container.register_field(
-                name=fieldname, description=fieldname + ": lazy field from disk"
-            )(fnc)
-        else:
-            hds = file[dataset[0]]
-            ds = da.from_array(
-                hds,
-                chunks=chunksize,
-                name=name,
-                inline_array=inline_array,
-            )
-            container[fieldname] = ds
+        lz = lazy and i > 0  # need one non-lazy field (?)
+        fieldpath = dataset[0]
+        _add_hdf5arr_to_fieldcontainer(
+            file,
+            container,
+            fieldpath,
+            fieldname,
+            name,
+            chunksize,
+            inline_array=inline_array,
+            lazy=lz,
+        )
+
     data = rootcontainer
 
     dtsdict = {k[0]: k[1:] for k in tree["datasets"]}
@@ -396,9 +422,15 @@ def determine_loader(path, **kwargs):
         else:
             # otherwise expect this is a chunked HDF5 file
             loader = ChunkedHDF5Loader(path)
-    else:
+    elif not os.path.exists(path):
+        raise ValueError("Path '%s' does not exist" % path)
+    elif str(path).endswith(".hdf5"):
         # we are directly given a target file
         loader = HDF5Loader(path)
+    elif str(path).endswith(".fits"):
+        loader = FITSLoader(path)
+    else:
+        raise ValueError("Unknown filetype of path '%s'" % path)
     return loader
 
 
@@ -441,6 +473,59 @@ def _cachefile_available_in_path(
     return False
 
 
+def _add_hdf5arr_to_fieldcontainer(
+    file,
+    container,
+    fieldpath,
+    fieldname,
+    name,
+    chunksize,
+    inline_array=False,
+    lazy=True,
+):
+    if not lazy:
+        hds = file[fieldpath]
+        ds = da.from_array(
+            hds,
+            chunks=chunksize,
+            name=name,
+            inline_array=inline_array,
+        )
+        container[fieldname] = ds
+        return
+
+    def field(
+        arrs,
+        snap=None,
+        h5path="",
+        chunksize="",
+        name="",
+        inline_array=False,
+        file=None,
+        **kwargs
+    ):
+        hds = file[h5path]
+        arr = da.from_array(
+            hds,
+            chunks=chunksize,
+            name=name,
+            inline_array=inline_array,
+        )
+        return arr
+
+    fnc = partial(
+        field,
+        h5path=fieldpath,
+        chunksize=chunksize,
+        name=name,
+        inline_array=inline_array,
+        file=file,
+    )
+    container.register_field(
+        name=fieldname, description=fieldname + ": lazy field from disk"
+    )(fnc)
+
+
 def _get_chunkedfiles(path, fileprefix: Optional[str] = "") -> list:
     """
     Get all files in directory with given prefix.