From 08916832b26de596d8a967e4075392554e51cc7a Mon Sep 17 00:00:00 2001 From: Chris Byrohl <9221545+cbyrohl@users.noreply.github.com> Date: Wed, 23 Aug 2023 11:47:37 +0200 Subject: [PATCH] Support Fitsio (#54) * add subfolder for io funcs * minimal SDSS dr16 fits load success (fixes #25) * add docs for SDSS and w.i.p. fits support --- docs/supported_data.md | 6 +- docs/tutorial/observations.md | 42 ++++++ src/scida/interfaces/mixins/units.py | 3 +- src/scida/io/__init__.py | 8 ++ src/scida/{io.py => io/_base.py} | 183 ++++++++++++++++++++------- src/scida/io/fits.py | 61 +++++++++ tests/conftest.py | 1 + tests/test_iofits.py | 28 ++++ tests/testdata.yaml | 5 + 9 files changed, 286 insertions(+), 51 deletions(-) create mode 100644 src/scida/io/__init__.py rename src/scida/{io.py => io/_base.py} (79%) create mode 100644 src/scida/io/fits.py create mode 100644 tests/test_iofits.py diff --git a/docs/supported_data.md b/docs/supported_data.md index 87c6abbd..aa326739 100644 --- a/docs/supported_data.md +++ b/docs/supported_data.md @@ -12,12 +12,14 @@ If you want to use a dataset that is not listed here, read on [here](dataset_str | [Gaia](https://www.cosmos.esa.int/web/gaia/dr3) | :material-database-check-outline:[^1] | *Observations* of a billion nearby stars | | [Illustris](https://www.illustris-project.org/) | :material-check-all: | Cosmological galaxy formation *simulations* | | [LGalaxies](customs/lgalaxies.md) | :material-check-all: | Semi-analytical model for [Millenium](https://wwwmpa.mpa-garching.mpg.de/galform/virgo/millennium/) simulations | +| [SDSS DR16](https://www.sdss.org/dr16/) | :material-check: | *Observations* for millions of galaxies | | [SIMBA](http://simba.roe.ac.uk/) | :material-check-all: | Cosmological galaxy formation *simulations* | | [TNG](https://www.tng-project.org/) | :material-check-all: | Cosmological galaxy formation *simulations* | | TNG-Cluster | :material-check-all: | Cosmological zoom-in galaxy formation *simulations* | -A :material-check-all: checkmark indicates support out-of-the-box, a :material-check: checkmark indicates support by creating a suitable configuration file. + +A :material-check-all: checkmark indicates support out-of-the-box, a :material-check: checkmark indicates work-in-progress support or the need to create a suitable configuration file. A :material-database-check-outline: checkmark indicates support for converted HDF5 versions of the original data. @@ -25,4 +27,6 @@ A :material-database-check-outline: checkmark indicates support for converted HD As of now, two underlying file formats are supported: hdf5 and zarr. Multi-file hdf5 is supported, for which a directory is passed as *path*, which contains only hdf5 files of the pattern *prefix.XXX.hdf5*, where *prefix* will be determined automatically and *XXX* is a contiguous list of integers indicating the order of hdf5 files to be merged. Hdf5 files are expected to have the same structure and all fields, i.e. hdf5 datasets, will be concatenated along their first axis. +Support for FITS is work-in-progress, also see [here](../tutorial/observations/#fits-files) for a proof-of-concept. + [^1]: The HDF5 version of GAIA DR3 is available [here](https://www.tng-project.org/data/obs/). diff --git a/docs/tutorial/observations.md b/docs/tutorial/observations.md index ae8da65b..4aaf153a 100644 --- a/docs/tutorial/observations.md +++ b/docs/tutorial/observations.md @@ -163,3 +163,45 @@ We discuss more advanced and interactive visualization methods [here](../visuali 1. The *compute()* on `im2d` results in a two-dimensional array which we can display. ![2D histogram example](../images/simple_hist2d_obs.png) + + +## FITS files + +Observations are often stored in [FITS](https://en.wikipedia.org/wiki/FITS) files. Support in scida work-in-progress +and requires the [astropy](https://www.astropy.org/) package. + +Here we show use of the SDSS DR16: + +```pycon +>>> from scida import load +>>> import matplotlib.pyplot as plt +>>> import numpy as np +>>> path = "/virgotng/mpia/obs/SDSS/specObj-dr16.fits" +>>> ds = load(path) +>>> +>>> cx = ds.data["CX"].compute() +>>> cy = ds.data["CY"].compute() +>>> cz = ds.data["CZ"].compute() +>>> z = ds.data["Z"].compute() +>>> +>>> # order by redshift for scatter plotting +>>> idx = np.argsort(z) +>>> +>>> theta = np.arccos(cz.magnitude / np.sqrt(cx**2 + cy**2 + cz**2).magnitude) +>>> phi = np.arctan2(cy.magnitude,cx.magnitude) +>>> +>>> fig = plt.figure(figsize=(10,5)) +>>> ax = fig.add_subplot(111, projection="aitoff") +>>> ra = phi[idx] +>>> dec = -(theta-np.pi/2.0) +>>> sc = ax.scatter(ra, dec[idx], s=0.05, c=z[idx], rasterized=True) +>>> fig.colorbar(sc, label="redshift") +>>> ax.set_xticklabels(['14h','16h','18h','20h','22h','0h','2h','4h','6h','8h','10h']) +>>> ax.set_xlabel("RA") +>>> ax.set_ylabel("DEC") +>>> ax.grid(True) +>>> plt.savefig("sdss_dr16.png", dpi=150) +>>> plt.show() +``` + +![SDSS DR16 Aitoff projection](../images/sdss_dr16.png) diff --git a/src/scida/interfaces/mixins/units.py b/src/scida/interfaces/mixins/units.py index a0cc80c0..d609d066 100644 --- a/src/scida/interfaces/mixins/units.py +++ b/src/scida/interfaces/mixins/units.py @@ -419,7 +419,8 @@ def add_units(container: FieldContainer, basepath: str): log.debug("Field %s already has units, overwriting." % k) container[k] = container[k].magnitude * unit else: - container[k] = unit * container[k] + if np.issubdtype(container[k].dtype, np.number): + container[k] = unit * container[k] if units == "cgs" and isinstance(container[k], pint.Quantity): container[k] = container[k].to_base_units() diff --git a/src/scida/io/__init__.py b/src/scida/io/__init__.py new file mode 100644 index 00000000..d439fc97 --- /dev/null +++ b/src/scida/io/__init__.py @@ -0,0 +1,8 @@ +from scida.io._base import ( + ChunkedHDF5Loader, + load, + load_datadict_old, + load_metadata, + load_metadata_all, + walk_hdf5file, +) diff --git a/src/scida/io.py b/src/scida/io/_base.py similarity index 79% rename from src/scida/io.py rename to src/scida/io/_base.py index 9715cf40..d25bf45a 100644 --- a/src/scida/io.py +++ b/src/scida/io/_base.py @@ -15,6 +15,7 @@ from scida.config import get_config from scida.fields import FieldContainer, walk_container from scida.helpers_hdf5 import create_mergedhdf5file, walk_hdf5file, walk_zarrfile +from scida.io.fits import fitsrecords_to_daskarrays from scida.misc import get_container_from_path, return_hdf5cachepath log = logging.getLogger(__name__) @@ -35,6 +36,59 @@ def load(self): pass +class FITSLoader(Loader): + def __init__(self, path): + super().__init__(path) + self.location = self.path + + def load( + self, + overwrite=False, + fileprefix="", + token="", + chunksize="auto", + virtualcache=False, + **kwargs + ): + log.warning("FITS file support is work-in-progress.") + self.location = self.path + from astropy.io import fits + + ext = 1 + with fits.open(self.location, memmap=True, mode="denywrite") as hdulist: + arr = hdulist[ext].data + darrs = fitsrecords_to_daskarrays(arr) + + derivedfields_kwargs = kwargs.get("derivedfields_kwargs", {}) + withunits = kwargs.get("withunits", False) + rootcontainer = FieldContainer( + fieldrecipes_kwargs=derivedfields_kwargs, withunits=withunits + ) + + for name, darr in darrs.items(): + rootcontainer[name] = darr + + metadata = {} + return rootcontainer, metadata + + def load_metadata(self, **kwargs): + """Take a quick glance at the metadata, only attributes.""" + from astropy.io import fits + + ext = 0 + with fits.open(self.location, memmap=True, mode="denywrite") as hdulist: + return hdulist[ext].header + + def load_metadata_all(self, **kwargs): + """Take a quick glance at metadata.""" + from astropy.io import fits + + ext = 1 + with fits.open(self.location, memmap=True, mode="denywrite") as hdulist: + arr = hdulist[ext].data + return arr.header + + class HDF5Loader(Loader): def __init__(self, path): super().__init__(path) @@ -53,11 +107,11 @@ def load( tree = {} walk_hdf5file(self.location, tree=tree) file = h5py.File(self.location, "r") - datadict = load_datadict_old( + data, metadata = load_datadict_old( self.path, file, token=token, chunksize=chunksize, **kwargs ) self.file = file - return datadict + return data, metadata def load_metadata(self, **kwargs): """Take a quick glance at the metadata, only attributes.""" @@ -90,7 +144,7 @@ def load( tree = {} walk_zarrfile(self.location, tree=tree) self.file = zarr.open(self.location) - datadict = load_datadict_old( + data, metadata = load_datadict_old( self.path, self.file, token=token, @@ -98,7 +152,7 @@ def load( filetype="zarr", **kwargs ) - return datadict + return data, metadata def load_metadata(self, **kwargs): """Take a quick glance at the metadata.""" @@ -159,7 +213,7 @@ def load( ) try: - datadict = self.load_cachefile( + data, metadata = self.load_cachefile( cachefp, token=token, chunksize=chunksize, **kwargs ) except InvalidCacheError: @@ -167,10 +221,10 @@ def load( log.info("Invalid cache file, attempting to create new one.") os.remove(cachefp) self.create_cachefile(fileprefix=fileprefix, virtualcache=virtualcache) - datadict = self.load_cachefile( + data, metadata = self.load_cachefile( cachefp, token=token, chunksize=chunksize, **kwargs ) - return datadict + return data, metadata def get_chunkedfiles(self, fileprefix: Optional[str] = "") -> list: """ @@ -318,47 +372,19 @@ def load_datadict_old( ) continue - if lazy and i > 0: # need one non-lazy entry though later on... - - def field( - arrs, - snap=None, - h5path="", - chunksize="", - name="", - inline_array=False, - file=None, - **kwargs - ): - hds = file[h5path] - arr = da.from_array( - hds, - chunks=chunksize, - name=name, - inline_array=inline_array, - ) - return arr - - fnc = partial( - field, - h5path=dataset[0], - chunksize=chunksize, - name=name, - inline_array=inline_array, - file=file, - ) - container.register_field( - name=fieldname, description=fieldname + ": lazy field from disk" - )(fnc) - else: - hds = file[dataset[0]] - ds = da.from_array( - hds, - chunks=chunksize, - name=name, - inline_array=inline_array, - ) - container[fieldname] = ds + lz = lazy and i > 0 # need one non-lazy field (?) + fieldpath = dataset[0] + _add_hdf5arr_to_fieldcontainer( + file, + container, + fieldpath, + fieldname, + name, + chunksize, + inline_array=inline_array, + lazy=lz, + ) + data = rootcontainer dtsdict = {k[0]: k[1:] for k in tree["datasets"]} @@ -396,9 +422,15 @@ def determine_loader(path, **kwargs): else: # otherwise expect this is a chunked HDF5 file loader = ChunkedHDF5Loader(path) - else: + elif not os.path.exists(path): + raise ValueError("Path '%s' does not exist" % path) + elif str(path).endswith(".hdf5"): # we are directly given a target file loader = HDF5Loader(path) + elif str(path).endswith(".fits"): + loader = FITSLoader(path) + else: + raise ValueError("Unknown filetype of path '%s'" % path) return loader @@ -441,6 +473,59 @@ def _cachefile_available_in_path( return False +def _add_hdf5arr_to_fieldcontainer( + file, + container, + fieldpath, + fieldname, + name, + chunksize, + inline_array=False, + lazy=True, +): + if not lazy: + hds = file[fieldpath] + ds = da.from_array( + hds, + chunks=chunksize, + name=name, + inline_array=inline_array, + ) + container[fieldname] = ds + return + + def field( + arrs, + snap=None, + h5path="", + chunksize="", + name="", + inline_array=False, + file=None, + **kwargs + ): + hds = file[h5path] + arr = da.from_array( + hds, + chunks=chunksize, + name=name, + inline_array=inline_array, + ) + return arr + + fnc = partial( + field, + h5path=fieldpath, + chunksize=chunksize, + name=name, + inline_array=inline_array, + file=file, + ) + container.register_field( + name=fieldname, description=fieldname + ": lazy field from disk" + )(fnc) + + def _get_chunkedfiles(path, fileprefix: Optional[str] = "") -> list: """ Get all files in directory with given prefix. diff --git a/src/scida/io/fits.py b/src/scida/io/fits.py new file mode 100644 index 00000000..b634f5da --- /dev/null +++ b/src/scida/io/fits.py @@ -0,0 +1,61 @@ +import dask +import dask.array as da +import numpy as np +from dask import delayed + +_sizeunits = { + "B": 1, + "KB": 10**3, + "MB": 10**6, + "GB": 10**9, + "TB": 10**12, + "KiB": 1024, + "MiB": 1024**2, + "GiB": 1024**3, +} +_sizeunits = {k.lower(): v for k, v in _sizeunits.items()} + + +def parse_size(size): + idx = 0 + for c in size: + if c.isnumeric(): + continue + idx += 1 + number = size[:idx] + unit = size[idx:] + return int(float(number) * _sizeunits[unit.lower().strip()]) + + +def fitsrecords_to_daskarrays(fitsrecords): + load_arr = delayed(lambda slc, field: fitsrecords[slc][field]) + shape = fitsrecords.shape + darrdict = {} + csize = dask.config.get("array.chunk-size") + + csize = parse_size(csize) # need int + + nbytes_dtype_max = 1 + for fieldname in fitsrecords.dtype.names: + nbytes_dtype = fitsrecords.dtype[fieldname].itemsize + nbytes_dtype_max = max(nbytes_dtype_max, nbytes_dtype) + chunksize = csize // nbytes_dtype_max + + for fieldname in fitsrecords.dtype.names: + chunks = [] + for index in range(0, shape[-1], chunksize): + dtype = fitsrecords.dtype[fieldname] + chunk_size = min(chunksize, shape[-1] - index) + slc = slice(index, index + chunk_size) + shp = (chunk_size,) + if dtype.subdtype is not None: + # for now, we expect this to be void type + assert dtype.type is np.void + break # do not handle void type for now => skip field + # shp = shp + dtype.subdtype[0].shape + # dtype = dtype.subdtype[0].base + chunk = da.from_delayed(load_arr(slc, fieldname), shape=shp, dtype=dtype) + chunks.append(chunk) + if len(chunks) > 0: + darrdict[fieldname] = da.concatenate(chunks, axis=0) + return darrdict diff --git a/tests/conftest.py b/tests/conftest.py index c730f6de..581865a5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ from tests.helpers import DummyGadgetCatalogFile, DummyGadgetSnapshotFile, DummyTNGFile flag_test_long = False # Set to true to run time-taking tests. +flag_test_big = False # Set to true to run memory-taking tests. scope_snapshot = "function" diff --git a/tests/test_iofits.py b/tests/test_iofits.py new file mode 100644 index 00000000..ab21351c --- /dev/null +++ b/tests/test_iofits.py @@ -0,0 +1,28 @@ +import dask.array as da + +from scida import load +from scida.io.fits import fitsrecords_to_daskarrays +from tests.testdata_properties import require_testdata_path + + +@require_testdata_path("fits", only=["SDSS_DR16_fits"]) +def test_fitsread(testdatapath): + path = testdatapath + + from astropy.io import fits + + ext = 1 + with fits.open(path, memmap=True, mode="denywrite") as hdulist: + arr = hdulist[ext].data + darrs = fitsrecords_to_daskarrays(arr) + assert len(darrs) > 0 + for k, darr in darrs.items(): + assert isinstance(darr, da.Array) + + +@require_testdata_path("fits", only=["SDSS_DR16_fits"]) +def test_fitsdataset(testdatapath): + path = testdatapath + + ds = load(path) + assert "CX" in ds.data diff --git a/tests/testdata.yaml b/tests/testdata.yaml index 4ebe591c..0114dd45 100644 --- a/tests/testdata.yaml +++ b/tests/testdata.yaml @@ -260,3 +260,8 @@ testdata: types: - interface fn: gaia_dr3_minimal.hdf5 + + SDSS_DR16_fits: + types: + - fits + fn: "specObj-dr16.fits"