From a784bd85e635c2c71aff3748d545df832c19b550 Mon Sep 17 00:00:00 2001 From: Chris Byrohl <9221545+cbyrohl@users.noreply.github.com> Date: Wed, 9 Aug 2023 16:52:21 +0200 Subject: [PATCH 1/4] add LocalSubhaloID and SubhaloID field --- src/scida/customs/arepo/dataset.py | 99 ++++++++++++++++++++++-------- tests/test_fields.py | 2 +- 2 files changed, 76 insertions(+), 25 deletions(-) diff --git a/src/scida/customs/arepo/dataset.py b/src/scida/customs/arepo/dataset.py index a271f28f..99807ef3 100644 --- a/src/scida/customs/arepo/dataset.py +++ b/src/scida/customs/arepo/dataset.py @@ -319,13 +319,12 @@ def add_catalogIDs(self) -> None: return glen = self.data["Group"]["GroupLenType"] - da_halocelloffsets = ( - da.concatenate( # TODO: Do not hardcode shape of 6 particle types! - [ - np.zeros((1, 6), dtype=np.int64), - da.cumsum(glen, axis=0, dtype=np.int64), - ] - ) + ngrp = glen.shape[0] + da_halocelloffsets = da.concatenate( + [ + np.zeros((1, 6), dtype=np.int64), + da.cumsum(glen, axis=0, dtype=np.int64), + ] ) # remove last entry to match shapematch shape self.data["Group"]["GroupOffsetsType"] = da_halocelloffsets[:-1].rechunk( @@ -370,24 +369,48 @@ def add_catalogIDs(self) -> None: if hasattr(subhalocellcounts, "magnitude"): subhalocellcounts = subhalocellcounts.magnitude + grp = self.data["Group"] + if "GroupFirstSub" not in grp or "GroupNsubs" not in grp: + # if not provided, we calculate: + # "GroupFirstSub": First subhalo index for each halo + # "GroupNsubs": Number of subhalos for each halo + dlyd = delayed(get_shcounts_shcells)(subhalogrnr, ngrp) + grp["GroupFirstSub"] = dask.compute(dlyd[1])[0] + grp["GroupNsubs"] = dask.compute(dlyd[0])[0] + + # remove "units" for numba funcs + grpfirstsub = grp["GroupFirstSub"] + if hasattr(grpfirstsub, "magnitude"): + grpfirstsub = grpfirstsub.magnitude + grpnsubs = grp["GroupNsubs"] + if hasattr(grpnsubs, "magnitude"): + grpnsubs = grpnsubs.magnitude + for key in self.data: if not (key.startswith("PartType")): continue num = int(key[-1]) + pdata = self.data[key] if "uid" not in self.data[key]: continue # can happen for empty containers - gidx = self.data[key]["uid"] - dlyd = delayed(get_shcounts_shcells)( - subhalogrnr, halocelloffsets[:, num].shape[0] - ) - sidx = compute_subhaloindex( + gidx = pdata["uid"] + + sidx = compute_localsubhaloindex( gidx, halocelloffsets[:, num], - dlyd[1], - dlyd[0], + grpfirstsub, + grpnsubs, subhalocellcounts[:, num], ) - self.data[key]["SubhaloID"] = sidx + + pdata["LocalSubhaloID"] = sidx + + # reconstruct SubhaloID from Group's GroupFirstSub and LocalSubhaloID + # should be easier to do it directly, but quicker to write down like this: + + # calculate first subhalo of each halo that a particle belongs to + self.add_groupquantity_to_particles("GroupFirstSub", parttype=key) + pdata["SubhaloID"] = pdata["GroupFirstSub"] + pdata["LocalSubhaloID"] @computedecorator def map_halo_operation( @@ -714,7 +737,7 @@ def compute_haloquantity(gidx, halocelloffsets, hvals, *args): @jit(nopython=True) -def get_shidx( +def get_localshidx( gidx_start: int, gidx_count: int, celloffsets: NDArray[np.int64], @@ -722,6 +745,23 @@ def get_shidx( shcounts, shcellcounts, ): + """ + Get the local subhalo index for each particle. This is the subhalo index within each + halo group. Particles belonging to the central galaxies will have index 0, particles + belonging to the first satellite will have index 1, etc. + Parameters + ---------- + gidx_start + gidx_count + celloffsets + shnumber + shcounts + shcellcounts + + Returns + ------- + + """ res = -1 * np.ones(gidx_count, dtype=np.int32) # fuzz has negative index. # find initial Group we are in @@ -785,7 +825,7 @@ def get_shidx( return res -def get_shidx_daskwrap( +def get_local_shidx_daskwrap( gidx: NDArray[np.int64], halocelloffsets: NDArray[np.int64], shnumber, @@ -794,16 +834,16 @@ def get_shidx_daskwrap( ) -> np.ndarray: gidx_start = gidx[0] gidx_count = gidx.shape[0] - return get_shidx( + return get_localshidx( gidx_start, gidx_count, halocelloffsets, shnumber, shcounts, shcellcounts ) -def compute_subhaloindex( +def compute_localsubhaloindex( gidx, halocelloffsets, shnumber, shcounts, shcellcounts ) -> da.Array: return da.map_blocks( - get_shidx_daskwrap, + get_local_shidx_daskwrap, gidx, halocelloffsets, shnumber, @@ -815,11 +855,22 @@ def compute_subhaloindex( @jit(nopython=True) def get_shcounts_shcells(SubhaloGrNr, hlength): - """Returns the number offset and count of subhalos per halo.""" - shcounts = np.zeros(hlength, dtype=np.int32) - shnumber = np.zeros(hlength, dtype=np.int32) + """ + Returns the id of the first subhalo and count of subhalos per halo. + Parameters + ---------- + SubhaloGrNr: np.ndarray + The group identifier that each subhalo belongs to respectively + hlength: int + The number of halos in the snapshot + + Returns + ------- + + """ + shcounts = np.zeros(hlength, dtype=np.int32) # number of subhalos per halo + shnumber = np.zeros(hlength, dtype=np.int32) # index of first subhalo per halo i = 0 - hid = 0 hid_old = 0 while i < SubhaloGrNr.shape[0]: hid = SubhaloGrNr[i] diff --git a/tests/test_fields.py b/tests/test_fields.py index 9cfe6cbc..aa647d57 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -24,7 +24,7 @@ def test_fieldtypes(testdatapath): snp = load(testdatapath) gas = snp.data["PartType0"] fnames = list(gas.keys(withrecipes=False)) - assert len(fnames) < 5, "not lazy loading fields (into recipes)" + assert len(fnames) < 10, "not lazy loading fields (into recipes)" fnames = list(gas.keys()) assert len(fnames) > 5, "not correctly considering recipes" assert gas.fieldcount == len(fnames) From de89d156bcf2569cea63236a09346a19cd7b4ace Mon Sep 17 00:00:00 2001 From: Chris Byrohl <9221545+cbyrohl@users.noreply.github.com> Date: Wed, 9 Aug 2023 17:40:56 +0200 Subject: [PATCH 2/4] update docs on halo catalogs --- docs/halocatalogs.md | 30 +++++++++++++++++++++++++++--- mkdocs.yml | 3 +++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/docs/halocatalogs.md b/docs/halocatalogs.md index 79ff14f4..636e29cf 100644 --- a/docs/halocatalogs.md +++ b/docs/halocatalogs.md @@ -14,6 +14,7 @@ ds = load("TNG50-4_snapshot") # (1)! The dataset itself passed to load does not possess information about the FoF/Subfind outputs as they are commonly saved in a separate folder or hdf5 file. For typical folder structures of GADGET/AREPO style simulations, an attempt is made to automatically discover and add such information. The path to the catalog can otherwise explicitly be passed to *load()* via the *catalog=...* keyword. +## Accessing halo/galaxy catalog information Groups and subhalo information is added into the dataset with the data containers *Group* and *Subhalo*. For example, we can obtain the masses of each group as: @@ -22,16 +23,39 @@ Groups and subhalo information is added into the dataset with the data container group_mass = ds.data["Group"]["GroupMass"] ``` +## Accessing particle-level halo/galaxy information + In addition to these two data containers, new information is added to all other containers about their belonging to a given group and subhalo. ``` py -groupid = ds.data["PartType0"]["GroupID"] +groupid = ds.data["PartType0"]["GroupID"] #(1)! subhaloid = ds.data["PartType0"]["SubhaloID"] +localsubhaloid = ds.data["PartType0"]["LocalSubhaloID"] ``` -In above example, we fetch the virtual dask arrays holding the group and subhalo belonging of each *PartType0* particle. The *groupid* field returns the integer position into the group catalog that given particle belongs to. For the *subhaloid* the subhalo id within a given *groupid* is given, *where subhaloid==0* marks the belonging to the central subhalo in a halo. +1. This information is also available for the other particle types. + +In above example, we fetch the virtual dask arrays holding information about the halo and subhalo association for each particle. + +`GroupID` + +: The group ID of the group the particle belongs to. This is the index into the group catalog. + +`SubhaloID` + +: The subhalo ID of the subhalo the particle belongs to. This is the index into the subhalo catalog. + +`LocalSubhaloID` + +: This is the Subhalo ID relative to the central subhalo of a given group. For the central subhalo, this is 0. + Satellites accordingly start at index 1. + +Particles that are not associated with a group or subhalo that are queried for such ID +will return `ds.misc['unboundID']'`. This is currently set to 9223372036854775807, but might change to -1 in the future. + -This operation allows us to efficiently query the belonging of given particles. So, for example, we can compute the group IDs of the particles 1000-1099 by running +This operation allows us to efficiently query the belonging of given particles. +So, for example, we can compute the group IDs of the gas particles 1000-1099 by running ``` py groupid[1000:1100].compute() diff --git a/mkdocs.yml b/mkdocs.yml index 94d31071..c35a05df 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -26,6 +26,9 @@ nav: - 'Derived fields': derived_fields.md - 'Data series': series.md - 'Large datasets': largedatasets.md + - 'Advanced Topics': + - 'Arepo Simulations': + - 'Halo Catalogs': halocatalogs.md # - 'Cookbook': # - 'Units': notebooks/cookbook/units.ipynb - 'FAQ': faq.md From 67f07ef1b1966eed2506d5438a852fce1b3827f3 Mon Sep 17 00:00:00 2001 From: Chris Byrohl <9221545+cbyrohl@users.noreply.github.com> Date: Fri, 11 Aug 2023 13:25:05 +0200 Subject: [PATCH 3/4] fix missing test/attribute for halo-assoc. --- src/scida/customs/arepo/dataset.py | 66 +++++++++++++++---- .../simulations/test_tngcatalog_operations.py | 18 +++++ 2 files changed, 71 insertions(+), 13 deletions(-) create mode 100644 tests/simulations/test_tngcatalog_operations.py diff --git a/src/scida/customs/arepo/dataset.py b/src/scida/customs/arepo/dataset.py index 99807ef3..e17c3b11 100644 --- a/src/scida/customs/arepo/dataset.py +++ b/src/scida/customs/arepo/dataset.py @@ -102,6 +102,7 @@ def __init__(self, path, chunksize="auto", catalog=None, **kwargs) -> None: self.config = {} self.parameters = {} self._grouplengths = {} + self.misc = {} # for storing misc info prfx = kwargs.pop("fileprefix", None) if prfx is None: prfx = self._get_fileprefix(path) @@ -307,15 +308,21 @@ def add_catalogIDs(self) -> None: # TODO: make these delayed objects and properly pass into (delayed?) numba functions: # https://docs.dask.org/en/stable/delayed-best-practices.html#avoid-repeatedly-putting-large-inputs-into-delayed-calls + maxint = np.iinfo(np.int64).max + self.misc["unboundID"] = maxint + # Group ID if "Group" not in self.data: # can happen for empty catalogs for key in self.data: if not (key.startswith("PartType")): continue - maxint = np.iinfo(np.int64).max uid = self.data[key]["uid"] - self.data[key]["GroupID"] = maxint * da.ones_like(uid, dtype=np.int64) - self.data[key]["SubhaloID"] = -1 * da.ones_like(uid, dtype=np.int64) + self.data[key]["GroupID"] = self.misc["unboundID"] * da.ones_like( + uid, dtype=np.int64 + ) + self.data[key]["SubhaloID"] = self.misc["unboundID"] * da.ones_like( + uid, dtype=np.int64 + ) return glen = self.data["Group"]["GroupLenType"] @@ -332,6 +339,8 @@ def add_catalogIDs(self) -> None: ) halocelloffsets = da_halocelloffsets.rechunk(-1) + index_unbound = self.misc["unboundID"] + for key in self.data: if not (key.startswith("PartType")): continue @@ -339,7 +348,9 @@ def add_catalogIDs(self) -> None: if "uid" not in self.data[key]: continue # can happen for empty containers gidx = self.data[key]["uid"] - hidx = compute_haloindex(gidx, halocelloffsets[:, num]) + hidx = compute_haloindex( + gidx, halocelloffsets[:, num], index_unbound=index_unbound + ) self.data[key]["GroupID"] = hidx # Subhalo ID @@ -401,6 +412,7 @@ def add_catalogIDs(self) -> None: grpfirstsub, grpnsubs, subhalocellcounts[:, num], + index_unbound=index_unbound, ) pdata["LocalSubhaloID"] = sidx @@ -411,6 +423,9 @@ def add_catalogIDs(self) -> None: # calculate first subhalo of each halo that a particle belongs to self.add_groupquantity_to_particles("GroupFirstSub", parttype=key) pdata["SubhaloID"] = pdata["GroupFirstSub"] + pdata["LocalSubhaloID"] + pdata["SubHaloID"] = da.where( + pdata["SubhaloID"] == index_unbound, index_unbound, pdata["SubhaloID"] + ) @computedecorator def map_halo_operation( @@ -661,7 +676,7 @@ def wrap_func_scalar( @jit(nopython=True) -def get_hidx(gidx_start, gidx_count, celloffsets): +def get_hidx(gidx_start, gidx_count, celloffsets, index_unbound=None): """Get halo index of a given cell Parameters @@ -673,9 +688,13 @@ def get_hidx(gidx_start, gidx_count, celloffsets): celloffsets : array An array holding the starting cell offset for each halo. Needs to include the offset after the last halo. The required shape is thus (Nhalo+1,). + index_unbound : integer, optional + The index to use for unbound particles. If None, the maximum integer value + of the dtype is used. """ dtype = np.int64 - index_unbound = np.iinfo(dtype).max + if index_unbound is None: + index_unbound = np.iinfo(dtype).max res = index_unbound * np.ones(gidx_count, dtype=dtype) # find initial celloffset hidx_idx = np.searchsorted(celloffsets, gidx_start, side="right") - 1 @@ -698,10 +717,12 @@ def get_hidx(gidx_start, gidx_count, celloffsets): return res -def get_hidx_daskwrap(gidx, halocelloffsets): +def get_hidx_daskwrap(gidx, halocelloffsets, index_unbound=None): gidx_start = gidx[0] gidx_count = gidx.shape[0] - return get_hidx(gidx_start, gidx_count, halocelloffsets) + return get_hidx( + gidx_start, gidx_count, halocelloffsets, index_unbound=index_unbound + ) def get_haloquantity_daskwrap(gidx, halocelloffsets, valarr): @@ -713,10 +734,14 @@ def get_haloquantity_daskwrap(gidx, halocelloffsets, valarr): return result -def compute_haloindex(gidx, halocelloffsets, *args): +def compute_haloindex(gidx, halocelloffsets, *args, index_unbound=None): """Computes the halo index for each particle with dask.""" return da.map_blocks( - get_hidx_daskwrap, gidx, halocelloffsets, meta=np.array((), dtype=np.int64) + get_hidx_daskwrap, + gidx, + halocelloffsets, + index_unbound=index_unbound, + meta=np.array((), dtype=np.int64), ) @@ -744,6 +769,7 @@ def get_localshidx( shnumber, shcounts, shcellcounts, + index_unbound=None, ): """ Get the local subhalo index for each particle. This is the subhalo index within each @@ -757,12 +783,18 @@ def get_localshidx( shnumber shcounts shcellcounts + index_unbound: integer, optional + The index to use for unbound particles. If None, the maximum integer value + of the dtype is used. Returns ------- """ - res = -1 * np.ones(gidx_count, dtype=np.int32) # fuzz has negative index. + dtype = np.int32 + if index_unbound is None: + index_unbound = np.iinfo(dtype).max + res = index_unbound * np.ones(gidx_count, dtype=dtype) # fuzz has negative index. # find initial Group we are in hidx_start_idx = np.searchsorted(celloffsets, gidx_start, side="right") - 1 @@ -831,16 +863,23 @@ def get_local_shidx_daskwrap( shnumber, shcounts, shcellcounts, + index_unbound=None, ) -> np.ndarray: gidx_start = gidx[0] gidx_count = gidx.shape[0] return get_localshidx( - gidx_start, gidx_count, halocelloffsets, shnumber, shcounts, shcellcounts + gidx_start, + gidx_count, + halocelloffsets, + shnumber, + shcounts, + shcellcounts, + index_unbound=index_unbound, ) def compute_localsubhaloindex( - gidx, halocelloffsets, shnumber, shcounts, shcellcounts + gidx, halocelloffsets, shnumber, shcounts, shcellcounts, index_unbound=None ) -> da.Array: return da.map_blocks( get_local_shidx_daskwrap, @@ -849,6 +888,7 @@ def compute_localsubhaloindex( shnumber, shcounts, shcellcounts, + index_unbound=index_unbound, meta=np.array((), dtype=np.int64), ) diff --git a/tests/simulations/test_tngcatalog_operations.py b/tests/simulations/test_tngcatalog_operations.py new file mode 100644 index 00000000..c4c4e7e4 --- /dev/null +++ b/tests/simulations/test_tngcatalog_operations.py @@ -0,0 +1,18 @@ +from testdata_properties import require_testdata_path + +from scida import load + + +@require_testdata_path("interface", only=["TNG50-4_snapshot"]) +def test_groupids_for_particles(testdatapath): + obj = load(testdatapath, units=True) + pdata = obj.data["PartType0"] + # still missing some tests + # first particles should belong to first halo and subhalo in TNG + assert pdata["GroupID"][0].compute() == 0 + assert pdata["LocalSubhaloID"][0].compute() == 0 + assert pdata["SubhaloID"][0].compute() == 0 + # last particles should be unbound + assert pdata["GroupID"][-1].compute() == obj.misc["unboundID"] + assert pdata["LocalSubhaloID"][-1].compute() == obj.misc["unboundID"] + assert pdata["SubhaloID"][-1].compute() == obj.misc["unboundID"] From 72791fb383349f609d9c61778acf0aa6968dbd2a Mon Sep 17 00:00:00 2001 From: Chris Byrohl <9221545+cbyrohl@users.noreply.github.com> Date: Fri, 11 Aug 2023 13:59:55 +0200 Subject: [PATCH 4/4] minor --- tests/simulations/test_tngcatalog_operations.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/simulations/test_tngcatalog_operations.py b/tests/simulations/test_tngcatalog_operations.py index c4c4e7e4..c9197759 100644 --- a/tests/simulations/test_tngcatalog_operations.py +++ b/tests/simulations/test_tngcatalog_operations.py @@ -1,6 +1,5 @@ -from testdata_properties import require_testdata_path - from scida import load +from tests.testdata_properties import require_testdata_path @require_testdata_path("interface", only=["TNG50-4_snapshot"])