Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug/universe loading #359

Merged
merged 5 commits into from
Sep 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 9 additions & 17 deletions sfaira/data/dataloaders/base/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,17 +1120,12 @@ def read_ontology_class_map(self, fn):
if self.cell_type_obs_key is not None:
warnings.warn(f"file {fn} does not exist but cell_type_obs_key {self.cell_type_obs_key} is given")

def project_free_to_ontology(self, attr: str, copy: bool = False):
def project_free_to_ontology(self, attr: str):
"""
Project free text cell type names to ontology based on mapping table.

ToDo: add ontology ID setting here.
ToDo: only for cell type right now, extend to other meta data in the future.

:param copy: If True, a dataframe with the celltype annotation is returned, otherwise self.adata.obs is updated
inplace.

:return:
"""
ontology_map = attr + "_map"
if hasattr(self, ontology_map):
Expand All @@ -1139,7 +1134,6 @@ def project_free_to_ontology(self, attr: str, copy: bool = False):
ontology_map = None
print(f"WARNING: did not find ontology map for {attr} which was only defined by free annotation")
adata_fields = self._adata_ids
results = {}
col_original = attr + adata_fields.onto_original_suffix
labels_original = self.adata.obs[col_original].values
if ontology_map is not None: # only if this was defined
Expand Down Expand Up @@ -1173,19 +1167,17 @@ def project_free_to_ontology(self, attr: str, copy: bool = False):
# TODO this could be changed in the future, this allows this function to be used both on cell type name
# mapping files with and without the ID in the third column.
# This mapping blocks progression in the unit test if not deactivated.
results[getattr(adata_fields, attr)] = labels_mapped
self.adata.obs[getattr(adata_fields, attr)] = labels_mapped
self.__project_ontology_ids_obs(attr=attr, map_exceptions=map_exceptions, from_id=False,
adata_ids=adata_fields)
else:
results[getattr(adata_fields, attr)] = labels_original
results[getattr(adata_fields, attr) + adata_fields.onto_id_suffix] = \
# Assumes that the original labels are the correct ontology symbols, because of a lack of ontology,
# ontology IDs cannot be inferred.
# TODO is this necessary in the future?
self.adata.obs[getattr(adata_fields, attr)] = labels_original
self.adata.obs[getattr(adata_fields, attr) + adata_fields.onto_id_suffix] = \
[adata_fields.unknown_metadata_identifier] * self.adata.n_obs
results[getattr(adata_fields, attr) + adata_fields.onto_original_suffix] = labels_original
if copy:
return pd.DataFrame(results, index=self.adata.obs.index)
else:
for k, v in results.items():
self.adata.obs[k] = v
self.adata.obs[getattr(adata_fields, attr) + adata_fields.onto_original_suffix] = labels_original

def __impute_ontology_cols_obs(
self,
Expand Down Expand Up @@ -1238,7 +1230,7 @@ def __impute_ontology_cols_obs(
# Original annotation (free text):
original_present = col_original in self.adata.obs.columns
if original_present and not symbol_present and not id_present: # 1)
self.project_free_to_ontology(attr=attr, copy=False)
self.project_free_to_ontology(attr=attr)
if symbol_present or id_present: # 2)
if symbol_present and not id_present: # 2a)
self.__project_ontology_ids_obs(attr=attr, from_id=False, adata_ids=adata_ids)
Expand Down
2 changes: 1 addition & 1 deletion sfaira/data/dataloaders/base/dataset_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ def __init__(
elif package_source == "sfaira_extension":
package_source = "sfairae"
else:
raise ValueError(f"invalid package source {package_source} for {self._cwd}, {self.collection_id}")
raise ValueError(f"invalid package source {package_source} for {self._cwd}")
except IndexError as e:
raise IndexError(f"{e} for {self._cwd}")
loader_pydoc_path_sfaira = "sfaira.data.dataloaders.loaders."
Expand Down
15 changes: 9 additions & 6 deletions sfaira/data/dataloaders/loaders/super_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,15 @@ def __init__(
if f[:len(dir_prefix)] == dir_prefix and f not in dir_exclude: # Narrow down to data set directories
path_dsg = str(pydoc.locate(f"sfaira.data.dataloaders.loaders.{f}.FILE_PATH"))
if path_dsg is not None:
dataset_groups.append(DatasetGroupDirectoryOriented(
file_base=path_dsg,
data_path=data_path,
meta_path=meta_path,
cache_path=cache_path
))
try:
dataset_groups.append(DatasetGroupDirectoryOriented(
file_base=path_dsg,
data_path=data_path,
meta_path=meta_path,
cache_path=cache_path
))
except IndexError as e:
raise IndexError(f"{e} for '{cwd}', '{f}', '{path_dsg}'")
else:
warn(f"DatasetGroupDirectoryOriented was None for {f}")
super().__init__(dataset_groups=dataset_groups)
26 changes: 26 additions & 0 deletions sfaira/data/utils_scripts/survey_obs_annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import numpy as np
import sfaira
import sys

# Set global variables.
print("sys.argv", sys.argv)

data_path = str(sys.argv[1])
path_meta = str(sys.argv[2])
path_cache = str(sys.argv[3])

universe = sfaira.data.dataloaders.Universe(
data_path=data_path, meta_path=path_meta, cache_path=path_cache
)
for k, v in universe.datasets.items():
print(k)
v.load(
load_raw=False,
allow_caching=True,
)
for col in v.adata.obs.columns:
val = np.sort(np.unique(v.adata.obs[col].values))
if len(val) > 20:
val = val[:20]
print(f"{k}: {col}: {val}")
v.clear()