diff --git a/.bandit.yml b/.bandit.yml index 21917d362..48945b021 100644 --- a/.bandit.yml +++ b/.bandit.yml @@ -4,4 +4,4 @@ tests: [] # (optional) list skipped tests here: -skips: ['B101', 'B403', 'B404', 'B603', 'B607', 'B301', 'B303', 'B311', 'B310', 'B506'] +skips: ['B101', 'B403', 'B404', 'B603', 'B607', 'B301', 'B303', 'B311', 'B310', 'B506', 'B321', 'B402'] diff --git a/requirements.txt b/requirements.txt index e6d8dad87..830c646c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ anndata>=0.7.6 crossref_commons -cellxgene-schema>=2.0.3 dask docutils fuzzywuzzy @@ -21,7 +20,7 @@ PyYAML scanpy>=1.7.0 scipy>=1.2.1 seaborn -tensorflow>=2.0.0 # TODO remove as soon as # 70 is solved +tensorflow # TODO remove as soon as # 70 is solved tqdm requests versioneer diff --git a/sfaira/__init__.py b/sfaira/__init__.py index 6a84da418..117908b05 100644 --- a/sfaira/__init__.py +++ b/sfaira/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """A Data and Model Zoo for Single-Cell Genomics.""" +from ._settings import settings import sfaira.consts import sfaira.data import sfaira.genomes @@ -22,7 +23,7 @@ "Lukas Heumos" ]) __email__ = ', '.join([ - "leander.dony@helmholtz-muenchen.de", - "david.fischer@helmholtz-muenchen.de", - "lukas.heumos@helmholtz-muenchen.de" + "leander.dony@helmholtz-munich.de", + "david.fischer@helmholtz-munich.de", + "lukas.heumos@helmholtz-munich.de" ]) diff --git a/sfaira/_settings.py b/sfaira/_settings.py new file mode 100644 index 000000000..a7a00984d --- /dev/null +++ b/sfaira/_settings.py @@ -0,0 +1,77 @@ +""" +Settings class which for example holds paths to cache directories used throughout the code. +""" + +import os + + +SFAIRA_REPO_URL = "https://zenodo.org/record/4836517/files/" + + +class SfairaConfig: + """\ + Config manager for sfaira. + """ + + def __init__(self): + self.sfaira_repo_url = SFAIRA_REPO_URL + self._cachedir_base = os.path.join(os.path.expanduser("~"), ".cache", "sfaira") + self._cachedir_databases = os.path.join(self._cachedir_base, "dataset_meta") + self._cachedir_databases_cellxgene = os.path.join(self._cachedir_databases, "cellxgene") + self._cachedir_genomes = os.path.join(self._cachedir_base, "genomes") + self._cachedir_ontologies = os.path.join(self._cachedir_base, "ontologies") + + @property + def cachedir_base(self) -> str: + os.makedirs(self._cachedir_base, exist_ok=True) + return self._cachedir_base + + @cachedir_base.setter + def cachedir_base(self, cachedir_base): + if not isinstance(cachedir_base, str): + raise ValueError(f"cachedir_base needs to be provided as a string, was {type(cachedir_base)}") + if cachedir_base == "repo": + cachedir_base = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "cache") + self._cachedir_base = cachedir_base + + @property + def cachedir_databases(self) -> str: + os.makedirs(self._cachedir_databases, exist_ok=True) + return self._cachedir_databases + + @cachedir_databases.setter + def cachedir_databases(self, cachedir_databases): + raise ValueError("cachedir_databases cannot be set manually as it is defined as a subdirectory of" + " cachedir_base. please modify cachedir_base instead") + + @property + def cachedir_databases_cellxgene(self) -> str: + os.makedirs(self._cachedir_databases_cellxgene, exist_ok=True) + return self._cachedir_databases_cellxgene + + @cachedir_databases_cellxgene.setter + def cachedir_databases_cellxgene(self, cachedir_databases_cellxgene): + raise ValueError("cachedir_databases_cellxgene cannot be set manually as it is defined as a subdirectory" + " of cachedir_base. please modify cachedir_base instead") + + @property + def cachedir_genomes(self) -> str: + os.makedirs(self._cachedir_genomes, exist_ok=True) + return self._cachedir_genomes + + @cachedir_genomes.setter + def cachedir_genomes(self, cachedir_genomes): + raise ValueError("cachedir_genomes cannot be set manually as it is defined as a subdirectory of cachedir_base." + "please modify cachedir_base instead") + + @property + def cachedir_ontologies(self) -> str: + os.makedirs(self._cachedir_ontologies, exist_ok=True) + return self._cachedir_ontologies + + @cachedir_ontologies.setter + def cachedir_ontologies(self, cachedir_ontologies): + raise ValueError("cachedir_ontologies cannot be set manually as it is defined as a subdirectory of cachedir_base. please modify cachedir_base instead") + + +settings = SfairaConfig() diff --git a/sfaira/consts/__init__.py b/sfaira/consts/__init__.py index ffcc7740f..2ecf519ba 100644 --- a/sfaira/consts/__init__.py +++ b/sfaira/consts/__init__.py @@ -1,7 +1,6 @@ from sfaira.consts.adata_fields import AdataIds, AdataIdsSfaira, AdataIdsCellxgene, AdataIdsCellxgene_v2_0_0 -from sfaira.consts.directories import CACHE_DIR, SFAIRA_REPO_URL from sfaira.consts.meta_data_files import META_DATA_FIELDS -from sfaira.consts.ontologies import OntologyContainerSfaira +from sfaira.consts.ontologies import OntologyContainerSfaira, OTHER_ORGANISM_KEY from sfaira.consts.utils import clean_cache OCS = OntologyContainerSfaira() diff --git a/sfaira/consts/adata_fields.py b/sfaira/consts/adata_fields.py index 5409ce45f..3313b3df8 100644 --- a/sfaira/consts/adata_fields.py +++ b/sfaira/consts/adata_fields.py @@ -89,9 +89,7 @@ def __init__(self): self.onto_original_suffix = "_original" self.feature_kwargs = { - "match_to_reference": { - "human": "Homo_sapiens.GRCh38.104", - "mouse": "Mus_musculus.GRCm39.104"}, + "match_to_release": "104", "remove_gene_version": True, "subset_genes_to_type": None} @@ -305,6 +303,4 @@ class AdataIdsCellxgene_v2_0_0(AdataIdsCellxgene): def __init__(self): super(AdataIdsCellxgene_v2_0_0, self).__init__() - self.feature_kwargs["match_to_reference"] = { - "human": "Homo_sapiens.GRCh38.104", - "mouse": "Mus_musculus.GRCm39.104"} + self.feature_kwargs["match_to_release"] = "104" diff --git a/sfaira/consts/directories.py b/sfaira/consts/directories.py deleted file mode 100644 index 33d5422e7..000000000 --- a/sfaira/consts/directories.py +++ /dev/null @@ -1,16 +0,0 @@ -""" -Paths to cache directories used throughout the code. -""" - -import os - -CACHE_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "cache") - -CACHE_DIR_DATABASES = os.path.join(CACHE_DIR, "dataset_meta") -CACHE_DIR_DATABASES_CELLXGENE = os.path.join(CACHE_DIR_DATABASES, "cellxgene") - -CACHE_DIR_GENOMES = os.path.join(CACHE_DIR, "genomes") - -CACHE_DIR_ONTOLOGIES = os.path.join(CACHE_DIR, "ontologies") - -SFAIRA_REPO_URL = "https://zenodo.org/record/4836517/files/" diff --git a/sfaira/consts/ontologies.py b/sfaira/consts/ontologies.py index d2a92c80a..d8c31ab6f 100644 --- a/sfaira/consts/ontologies.py +++ b/sfaira/consts/ontologies.py @@ -2,13 +2,16 @@ from sfaira.versions.metadata import OntologyList, OntologyCl from sfaira.versions.metadata import OntologyCellosaurus, OntologyHancestro, OntologyHsapdv, OntologyMondo, \ - OntologyMmusdv, OntologyEfo, OntologySex, OntologyUberon + OntologyMmusdv, OntologyEfo, OntologySex, OntologyTaxon, OntologyUberon, OntologyUberonLifecyclestage + +OTHER_ORGANISM_KEY = "other" DEFAULT_CL = "v2021-08-10" DEFAULT_HSAPDV = "master" DEFAULT_MONDO = "v2021-08-11" DEFAULT_MMUSDV = "master" DEFAULT_PATO = "v2021-08-06" +DEFAULT_NCBITAXON = "v2021-06-10" DEFAULT_UBERON = "v2021-07-27" @@ -26,9 +29,12 @@ class OntologyContainerSfaira: _development_stage: Union[None, Dict[str, Union[OntologyHsapdv, OntologyMmusdv]]] _ethnicity: Union[None, Dict[str, Union[OntologyHancestro, None]]] _organ: Union[None, OntologyUberon] + _organism: Union[None, OntologyTaxon] _sex: Union[None, OntologySex] def __init__(self): + self.key_other = OTHER_ORGANISM_KEY + self.annotated = OntologyList(terms=[True, False]) self.author = None self.assay_differentiation = None @@ -50,7 +56,7 @@ def __init__(self): self.individual = None self.normalization = None self._organ = None - self.organism = OntologyList(terms=["mouse", "human"]) # TODO introduce NCBItaxon here + self._organism = None self.primary_data = OntologyList(terms=[True, False]) self.sample_source = OntologyList(terms=["primary_tissue", "2d_culture", "3d_culture", "tumor"]) self._sex = None @@ -60,6 +66,12 @@ def __init__(self): self.year = OntologyList(terms=list(range(2000, 3000))) def reload_ontology(self, attr): + """ + Complex alternative to attribute-wise setters. + + :param attr: + :return: + """ kwargs = {"recache": True} if attr == "assay_sc": self._assay_sc = OntologyEfo(**kwargs) @@ -69,37 +81,40 @@ def reload_ontology(self, attr): self._cell_type = OntologyCl(branch=DEFAULT_CL, **kwargs) elif attr == "development_stage": self._development_stage = { - "human": OntologyHsapdv(**kwargs), - "mouse": OntologyMmusdv(**kwargs), + "homosapiens": OntologyHsapdv(**kwargs), + "musmusculus": OntologyMmusdv(**kwargs), + self.key_other: OntologyUberonLifecyclestage(branch=DEFAULT_UBERON, **kwargs), } elif attr == "disease": self._disease = OntologyMondo(**kwargs) elif attr == "ethnicity": self._ethnicity = { - "human": OntologyHancestro(), - "mouse": None, + "homosapiens": OntologyHancestro(), + self.key_other: None, } elif attr == "organ": self._organ = OntologyUberon(**kwargs) + elif attr == "organism": + self._organism = OntologyTaxon(**kwargs) elif attr == "sex": self._sex = OntologySex(**kwargs) return self._assay_sc @property def assay_sc(self): - if self._assay_sc is None: + if self._assay_sc is None: # Lazy loading after class instantiation. self._assay_sc = OntologyEfo() return self._assay_sc @property def cell_line(self): - if self._cell_line is None: + if self._cell_line is None: # Lazy loading after class instantiation. self._cell_line = OntologyCellosaurus() return self._cell_line @property def cell_type(self): - if self._cell_type is None: + if self._cell_type is None: # Lazy loading after class instantiation. self._cell_type = OntologyCl(branch=DEFAULT_CL) return self._cell_type @@ -109,36 +124,43 @@ def cell_type(self, x: str): @property def development_stage(self): - if self._development_stage is None: + if self._development_stage is None: # Lazy loading after class instantiation. self._development_stage = { - "human": OntologyHsapdv(branch=DEFAULT_HSAPDV), - "mouse": OntologyMmusdv(branch=DEFAULT_MMUSDV), + "Homo sapiens": OntologyHsapdv(branch=DEFAULT_HSAPDV), + "Mus musculus": OntologyMmusdv(branch=DEFAULT_MMUSDV), + self.key_other: OntologyUberonLifecyclestage(branch=DEFAULT_UBERON), } return self._development_stage @property def disease(self): - if self._disease is None: + if self._disease is None: # Lazy loading after class instantiation. self._disease = OntologyMondo(branch=DEFAULT_MONDO) return self._disease @property def ethnicity(self): - if self._ethnicity is None: + if self._ethnicity is None: # Lazy loading after class instantiation. self._ethnicity = { - "human": OntologyHancestro(), - "mouse": None, + "Homo sapiens": OntologyHancestro(), + self.key_other: None, } return self._ethnicity @property def organ(self): - if self._organ is None: + if self._organ is None: # Lazy loading after class instantiation. self._organ = OntologyUberon(branch=DEFAULT_UBERON) return self._organ + @property + def organism(self): + if self._organism is None: # Lazy loading after class instantiation. + self._organism = OntologyTaxon(branch=DEFAULT_NCBITAXON) + return self._organism + @property def sex(self): - if self._sex is None: + if self._sex is None: # Lazy loading after class instantiation. self._sex = OntologySex(branch=DEFAULT_PATO) return self._sex diff --git a/sfaira/consts/utils.py b/sfaira/consts/utils.py index 7a304a92a..67379ca6b 100644 --- a/sfaira/consts/utils.py +++ b/sfaira/consts/utils.py @@ -2,7 +2,7 @@ import shutil from typing import Union -from sfaira.consts.directories import CACHE_DIR, CACHE_DIR_DATABASES, CACHE_DIR_GENOMES, CACHE_DIR_ONTOLOGIES +from sfaira import settings def clean_cache(cache: Union[None, str] = None): @@ -13,10 +13,10 @@ def clean_cache(cache: Union[None, str] = None): """ if cache is not None: cache_dir_dict = { - "all": CACHE_DIR, - "dataset_meta": CACHE_DIR_DATABASES, - "genomes": CACHE_DIR_GENOMES, - "ontologies": CACHE_DIR_ONTOLOGIES, + "all": settings.cachedir_base, + "dataset_meta": settings.cachedir_databases, + "genomes": settings.cachedir_genomes, + "ontologies": settings.cachedir_ontologies, } if cache not in cache_dir_dict.keys(): raise ValueError(f"Did not find cache directory input {cache} in support list: " diff --git a/sfaira/data/dataloaders/base/dataset.py b/sfaira/data/dataloaders/base/dataset.py index dda603d66..d03df3cbf 100644 --- a/sfaira/data/dataloaders/base/dataset.py +++ b/sfaira/data/dataloaders/base/dataset.py @@ -39,7 +39,6 @@ class DatasetBase(abc.ABC): adata: Union[None, anndata.AnnData] - class_maps: dict _meta: Union[None, pandas.DataFrame] data_dir_base: Union[None, str] meta_path: Union[None, str] @@ -78,11 +77,11 @@ class DatasetBase(abc.ABC): _bio_sample: Union[None, str] _year: Union[None, int] + _bio_sample_obs_key: Union[None, str] + _tech_sample_obs_key: Union[None, str] assay_sc_obs_key: Union[None, str] assay_differentiation_obs_key: Union[None, str] assay_type_differentiation_obs_key: Union[None, str] - assay_cell_line_obs_key: Union[None, str] - bio_sample_obs_key: Union[None, str] cell_type_obs_key: Union[None, str] development_stage_obs_key: Union[None, str] disease_obs_key: Union[None, str] @@ -93,7 +92,6 @@ class DatasetBase(abc.ABC): sample_source_obs_key: Union[None, str] sex_obs_key: Union[None, str] state_exact_obs_key: Union[None, str] - tech_sample_obs_key: Union[None, str] gene_id_symbols_var_key: Union[None, str] gene_id_ensembl_var_key: Union[None, str] @@ -205,8 +203,6 @@ def __init__( self.gene_id_symbols_var_key = None self.gene_id_ensembl_var_key = None - self.class_maps = {"0": {}} - self._celltype_universe = None self._ontology_class_map = None @@ -510,7 +506,7 @@ def collapse_counts(self): def streamline_features( self, - match_to_reference: Union[str, Dict[str, str], None] = None, + match_to_release: Union[str, Dict[str, str], None] = None, remove_gene_version: bool = True, subset_genes_to_type: Union[None, str, List[str]] = None, schema: Union[str, None] = None, @@ -520,9 +516,10 @@ def streamline_features( This also adds missing ensid or gene symbol columns if match_to_reference is not set to False and removes all adata.var columns that are not defined as gene_id_ensembl_var_key or gene_id_symbol_var_key in the dataloader. - :param match_to_reference: Which annotation to map the feature space to. Can be: - - str: Provide the name of the annotation in the format Organism.Assembly.Release - - dict: Mapping of organism to name of the annotation (see str format). Chooses annotation for each + :param match_to_release: Which genome annotation release to map the feature space to. Note that assemblies from + ensbeml are usually named as Organism.Assembly.Release, this is the Release string. Can be: + - str: Provide the name of the release. + - dict: Mapping of organism to name of the release (see str format). Chooses release for each data set based on organism annotation. :param remove_gene_version: Whether to remove the version number after the colon sometimes found in ensembl gene ids. @@ -541,20 +538,20 @@ def streamline_features( adata_target_ids = AdataIdsCellxgene_v2_0_0() else: raise ValueError(f"did not recognize schema {schema}") - match_to_reference = adata_target_ids.feature_kwargs["match_to_reference"] + match_to_release = adata_target_ids.feature_kwargs["match_to_release"] remove_gene_version = adata_target_ids.feature_kwargs["remove_gene_version"] subset_genes_to_type = adata_target_ids.feature_kwargs["subset_genes_to_type"] # Set genome container if mapping of gene labels is requested - if isinstance(match_to_reference, dict): - match_to_reference = match_to_reference[self.organism] - self._set_genome(assembly=match_to_reference) - self.mapped_features = self.genome_container.assembly + if isinstance(match_to_release, dict): + match_to_release = match_to_release[self.organism] + self._set_genome(organism=self.organism, release=match_to_release) + self.mapped_features = self.genome_container.release self.remove_gene_version = remove_gene_version self.subset_gene_type = subset_genes_to_type # Streamline feature space: - self._add_missing_featurenames(match_to_reference=match_to_reference) + self._add_missing_featurenames(match_to_reference=match_to_release) for key in [self.gene_id_ensembl_var_key, self.gene_id_symbols_var_key]: # Make features unique (to avoid na-matches in converted columns to be collapsed by # _collapse_ensembl_gene_id_versions() below. @@ -640,7 +637,7 @@ def streamline_features( var=var_new, uns=self.adata.uns ) - self.adata.uns[self._adata_ids.mapped_features] = match_to_reference + self.adata.uns[self._adata_ids.mapped_features] = match_to_release def streamline_metadata( self, @@ -759,9 +756,9 @@ def streamline_metadata( if hasattr(self, k) and getattr(self, k) is not None: val = getattr(self, k) elif hasattr(self, f"{k}_obs_key") and getattr(self, f"{k}_obs_key") is not None: - val = np.sort(self.adata.obs[getattr(self, f"{k}_obs_key")].unique().astype(str)).tolist() + val = self.adata.obs[getattr(self, f"{k}_obs_key")].unique().astype(str).tolist() elif getattr(self._adata_ids, k) in self.adata.obs.columns: - val = np.sort(self.adata.obs[getattr(self._adata_ids, k)].unique().astype(str)).tolist() + val = self.adata.obs[getattr(self._adata_ids, k)].unique().astype(str).tolist() else: val = None while hasattr(val, '__len__') and not isinstance(val, str) and len(val) == 1: # Unpack nested lists/tuples. @@ -794,8 +791,7 @@ def streamline_metadata( # * original labels: "attr" + self._adata_ids.onto_original_suffix obs_new = pd.DataFrame(index=self.adata.obs.index) for k in [x for x in adata_target_ids.obs_keys]: - if k in experiment_batch_labels and getattr(self, f"{k}_obs_key") is not None and \ - "*" in getattr(self, f"{k}_obs_key"): + if k in experiment_batch_labels and getattr(self, f"{k}_obs_key") is not None: # Handle batch-annotation columns which can be provided as a combination of columns separated by an # asterisk. # The queried meta data are always: @@ -811,8 +807,9 @@ def streamline_metadata( # in .obs set. print(f"WARNING: attribute {batch_col} of data set {self.id} was not found in columns.") # Build a combination label out of all columns used to describe this group. + # Add data set label into this label so that these groups are unique across data sets. val = [ - "_".join([str(xxx) for xxx in xx]) + self.doi_cleaned_id + "_".join([str(xxx) for xxx in xx]) for xx in zip(*[self.adata.obs[batch_col].values.tolist() for batch_col in batch_cols]) ] else: @@ -969,89 +966,25 @@ def write_distributed_store( else: raise ValueError() - def write_backed( - self, - adata_backed: anndata.AnnData, - genome: str, - idx: np.ndarray, - load_raw: bool = False, - allow_caching: bool = True - ): - """ - Loads data set into slice of backed anndata object. - - Note: scatter updates to backed sparse arrays are not yet supported by anndata. Accordingly, we need to work - around below using .append() of the backed matrix. - - :param adata_backed: - :param genome: Genome name to use as refernce. - :param idx: Indices in adata_backed to write observations to. This can be used to immediately create a - shuffled object. - :param load_raw: See .load(). - :param allow_caching: See .load(). - :return: New row index for next element to be written into backed anndata. - """ - self.load(load_raw=load_raw, allow_caching=allow_caching) - # Check if writing to sparse or dense matrix: - if isinstance(adata_backed.X, np.ndarray) or \ - isinstance(adata_backed.X, h5py._hl.dataset.Dataset): # backed dense - if isinstance(self.adata.X, scipy.sparse.csr_matrix) or \ - isinstance(self.adata.X, scipy.sparse.csc_matrix) or \ - isinstance(self.adata.X, scipy.sparse.lil_matrix): - # map to dense array - x_new = self.adata.X.toarray() - else: - x_new = self.adata.X - - adata_backed.X[np.sort(idx), :] = x_new[np.argsort(idx), :] - for k in adata_backed.obs.columns: - if k == self._adata_ids.dataset: - adata_backed.obs.loc[np.sort(idx), self._adata_ids.dataset] = [ - self.id for _ in range(len(idx))] - elif k in self.adata.obs.columns: - adata_backed.obs.loc[np.sort(idx), k] = self.adata.obs[k].values[np.argsort(idx)] - elif k in list(self.adata.uns.keys()): - adata_backed.obs.loc[np.sort(idx), k] = [self.adata.uns[k] for i in range(len(idx))] - else: - # Need to fill this instead of throwing an exception as this condition can trigger for one element - # within a loop over multiple data sets (ie in data set human). - adata_backed.obs.loc[idx, k] = ["key_not_found" for i in range(len(idx))] - elif isinstance(adata_backed.X, anndata._core.sparse_dataset.SparseDataset): # backed sparse - # cannot scatter update on backed sparse yet! assert that updated block is meant to be appended: - assert np.all(idx == np.arange(adata_backed.shape[0], adata_backed.shape[0] + len(idx))) - if not isinstance(self.adata.X, scipy.sparse.csr_matrix): - x_new = self.adata.X.tocsr() - else: - x_new = self.adata.X - adata_backed.X.append(x_new[np.argsort(idx)]) - adata_backed._n_obs = adata_backed.X.shape[0] # not automatically updated after append - adata_backed.obs = adata_backed.obs.append( # .obs was not broadcasted to the right shape! - pandas.DataFrame(dict([ - (k, [self.id for i in range(len(idx))]) if k == self._adata_ids.dataset - else (k, self.adata.obs[k].values[np.argsort(idx)]) if k in self.adata.obs.columns - else (k, [self.adata.uns[k] for _ in range(len(idx))]) if k in list(self.adata.uns.keys()) - else (k, ["key_not_found" for _ in range(len(idx))]) - for k in adata_backed.obs.columns - ])) - ) - self.clear() - else: - raise ValueError(f"Did not recognize backed AnnData.X format {type(adata_backed.X)}") - - def _set_genome(self, assembly: Union[str, None]): - self.genome_container = GenomeContainer( - assembly=assembly, - ) + def _set_genome(self, organism: str, release: str): + self.genome_container = GenomeContainer(organism=organism, release=release) @property def doi_cleaned_id(self): return "_".join(self.id.split("_")[:-1]) - def get_ontology(self, k) -> OntologyHierarchical: + def get_ontology(self, k) -> Union[OntologyHierarchical, None]: x = getattr(self.ontology_container_sfaira, k) if hasattr(self.ontology_container_sfaira, k) else None - if isinstance(x, dict): - assert isinstance(self.organism, str) - x = x[self.organism] + if x is not None and isinstance(x, dict): + assert isinstance(self.organism, str), self.organism + # Check if organism-specific option is available, otherwise choose generic option: + if self.organism in x.keys(): + k = self.organism + else: + k = self.ontology_container_sfaira.key_other + assert k in x.keys(), x.keys() # Sanity check on dictionary keys. + x = x[k] + assert x is None or isinstance(x, Ontology), x # Sanity check on dictionary element. return x @property @@ -1166,7 +1099,7 @@ def project_free_to_ontology(self, attr: str): # This protection blocks progression in the unit test if not deactivated. self._value_protection( attr=attr, - allowed=getattr(self.ontology_container_sfaira, attr), + allowed=self.get_ontology(k=attr), attempted=[x for x in list(set(labels_mapped)) if x not in map_exceptions], ) # Add cell type IDs into object: @@ -1450,7 +1383,7 @@ def assay_sc(self) -> Union[None, str]: @assay_sc.setter def assay_sc(self, x: str): - x = self._value_protection(attr="assay_sc", allowed=self.ontology_container_sfaira.assay_sc, attempted=x) + x = self._value_protection(attr="assay_sc", allowed=self.get_ontology(k="assay_sc"), attempted=x) self._assay_sc = x @property @@ -1468,7 +1401,7 @@ def assay_differentiation(self) -> Union[None, str]: @assay_differentiation.setter def assay_differentiation(self, x: str): x = self._value_protection(attr="assay_differentiation", - allowed=self.ontology_container_sfaira.assay_differentiation, attempted=x) + allowed=self.get_ontology(k="assay_differentiation"), attempted=x) self._assay_differentiation = x @property @@ -1486,25 +1419,59 @@ def assay_type_differentiation(self) -> Union[None, str]: @assay_type_differentiation.setter def assay_type_differentiation(self, x: str): x = self._value_protection(attr="assay_type_differentiation", - allowed=self.ontology_container_sfaira.assay_type_differentiation, attempted=x) + allowed=self.get_ontology(k="assay_type_differentiation"), attempted=x) self._assay_type_differentiation = x @property def bio_sample(self) -> Union[None, str]: - if self._bio_sample is not None: + if self._bio_sample is not None and self._bio_sample != self._adata_ids.unknown_metadata_identifier: return self._bio_sample else: - if self.meta is None: - self.load_meta(fn=None) - if self.meta is not None and self._adata_ids.bio_sample in self.meta.columns: - return self.meta[self._adata_ids.bio_sample] - else: - return None + # Define biological sample automatically. + bio_sample = "*".join([x for x in [ + self.assay_sc, + self.assay_differentiation, + self.assay_type_differentiation, + self.development_stage, + self.disease, + self.ethnicity, + self.individual, + self.organ, + self.organism, + self.sex, + ] if x is not None]) + return bio_sample if bio_sample != "" else None @bio_sample.setter def bio_sample(self, x: str): self._bio_sample = x + @property + def bio_sample_obs_key(self) -> Union[None, str]: + if self._bio_sample_obs_key is not None and \ + self._bio_sample_obs_key != self._adata_ids.unknown_metadata_identifier: + return self._bio_sample_obs_key + else: + # Define biological sample automatically. + bio_sample_obs_key = "*".join([x for x in [ + self.assay_sc_obs_key, + self.assay_differentiation_obs_key, + self.assay_type_differentiation_obs_key, + self.development_stage_obs_key, + self.disease_obs_key, + self.ethnicity_obs_key, + self.individual_obs_key, + self.organ_obs_key, + self.organism_obs_key, + self.sex_obs_key, + self.state_exact_obs_key, + ] if x is not None]) + return bio_sample_obs_key if bio_sample_obs_key != "" else None + + @bio_sample_obs_key.setter + def bio_sample_obs_key(self, x: str): + self._bio_sample_obs_key = x + @property def cell_line(self) -> Union[None, str]: if self._cell_line is not None: @@ -1573,7 +1540,7 @@ def default_embedding(self) -> Union[None, str]: @default_embedding.setter def default_embedding(self, x: str): - x = self._value_protection(attr="default_embedding", allowed=self.ontology_container_sfaira.default_embedding, + x = self._value_protection(attr="default_embedding", allowed=self.get_ontology(k="default_embedding"), attempted=x) self._default_embedding = x @@ -1592,7 +1559,7 @@ def development_stage(self) -> Union[None, str]: @development_stage.setter def development_stage(self, x: str): x = self._value_protection(attr="development_stage", - allowed=self.ontology_container_sfaira.development_stage[self.organism], + allowed=self.get_ontology(k="development_stage"), attempted=x) self._development_stage = x @@ -1610,7 +1577,7 @@ def disease(self) -> Union[None, str]: @disease.setter def disease(self, x: str): - x = self._value_protection(attr="disease", allowed=self.ontology_container_sfaira.disease, + x = self._value_protection(attr="disease", allowed=self.get_ontology(k="disease"), attempted=x) self._disease = x @@ -1733,7 +1700,7 @@ def ethnicity(self) -> Union[None, str]: @ethnicity.setter def ethnicity(self, x: str): - x = self._value_protection(attr="ethnicity", allowed=self.ontology_container_sfaira.ethnicity[self.organism], + x = self._value_protection(attr="ethnicity", allowed=self.get_ontology(k="ethnicity"), attempted=x) self._ethnicity = x @@ -1819,7 +1786,7 @@ def normalization(self) -> Union[None, str]: @normalization.setter def normalization(self, x: str): - x = self._value_protection(attr="normalization", allowed=self.ontology_container_sfaira.normalization, + x = self._value_protection(attr="normalization", allowed=self.get_ontology(k="normalization"), attempted=x) self._normalization = x @@ -1837,7 +1804,7 @@ def primary_data(self) -> Union[None, bool]: @primary_data.setter def primary_data(self, x: bool): - x = self._value_protection(attr="primary_data", allowed=self.ontology_container_sfaira.primary_data, + x = self._value_protection(attr="primary_data", allowed=self.get_ontology(k="primary_data"), attempted=x) self._primary_data = x @@ -1855,7 +1822,7 @@ def organ(self) -> Union[None, str]: @organ.setter def organ(self, x: str): - x = self._value_protection(attr="organ", allowed=self.ontology_container_sfaira.organ, attempted=x) + x = self._value_protection(attr="organ", allowed=self.get_ontology(k="organ"), attempted=x) self._organ = x @property @@ -1872,7 +1839,7 @@ def organism(self) -> Union[None, str]: @organism.setter def organism(self, x: str): - x = self._value_protection(attr="organism", allowed=self.ontology_container_sfaira.organism, attempted=x) + x = self._value_protection(attr="organism", allowed=self.get_ontology(k="organism"), attempted=x) # Update ontology container so that correct ontologies are queried: self.ontology_container_sfaira.organism_cache = x self._organism = x @@ -1891,7 +1858,7 @@ def sample_source(self) -> Union[None, str]: @sample_source.setter def sample_source(self, x: str): - x = self._value_protection(attr="sample_source", allowed=self.ontology_container_sfaira.sample_source, + x = self._value_protection(attr="sample_source", allowed=self.get_ontology(k="sample_source"), attempted=x) self._sample_source = x @@ -1909,7 +1876,7 @@ def sex(self) -> Union[None, str]: @sex.setter def sex(self, x: str): - x = self._value_protection(attr="sex", allowed=self.ontology_container_sfaira.sex, attempted=x) + x = self._value_protection(attr="sex", allowed=self.get_ontology(k="sex"), attempted=x) self._sex = x @property @@ -1938,20 +1905,29 @@ def state_exact(self, x: str): @property def tech_sample(self) -> Union[None, str]: - if self._tech_sample is not None: + if self._tech_sample is not None and self._tech_sample != self._adata_ids.unknown_metadata_identifier: return self._tech_sample else: - if self.meta is None: - self.load_meta(fn=None) - if self.meta is not None and self._adata_ids.tech_sample in self.meta.columns: - return self.meta[self._adata_ids.tech_sample] - else: - return None + # Define technical batch automatically as biological sample. + return self.bio_sample @tech_sample.setter def tech_sample(self, x: str): self._tech_sample = x + @property + def tech_sample_obs_key(self) -> Union[None, str]: + if self._tech_sample_obs_key is not None and \ + self._tech_sample_obs_key != self._adata_ids.unknown_metadata_identifier: + return self._tech_sample_obs_key + else: + # Define technical batch automatically as biological sample. + return self.bio_sample_obs_key + + @tech_sample_obs_key.setter + def tech_sample_obs_key(self, x: str): + self._tech_sample_obs_key = x + @property def year(self) -> Union[None, int]: if self._year is not None: @@ -1966,7 +1942,7 @@ def year(self) -> Union[None, int]: @year.setter def year(self, x: int): - x = self._value_protection(attr="year", allowed=self.ontology_container_sfaira.year, attempted=x) + x = self._value_protection(attr="year", allowed=self.get_ontology(k="year"), attempted=x) self._year = x @property diff --git a/sfaira/data/dataloaders/base/dataset_group.py b/sfaira/data/dataloaders/base/dataset_group.py index c6495c0c3..6105476af 100644 --- a/sfaira/data/dataloaders/base/dataset_group.py +++ b/sfaira/data/dataloaders/base/dataset_group.py @@ -229,25 +229,27 @@ def streamline_metadata( def streamline_features( self, - match_to_reference: Union[str, Dict[str, str], None] = None, + match_to_release: Union[str, Dict[str, str], None] = None, remove_gene_version: bool = True, subset_genes_to_type: Union[None, str, List[str]] = None, schema: Union[str, None] = None, ): """ Subset and sort genes to genes defined in an assembly or genes of a particular type, such as protein coding. - :param match_to_reference: Which annotation to map the feature space to. Can be: - - str: Provide the name of the annotation in the format Organism.Assembly.Release - - dict: Mapping of organism to name of the annotation (see str format). Chooses annotation for each data set - based on organism annotation. - :param remove_gene_version: Whether to remove the version number after the colon sometimes found in ensembl gene ids. - :param subset_genes_to_type: Type(s) to subset to. Can be a single type or a list of types or None. Types can be: - - None: All genes in assembly. - - "protein_coding": All protein coding genes in assembly. + :param match_to_release: Which genome annotation release to map the feature space to. Note that assemblies from + ensbeml are usually named as Organism.Assembly.Release, this is the Release string. Can be: + - str: Provide the name of the release. + - dict: Mapping of organism to name of the release (see str format). Chooses release for each + data set based on organism annotation.:param remove_gene_version: Whether to remove the version + number after the colon sometimes found in ensembl gene ids. + :param subset_genes_to_type: Type(s) to subset to. Can be a single type or a list of types or None. + Types can be: + - None: All genes in assembly. + - "protein_coding": All protein coding genes in assembly. """ for x in self.ids: self.datasets[x].streamline_features( - match_to_reference=match_to_reference, + match_to_release=match_to_release, remove_gene_version=remove_gene_version, subset_genes_to_type=subset_genes_to_type, schema=schema, @@ -415,21 +417,24 @@ def adata(self): adata_concat = adata_ls[0] adata_concat.obs[self._adata_ids.dataset] = ds_id else: - # Check that all individual adata objects in linked Dataset instances have identicall streamlined features and metadata + # Check that all individual adata objects in linked Dataset instances have identicall streamlined features + # and metadata match_ref_list = [] rm_gene_ver_list = [] gene_type_list = [] for d_id in self.ids: if self.datasets[d_id].adata is not None: - assert self.datasets[d_id].mapped_features, f"Dataset {d_id} does not seem to have a streamlined " \ - f"featurespace. To obtain an adata object from this " \ - f"DatasetGroup, all contained Datasets need to have a " \ - f"streamlined featurespace. Run .streamline_features()" \ - f" first." - assert self.datasets[d_id].streamlined_meta, f"Dataset {d_id} does not seem to have streamlined " \ - f"metadata. To obtain an adata object from this " \ - f"DatasetGroup, all contained Datasets need to have " \ - f"streamlined metadata. Run .streamline_metadata() first." + assert self.datasets[d_id].mapped_features, \ + f"Dataset {d_id} does not seem to have a streamlined " \ + f"featurespace. To obtain an adata object from this " \ + f"DatasetGroup, all contained Datasets need to have a " \ + f"streamlined featurespace. Run .streamline_features()" \ + f" first." + assert self.datasets[d_id].streamlined_meta, \ + f"Dataset {d_id} does not seem to have streamlined " \ + f"metadata. To obtain an adata object from this " \ + f"DatasetGroup, all contained Datasets need to have " \ + f"streamlined metadata. Run .streamline_metadata() first." match_ref_list.append(self.datasets[d_id].mapped_features) rm_gene_ver_list.append(self.datasets[d_id].remove_gene_version) gene_type_list.append(self.datasets[d_id].subset_gene_type) @@ -438,8 +443,9 @@ def adata(self): "'match_to_reference' of method .streamline_features())." \ "This is however a prerequisite for creating a combined adata object." assert len(set(rm_gene_ver_list)) == 1, \ - "Not all datasets in this group have had their gene version removed (argument 'remove_gene_version' of " \ - "method .streamline_features()). This is however a prerequisite for creating a combined adata object." + "Not all datasets in this group have had their gene version removed (argument 'remove_gene_version' " \ + "of method .streamline_features()). This is however a prerequisite for creating a combined adata " \ + "object." assert len(set(gene_type_list)) == 1, \ "Not all datasets in this group had their featurespace subsetted to the same gene type (argument " \ "'subset_gene_type' of method .streamline_features()). This is however a prerequisite for creating a " \ @@ -861,16 +867,8 @@ def ids(self): ids.extend(x.ids) return ids - def get_gc( - self, - genome: str = None - ): - if genome.lower().startswith("homo_sapiens") or genome.lower().startswith("mus_musculus"): - g = GenomeContainer( - assembly=genome - ) - else: - raise ValueError(f"Genome {genome} not recognised. Needs to start with 'Mus_Musculus' or 'Homo_Sapiens'.") + def get_gc(self, genome: str = None): + g = GenomeContainer(release=genome) return g def ncells_bydataset(self, annotated_only: bool = False): @@ -930,7 +928,7 @@ def load( **kwargs ): """ - Loads data set human into anndata object. + Loads data set homosapiens into anndata object. :param annotated_only: :param load_raw: See .load(). @@ -950,25 +948,28 @@ def load( def streamline_features( self, - match_to_reference: Union[str, Dict[str, str], None] = None, + match_to_release: Union[str, Dict[str, str], None] = None, remove_gene_version: bool = True, subset_genes_to_type: Union[None, str, List[str]] = None, schema: Union[str, None] = None, ): """ Subset and sort genes to genes defined in an assembly or genes of a particular type, such as protein coding. - :param match_to_reference: Which annotation to map the feature space to. Can be: - - str: Provide the name of the annotation in the format Organism.Assembly.Release - - dict: Mapping of organism to name of the annotation (see str format). Chooses annotation for each data set - based on organism annotation. - :param remove_gene_version: Whether to remove the version number after the colon sometimes found in ensembl gene ids. - :param subset_genes_to_type: Type(s) to subset to. Can be a single type or a list of types or None. Types can be: - - None: All genes in assembly. - - "protein_coding": All protein coding genes in assembly. + + :param match_to_release: Which genome annotation release to map the feature space to. Note that assemblies from + ensembl are usually named as Organism.Assembly.Release, this is the Release string. Can be: + - str: Provide the name of the release. + - dict: Mapping of organism to name of the release (see str format). Chooses release for each + data set based on organism annotation.:param remove_gene_version: Whether to remove the version + number after the colon sometimes found in ensembl gene ids. + :param subset_genes_to_type: Type(s) to subset to. Can be a single type or a list of types or None. + Types can be: + - None: All genes in assembly. + - "protein_coding": All protein coding genes in assembly. """ for x in self.dataset_groups: x.streamline_features( - match_to_reference=match_to_reference, + match_to_release=match_to_release, remove_gene_version=remove_gene_version, subset_genes_to_type=subset_genes_to_type, schema=schema, @@ -1100,129 +1101,6 @@ def write_distributed_store( x.write_distributed_store(dir_cache=dir_cache, store_format=store_format, dense=dense, compression_kwargs=compression_kwargs, chunks=chunks) - def write_backed( - self, - fn_backed: PathLike, - genome: str, - shuffled: bool = False, - as_dense: bool = False, - annotated_only: bool = False, - load_raw: bool = False, - allow_caching: bool = True, - ): - """ - Loads data set human into backed anndata object. - - TODO replace streamlining in here by required call to .streamline() before. - - Example usage: - - ds = DatasetSuperGroup([...]) - ds.write_backed( - fn_backed="...", - target_genome="...", - annotated_only=False - ) - adata_backed = anndata.read(ds.fn_backed, backed='r') - adata_slice = ad_full[idx] - - :param fn_backed: File name to save backed anndata to temporarily. - :param genome: ID of target genomes. - :param shuffled: Whether to shuffle data when writing to backed. - :param as_dense: Whether to load into dense count matrix. - :param annotated_only: - :param load_raw: See .load(). - :param allow_caching: See .load(). - """ - if shuffled and not as_dense: - raise ValueError("cannot write backed shuffled and sparse") - scatter_update = shuffled or as_dense - self.fn_backed = fn_backed - n_cells = self.ncells(annotated_only=annotated_only) - gc = self.get_gc(genome=genome) - n_genes = gc.n_var - if scatter_update: - self.adata = anndata.AnnData( - scipy.sparse.csr_matrix((n_cells, n_genes), dtype=np.float32) - ) # creates an empty anndata object with correct dimensions that can be filled with cells from data sets - else: - self.adata = anndata.AnnData( - scipy.sparse.csr_matrix((0, n_genes), dtype=np.float32) - ) - self.adata.filename = fn_backed # setting this attribute switches this anndata to a backed object - # Note that setting .filename automatically redefines .X as dense, so we have to redefine it as sparse: - if not as_dense: - X = scipy.sparse.csr_matrix(self.adata.X) # redefines this backed anndata as sparse - X.indices = X.indices.astype(np.int64) - X.indptr = X.indptr.astype(np.int64) - self.adata.X = X - keys = [ - self._adata_ids.annotated, - self._adata_ids.assay_sc, - self._adata_ids.assay_differentiation, - self._adata_ids.assay_type_differentiation, - self._adata_ids.author, - self._adata_ids.cell_line, - self._adata_ids.dataset, - self._adata_ids.cell_type, - self._adata_ids.development_stage, - self._adata_ids.normalization, - self._adata_ids.organ, - self._adata_ids.bio_sample, - self._adata_ids.state_exact, - self._adata_ids.year, - ] - if scatter_update: - self.adata.obs = pandas.DataFrame({ - k: ["nan" for _ in range(n_cells)] for k in keys - }) - else: - for k in keys: - self.adata.obs[k] = [] - # Define index vectors to write to: - idx_vector = np.arange(0, n_cells) - if shuffled: - np.random.shuffle(idx_vector) - idx_ls = [] - row = 0 - ncells = self.ncells_bydataset(annotated_only=annotated_only) - if np.all([len(x) == 0 for x in ncells]): - raise ValueError("no datasets found") - for x in ncells: - temp_ls = [] - for y in x: - temp_ls.append(idx_vector[row:(row + y)]) - row += y - idx_ls.append(temp_ls) - print("checking expected and received data set sizes, rerun meta data generation if mismatch is found:") - print(self.ncells_bydataset(annotated_only=annotated_only)) - print([[len(x) for x in xx] for xx in idx_ls]) - for i, x in enumerate(self.dataset_groups): - x.write_backed( - adata_backed=self.adata, - genome=genome, - idx=idx_ls[i], - annotated_only=annotated_only, - load_raw=load_raw, - allow_caching=allow_caching, - ) - # If the sparse non-shuffled approach is used, make sure that self.adata.obs.index is unique() before saving - if not scatter_update: - self.adata.obs.index = pd.RangeIndex(0, len(self.adata.obs.index)) - # Explicitly write backed file to disk again to make sure that obs are included and that n_obs is set correctly - self.adata.write() - # Saving obs separately below is therefore no longer required (hence commented out) - # fn_backed_obs = ".".join(self.fn_backed.split(".")[:-1]) + "_obs.csv" - # self.adata.obs.to_csv(fn_backed_obs) - - def delete_backed(self): - del self.adata - self.adata = None - os.remove(str(self.fn_backed)) - - def load_cached_backed(self, fn: PathLike): - self.adata = anndata.read(fn, backed='r') - def streamline_metadata( self, schema: str = "sfaira", diff --git a/sfaira/data/dataloaders/databases/cellxgene/cellxgene_loader.py b/sfaira/data/dataloaders/databases/cellxgene/cellxgene_loader.py index f7df5acbc..9c024c04f 100644 --- a/sfaira/data/dataloaders/databases/cellxgene/cellxgene_loader.py +++ b/sfaira/data/dataloaders/databases/cellxgene/cellxgene_loader.py @@ -8,9 +8,9 @@ from typing import List, Union import uuid +from sfaira import settings from sfaira.data.dataloaders.base import DatasetBase from sfaira.consts import AdataIdsCellxgene, AdataIdsCellxgene_v2_0_0 -from sfaira.consts.directories import CACHE_DIR_DATABASES_CELLXGENE from sfaira.data.dataloaders.databases.cellxgene.rest_helpers import get_collection, get_data from sfaira.data.dataloaders.databases.cellxgene.rest_helpers import CELLXGENE_PRODUCTION_ENDPOINT, DOWNLOAD_DATASET @@ -25,19 +25,9 @@ def clean_cellxgene_meta_obs(k, val, adata_ids) -> Union[str, List[str]]: :param val: Found meta data entry. :returns: Cleaned meta data entry. """ - if k == "disease": - # TODO normal state label varies in disease annotation. This can be removed once streamlined. - val = ["healthy" if (v.lower() == "normal" or v.lower() == "healthy") else v for v in val] - elif k == "organ": + if k == adata_ids.organ: # Organ labels contain labels on tissue type also, such as 'UBERON:0001911 (cell culture)'. val = [v.split(" ")[0] for v in val] - elif k == "organism": - # TODO deprecate map once same organism naming is used. - organism_map = { - "Homo sapiens": "human", - "Mus musculus": "mouse", - } - val = [organism_map[v] if v in organism_map.keys() else v for v in val] return val @@ -49,27 +39,11 @@ def clean_cellxgene_meta_uns(k, val, adata_ids) -> Union[str, List[str]]: """ x_clean = [] for v in val: - if k == "sex": - v = v[0] - else: - # Decide if labels are read from name or ontology ID: - if k == "disease" and (v["label"].lower() == "normal" or v["label"].lower() == "healthy"): - # TODO normal state label varies in disease annotation. This can be removed once streamlined. - v = "healthy" - elif k in ["assay_sc", "disease", "organ"] and \ - v["ontology_term_id"] != adata_ids.unknown_metadata_identifier: - v = v["ontology_term_id"] - else: - v = v["label"] - # Organ labels contain labels on tissue type also, such as 'UBERON:0001911 (cell culture)'. - if k == "organ": - v = v.split(" ")[0] - if k == "organism": - # TODO deprecate map once same organism naming is used. - organism_map = { - "Homo sapiens": "human", - "Mus musculus": "mouse"} - v = organism_map[v] if v in organism_map.keys() else v + # Decide if labels are read from name or ontology ID: + v = v[adata_ids.onto_id_suffix[1:]] + # Organ labels contain labels on tissue type also, such as 'UBERON:0001911 (cell culture)'. + if k == adata_ids.organ: + v = v.split(" ")[0] if v != adata_ids.unknown_metadata_identifier and v != adata_ids.invalid_metadata_identifier: x_clean.append(v) return x_clean @@ -132,27 +106,19 @@ def __init__( val = self._collection_dataset[getattr(self._adata_ids_cellxgene, k)] # Unique label if list is length 1: # Otherwise do not set property and resort to cell-wise labels. - if isinstance(val, dict) or k == "sex": - val = [val] v_clean = clean_cellxgene_meta_uns(k=k, val=val, adata_ids=self._adata_ids_cellxgene) + # Set as single element or list if multiple entries are given. + if len(v_clean) > 1: + v_clean = v_clean[0] try: - # Set as single element or list if multiple entries are given. - if len(v_clean) == 1: - setattr(self, k, v_clean[0]) - else: - setattr(self, k, v_clean) + setattr(self, k, v_clean) except ValueError as e: if verbose > 0: print(f"WARNING: {e} in {self.collection_id} and data set {self.id}") - if self.organism == "human": - self._adata_ids_cellxgene = AdataIdsCellxgene_v2_0_0() - elif self.organism == "mouse": - self._adata_ids_cellxgene = AdataIdsCellxgene_v2_0_0() - else: - assert False, self.organism + self._adata_ids_cellxgene = AdataIdsCellxgene_v2_0_0() # Add author information. # TODO need to change this to contributor? - setattr(self, "author", "cellxgene") + self.author = "cellxgene" # The h5ad objects from cellxgene follow a particular structure and the following attributes are guaranteed to # be in place. Note that these point at the anndata instance and will only be available for evaluation after # download. See below for attributes that are lazily available @@ -161,7 +127,7 @@ def __init__( self.disease_obs_key = self._adata_ids_cellxgene.disease self.ethnicity_obs_key = self._adata_ids_cellxgene.ethnicity self.sex_obs_key = self._adata_ids_cellxgene.sex - self.organ_obs_key = self._adata_ids_cellxgene.organism + self.organ_obs_key = self._adata_ids_cellxgene.organ self.state_exact_obs_key = self._adata_ids_cellxgene.state_exact self.gene_id_symbols_var_key = self._adata_ids_cellxgene.feature_symbol @@ -171,11 +137,9 @@ def __init__( @property def _collection_cache_dir(self): """ - The cache dir is in a cache directory in the sfaira installation that is excempt from git versioning. + The cache dir is in a cache directory in the homedirectory of the user by default and can be user modified. """ - cache_dir_path = pathlib.Path(CACHE_DIR_DATABASES_CELLXGENE) - cache_dir_path.mkdir(parents=True, exist_ok=True) - return CACHE_DIR_DATABASES_CELLXGENE + return settings.cachedir_databases_cellxgene @property def _collection_cache_fn(self): diff --git a/sfaira/data/dataloaders/export_adaptors/cellxgene.py b/sfaira/data/dataloaders/export_adaptors/cellxgene.py index eeb58862a..268da4293 100644 --- a/sfaira/data/dataloaders/export_adaptors/cellxgene.py +++ b/sfaira/data/dataloaders/export_adaptors/cellxgene.py @@ -52,10 +52,10 @@ def cellxgene_export_adaptor_1_1_0(adata: anndata.AnnData, adata_ids: AdataIdsCe # TODO port this into organism ontology handling. # Infer organism from adata object. organism = np.unique(adata.obs[adata_ids.organism].values)[0] - if organism == "mouse": + if organism == "musmusculus": adata.uns["organism"] = "Mus musculus" adata.uns["organism_ontology_term_id"] = "NCBITaxon:10090" - elif organism == "human": + elif organism == "homosapiens": adata.uns["organism"] = "Homo sapiens" adata.uns["organism_ontology_term_id"] = "NCBITaxon:9606" else: @@ -105,14 +105,6 @@ def cellxgene_export_adaptor_2_0_0(adata: anndata.AnnData, adata_ids: AdataIdsCe } if obs_keys_batch is not None: adata.uns["batch_condition"] = obs_keys_batch.split("*") - # TODO port this into organism ontology handling. - # Infer organism from adata object. - organism_map = {"mouse": "Mus musculus", "human": "Homo sapiens"} - organism_id_map = {"mouse": "NCBITaxon:10090", "human": "NCBITaxon:9606"} - organism_id = organism_id_map[np.unique(adata.obs[adata_ids.organism].values)[0]] - adata.obs[adata_ids.organism] = [organism_map[x] for x in adata.obs[adata_ids.organism].values] - adata.obs[adata_ids.organism + adata_ids.onto_id_suffix] = \ - [organism_id_map[x] for x in adata.obs[adata_ids.organism + adata_ids.onto_id_suffix].values] # 2) Modify .obs # a) Correct unknown cell type entries: adata.obs[adata_ids.cell_type] = [ @@ -147,7 +139,7 @@ def cellxgene_export_adaptor_2_0_0(adata: anndata.AnnData, adata_ids: AdataIdsCe adata.X.data = np.rint(adata.X.data) # 4) Modify .var: adata.var[adata_ids.feature_biotype] = "gene" - adata.var[adata_ids.feature_reference] = organism_id + adata.var[adata_ids.feature_reference] = adata.obs[adata_ids.organism + adata_ids.onto_id_suffix].values[0] adata.var[adata_ids.feature_is_filtered] = False adata.var[adata_ids.feature_biotype] = pd.Categorical(adata.var[adata_ids.feature_biotype].values.tolist()) # Modify ensembl ID writing: diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2017_09_004/human_isletoflangerhans_2017_smartseq2_enge_001.py b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2017_09_004/human_isletoflangerhans_2017_smartseq2_enge_001.py index 418f2620c..e6494151c 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2017_09_004/human_isletoflangerhans_2017_smartseq2_enge_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2017_09_004/human_isletoflangerhans_2017_smartseq2_enge_001.py @@ -23,7 +23,7 @@ def __init__(self, **kwargs): self.normalization = "raw" self.assay_sc = "Smart-seq2" self.organ = "islet of Langerhans" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.year = 2017 self.gene_id_symbols_var_key = "index" diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_02_001/mouse_x_2018_microwellseq_han_x.py b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_02_001/mouse_x_2018_microwellseq_han_x.py index 026d4f8c0..60ffb08a9 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_02_001/mouse_x_2018_microwellseq_han_x.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_02_001/mouse_x_2018_microwellseq_han_x.py @@ -35,7 +35,7 @@ "FetalMaleGonad_dge.txt.gz", "FetalPancreas_dge.txt.gz", "FetalStomach_dge.txt.gz", - # "human-293T_dge.txt.gz", # ToDo: sort out meta data for these + # "homosapiens-293T_dge.txt.gz", # ToDo: sort out meta data for these "Kidney1_dge.txt.gz", "Kidney2_dge.txt.gz", "Liver1_dge.txt.gz", @@ -59,7 +59,7 @@ # "mES.CJ7_dge.txt.gz", # TODO: sort out meta data for these "MesenchymalStemCells_dge.txt.gz", "MesenchymalStemCellsPrimary_dge.txt.gz", - # "mouse-3T3_dge.txt.gz", # TODO: sort out meta data for these + # "musmusculus-3T3_dge.txt.gz", # TODO: sort out meta data for these "Muscle_dge.txt.gz", "NeonatalCalvaria1_dge.txt.gz", "NeonatalCalvaria2_dge.txt.gz", @@ -134,7 +134,7 @@ def __init__(self, **kwargs): "FetalMaleGonad_dge.txt.gz": "testis", "FetalPancreas_dge.txt.gz": "pancreas", "FetalStomach_dge.txt.gz": "stomach", - "human-293T_dge.txt.gz": None, + "homosapiens-293T_dge.txt.gz": None, "Kidney1_dge.txt.gz": "kidney", "Kidney2_dge.txt.gz": "kidney", "Liver1_dge.txt.gz": "liver", @@ -158,7 +158,7 @@ def __init__(self, **kwargs): "mES.CJ7_dge.txt.gz": "blastocyst", "MesenchymalStemCells_dge.txt.gz": "mesenchyme", "MesenchymalStemCellsPrimary_dge.txt.gz": "mesenchyme", - "mouse-3T3_dge.txt.gz": None, + "musmusculus-3T3_dge.txt.gz": None, "Muscle_dge.txt.gz": "skeletal muscle organ", "NeonatalCalvaria1_dge.txt.gz": "vault of skull", "NeonatalCalvaria2_dge.txt.gz": "vault of skull", @@ -227,7 +227,7 @@ def __init__(self, **kwargs): "FetalMaleGonad_dge.txt.gz": "fetal", "FetalPancreas_dge.txt.gz": "fetal", "FetalStomach_dge.txt.gz": "fetal", - "human-293T_dge.txt.gz": None, + "homosapiens-293T_dge.txt.gz": None, "Kidney1_dge.txt.gz": "adult", "Kidney2_dge.txt.gz": "adult", "Liver1_dge.txt.gz": "adult", @@ -251,7 +251,7 @@ def __init__(self, **kwargs): "mES.CJ7_dge.txt.gz": "embryonic", "MesenchymalStemCells_dge.txt.gz": "embryonic", "MesenchymalStemCellsPrimary_dge.txt.gz": "embryonic", - "mouse-3T3_dge.txt.gz": None, + "musmusculus-3T3_dge.txt.gz": None, "Muscle_dge.txt.gz": "adult", "NeonatalCalvaria1_dge.txt.gz": "neonatal", "NeonatalCalvaria2_dge.txt.gz": "neonatal", @@ -304,7 +304,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1016/j.cell.2018.02.001" self.normalization = "raw" self.primary_data = True - self.organism = "mouse" + self.organism = "Mus musculus" self.assay_sc = "microwell-seq" self.year = 2018 self.sample_source = "primary_tissue" diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_08_067/human_laminapropriaofmucosaofcolon_2019_10xsequencing_kinchen_001.py b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_08_067/human_laminapropriaofmucosaofcolon_2019_10xsequencing_kinchen_001.py index 2efb01ddc..64afe3aa8 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_08_067/human_laminapropriaofmucosaofcolon_2019_10xsequencing_kinchen_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_08_067/human_laminapropriaofmucosaofcolon_2019_10xsequencing_kinchen_001.py @@ -17,6 +17,6 @@ def load(data_dir, sample_fn, **kwargs): adata.obs = obs s_dict = {"F": "female", "M": "male"} adata.obs['Sex'] = [s_dict[i] for i in adata.obs['Sex']] - adata.obs['Age'] = [str(x) + "-year-old human stage" for x in adata.obs['Age'].values] + adata.obs['Age'] = [str(x) + "-year-old homosapiens stage" for x in adata.obs['Age'].values] return adata diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_08_067/human_laminapropriaofmucosaofcolon_2019_10xsequencing_kinchen_001.yaml b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_08_067/human_laminapropriaofmucosaofcolon_2019_10xsequencing_kinchen_001.yaml index c7aa0818e..7b571d97a 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_08_067/human_laminapropriaofmucosaofcolon_2019_10xsequencing_kinchen_001.yaml +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2018_08_067/human_laminapropriaofmucosaofcolon_2019_10xsequencing_kinchen_001.yaml @@ -39,7 +39,7 @@ dataset_or_observation_wise: individual_obs_key: organ: "lamina propria of mucosa of colon" organ_obs_key: - organism: "human" + organism: "Homo sapiens" organism_obs_key: sample_source: "primary_tissue" sample_source_obs_key: diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2019_06_029/human_colonicepithelium_2019_10xsequencing_smilie_001.py b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2019_06_029/human_colonicepithelium_2019_10xsequencing_smilie_001.py index 737cd2f10..092f17579 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2019_06_029/human_colonicepithelium_2019_10xsequencing_smilie_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2019_06_029/human_colonicepithelium_2019_10xsequencing_smilie_001.py @@ -20,7 +20,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1016/j.cell.2019.06.029" self.normalization = "raw" self.organ = "colonic epithelium" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2019 diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2019_08_008/human_ileum_2019_10xsequencing_martin_001.py b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2019_08_008/human_ileum_2019_10xsequencing_martin_001.py index c9c247543..4a09137cd 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2019_08_008/human_ileum_2019_10xsequencing_martin_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2019_08_008/human_ileum_2019_10xsequencing_martin_001.py @@ -19,7 +19,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1016/j.cell.2019.08.008" self.normalization = "raw" self.organ = "ileum" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2019 diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2021_01_053/human_na_2021_na_ren_001.py b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2021_01_053/human_na_2021_na_ren_001.py index e5a030203..bb7ed448f 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2021_01_053/human_na_2021_na_ren_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2021_01_053/human_na_2021_na_ren_001.py @@ -29,13 +29,14 @@ "F": "female", "M": "male", "unknown": "unknown", + "nan": "unknown", } def load(data_dir, sample_fn, **kwargs): fn = os.path.join(data_dir, sample_fn) adata = anndata.read(fn) - adata.X = scipy.sparse.csc_matrix(adata.X) + adata.X = scipy.sparse.csr_matrix(adata.X) adata.obs["assay_sc"] = adata.obs["Single cell sequencing platform"].map(assay_sc_map) adata.obs["disease"] = adata.obs["SARS-CoV-2"].map(disease_map) diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2021_01_053/human_na_2021_na_ren_001.yaml b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2021_01_053/human_na_2021_na_ren_001.yaml index 75a58aad4..5af68b8c7 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2021_01_053/human_na_2021_na_ren_001.yaml +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_cell_2021_01_053/human_na_2021_na_ren_001.yaml @@ -8,7 +8,7 @@ dataset_wise: doi_preprint: doi_journal: "10.1016/j.cell.2021.01.053" download_url_data: "https://drive.google.com/file/d/1IwWcn4W-YKgNbz4DpNweM2cKxlx1hbM0/view" - download_url_meta: "None" + download_url_meta: primary_data: True normalization: "raw" year: 2021 @@ -33,7 +33,7 @@ dataset_or_observation_wise: individual_obs_key: "PatientID" organ: organ_obs_key: "organ" - organism: "human" + organism: "Homo sapiens" organism_obs_key: sample_source: "primary_tissue" sample_source_obs_key: diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_celrep_2018_11_086/human_prostategland_2018_10xsequencing_henry_001.py b/sfaira/data/dataloaders/loaders/d10_1016_j_celrep_2018_11_086/human_prostategland_2018_10xsequencing_henry_001.py index 2315661c1..66a93b1c1 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_celrep_2018_11_086/human_prostategland_2018_10xsequencing_henry_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_celrep_2018_11_086/human_prostategland_2018_10xsequencing_henry_001.py @@ -26,7 +26,7 @@ def __init__(self, **kwargs): self.sample_source = "primary_tissue" self.state_exact = "healthy" self.organ = "prostate gland" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.year = 2018 diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_cels_2016_08_011/human_pancreas_2016_indrop_baron_001.py b/sfaira/data/dataloaders/loaders/d10_1016_j_cels_2016_08_011/human_pancreas_2016_indrop_baron_001.py index e12e51abc..55ec8fe3f 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_cels_2016_08_011/human_pancreas_2016_indrop_baron_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_cels_2016_08_011/human_pancreas_2016_indrop_baron_001.py @@ -19,7 +19,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1016/j.cels.2016.08.011" self.normalization = "raw" self.organ = "pancreas" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.state_exact = "healthy" diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_cmet_2016_08_020/human_pancreas_2016_smartseq2_segerstolpe_001.py b/sfaira/data/dataloaders/loaders/d10_1016_j_cmet_2016_08_020/human_pancreas_2016_smartseq2_segerstolpe_001.py index d480b0617..bc7f3cf0f 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_cmet_2016_08_020/human_pancreas_2016_smartseq2_segerstolpe_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_cmet_2016_08_020/human_pancreas_2016_smartseq2_segerstolpe_001.py @@ -20,7 +20,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1016/j.cmet.2016.08.020" self.normalization = "raw" self.organ = "pancreas" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2016 diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_cmet_2019_01_021/mouse_pancreas_2019_10xsequencing_thompson_x.py b/sfaira/data/dataloaders/loaders/d10_1016_j_cmet_2019_01_021/mouse_pancreas_2019_10xsequencing_thompson_x.py index ed91f5bc3..9174de4b6 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_cmet_2019_01_021/mouse_pancreas_2019_10xsequencing_thompson_x.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_cmet_2019_01_021/mouse_pancreas_2019_10xsequencing_thompson_x.py @@ -33,7 +33,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1016/j.cmet.2019.01.021" self.normalization = "raw" self.organ = "pancreas" - self.organism = "mouse" + self.organism = "Mus musculus" self.primary_data = True self.sample_source = "primary_tissue" self.state_exact = "diabetic" diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_devcel_2020_01_033/human_lung_2020_10xsequencing_miller_001.py b/sfaira/data/dataloaders/loaders/d10_1016_j_devcel_2020_01_033/human_lung_2020_10xsequencing_miller_001.py index c9d4d2928..7283437e8 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_devcel_2020_01_033/human_lung_2020_10xsequencing_miller_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_devcel_2020_01_033/human_lung_2020_10xsequencing_miller_001.py @@ -19,7 +19,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1016/j.devcel.2020.01.033" self.normalization = "raw" self.organ = "lung" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.state_exact = "healthy" diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_neuron_2019_06_011/human_brain_2019_dropseq_polioudakis_001.py b/sfaira/data/dataloaders/loaders/d10_1016_j_neuron_2019_06_011/human_brain_2019_dropseq_polioudakis_001.py index e2d8b8435..8d5d4cffc 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_neuron_2019_06_011/human_brain_2019_dropseq_polioudakis_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_neuron_2019_06_011/human_brain_2019_dropseq_polioudakis_001.py @@ -6,8 +6,8 @@ def load(data_dir, **kwargs): age_dict = { - 17: "17th week post-fertilization human stage", - 18: "18th week post-fertilization human stage", + 17: "17th week post-fertilization homosapiens stage", + 18: "18th week post-fertilization homosapiens stage", } ct_dict = { "End": "Endothelial", diff --git a/sfaira/data/dataloaders/loaders/d10_1016_j_neuron_2019_06_011/human_brain_2019_dropseq_polioudakis_001.yaml b/sfaira/data/dataloaders/loaders/d10_1016_j_neuron_2019_06_011/human_brain_2019_dropseq_polioudakis_001.yaml index a190b694e..f89776f52 100644 --- a/sfaira/data/dataloaders/loaders/d10_1016_j_neuron_2019_06_011/human_brain_2019_dropseq_polioudakis_001.yaml +++ b/sfaira/data/dataloaders/loaders/d10_1016_j_neuron_2019_06_011/human_brain_2019_dropseq_polioudakis_001.yaml @@ -33,7 +33,7 @@ dataset_or_observation_wise: individual_obs_key: "Donor" organ: "brain" organ_obs_key: - organism: "human" + organism: "Homo sapiens" organism_obs_key: sample_source: "primary_tissue" sample_source_obs_key: diff --git a/sfaira/data/dataloaders/loaders/d10_1038_nmeth_4407/human_brain_2017_droncseq_habib_001.py b/sfaira/data/dataloaders/loaders/d10_1038_nmeth_4407/human_brain_2017_droncseq_habib_001.py index 2f995a6b6..8b88fc951 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_nmeth_4407/human_brain_2017_droncseq_habib_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_nmeth_4407/human_brain_2017_droncseq_habib_001.py @@ -19,7 +19,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/nmeth.4407" self.normalization = "raw" self.organ = "brain" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2017 diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41422_018_0099_2/human_testis_2018_10xsequencing_guo_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41422_018_0099_2/human_testis_2018_10xsequencing_guo_001.py index 418ea97e1..0f37aa1dd 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41422_018_0099_2/human_testis_2018_10xsequencing_guo_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41422_018_0099_2/human_testis_2018_10xsequencing_guo_001.py @@ -19,7 +19,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/s41422-018-0099-2" self.normalization = "raw" self.organ = "testis" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2018 diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41467_018_06318_7/human_caudatelobeofliver_2018_10xsequencing_macparland_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41467_018_06318_7/human_caudatelobeofliver_2018_10xsequencing_macparland_001.py index 0f77d2a86..28de5c360 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41467_018_06318_7/human_caudatelobeofliver_2018_10xsequencing_macparland_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41467_018_06318_7/human_caudatelobeofliver_2018_10xsequencing_macparland_001.py @@ -18,7 +18,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/s41467-018-06318-7" self.normalization = "raw" self.organ = "caudate lobe of liver" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2018 diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_10861_2/human_kidney_2019_droncseq_lake_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_10861_2/human_kidney_2019_droncseq_lake_001.py index 1d62b8bbe..8a7053d2c 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_10861_2/human_kidney_2019_droncseq_lake_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_10861_2/human_kidney_2019_droncseq_lake_001.py @@ -20,7 +20,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/s41467-019-10861-2" self.normalization = "raw" self.organ = "kidney" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2019 diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_12464_3/human_x_2019_10xsequencing_szabo_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_12464_3/human_x_2019_10xsequencing_szabo_001.py index 31c55d92f..0db9b6ee7 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_12464_3/human_x_2019_10xsequencing_szabo_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_12464_3/human_x_2019_10xsequencing_szabo_001.py @@ -64,7 +64,7 @@ def __init__(self, **kwargs): self.individual = SAMPLE_DICT[self.sample_fn][1] self.normalization = "raw" self.organ = SAMPLE_DICT[self.sample_fn][0] - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.state_exact = SAMPLE_DICT[self.sample_fn][2] diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_12780_8/human_retina_2019_10xsequencing_menon_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_12780_8/human_retina_2019_10xsequencing_menon_001.py index 43742a38d..dde2afefb 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_12780_8/human_retina_2019_10xsequencing_menon_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41467_019_12780_8/human_retina_2019_10xsequencing_menon_001.py @@ -17,7 +17,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/s41467-019-12780-8" self.normalization = "raw" self.organ = "retina" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2019 diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41586_018_0698_6/human_placenta_2018_x_ventotormo_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41586_018_0698_6/human_placenta_2018_x_ventotormo_001.py index 50926a4b1..142401306 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41586_018_0698_6/human_placenta_2018_x_ventotormo_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41586_018_0698_6/human_placenta_2018_x_ventotormo_001.py @@ -25,7 +25,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/s41586-018-0698-6" self.normalization = "raw" self.organ = "placenta" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2018 diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1373_2/human_liver_2019_celseq2_aizarani_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1373_2/human_liver_2019_celseq2_aizarani_001.py index 09c21c468..ae724d69a 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1373_2/human_liver_2019_celseq2_aizarani_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1373_2/human_liver_2019_celseq2_aizarani_001.py @@ -19,7 +19,7 @@ def __init__(self, **kwargs): self.normalization = "raw" self.sample_source = "primary_tissue" self.organ = "liver" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.year = 2019 diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1631_3/human_liver_2019_10xsequencing_ramachandran_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1631_3/human_liver_2019_10xsequencing_ramachandran_001.py index 73d332a05..c847de6f8 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1631_3/human_liver_2019_10xsequencing_ramachandran_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1631_3/human_liver_2019_10xsequencing_ramachandran_001.py @@ -19,7 +19,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/s41586-019-1631-3" self.normalization = "raw" self.organ = "liver" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2019 diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1652_y/human_liver_2019_10xsequencing_popescu_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1652_y/human_liver_2019_10xsequencing_popescu_001.py index a03bd0525..7cacaae70 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1652_y/human_liver_2019_10xsequencing_popescu_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1652_y/human_liver_2019_10xsequencing_popescu_001.py @@ -17,7 +17,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/s41586-019-1652-y" self.normalization = "raw" self.organ = "liver" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.state_exact = "healthy" diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1654_9/human_brain_2019_10x3v2sequencing_kanton_001.yaml b/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1654_9/human_brain_2019_10x3v2sequencing_kanton_001.yaml index 516190e4d..c3837e408 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1654_9/human_brain_2019_10x3v2sequencing_kanton_001.yaml +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41586_019_1654_9/human_brain_2019_10x3v2sequencing_kanton_001.yaml @@ -35,7 +35,7 @@ dataset_or_observation_wise: individual_obs_key: organ: "brain" organ_obs_key: - organism: "human" + organism: "Homo sapiens" organism_obs_key: sample_source: "3d_culture" sample_source_obs_key: diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41586_020_2157_4/human_x_2020_microwellseq_han_x.py b/sfaira/data/dataloaders/loaders/d10_1038_s41586_020_2157_4/human_x_2020_microwellseq_han_x.py index a1c18c208..5752f504a 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41586_020_2157_4/human_x_2020_microwellseq_han_x.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41586_020_2157_4/human_x_2020_microwellseq_han_x.py @@ -22,7 +22,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/s41586-020-2157-4" self.healthy = True self.normalization = "raw" - self.organism = "human" + self.organism = "Homo sapiens" self.assay_sc = "microwell-seq" self.state_exact = "healthy" self.year = 2020 @@ -150,66 +150,66 @@ def load(data_dir, **kwargs): 'Placenta_1': 'placenta', } sample_dev_stage_dict = { - 'AdultAdipose_1': 'human adult stage', - 'AdultAdrenalGland_2': 'human adult stage', - 'AdultAdrenalGland_3': 'human adult stage', - 'AdultArtery_1': 'human adult stage', - 'AdultAscendingColon_1': 'human adult stage', - 'AdultBladder_1': 'human adult stage', - 'AdultBladder_2': 'human adult stage', - 'AdultCerebellum_1': 'human adult stage', - 'AdultCervix_1': 'human adult stage', - 'AdultColon_1': 'human adult stage', - 'AdultDuodenum_1': 'human adult stage', - 'AdultEpityphlon_1': 'human adult stage', - 'AdultEsophagus_1': 'human adult stage', - 'AdultEsophagus_2': 'human adult stage', - 'AdultFallopiantube_1': 'human adult stage', - 'AdultGallbladder_1': 'human adult stage', - 'AdultGallbladder_2': 'human adult stage', - 'AdultHeart_1': 'human adult stage', - 'AdultHeart_2': 'human adult stage', - 'AdultIleum_2': 'human adult stage', - 'AdultJejunum_2': 'human adult stage', - 'AdultKidney_2': 'human adult stage', - 'AdultKidney_3': 'human adult stage', - 'AdultKidney_4': 'human adult stage', - 'AdultLiver_1': 'human adult stage', - 'AdultLiver_2': 'human adult stage', - 'AdultLiver_4': 'human adult stage', - 'AdultLung_1': 'human adult stage', - 'AdultLung_2': 'human adult stage', - 'AdultLung_3': 'human adult stage', - 'AdultMuscle_1': 'human adult stage', - 'AdultOmentum_1': 'human adult stage', - 'AdultOmentum_2': 'human adult stage', - 'AdultOmentum_3': 'human adult stage', - 'AdultPancreas_1': 'human adult stage', - 'AdultPeripheralBlood_3': 'human adult stage', - 'AdultPeripheralBlood_4': 'human adult stage', - 'AdultPleura_1': 'human adult stage', - 'AdultProstate_1': 'human adult stage', - 'AdultRectum_1': 'human adult stage', - 'AdultSigmoidColon_1': 'human adult stage', - 'AdultSpleenParenchyma_1': 'human adult stage', - 'AdultSpleen_1': 'human adult stage', - 'AdultStomach_1': 'human adult stage', - 'AdultStomach_2': 'human adult stage', - 'AdultStomach_3': 'human adult stage', - 'AdultTemporalLobe_1': 'human adult stage', - 'AdultThyroid_1': 'human adult stage', - 'AdultThyroid_2': 'human adult stage', - 'AdultTrachea_2': 'human adult stage', - 'AdultTransverseColon_2': 'human adult stage', - 'AdultUreter_1': 'human adult stage', - 'AdultUterus_1': 'human adult stage', - 'BoneMarrow_1': 'human adult stage', - 'BoneMarrow_2': 'human adult stage', - 'ChorionicVillus_1': 'human adult stage', - 'CordBloodCD34P_1': 'human adult stage', - 'CordBloodCD34P_2': 'human adult stage', - 'CordBlood_1': 'human adult stage', - 'CordBlood_2': 'human adult stage', + 'AdultAdipose_1': 'homosapiens adult stage', + 'AdultAdrenalGland_2': 'homosapiens adult stage', + 'AdultAdrenalGland_3': 'homosapiens adult stage', + 'AdultArtery_1': 'homosapiens adult stage', + 'AdultAscendingColon_1': 'homosapiens adult stage', + 'AdultBladder_1': 'homosapiens adult stage', + 'AdultBladder_2': 'homosapiens adult stage', + 'AdultCerebellum_1': 'homosapiens adult stage', + 'AdultCervix_1': 'homosapiens adult stage', + 'AdultColon_1': 'homosapiens adult stage', + 'AdultDuodenum_1': 'homosapiens adult stage', + 'AdultEpityphlon_1': 'homosapiens adult stage', + 'AdultEsophagus_1': 'homosapiens adult stage', + 'AdultEsophagus_2': 'homosapiens adult stage', + 'AdultFallopiantube_1': 'homosapiens adult stage', + 'AdultGallbladder_1': 'homosapiens adult stage', + 'AdultGallbladder_2': 'homosapiens adult stage', + 'AdultHeart_1': 'homosapiens adult stage', + 'AdultHeart_2': 'homosapiens adult stage', + 'AdultIleum_2': 'homosapiens adult stage', + 'AdultJejunum_2': 'homosapiens adult stage', + 'AdultKidney_2': 'homosapiens adult stage', + 'AdultKidney_3': 'homosapiens adult stage', + 'AdultKidney_4': 'homosapiens adult stage', + 'AdultLiver_1': 'homosapiens adult stage', + 'AdultLiver_2': 'homosapiens adult stage', + 'AdultLiver_4': 'homosapiens adult stage', + 'AdultLung_1': 'homosapiens adult stage', + 'AdultLung_2': 'homosapiens adult stage', + 'AdultLung_3': 'homosapiens adult stage', + 'AdultMuscle_1': 'homosapiens adult stage', + 'AdultOmentum_1': 'homosapiens adult stage', + 'AdultOmentum_2': 'homosapiens adult stage', + 'AdultOmentum_3': 'homosapiens adult stage', + 'AdultPancreas_1': 'homosapiens adult stage', + 'AdultPeripheralBlood_3': 'homosapiens adult stage', + 'AdultPeripheralBlood_4': 'homosapiens adult stage', + 'AdultPleura_1': 'homosapiens adult stage', + 'AdultProstate_1': 'homosapiens adult stage', + 'AdultRectum_1': 'homosapiens adult stage', + 'AdultSigmoidColon_1': 'homosapiens adult stage', + 'AdultSpleenParenchyma_1': 'homosapiens adult stage', + 'AdultSpleen_1': 'homosapiens adult stage', + 'AdultStomach_1': 'homosapiens adult stage', + 'AdultStomach_2': 'homosapiens adult stage', + 'AdultStomach_3': 'homosapiens adult stage', + 'AdultTemporalLobe_1': 'homosapiens adult stage', + 'AdultThyroid_1': 'homosapiens adult stage', + 'AdultThyroid_2': 'homosapiens adult stage', + 'AdultTrachea_2': 'homosapiens adult stage', + 'AdultTransverseColon_2': 'homosapiens adult stage', + 'AdultUreter_1': 'homosapiens adult stage', + 'AdultUterus_1': 'homosapiens adult stage', + 'BoneMarrow_1': 'homosapiens adult stage', + 'BoneMarrow_2': 'homosapiens adult stage', + 'ChorionicVillus_1': 'homosapiens adult stage', + 'CordBloodCD34P_1': 'homosapiens adult stage', + 'CordBloodCD34P_2': 'homosapiens adult stage', + 'CordBlood_1': 'homosapiens adult stage', + 'CordBlood_2': 'homosapiens adult stage', 'FetalAdrenalGland_2': 'fetal stage', 'FetalAdrenalGland_3': 'fetal stage', 'FetalAdrenalGland_4': 'fetal stage', @@ -250,11 +250,11 @@ def load(data_dir, **kwargs): 'FetalThymus_1': 'fetal stage', 'FetalThymus_2': 'fetal stage', 'HESC_1': 'blastula stage', - 'Liver_1': 'human adult stage', - 'Liver_2': 'human adult stage', - 'NeonatalAdrenalGland_1': 'newborn human stage', - 'PeripheralBlood_1': 'human adult stage', - 'Placenta_1': 'human adult stage', + 'Liver_1': 'homosapiens adult stage', + 'Liver_2': 'homosapiens adult stage', + 'NeonatalAdrenalGland_1': 'newborn homosapiens stage', + 'PeripheralBlood_1': 'homosapiens adult stage', + 'Placenta_1': 'homosapiens adult stage', } sex_dict = { 'Male': "male", @@ -312,7 +312,7 @@ def load(data_dir, **kwargs): ]], axis=1) assert np.all(a_idx == adata.obs.index) - # remove mouse cells from the object # ToDo: add this back in as mouse data sets? + # remove musmusculus cells from the object # ToDo: add this back in as musmusculus data sets? adata = adata[adata.obs["Source"] != "MCA2.0"].copy() # tidy up the column names of the obs annotations diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41586_020_2922_4/human_lung_2020_x_travaglini_001.yaml b/sfaira/data/dataloaders/loaders/d10_1038_s41586_020_2922_4/human_lung_2020_x_travaglini_001.yaml index 3797eb3c1..da8dee2be 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41586_020_2922_4/human_lung_2020_x_travaglini_001.yaml +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41586_020_2922_4/human_lung_2020_x_travaglini_001.yaml @@ -41,7 +41,7 @@ dataset_or_observation_wise: individual_obs_key: "patient" organ: "lung" organ_obs_key: - organism: "human" + organism: "Homo sapiens" organism_obs_key: sample_source: "primary_tissue" sample_source_obs_key: diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41590_020_0602_z/human_colon_2020_10xsequencing_james_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41590_020_0602_z/human_colon_2020_10xsequencing_james_001.py index fefcdccbe..b2f07e281 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41590_020_0602_z/human_colon_2020_10xsequencing_james_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41590_020_0602_z/human_colon_2020_10xsequencing_james_001.py @@ -20,7 +20,7 @@ def __init__(self, **kwargs): self.individual_obs_key = "donor" self.normalization = "raw" self.organ = "colon" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.state_exact = "healthy" diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.py index ba46939a6..b89eb24dd 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.py @@ -13,8 +13,7 @@ def load(data_dir, sample_fn, **kwargs): adata.X = scipy.sparse.csr_matrix(adata.X) metadata = pd.read_csv(os.path.join(data_dir, "GSE158769_meta_data.txt.gz"), sep="\t") adata.obs = adata.obs.join(metadata.set_index("cell_id")) - adata.obs["cluster_name"] = adata.obs["cluster_name"].astype("category") - adata.obs["cluster_name"].cat.add_categories("unknown").fillna("unknown") + adata.obs["cluster_name"] = adata.obs["cluster_name"].astype("str") adata.obs["disease"] = adata.obs["TB_status"].map(disease_map) diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.tsv b/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.tsv index 7307751ad..1aa0b3558 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.tsv +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.tsv @@ -28,4 +28,4 @@ CD8+ activated activated CD8-positive, alpha-beta T cell CL:0000906 CD8+ central central memory CD8-positive, alpha-beta T cell CL:0000907 Vd1 gamma-delta T cell CL:0000798 Vd2 gamma-delta T cell CL:0000798 -unknown UNKNOWN UNKNOWN +nan UNKNOWN UNKNOWN diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.yaml b/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.yaml index cb1283d17..c3eabc6cc 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.yaml +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41590_021_00933_1/human_blood_2021_citeseq_nathan_001.yaml @@ -33,7 +33,7 @@ dataset_or_observation_wise: individual_obs_key: organ: "blood" organ_obs_key: - organism: "human" + organism: "Homo sapiens" organism_obs_key: sample_source: "primary_tissue" sample_source_obs_key: diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41591_019_0468_5/human_lung_2019_dropseq_braga_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41591_019_0468_5/human_lung_2019_dropseq_braga_001.py index fb9cdbce0..535f0f006 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41591_019_0468_5/human_lung_2019_dropseq_braga_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41591_019_0468_5/human_lung_2019_dropseq_braga_001.py @@ -18,7 +18,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/s41591-019-0468-5" self.normalization = "raw" self.organ = "lung" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.state_exact = "uninvolved areas of tumour resection material" diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41591_019_0468_5/human_x_2019_10xsequencing_braga_x.py b/sfaira/data/dataloaders/loaders/d10_1038_s41591_019_0468_5/human_x_2019_10xsequencing_braga_x.py index ed1ac4e5f..7954fa0c4 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41591_019_0468_5/human_x_2019_10xsequencing_braga_x.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41591_019_0468_5/human_x_2019_10xsequencing_braga_x.py @@ -23,7 +23,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/s41591-019-0468-5" self.normalization = "scaled" self.organ = "bronchus" if self.sample_fn == "vieira19_Bronchi_anonymised.processed.h5ad" else "lung parenchyma" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2019 diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41593_019_0393_4/mouse_x_2019_10xsequencing_hove_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41593_019_0393_4/mouse_x_2019_10xsequencing_hove_001.py index 9028204b2..ed486eb32 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41593_019_0393_4/mouse_x_2019_10xsequencing_hove_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41593_019_0393_4/mouse_x_2019_10xsequencing_hove_001.py @@ -21,7 +21,7 @@ def __init__(self, **kwargs): self.disease = "healthy" self.doi_journal = "10.1038/s41593-019-0393-4" self.normalization = "raw" - self.organism = "mouse" + self.organism = "Mus musculus" self.primary_data = True self.sample_source = "primary_tissue" self.state_exact = "healthy" diff --git a/sfaira/data/dataloaders/loaders/d10_1038_s41597_019_0351_8/human_kidney_2020_10xsequencing_liao_001.py b/sfaira/data/dataloaders/loaders/d10_1038_s41597_019_0351_8/human_kidney_2020_10xsequencing_liao_001.py index a5a67a430..3ba0ccecd 100644 --- a/sfaira/data/dataloaders/loaders/d10_1038_s41597_019_0351_8/human_kidney_2020_10xsequencing_liao_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1038_s41597_019_0351_8/human_kidney_2020_10xsequencing_liao_001.py @@ -20,7 +20,7 @@ def __init__(self, **kwargs): self.disease = "healthy" self.normalization = "raw" self.organ = "kidney" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2020 diff --git a/sfaira/data/dataloaders/loaders/d10_1073_pnas_1914143116/human_retina_2019_10xsequencing_voigt_001.py b/sfaira/data/dataloaders/loaders/d10_1073_pnas_1914143116/human_retina_2019_10xsequencing_voigt_001.py index 611ae2788..727dd683b 100644 --- a/sfaira/data/dataloaders/loaders/d10_1073_pnas_1914143116/human_retina_2019_10xsequencing_voigt_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1073_pnas_1914143116/human_retina_2019_10xsequencing_voigt_001.py @@ -18,7 +18,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1073/pnas.1914143116" self.normalization = "norm" self.organ = "retina" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2019 diff --git a/sfaira/data/dataloaders/loaders/d10_1084_jem_20191130/human_x_2019_10xsequencing_wang_001.py b/sfaira/data/dataloaders/loaders/d10_1084_jem_20191130/human_x_2019_10xsequencing_wang_001.py index 38b58e575..365a7458e 100644 --- a/sfaira/data/dataloaders/loaders/d10_1084_jem_20191130/human_x_2019_10xsequencing_wang_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1084_jem_20191130/human_x_2019_10xsequencing_wang_001.py @@ -27,7 +27,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1084/jem.20191130" self.normalization = "raw" self.organ = organ - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2019 diff --git a/sfaira/data/dataloaders/loaders/d10_1101_2020_03_13_991455/human_lung_2020_10xsequencing_lukassen_001.py b/sfaira/data/dataloaders/loaders/d10_1101_2020_03_13_991455/human_lung_2020_10xsequencing_lukassen_001.py index 1a04698fd..61281afe1 100644 --- a/sfaira/data/dataloaders/loaders/d10_1101_2020_03_13_991455/human_lung_2020_10xsequencing_lukassen_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1101_2020_03_13_991455/human_lung_2020_10xsequencing_lukassen_001.py @@ -25,7 +25,7 @@ def __init__(self, **kwargs): self.doi_preprint = "10.1101/2020.03.13.991455" self.normalization = "raw" self.organ = "lung" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2020 diff --git a/sfaira/data/dataloaders/loaders/d10_1101_2020_10_12_335331/human_blood_2020_10x_hao_001.yaml b/sfaira/data/dataloaders/loaders/d10_1101_2020_10_12_335331/human_blood_2020_10x_hao_001.yaml index 5cd1ca3e7..636b7ea18 100644 --- a/sfaira/data/dataloaders/loaders/d10_1101_2020_10_12_335331/human_blood_2020_10x_hao_001.yaml +++ b/sfaira/data/dataloaders/loaders/d10_1101_2020_10_12_335331/human_blood_2020_10x_hao_001.yaml @@ -33,7 +33,7 @@ dataset_or_observation_wise: individual_obs_key: "donor" organ: "blood" organ_obs_key: - organism: "human" + organism: "Homo sapiens" organism_obs_key: sample_source: "primary_tissue" sample_source_obs_key: diff --git a/sfaira/data/dataloaders/loaders/d10_1101_661728/mouse_x_2019_x_pisco_x.py b/sfaira/data/dataloaders/loaders/d10_1101_661728/mouse_x_2019_x_pisco_x.py index f35571467..86e7f4c77 100644 --- a/sfaira/data/dataloaders/loaders/d10_1101_661728/mouse_x_2019_x_pisco_x.py +++ b/sfaira/data/dataloaders/loaders/d10_1101_661728/mouse_x_2019_x_pisco_x.py @@ -86,7 +86,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1038/s41586-020-2496-1" self.doi_preprint = "10.1101/661728" self.normalization = "norm" - self.organism = "mouse" + self.organism = "Mus musculus" self.organ = organ self.primary_data = True self.assay_sc = "10x 3' v2" if self.sample_fn.split("-")[3] == "droplet" else "Smart-seq2" diff --git a/sfaira/data/dataloaders/loaders/d10_1101_753806/human_lungparenchyma_2020_10xsequencing_habermann_001.py b/sfaira/data/dataloaders/loaders/d10_1101_753806/human_lungparenchyma_2020_10xsequencing_habermann_001.py index 84d6114e7..b4da8c6cc 100644 --- a/sfaira/data/dataloaders/loaders/d10_1101_753806/human_lungparenchyma_2020_10xsequencing_habermann_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1101_753806/human_lungparenchyma_2020_10xsequencing_habermann_001.py @@ -33,7 +33,7 @@ def __init__(self, **kwargs): self.doi_preprint = "10.1101/753806" self.normalization = "raw" self.organ = "lung parenchyma" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.assay_sc_obs_key = "Chemistry" self.year = 2020 diff --git a/sfaira/data/dataloaders/loaders/d10_1126_science_aat5031/human_kidney_2019_10xsequencing_stewart_001.py b/sfaira/data/dataloaders/loaders/d10_1126_science_aat5031/human_kidney_2019_10xsequencing_stewart_001.py index 005a6cc9a..691972147 100644 --- a/sfaira/data/dataloaders/loaders/d10_1126_science_aat5031/human_kidney_2019_10xsequencing_stewart_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1126_science_aat5031/human_kidney_2019_10xsequencing_stewart_001.py @@ -25,7 +25,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.1126/science.aat5031" self.normalization = "norm" self.organ = "kidney" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.state_exact = "healthy" diff --git a/sfaira/data/dataloaders/loaders/d10_1126_science_aay3224/human_thymus_2020_10xsequencing_park_001.py b/sfaira/data/dataloaders/loaders/d10_1126_science_aay3224/human_thymus_2020_10xsequencing_park_001.py index 3f0e06ac0..63a9d6a0d 100644 --- a/sfaira/data/dataloaders/loaders/d10_1126_science_aay3224/human_thymus_2020_10xsequencing_park_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1126_science_aay3224/human_thymus_2020_10xsequencing_park_001.py @@ -26,7 +26,7 @@ def __init__(self, **kwargs): self.individual_obs_key = "donor" self.normalization = "norm" self.organ = "thymus" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.sex_obs_key = "Gender" diff --git a/sfaira/data/dataloaders/loaders/d10_1126_science_aba7721/human_x_2020_scirnaseq_cao_001.py b/sfaira/data/dataloaders/loaders/d10_1126_science_aba7721/human_x_2020_scirnaseq_cao_001.py index a8ee1d628..065eb0dfa 100644 --- a/sfaira/data/dataloaders/loaders/d10_1126_science_aba7721/human_x_2020_scirnaseq_cao_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1126_science_aba7721/human_x_2020_scirnaseq_cao_001.py @@ -12,24 +12,24 @@ def load(data_dir, **kwargs): } dev_stage_dict = { - 72: "11th week post-fertilization human stage", - 74: "11th week post-fertilization human stage", - 85: "13th week post-fertilization human stage", - 89: "13th week post-fertilization human stage", - 90: "13th week post-fertilization human stage", - 94: "14th week post-fertilization human stage", - 96: "14th week post-fertilization human stage", - 100: "15th week post-fertilization human stage", - 110: "16th week post-fertilization human stage", - 112: "17th week post-fertilization human stage", - 113: "17th week post-fertilization human stage", - 115: "17th week post-fertilization human stage", - 117: "17th week post-fertilization human stage", - 119: "18th week post-fertilization human stage", - 120: "18th week post-fertilization human stage", - 122: "18th week post-fertilization human stage", - 125: "18th week post-fertilization human stage", - 129: "19th week post-fertilization human stage", + 72: "11th week post-fertilization homosapiens stage", + 74: "11th week post-fertilization homosapiens stage", + 85: "13th week post-fertilization homosapiens stage", + 89: "13th week post-fertilization homosapiens stage", + 90: "13th week post-fertilization homosapiens stage", + 94: "14th week post-fertilization homosapiens stage", + 96: "14th week post-fertilization homosapiens stage", + 100: "15th week post-fertilization homosapiens stage", + 110: "16th week post-fertilization homosapiens stage", + 112: "17th week post-fertilization homosapiens stage", + 113: "17th week post-fertilization homosapiens stage", + 115: "17th week post-fertilization homosapiens stage", + 117: "17th week post-fertilization homosapiens stage", + 119: "18th week post-fertilization homosapiens stage", + 120: "18th week post-fertilization homosapiens stage", + 122: "18th week post-fertilization homosapiens stage", + 125: "18th week post-fertilization homosapiens stage", + 129: "19th week post-fertilization homosapiens stage", } organ_dict = { diff --git a/sfaira/data/dataloaders/loaders/d10_1126_science_aba7721/human_x_2020_scirnaseq_cao_001.yaml b/sfaira/data/dataloaders/loaders/d10_1126_science_aba7721/human_x_2020_scirnaseq_cao_001.yaml index 9c1da6595..3ec56ec19 100644 --- a/sfaira/data/dataloaders/loaders/d10_1126_science_aba7721/human_x_2020_scirnaseq_cao_001.yaml +++ b/sfaira/data/dataloaders/loaders/d10_1126_science_aba7721/human_x_2020_scirnaseq_cao_001.yaml @@ -33,7 +33,7 @@ dataset_or_observation_wise: individual_obs_key: "Fetus_id" organ: organ_obs_key: "Organ" - organism: "human" + organism: "Homo sapiens" organism_obs_key: sample_source: "primary_tissue" sample_source_obs_key: diff --git a/sfaira/data/dataloaders/loaders/d10_1186_s13059_019_1906_x/human_x_2019_10xsequencing_madissoon_001.py b/sfaira/data/dataloaders/loaders/d10_1186_s13059_019_1906_x/human_x_2019_10xsequencing_madissoon_001.py index c8070d7b0..7c9747eb9 100644 --- a/sfaira/data/dataloaders/loaders/d10_1186_s13059_019_1906_x/human_x_2019_10xsequencing_madissoon_001.py +++ b/sfaira/data/dataloaders/loaders/d10_1186_s13059_019_1906_x/human_x_2019_10xsequencing_madissoon_001.py @@ -40,7 +40,7 @@ def __init__(self, **kwargs): self.normalization = "raw" # ToDo "madissoon19_lung.processed.h5ad" is close to integer but not quire (~1e-4) self.organ = "lung parenchyma" if self.sample_fn == "madissoon19_lung.processed.h5ad" else \ "esophagus" if self.sample_fn == "oesophagus.cellxgene.h5ad" else "spleen" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.year = 2019 self.sample_source = "primary_tissue" diff --git a/sfaira/data/dataloaders/loaders/d10_15252_embj_2018100811/human_retina_2019_10xsequencing_lukowski_001.py b/sfaira/data/dataloaders/loaders/d10_15252_embj_2018100811/human_retina_2019_10xsequencing_lukowski_001.py index 8a96740f6..30e4b3c85 100644 --- a/sfaira/data/dataloaders/loaders/d10_15252_embj_2018100811/human_retina_2019_10xsequencing_lukowski_001.py +++ b/sfaira/data/dataloaders/loaders/d10_15252_embj_2018100811/human_retina_2019_10xsequencing_lukowski_001.py @@ -20,7 +20,7 @@ def __init__(self, **kwargs): self.doi_journal = "10.15252/embj.2018100811" self.normalization = "raw" self.organ = "retina" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2019 diff --git a/sfaira/data/dataloaders/loaders/dno_doi_10x_genomics/human_blood_2019_10xsequencing_10xgenomics_001.py b/sfaira/data/dataloaders/loaders/dno_doi_10x_genomics/human_blood_2019_10xsequencing_10xgenomics_001.py index 9c5702c69..20492d086 100644 --- a/sfaira/data/dataloaders/loaders/dno_doi_10x_genomics/human_blood_2019_10xsequencing_10xgenomics_001.py +++ b/sfaira/data/dataloaders/loaders/dno_doi_10x_genomics/human_blood_2019_10xsequencing_10xgenomics_001.py @@ -21,7 +21,7 @@ def __init__(self, **kwargs): self.doi_journal = "no_doi_10x_genomics" self.normalization = "raw" self.organ = "blood" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2019 diff --git a/sfaira/data/dataloaders/loaders/dno_doi_regev/human_x_2018_10xsequencing_regev_001.py b/sfaira/data/dataloaders/loaders/dno_doi_regev/human_x_2018_10xsequencing_regev_001.py index b1d8c86ad..4339abdc6 100644 --- a/sfaira/data/dataloaders/loaders/dno_doi_regev/human_x_2018_10xsequencing_regev_001.py +++ b/sfaira/data/dataloaders/loaders/dno_doi_regev/human_x_2018_10xsequencing_regev_001.py @@ -23,7 +23,7 @@ def __init__(self, **kwargs): self.doi_journal = "no_doi_regev" self.normalization = "raw" self.organ_obs_key = "derived_organ_parts_label" - self.organism = "human" + self.organism = "Homo sapiens" self.primary_data = True self.sample_source = "primary_tissue" self.year = 2018 diff --git a/sfaira/data/interactive/loader.py b/sfaira/data/interactive/loader.py index ff4f459b1..1bcc44bd2 100644 --- a/sfaira/data/interactive/loader.py +++ b/sfaira/data/interactive/loader.py @@ -9,12 +9,8 @@ class DatasetInteractive(DatasetBase): def __init__( self, data: anndata.AnnData, - organism: str, - organ: str, gene_symbol_col: Union[str, None] = 'index', gene_ens_col: Union[str, None] = None, - obs_key_celltypes: Union[str, None] = None, - class_maps: dict = {}, dataset_id: str = "interactive_dataset", data_path: Union[str, None] = ".", meta_path: Union[str, None] = ".", @@ -24,12 +20,8 @@ def __init__( Load data set into sfaira data format. :param data: Data set. - :param organism: Organism of data set. - :param organ: Organ of data set. :param gene_symbol_col: Column name in .var which contains gene symbols. Set to "index" to use the index. :param gene_ens_col: Column name in .var which contains ENSG symbols. Set to "index" to use the index. - :param obs_key_celltypes: .obs column name which contains cell type labels. - :param class_maps: Cell type class maps. :param dataset_id: Identifer of data set. :param data_path: :param meta_path: @@ -40,47 +32,12 @@ def __init__( self.author = "interactive_dataset" self.doi_journal = "interactive_dataset" - self.download_url_data = "." self.download_url_meta = "." - # self.age # not currently supported - # self.assay_sc # not currently supported - # self.assay_differentiation # not currently supported - # self.assay_type_differentiation # not currently supported - # self.cell_line # not currently supported - # self.dev_stage # not currently supported - # self.ethnicity # not currently supported - # self.healthy # not currently supported - # self.normalisation # not currently supported - self.organ = organ - self.organism = organism - # self.sample_source # not currently supported - # self.sex # not currently supported - # self.state_exact # not currently supported - # self.year # not currently supported - - self.obs_key_cell_types_original = obs_key_celltypes - - # self.obs_key_age # not currently supported - # self.obs_key_assay_sc # not currently supported - # self.obs_key_assay_differentiation # not currently supported - # self.obs_key_assay_type_differentiation # not currently supported - # self.obs_key_cell_line # not currently supported - # self.obs_key_dev_stage # not currently supported - # self.obs_key_ethnicity # not currently supported - # self.obs_key_healthy # not currently supported - # self.obs_key_organ # not currently supported - # self.obs_key_organism # not currently supported - # self.obs_key_sample_source # not currently supported - # self.obs_key_sex # not currently supported - # self.obs_key_state_exact # not currently supported - self.gene_id_symbols_var_key = gene_symbol_col self.gene_id_ensembl_var_key = gene_ens_col - self.class_maps = class_maps - self.adata = data def _load(self): diff --git a/sfaira/data/store/batch_schedule.py b/sfaira/data/store/batch_schedule.py index 6a2a10718..fdbf46e23 100644 --- a/sfaira/data/store/batch_schedule.py +++ b/sfaira/data/store/batch_schedule.py @@ -2,17 +2,6 @@ from typing import List, Tuple -def _get_batch_start_ends(idx: np.ndarray, batch_size: int): - n_obs = len(idx) - remainder = n_obs % batch_size if n_obs > 0 else 0 - n_batches = int(n_obs // batch_size + int(remainder > 0)) if n_obs > 0 else 0 - batch_starts_ends = [ - (int(x * batch_size), int(np.minimum((x * batch_size) + batch_size, n_obs))) - for x in np.arange(0, n_batches) - ] - return batch_starts_ends - - def _randomize_batch_start_ends(batch_starts_ends): batch_range = np.arange(0, len(batch_starts_ends)) np.random.shuffle(batch_range) @@ -31,6 +20,17 @@ def __init__(self, retrieval_batch_size: int, randomized_batch_access: bool, ran self.randomized_batch_access = randomized_batch_access self.random_access = random_access + @staticmethod + def _get_batch_start_ends(idx: np.ndarray, batch_size: int): + n_obs = len(idx) + remainder = n_obs % batch_size if n_obs > 0 else 0 + n_batches = int(n_obs // batch_size + int(remainder > 0)) if n_obs > 0 else 0 + batch_starts_ends = [ + (int(x * batch_size), int(np.minimum((x * batch_size) + batch_size, n_obs))) + for x in np.arange(0, n_batches) + ] + return batch_starts_ends + @property def batch_bounds(self): """ @@ -48,7 +48,7 @@ def idx(self): @idx.setter def idx(self, x): - self._batch_bounds = _get_batch_start_ends(idx=x, batch_size=self.retrieval_batch_size) + self._batch_bounds = self._get_batch_start_ends(idx=x, batch_size=self.retrieval_batch_size) self._idx = np.sort(x) # Sorted indices improve accession efficiency in some cases. @property @@ -123,7 +123,85 @@ def design(self) -> Tuple[np.ndarray, np.ndarray, List[Tuple[int, int]]]: return idx_proc, self.idx[idx_proc], batch_bounds +class BatchDesignBlocks(BatchDesignBase): + + """Yields meta data-defined blocks of observations in each iteration.""" + + def __init__(self, grouping, random_access: bool, **kwargs): + """ + :param grouping: Group label for each entry in idx. + :param group_weights: Group weight for each unique group in grouping. Does not have to normalise to a probability + distribution but is normalised in this function. The outcome vector is always of length idx. + """ + super(BatchDesignBlocks, self).__init__(random_access=random_access, **kwargs) + if not random_access: + print("WARNING: random_access==False is dangerous if you do not work with a large shuffle buffer " + "downstream of the sfaira generator.") + # Create integer group assignment array. + self.grouping = grouping + + @property + def grouping(self): + return self._grouping[self.idx] + + @grouping.setter + def grouping(self, x): + self._grouping = x + # Reset: + self._groups = None + self.grouping_sizes = None + self.idx_sorted = None + + @property + def groups(self): + if self._groups is None: + self._groups = np.unique(self.grouping) + return self._groups + + def _get_batch_start_ends(self, idx: np.ndarray, batch_size: int): + n_batches = len(self.groups) + batch_starts_ends = [] + for x in np.arange(0, n_batches): + s = 0 if x == 0 else batch_starts_ends[-1][1] + batch_starts_ends.append((s, int(s + self.grouping_sizes[x]))) + return batch_starts_ends + + @property + def idx(self): + """ + Protects property from uncontrolled changing. + Changes to _idx require changes to _batch_bounds. + """ + return self._idx + + @idx.setter + def idx(self, x): + self._idx = x + # Reset: + self._groups = None + grouping_sizes = np.zeros((self.groups.shape[0],), dtype="int32") + idx_sorted = [] + for i, x in enumerate(self.groups): + idx = np.where(self.grouping == x)[0] + grouping_sizes[i] = len(idx) + idx_sorted.append(idx) + self.grouping_sizes = grouping_sizes + self.idx_sorted = np.concatenate(idx_sorted) + self._batch_bounds = self._get_batch_start_ends(idx=x, batch_size=self.retrieval_batch_size) + + @property + def design(self) -> Tuple[np.ndarray, np.ndarray, List[Tuple[int, int]]]: + idx_proc = self.idx_sorted.copy() + batch_bounds = self.batch_bounds.copy() + if self.random_access: # Note: randomization is result from sampling above, need to revert if not desired. + batch_bounds = np.asarray(batch_bounds) + np.random.shuffle(batch_bounds) + batch_bounds = batch_bounds.tolist() + return idx_proc, self.idx[idx_proc], batch_bounds + + BATCH_SCHEDULE = { "base": BatchDesignBasic, "balanced": BatchDesignBalanced, + "blocks": BatchDesignBlocks, } diff --git a/sfaira/data/store/generators.py b/sfaira/data/store/generators.py index 173c94e76..dc51f7dd2 100644 --- a/sfaira/data/store/generators.py +++ b/sfaira/data/store/generators.py @@ -101,11 +101,10 @@ def __init__(self, batch_schedule, batch_size, map_fn, obs_idx, obs_keys, var_id :param obs_keys: .obs columns to return in the generator. These have to be a subset of the columns available in self.adata_by_key. :param var_idx: The features to emit. + :parm split_to_obs: Whether to split tensors to observation-wise slices at the emission stage of the generator. """ self.var_idx = var_idx self._obs_idx = None - if not batch_size == 1: - raise ValueError(f"Only batch size==1 is supported, found {batch_size}.") self.batch_schedule = batch_schedule self.batch_size = batch_size self.map_fn = map_fn diff --git a/sfaira/data/store/multi_store.py b/sfaira/data/store/multi_store.py index a041c7b86..bfb9940b3 100644 --- a/sfaira/data/store/multi_store.py +++ b/sfaira/data/store/multi_store.py @@ -247,8 +247,15 @@ def __init__(self, adatas: Union[anndata.AnnData, List[anndata.AnnData], Tuple[a indices = {} if isinstance(adatas, anndata.AnnData): adatas = [adatas] - for adata in adatas: - organism = adata.uns[self._adata_ids_sfaira.organism] + for i, adata in enumerate(adatas): + # Check if adata has a unique ID, if not, add one: + if self._adata_ids_sfaira.id not in adata.uns.keys(): + adata.uns[self._adata_ids_sfaira.id] = f"adata_{i}" + if self._adata_ids_sfaira.organism in adata.uns.keys(): + organism = adata.uns[self._adata_ids_sfaira.organism] + else: + # Declare as unknown organism and genome and make a group of its own: + organism = adata.uns[self._adata_ids_sfaira.id] if isinstance(organism, list): if len(organism) == 1: organism = organism[0] diff --git a/sfaira/data/store/single_store.py b/sfaira/data/store/single_store.py index 6a0598f9e..429ccfecf 100644 --- a/sfaira/data/store/single_store.py +++ b/sfaira/data/store/single_store.py @@ -13,7 +13,7 @@ from sfaira.data.dataloaders.base.utils import is_child, UNS_STRING_META_IN_OBS from sfaira.data.store.base import DistributedStoreBase from sfaira.data.store.generators import GeneratorAnndata, GeneratorDask, GeneratorSingle -from sfaira.versions.genomes.genomes import GenomeContainer +from sfaira.versions.genomes.genomes import GenomeContainer, ReactiveFeatureContainer """ Distributed stores are array-like classes that sit on groups of on disk representations of anndata instances files. @@ -35,8 +35,8 @@ def _process_batch_size(batch_size: int, retrival_batch_size: int) -> Tuple[int, int]: - if batch_size != 1: - raise ValueError("batch size is only supported as 1") + if batch_size not in [0, 1]: + raise ValueError("batch size is only supported other than 0 or 1") return batch_size, retrival_batch_size @@ -67,6 +67,7 @@ class DistributedStoreSingleFeatureSpace(DistributedStoreBase): _indices: Dict[str, np.ndarray] _obs_by_key: Union[None, Dict[str, dask.dataframe.DataFrame]] data_source: str + _dataset_weights: Dict[str, float] def __init__(self, adata_by_key: Dict[str, anndata.AnnData], indices: Dict[str, np.ndarray], obs_by_key: Union[None, Dict[str, dask.dataframe.DataFrame]] = None, data_source: str = "X"): @@ -190,12 +191,17 @@ def genome_container(self) -> Union[GenomeContainer, None]: return self._genome_container @genome_container.setter - def genome_container(self, x: Union[GenomeContainer]): - var_names = self._validate_feature_space_homogeneity() - # Validate genome container choice: - # Make sure that all var names defined in genome container are also contained in loaded data sets. - assert np.all([y in var_names for y in x.ensembl]), \ - "did not find variable names from genome container in store" + def genome_container(self, x: Union[GenomeContainer, None]): + if x is not None: + var_names = self._validate_feature_space_homogeneity() + if isinstance(x, ReactiveFeatureContainer): + # Load stores feature names into container. + x.symbols = var_names + else: + # Validate genome container choice: + # Make sure that all var names defined in genome container are also contained in loaded data sets. + assert np.all([y in var_names for y in x.ensembl]), \ + "did not find variable names from genome container in store" self._genome_container = x @property @@ -236,11 +242,11 @@ def get_subset_idx(self, attr_key, values: Union[str, List[str], None], assert (values is None or excluded_values is not None) or (values is not None or excluded_values is None), \ "supply either values or excluded_values" - def get_idx(adata, obs, k, v, xv, dataset): + def get_idx(adata, obs, k, v, xv, dataset) -> np.ndarray: # Use cell-wise annotation if data set-wide maps are ambiguous: # This can happen if the different cell-wise annotations are summarised as a union in .uns. read_from_uns = (getattr(self._adata_ids_sfaira, k) in adata.uns.keys() and - adata.uns[getattr(self._adata_ids_sfaira, k)] != UNS_STRING_META_IN_OBS and + np.all(adata.uns[getattr(self._adata_ids_sfaira, k)] != UNS_STRING_META_IN_OBS) and getattr(self._adata_ids_sfaira, k) not in obs.columns) read_from_obs = not read_from_uns and getattr(self._adata_ids_sfaira, k) in obs.columns if read_from_uns: @@ -255,31 +261,32 @@ def get_idx(adata, obs, k, v, xv, dataset): # Replicate unique property along cell dimension. values_found = [values_found[0] for _ in range(adata.n_obs)] elif read_from_obs: - values_found = obs[getattr(self._adata_ids_sfaira, k)].values + values_found = obs[getattr(self._adata_ids_sfaira, k)].to_numpy() else: values_found = [] print(f"WARNING: did not find attribute {k} in data set {dataset}") - values_found_unique = np.unique(values_found) + try: ontology = getattr(self.ontology_container, k) except AttributeError: raise ValueError(f"{k} not a valid property of ontology_container object") - # Test only unique elements found in ontology to save time. + if v is not None: - values_found_unique_matched = [ - x for x in values_found_unique if np.any([ - is_child(query=x, ontology=ontology, ontology_parent=y) - for y in v - ]) - ] + v_xv_selector = v else: - values_found_unique_matched = [ - x for x in values_found_unique if np.all([ - not is_child(query=x, ontology=ontology, ontology_parent=y) - for y in xv - ]) - ] - idx = np.where([x in values_found_unique_matched for x in values_found])[0] + v_xv_selector = xv + values_found_unique_matched = [] + for x in pd.unique(values_found): + # dont do checking for 'unknown' placeholders + if any([ + x == self._adata_ids_sfaira.unknown_metadata_identifier, + x == self._adata_ids_sfaira.not_a_cell_celltype_identifier + ]): + values_found_unique_matched.append(x) + elif np.all([is_child(query=x, ontology=ontology, ontology_parent=y) for y in v_xv_selector]): + values_found_unique_matched.append(x) + + idx = np.where(np.isin(values_found, values_found_unique_matched))[0] return idx indices = {} @@ -293,10 +300,10 @@ def get_idx(adata, obs, k, v, xv, dataset): # Cannot index on view here as indexing on view of views of backed anndata objects is not yet supported. idx_subset = get_idx(adata=adata_k, obs=obs_k, k=attr_key, v=values, xv=excluded_values, dataset=key) # Keep intersection of old and new hits. - idx_new = np.sort(list(set(np.asarray(idx_old).tolist()).intersection( - set(np.asarray(idx_subset).tolist())))) + idx_new = np.intersect1d(idx_old, idx_subset) if len(idx_new) > 0: indices[key] = np.asarray(idx_new, dtype="int32") + return indices def subset(self, attr_key, values: Union[str, List[str], None] = None, @@ -322,6 +329,7 @@ def subset(self, attr_key, values: Union[str, List[str], None] = None, - "state_exact" points to self.state_exact_obs_key :param values: Classes to overlap to. Supply either values or excluded_values. :param excluded_values: Classes to exclude from match list. Supply either values or excluded_values. + :param verbose: If >1 print warning message if store is empty after subsetting """ self.indices = self.get_subset_idx(attr_key=attr_key, values=values, excluded_values=excluded_values) if self.n_obs == 0 and verbose > 0: @@ -401,7 +409,7 @@ def _index_curation_helper( :param batch_size: Number of observations read from disk in each batched access (generator invocation). :return: Tuple: - var_idx: Processed feature index vector for generator to access. - - batch_size: Processed batch size for generator to access. + - batch_size: Processed batch size for generator to access. Choose as 0 to keep batch schedule batches. - retrival_batch_size: Processed retrieval batch size for generator to access. """ # Make sure that features are ordered in the same way in each object so that generator yields consistent cell @@ -410,8 +418,11 @@ def _index_curation_helper( # Use feature space sub-selection based on assembly if provided, will use full feature space otherwise. if self.genome_container is not None: var_names_target = self.genome_container.ensembl - # Check if index vector is just full ordered list of indices, in this case, sub-setting is unnecessary. - if len(var_names_target) == len(var_names) and np.all(var_names_target == var_names): + if var_names_target is None: + # Check if genome container does not constrain features. + var_idx = None + elif len(var_names_target) == len(var_names) and np.all(var_names_target == var_names): + # Check if index vector is just full ordered list of indices, in this case, sub-setting is unnecessary. var_idx = None else: # Check if variable names are continuous stretch in reference list, indexing this is much faster. @@ -462,7 +473,7 @@ def generator( batch_size: int = 1, retrieval_batch_size: int = 128, map_fn=None, - obs_keys: List[str] = [], + obs_keys: List[str] = None, return_dense: bool = True, randomized_batch_access: bool = False, random_access: bool = False, @@ -481,8 +492,8 @@ def generator( along all observations in self.adata_by_key, ordered along a hypothetical concatenation along the keys of self.adata_by_key. If None, all observations are selected. :param batch_size: Number of observations to yield in each access (generator invocation). - :param retrieval_batch_size: Number of observations read from disk in each batched access (data-backend generator - invocation). + :param retrieval_batch_size: Number of observations read from disk in each batched access (data-backend + generator invocation). :param map_fn: Map functino to apply to output tuple of raw generator. Each draw i from the generator is then: `yield map_fn(x[i, var_idx], obs[i, obs_keys])` :param obs_keys: .obs columns to return in the generator. These have to be a subset of the columns available @@ -509,11 +520,14 @@ def generator( :return: Generator function which yields batch_size at every invocation. The generator returns a tuple of (.X, .obs). """ + if obs_keys is None: + obs_keys = [] var_idx, batch_size, retrieval_batch_size = self._index_curation_helper( batch_size=batch_size, retrival_batch_size=retrieval_batch_size) batch_schedule_kwargs = {"randomized_batch_access": randomized_batch_access, "random_access": random_access, "retrieval_batch_size": retrieval_batch_size} + kwargs['return_dense'] = return_dense gen = self._get_generator(batch_schedule=batch_schedule, batch_size=batch_size, map_fn=map_fn, obs_idx=idx, obs_keys=obs_keys, var_idx=var_idx, **batch_schedule_kwargs, **kwargs) return gen @@ -584,8 +598,8 @@ def X_slice(self, idx: np.ndarray, as_sparse: bool = True, **kwargs) -> Union[np """ batch_size = min(len(idx), 128) - def map_fn(x, obs): - return (x, ), + def map_fn(x_, obs): + return (x_, ), g = self.generator(idx=idx, retrieval_batch_size=batch_size, return_dense=True, random_access=False, randomized_batch_access=False, map_fn=map_fn, **kwargs) diff --git a/sfaira/data/utils_scripts/create_anatomical_configs_store.py b/sfaira/data/utils_scripts/create_anatomical_configs_store.py index cb2e2ce36..409998229 100644 --- a/sfaira/data/utils_scripts/create_anatomical_configs_store.py +++ b/sfaira/data/utils_scripts/create_anatomical_configs_store.py @@ -15,7 +15,7 @@ configs_to_write = { - "human": [ + "Homo sapiens": [ "adipose tissue", "adrenal gland", "artery", @@ -52,7 +52,7 @@ "uterus", "vault of skull", ], - "mouse": [ + "Mus musculus": [ "adipose tissue", "blood", "bone marrow", diff --git a/sfaira/data/utils_scripts/test_store.py b/sfaira/data/utils_scripts/test_store.py index 64bd0ecdd..8f1fa7464 100644 --- a/sfaira/data/utils_scripts/test_store.py +++ b/sfaira/data/utils_scripts/test_store.py @@ -55,8 +55,8 @@ def time_gen(_store, store_format, kwargs) -> List[float]: if store_format == "h5ad": del kwargs["random_access"] if kwargs["var_subset"]: - gc = sfaira.versions.genomes.genomes.GenomeContainer(assembly="Homo_sapiens.GRCh38.102") - gc.subset(symbols=["VTA1", "MLXIPL", "BAZ1B", "RANBP9", "PPARGC1A", "DDX25", "CRYAB"]) + gc = sfaira.versions.genomes.genomes.GenomeContainer(release="Homo_sapiens.GRCh38.102") + gc.set(symbols=["VTA1", "MLXIPL", "BAZ1B", "RANBP9", "PPARGC1A", "DDX25", "CRYAB"]) _store.genome_container = gc del kwargs["var_subset"] _gen, _ = _store.iterator(**kwargs) @@ -81,14 +81,14 @@ def get_idx_dataset_start(_store, k_target): # Define data objects to be comparable: store = sfaira.data.load_store(cache_path=path_store_dao, store_format="dao") -store.subset(attr_key="organism", values="human") -store = store.stores["human"] +store.subset(attr_key="organism", values="homosapiens") +store = store.stores["homosapiens"] k_datasets_dao = list(store.indices.keys()) # Sort by size: k_datasets_dao = np.asarray(k_datasets_dao)[np.argsort([len(v) for v in store.indices.values()])].tolist() store = sfaira.data.load_store(cache_path=path_store_h5ad, store_format="h5ad") -store.subset(attr_key="organism", values="human") -store = store.stores["human"] +store.subset(attr_key="organism", values="homosapiens") +store = store.stores["homosapiens"] k_datasets_h5ad = list(store.indices.keys()) # Only retain intersection of data sets while keeping order. k_datasets = [x for x in k_datasets_dao if x in k_datasets_h5ad] @@ -121,8 +121,8 @@ def get_idx_dataset_start(_store, k_target): time_measurements["load_random_from_one_dataset_todense_varsubet"][store_type_i] = {} time_measurements["load_random_from_many_datasets_todense_varsubet"][store_type_i] = {} store = sfaira.data.load_store(cache_path=path_store, store_format=store_type_i) - store.subset(attr_key="organism", values="human") - store = store.stores["human"] + store.subset(attr_key="organism", values="homosapiens") + store = store.stores["homosapiens"] idx_dataset_start = get_idx_dataset_start(_store=store, k_target=k_datasets) idx_dataset_end = [i + len(store.indices[x]) for i, x in zip(idx_dataset_start, k_datasets)] for bs in BATCH_SIZES: diff --git a/sfaira/data/utils_scripts/write_store.py b/sfaira/data/utils_scripts/write_store.py index 42cd7f90b..a8c0d0dcf 100644 --- a/sfaira/data/utils_scripts/write_store.py +++ b/sfaira/data/utils_scripts/write_store.py @@ -42,7 +42,7 @@ ) ds.streamline_features( remove_gene_version=True, - match_to_reference={"human": "Homo_sapiens.GRCh38.102", "mouse": "Mus_musculus.GRCm38.102"}, + match_to_release={"Homo sapiens": "104", "Mus musculus": "104"}, subset_genes_to_type="protein_coding" ) ds.streamline_metadata(schema="sfaira", clean_obs=True, clean_var=True, clean_uns=True, clean_obs_names=True) diff --git a/sfaira/estimators/keras.py b/sfaira/estimators/keras.py index 2aa3d4154..004f30507 100644 --- a/sfaira/estimators/keras.py +++ b/sfaira/estimators/keras.py @@ -68,14 +68,14 @@ def split_idx(data: DistributedStoreSingleFeatureSpace, test_split, val_split): print(f"Found {len(idx_test)} out of {data.n_obs} cells that correspond to test data set") assert len(idx_test) < data.n_obs, f"test set covers full data set, apply a more restrictive test " \ f"data definiton ({len(idx_test)}, {data.n_obs})" - idx_train_eval = np.array([x for x in all_idx if x not in idx_test]) + idx_train_eval = all_idx[~np.isin(all_idx, idx_test)] np.random.seed(1) idx_eval = np.sort(np.random.choice( a=idx_train_eval, size=round(len(idx_train_eval) * val_split), replace=False )) - idx_train = np.sort([x for x in idx_train_eval if x not in idx_eval]) + idx_train = np.sort(idx_train_eval[~np.isin(idx_train_eval, idx_eval)]) # Check that none of the train, test, eval partitions are empty if not len(idx_test): @@ -213,7 +213,7 @@ def model_type(self): @property def organism(self): - return {"homo_sapiens": "human", "mus_musculus": "mouse"}[self.topology_container.organism] + return self.topology_container.organism def load_pretrained_weights(self): """ diff --git a/sfaira/train/summaries.py b/sfaira/train/summaries.py index 40c5147b7..ccf69aee5 100644 --- a/sfaira/train/summaries.py +++ b/sfaira/train/summaries.py @@ -1445,7 +1445,7 @@ def get_gradients_by_celltype( if data_organ is not None: u.subset("organ", data_organ) u.load(allow_caching=False) - u.streamline_features(match_to_reference=genome, subset_genes_to_type=gene_type) + u.streamline_features(match_to_release=genome, subset_genes_to_type=gene_type) u.streamline_metadata() adata = u.adata else: diff --git a/sfaira/train/train_model.py b/sfaira/train/train_model.py index f0c791055..f5bde44e0 100644 --- a/sfaira/train/train_model.py +++ b/sfaira/train/train_model.py @@ -128,7 +128,8 @@ def init_estim( data=self.data, model_dir=self.model_dir, model_id=self.zoo.model_id, - model_topology=self.zoo.topology_container + model_topology=self.zoo.topology_container, + cache_path=self.model_dir, ) self.estimator.init_model(override_hyperpar=override_hyperpar) print(f"TRAINER: initialised model with {self.estimator.topology_container.n_var} features.") @@ -195,7 +196,8 @@ def init_estim( data=self.data, model_dir=self.model_dir, model_id=self.zoo.model_id, - model_topology=self.zoo.topology_container + model_topology=self.zoo.topology_container, + cache_path=self.model_dir, ) self.estimator.celltype_universe.load_target_universe(self.fn_target_universe) self.estimator.init_model(override_hyperpar=override_hyperpar) diff --git a/sfaira/ui/model_zoo.py b/sfaira/ui/model_zoo.py index a6c757db2..db117384f 100644 --- a/sfaira/ui/model_zoo.py +++ b/sfaira/ui/model_zoo.py @@ -171,7 +171,12 @@ def model_name(self): def model_organism(self): # TODO: this relies on theislab model_name formatting assert self.model_id is not None, "set model_id first" - return self.model_id.split('_')[1].split("-")[0] + return { + "homosapiens": "Homo sapiens", + "musmusculus": "Mus musculus", + "human": "Homo sapiens", # necessary for old sfaira model uploads + "mouse": "Mus musculus", # necessary for old sfaira model uploads + }[self.model_id.split('_')[1].split("-")[0]] @property def model_organ(self): diff --git a/sfaira/ui/user_interface.py b/sfaira/ui/user_interface.py index ef6edd950..d079154c5 100644 --- a/sfaira/ui/user_interface.py +++ b/sfaira/ui/user_interface.py @@ -8,7 +8,8 @@ import warnings import time -from sfaira.consts import AdataIdsSfaira, AdataIds, OCS, SFAIRA_REPO_URL +from sfaira import settings +from sfaira.consts import AdataIdsSfaira, AdataIds, OCS from sfaira.data import DatasetInteractive from sfaira.estimators import EstimatorKerasEmbedding, EstimatorKerasCelltype from sfaira.ui.model_zoo import ModelZoo @@ -61,7 +62,7 @@ def __init__( self.adata_ids = AdataIdsSfaira() if sfaira_repo: # check if public sfaira repository should be accessed - self.model_lookuptable = self._load_lookuptable(SFAIRA_REPO_URL) + self.model_lookuptable = self._load_lookuptable(settings.sfaira_repo_url) if custom_repo: if isinstance(custom_repo, str): @@ -341,7 +342,6 @@ def load_data( gene_symbol_col: Union[str, None] = None, gene_ens_col: Union[str, None] = None, obs_key_celltypes: Union[str, None] = None, - class_maps: dict = {}, ): """ Loads the provided AnnData object into sfaira. @@ -355,7 +355,6 @@ def load_data( :param gene_symbol_col: Var column name (or 'index') which contains gene symbols :param gene_ens_col: ar column name (or 'index') which contains ensembl ids :param obs_key_celltypes: .obs column name which contains cell type labels. - :param class_maps: Cell type class maps. """ if self.zoo_embedding.model_organism is not None and self.zoo_celltype.model_organism is not None: assert self.zoo_embedding.model_organism == self.zoo_celltype.model_organism, \ @@ -382,16 +381,15 @@ def load_data( self.data = DatasetInteractive( data=data, - organism=organism, - organ=organ, gene_symbol_col=gene_symbol_col, gene_ens_col=gene_ens_col, - obs_key_celltypes=obs_key_celltypes, - class_maps=class_maps, ) + self.data.organism = organism + self.data.organ = organ + self.data.cell_type_obs_key = obs_key_celltypes # Align to correct featurespace self.data.streamline_features( - match_to_reference=self.zoo_embedding.topology_container.gc.assembly, + match_to_release=self.zoo_embedding.topology_container.gc.release, subset_genes_to_type=list(set(self.zoo_embedding.topology_container.gc.biotype)) ) # Transfer required metadata from the Dataset instance to the adata object @@ -433,14 +431,15 @@ def load_model_embedding(self): :return: Model ID loaded. """ assert self.zoo_embedding.model_id is not None, "choose embedding model first" - if self.zoo_celltype.topology_container.gc.assembly is not None: - assert self.zoo_embedding.topology_container.gc.assembly == \ - self.zoo_celltype.topology_container.gc.assembly, f"genome assemblies defined in the topology " \ - f"containers if the embedding and the celltype " \ - f"prediction model are not equivalent " \ - f"({self.zoo_embedding.topology_container.gc.assembly} " \ - f"and {self.zoo_celltype.topology_container.gc.assembly} " \ - f"respectively, aborting.)" + if self.zoo_celltype.topology_container.gc.release is not None: + assert self.zoo_embedding.topology_container.gc.release == \ + self.zoo_celltype.topology_container.gc.release, \ + "genome assemblies defined in the topology " \ + "containers if the embedding and the celltype " \ + "prediction model are not equivalent " \ + f"({self.zoo_embedding.topology_container.gc.release} " \ + f"and {self.zoo_celltype.topology_container.gc.release} " \ + f"respectively, aborting.)" model_weights_file = self.model_lookuptable["model_file_path"].loc[self.model_lookuptable["model_id"] == self.zoo_embedding.model_id].iloc[0] md5 = self.model_lookuptable["md5"].loc[self.model_lookuptable["model_id"] == @@ -470,14 +469,15 @@ def load_model_celltype(self): :return: Model ID loaded. """ assert self.zoo_celltype.model_id is not None, "choose cell type model first" - if self.zoo_embedding.topology_container.gc.assembly is not None: - assert self.zoo_embedding.topology_container.gc.assembly == \ - self.zoo_celltype.topology_container.gc.assembly, f"genome assemblies defined in the topology " \ - f"containers if the embedding and the celltype " \ - f"prediction model are not equivalent " \ - f"({self.zoo_embedding.topology_container.gc.assembly} " \ - f"and {self.zoo_celltype.topology_container.gc.assembly} " \ - f"respectively, aborting.)" + if self.zoo_embedding.topology_container.gc.release is not None: + assert self.zoo_embedding.topology_container.gc.release == \ + self.zoo_celltype.topology_container.gc.release, \ + "genome assemblies defined in the topology " \ + "containers if the embedding and the celltype " \ + "prediction model are not equivalent " \ + f"({self.zoo_embedding.topology_container.gc.release} " \ + f"and {self.zoo_celltype.topology_container.gc.release} " \ + f"respectively, aborting.)" model_weights_file = self.model_lookuptable["model_file_path"].loc[self.model_lookuptable["model_id"] == self.zoo_celltype.model_id].iloc[0] md5 = self.model_lookuptable["md5"].loc[self.model_lookuptable["model_id"] == diff --git a/sfaira/unit_tests/data_for_tests/loaders/__init__.py b/sfaira/unit_tests/data_for_tests/loaders/__init__.py index f2096a71b..1cbe9e514 100644 --- a/sfaira/unit_tests/data_for_tests/loaders/__init__.py +++ b/sfaira/unit_tests/data_for_tests/loaders/__init__.py @@ -1,3 +1,3 @@ -from .consts import ASSEMBLY_HUMAN, ASSEMBLY_MOUSE +from .consts import ASSEMBLY_HUMAN, ASSEMBLY_MOUSE, RELEASE_HUMAN, RELEASE_MOUSE from .loaders import DatasetSuperGroupMock -from .utils import prepare_dsg, prepare_store +from .utils import PrepareData diff --git a/sfaira/unit_tests/data_for_tests/loaders/consts.py b/sfaira/unit_tests/data_for_tests/loaders/consts.py index ef6cae2a5..47d215d61 100644 --- a/sfaira/unit_tests/data_for_tests/loaders/consts.py +++ b/sfaira/unit_tests/data_for_tests/loaders/consts.py @@ -1,5 +1,7 @@ -ASSEMBLY_HUMAN = "Homo_sapiens.GRCh38.104" -ASSEMBLY_MOUSE = "Mus_musculus.GRCm39.104" +RELEASE_HUMAN = "104" +RELEASE_MOUSE = "104" +ASSEMBLY_HUMAN = f"Homo_sapiens.GRCh38.{RELEASE_HUMAN}" +ASSEMBLY_MOUSE = f"Mus_musculus.GRCm39.{RELEASE_MOUSE}" CELLTYPES = ["adventitial cell", "endothelial cell", "acinar cell", "pancreatic PP cell", "type B pancreatic cell"] CL_VERSION = "v2021-08-10" diff --git a/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock1/human_lung_2021_10xtechnology_mock1_001.yaml b/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock1/human_lung_2021_10xtechnology_mock1_001.yaml index 2fb03f533..8a22018d8 100644 --- a/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock1/human_lung_2021_10xtechnology_mock1_001.yaml +++ b/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock1/human_lung_2021_10xtechnology_mock1_001.yaml @@ -33,7 +33,7 @@ dataset_or_observation_wise: individual_obs_key: organ: "lung" organ_obs_key: - organism: "human" + organism: "Homo sapiens" organism_obs_key: sample_source: "primary_tissue" sample_source_obs_key: diff --git a/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock2/mouse_pancreas_2021_10xtechnology_mock2_001.yaml b/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock2/mouse_pancreas_2021_10xtechnology_mock2_001.yaml index 68c14f8d7..8f64a3b1f 100644 --- a/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock2/mouse_pancreas_2021_10xtechnology_mock2_001.yaml +++ b/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock2/mouse_pancreas_2021_10xtechnology_mock2_001.yaml @@ -33,7 +33,7 @@ dataset_or_observation_wise: individual_obs_key: organ: "pancreas" organ_obs_key: - organism: "mouse" + organism: "Mus musculus" organism_obs_key: sample_source: "primary_tissue" sample_source_obs_key: diff --git a/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock3/human_lung_2021_10xtechnology_mock3_001.yaml b/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock3/human_lung_2021_10xtechnology_mock3_001.yaml index f67c394aa..50ba7c960 100644 --- a/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock3/human_lung_2021_10xtechnology_mock3_001.yaml +++ b/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock3/human_lung_2021_10xtechnology_mock3_001.yaml @@ -33,7 +33,7 @@ dataset_or_observation_wise: individual_obs_key: organ: "lung" organ_obs_key: - organism: "human" + organism: "Homo sapiens" organism_obs_key: sample_source: "primary_tissue" sample_source_obs_key: diff --git a/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock4/human_lung_2021_10xtechnology_mock4_001.yaml b/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock4/human_lung_2021_10xtechnology_mock4_001.yaml index 83c928c8a..d55156f8c 100644 --- a/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock4/human_lung_2021_10xtechnology_mock4_001.yaml +++ b/sfaira/unit_tests/data_for_tests/loaders/loaders/dno_doi_mock4/human_lung_2021_10xtechnology_mock4_001.yaml @@ -33,7 +33,7 @@ dataset_or_observation_wise: individual_obs_key: organ: "lung" organ_obs_key: - organism: "human" + organism: "Homo sapiens" organism_obs_key: sample_source: "primary_tissue" sample_source_obs_key: diff --git a/sfaira/unit_tests/data_for_tests/loaders/utils.py b/sfaira/unit_tests/data_for_tests/loaders/utils.py index 75d0398c3..ac87212f1 100644 --- a/sfaira/unit_tests/data_for_tests/loaders/utils.py +++ b/sfaira/unit_tests/data_for_tests/loaders/utils.py @@ -4,22 +4,25 @@ import os import pandas as pd import pathlib + +from sfaira.data.store.multi_store import DistributedStoresAnndata from sfaira.versions.genomes import GenomeContainer from sfaira.unit_tests.directories import DIR_DATA_LOADERS_CACHE, DIR_DATA_LOADERS_STORE_DAO, \ DIR_DATA_LOADERS_STORE_H5AD, save_delete -from .consts import ASSEMBLY_HUMAN, ASSEMBLY_MOUSE +from .consts import RELEASE_HUMAN, RELEASE_MOUSE from .loaders import DatasetSuperGroupMock -MATCH_TO_REFERENCE = {"human": ASSEMBLY_HUMAN, "mouse": ASSEMBLY_MOUSE} +MATCH_TO_RELEASE = {"Homo sapiens": RELEASE_HUMAN, + "Mus musculus": RELEASE_MOUSE} def _create_adata(celltypes, ncells, ngenes, assembly) -> anndata.AnnData: """ Usesd by mock data loaders. """ - gc = GenomeContainer(assembly=assembly) - gc.subset(biotype="protein_coding") + gc = GenomeContainer(organism=" ".join(assembly.split(".")[0].split("_")), release=assembly.split(".")[-1]) + gc.set(biotype="protein_coding") genes = gc.ensembl[:ngenes] x = scipy.sparse.csc_matrix(np.random.randint(low=0, high=100, size=(ncells, ngenes))) var = pd.DataFrame(index=genes) @@ -32,62 +35,70 @@ def _create_adata(celltypes, ncells, ngenes, assembly) -> anndata.AnnData: return adata -def _load_script(dsg, rewrite: bool, match_to_reference): +def _load_script(dsg, rewrite: bool, match_to_release): dsg.load(allow_caching=True, load_raw=rewrite) - dsg.streamline_features(remove_gene_version=True, match_to_reference=match_to_reference) + dsg.streamline_features(remove_gene_version=True, match_to_release=match_to_release) dsg.streamline_metadata(schema="sfaira", clean_obs=True, clean_var=True, clean_uns=True, clean_obs_names=True) return dsg -def prepare_dsg(rewrite: bool = False, load: bool = True, match_to_reference=None) -> DatasetSuperGroupMock: - """ - Prepares data set super group of mock data and returns instance. +class PrepareData: + CLS_DSG = DatasetSuperGroupMock - Use this do testing involving a data set group. - """ - # Make sure cache exists: - if not os.path.exists(DIR_DATA_LOADERS_CACHE): - pathlib.Path(DIR_DATA_LOADERS_CACHE).mkdir(parents=True, exist_ok=True) - dsg = DatasetSuperGroupMock() - if match_to_reference is None: - match_to_reference = MATCH_TO_REFERENCE - if load: - dsg = _load_script(dsg=dsg, rewrite=rewrite, match_to_reference=match_to_reference) - return dsg + def prepare_dsg(self, rewrite: bool = False, load: bool = True, match_to_release=None): + """ + Prepares data set super group of mock data and returns instance. + Use this do testing involving a data set group. + """ + # Make sure cache exists: + if not os.path.exists(DIR_DATA_LOADERS_CACHE): + pathlib.Path(DIR_DATA_LOADERS_CACHE).mkdir(parents=True, exist_ok=True) + dsg = self.CLS_DSG() + if match_to_release is None: + match_to_release = MATCH_TO_RELEASE + if load: + dsg = _load_script(dsg=dsg, rewrite=rewrite, match_to_release=match_to_release) + return dsg -def prepare_store(store_format: str, rewrite: bool = False, rewrite_store: bool = False, - match_to_reference=None) -> str: - """ - Prepares mock data store and returns path to store. + def prepare_store_anndata(self, match_to_reference=None) -> DistributedStoresAnndata: + dsg = self.prepare_dsg(load=True, match_to_release=match_to_reference) + store = DistributedStoresAnndata(adatas=dsg.adata_ls) + return store - Use this do testing involving a data set store. - """ - dir_store_formatted = { - "dao": DIR_DATA_LOADERS_STORE_DAO, - "h5ad": DIR_DATA_LOADERS_STORE_H5AD, - }[store_format] - if not os.path.exists(dir_store_formatted): - pathlib.Path(dir_store_formatted).mkdir(parents=True, exist_ok=True) - dsg = prepare_dsg(rewrite=rewrite, load=False, match_to_reference=match_to_reference) - for k, ds in dsg.datasets.items(): - if store_format == "dao": - compression_kwargs = {"compressor": "default", "overwrite": True, "order": "C"} - else: - compression_kwargs = {} - if store_format == "dao": - anticipated_fn = os.path.join(dir_store_formatted, ds.doi_cleaned_id) - elif store_format == "h5ad": - anticipated_fn = os.path.join(dir_store_formatted, ds.doi_cleaned_id + ".h5ad") - else: - assert False - if rewrite_store and os.path.exists(anticipated_fn): - # Can't write if h5ad already exists. - # Delete store to writing if forced. - save_delete(anticipated_fn) - # Only rewrite if necessary - if rewrite_store or not os.path.exists(anticipated_fn): - ds = _load_script(dsg=ds, rewrite=rewrite, match_to_reference=MATCH_TO_REFERENCE) - ds.write_distributed_store(dir_cache=dir_store_formatted, store_format=store_format, dense=True, - chunks=128, compression_kwargs=compression_kwargs) - return dir_store_formatted + def prepare_store(self, store_format: str, rewrite: bool = False, rewrite_store: bool = False, + match_to_reference=None) -> str: + """ + Prepares mock data store and returns path to store. + + Use this do testing involving a data set store. + """ + dir_store_formatted = { + "dao": DIR_DATA_LOADERS_STORE_DAO, + "h5ad": DIR_DATA_LOADERS_STORE_H5AD, + }[store_format] + if not os.path.exists(dir_store_formatted): + pathlib.Path(dir_store_formatted).mkdir(parents=True, exist_ok=True) + dsg = self.prepare_dsg(rewrite=rewrite, load=False, match_to_release=match_to_reference) + for k, ds in dsg.datasets.items(): + print(k) + if store_format == "dao": + compression_kwargs = {"compressor": "default", "overwrite": True, "order": "C"} + else: + compression_kwargs = {} + if store_format == "dao": + anticipated_fn = os.path.join(dir_store_formatted, ds.doi_cleaned_id) + elif store_format == "h5ad": + anticipated_fn = os.path.join(dir_store_formatted, ds.doi_cleaned_id + ".h5ad") + else: + assert False + if rewrite_store and os.path.exists(anticipated_fn): + # Can't write if h5ad already exists. + # Delete store to writing if forced. + save_delete(anticipated_fn) + # Only rewrite if necessary + if rewrite_store or not os.path.exists(anticipated_fn): + ds = _load_script(dsg=ds, rewrite=rewrite, match_to_release=MATCH_TO_RELEASE) + ds.write_distributed_store(dir_cache=dir_store_formatted, store_format=store_format, dense=True, + chunks=128, compression_kwargs=compression_kwargs) + return dir_store_formatted diff --git a/sfaira/unit_tests/tests_by_submodule/data/databases/test_database_intput.py b/sfaira/unit_tests/tests_by_submodule/data/databases/test_database_intput.py index c9d44c95f..09b356456 100644 --- a/sfaira/unit_tests/tests_by_submodule/data/databases/test_database_intput.py +++ b/sfaira/unit_tests/tests_by_submodule/data/databases/test_database_intput.py @@ -1,56 +1,59 @@ import os import pytest -from typing import List from sfaira.consts import AdataIdsSfaira from sfaira.data.store.io_dao import read_dao from sfaira.unit_tests.data_for_tests.databases.utils import prepare_dsg_database from sfaira.unit_tests.data_for_tests.databases.consts import CELLXGENE_DATASET_ID -from sfaira.unit_tests.data_for_tests.loaders import ASSEMBLY_HUMAN, ASSEMBLY_MOUSE +from sfaira.unit_tests.data_for_tests.loaders import RELEASE_HUMAN, RELEASE_MOUSE from sfaira.unit_tests.directories import DIR_DATABASE_STORE_DAO +MATCH_TO_RELEASE = {"Homo sapiens": RELEASE_HUMAN, "Mus musculus": RELEASE_MOUSE} -@pytest.mark.parametrize("database", ["cellxgene", ]) -@pytest.mark.parametrize("subset_args", [["id", CELLXGENE_DATASET_ID], ]) -@pytest.mark.parametrize("match_to_reference", [{"human": ASSEMBLY_HUMAN, "mouse": ASSEMBLY_MOUSE}, ]) + +@pytest.mark.parametrize("database", [ + ("cellxgene", ["id", CELLXGENE_DATASET_ID]), +]) @pytest.mark.parametrize("subset_genes_to_type", [None, "protein_coding", ]) -def test_streamline_features(database: str, subset_args: List[str], match_to_reference: dict, - subset_genes_to_type: str): +def test_streamline_features(database: str, subset_genes_to_type: str): + database, subset_args = database dsg = prepare_dsg_database(database=database) dsg.subset(key=subset_args[0], values=subset_args[1]) dsg.load() - dsg.streamline_features(match_to_reference=match_to_reference, subset_genes_to_type=subset_genes_to_type) + dsg.streamline_features(match_to_release=MATCH_TO_RELEASE, subset_genes_to_type=subset_genes_to_type) -@pytest.mark.parametrize("database", ["cellxgene", ]) -@pytest.mark.parametrize("subset_args", [["id", CELLXGENE_DATASET_ID], ]) +@pytest.mark.parametrize("database", [ + ("cellxgene", ["id", CELLXGENE_DATASET_ID]), +]) @pytest.mark.parametrize("format", ["sfaira", ]) -def test_streamline_metadata(database: str, subset_args: List[str], format: str): +def test_streamline_metadata(database: str, format: str): + database, subset_args = database dsg = prepare_dsg_database(database=database) dsg.subset(key=subset_args[0], values=subset_args[1]) dsg.load() - dsg.streamline_features(match_to_reference={"human": ASSEMBLY_HUMAN, "mouse": ASSEMBLY_MOUSE}, - subset_genes_to_type="protein_coding") + dsg.streamline_features(match_to_release=MATCH_TO_RELEASE, subset_genes_to_type="protein_coding") dsg.streamline_metadata(schema=format) adata = dsg.datasets[subset_args[1]].adata ids = AdataIdsSfaira() assert "CL:0000128" in adata.obs[ids.cell_type + ids.onto_id_suffix].values assert "oligodendrocyte" in adata.obs[ids.cell_type].values - assert "HsapDv:0000087" in adata.obs[ids.development_stage + ids.onto_id_suffix].values - assert "human adult stage" in adata.obs[ids.development_stage].values - assert "UBERON:0000956" in adata.obs[ids.organ + ids.onto_id_suffix].values - assert "cerebral cortex" in adata.obs[ids.organ].values + assert "MmusDv:0000061" in adata.obs[ids.development_stage + ids.onto_id_suffix].values + assert "early adult stage" in adata.obs[ids.development_stage].values + assert "UBERON:0002436" in adata.obs[ids.organ + ids.onto_id_suffix].values + assert "primary visual cortex" in adata.obs[ids.organ].values @pytest.mark.parametrize("store", ["dao", ]) -@pytest.mark.parametrize("database", ["cellxgene", ]) -@pytest.mark.parametrize("subset_args", [["id", CELLXGENE_DATASET_ID], ]) -def test_output_to_store(store: str, database: str, subset_args: List[str]): +@pytest.mark.parametrize("database", [ + ("cellxgene", ["id", CELLXGENE_DATASET_ID]), +]) +def test_output_to_store(store: str, database: str): + database, subset_args = database dsg = prepare_dsg_database(database=database) dsg.subset(key=subset_args[0], values=subset_args[1]) dsg.load() - dsg.streamline_features(match_to_reference={"human": ASSEMBLY_HUMAN, "mouse": ASSEMBLY_MOUSE}, - subset_genes_to_type="protein_coding") + dsg.streamline_features(match_to_release=MATCH_TO_RELEASE, subset_genes_to_type="protein_coding") dsg.streamline_metadata(schema="sfaira", clean_obs=True, clean_uns=True, clean_var=True, clean_obs_names=True, keep_id_obs=True, keep_orginal_obs=False, keep_symbol_obs=True) dsg.write_distributed_store(dir_cache=DIR_DATABASE_STORE_DAO, store_format=store, dense=True) @@ -59,7 +62,5 @@ def test_output_to_store(store: str, database: str, subset_args: List[str]): ids = AdataIdsSfaira() assert "CL:0000128" in adata.obs[ids.cell_type + ids.onto_id_suffix].values assert "oligodendrocyte" in adata.obs[ids.cell_type].values - assert "HsapDv:0000087" in adata.obs[ids.development_stage + ids.onto_id_suffix].values - assert "human adult stage" in adata.obs[ids.development_stage].values - assert "UBERON:0000956" in adata.obs[ids.organ + ids.onto_id_suffix].values - assert "cerebral cortex" in adata.obs[ids.organ].values + assert "MmusDv:0000061" in adata.obs[ids.development_stage + ids.onto_id_suffix].values + assert "early adult stage" in adata.obs[ids.development_stage].values diff --git a/sfaira/unit_tests/tests_by_submodule/data/databases/test_databases_basic.py b/sfaira/unit_tests/tests_by_submodule/data/databases/test_databases_basic.py index 503577c6d..3a5a39b10 100644 --- a/sfaira/unit_tests/tests_by_submodule/data/databases/test_databases_basic.py +++ b/sfaira/unit_tests/tests_by_submodule/data/databases/test_databases_basic.py @@ -9,14 +9,16 @@ # Execute this one first so that data sets are only downloaded once. Named test_a for this reason. -@pytest.mark.parametrize("database", ["cellxgene", ]) -@pytest.mark.parametrize("subset_args", [None, ["id", CELLXGENE_DATASET_ID], ]) -def test_a_dsgs_download(database: str, subset_args: List[str]): +@pytest.mark.parametrize("database", [ + ("cellxgene", ["id", CELLXGENE_DATASET_ID]), +]) +def test_a_dsgs_download(database: str): """ Tests if downloading of data base entries works. Warning, deletes entire database unit test cache. """ + database, subset_args = database if os.path.exists(DIR_DATA_DATABASES_CACHE): shutil.rmtree(DIR_DATA_DATABASES_CACHE) dsg = prepare_dsg_database(database=database, download=False) @@ -25,11 +27,14 @@ def test_a_dsgs_download(database: str, subset_args: List[str]): dsg.download() -@pytest.mark.parametrize("database", ["cellxgene", ]) -@pytest.mark.parametrize("subset_args", [["id", CELLXGENE_DATASET_ID], ["organism", "human"], ]) -def test_dsgs_subset(database: str, subset_args: List[str]): +@pytest.mark.parametrize("database", [ + ("cellxgene", ["id", CELLXGENE_DATASET_ID]), + ("cellxgene", ["organism", "Homo sapiens"]), +]) +def test_dsgs_subset(database: str): """ Tests if subsetting results only in datasets of the desired characteristics. """ + database, subset_args = database dsg = prepare_dsg_database(database=database) dsg.subset(key=subset_args[0], values=subset_args[1]) diff --git a/sfaira/unit_tests/tests_by_submodule/data/dataset/test_dataset.py b/sfaira/unit_tests/tests_by_submodule/data/dataset/test_dataset.py index 4c3324635..1689afd0a 100644 --- a/sfaira/unit_tests/tests_by_submodule/data/dataset/test_dataset.py +++ b/sfaira/unit_tests/tests_by_submodule/data/dataset/test_dataset.py @@ -2,10 +2,12 @@ import os import pytest -from sfaira.data import DatasetSuperGroup +import sfaira.versions.genomes +from sfaira.consts import AdataIdsSfaira +from sfaira.data import DatasetSuperGroup, DatasetInteractive from sfaira.data import Universe -from sfaira.unit_tests.data_for_tests.loaders import ASSEMBLY_HUMAN, prepare_dsg +from sfaira.unit_tests.data_for_tests.loaders import RELEASE_HUMAN, PrepareData from sfaira.unit_tests.directories import DIR_TEMP, DIR_DATA_LOADERS_CACHE @@ -33,39 +35,39 @@ def test_dsgs_subset_dataset_wise(organ: str): """ Tests if subsetting results only in datasets of the desired characteristics. """ - ds = prepare_dsg(load=False) - ds.subset(key="organism", values=["human"]) + ds = PrepareData().prepare_dsg(load=False) + ds.subset(key="organism", values=["Homo sapiens"]) ds.subset(key="organ", values=[organ]) ds.load() for x in ds.dataset_groups: for k, v in x.datasets.items(): - assert v.organism == "human", v.organism + assert v.organism == "Homo sapiens", v.organism assert v.ontology_container_sfaira.organ.is_a(query=v.organ, reference=organ), v.organ def test_dsgs_config_write_load(): fn = os.path.join(DIR_TEMP, "config.csv") - ds = prepare_dsg(load=False) - ds.subset(key="organism", values=["human"]) + ds = PrepareData().prepare_dsg(load=False) + ds.subset(key="organism", values=["Homo sapiens"]) ds.subset(key="organ", values=["lung"]) ds.load() ds.write_config(fn=fn) - ds2 = prepare_dsg() + ds2 = PrepareData().prepare_dsg() ds2.load_config(fn=fn) assert np.all(ds.ids == ds2.ids) def test_dsgs_adata(): - ds = prepare_dsg(load=False) - ds.subset(key="organism", values=["human"]) + ds = PrepareData().prepare_dsg(load=False) + ds.subset(key="organism", values=["Homo sapiens"]) ds.subset(key="organ", values=["lung"]) ds.load() _ = ds.adata_ls def test_dsgs_load(): - ds = prepare_dsg(load=False) - ds.subset(key="organism", values=["human"]) + ds = PrepareData().prepare_dsg(load=False) + ds.subset(key="organism", values=["Homo sapiens"]) ds.subset(key="organ", values=["lung"]) ds.load() @@ -76,46 +78,73 @@ def test_dsgs_subset_cell_wise(celltype: str): Tests if sub-setting results only in datasets of the desired characteristics. """ organ = "lung" - ds = prepare_dsg(load=False) - ds.subset(key="organism", values=["human"]) + ds = PrepareData().prepare_dsg(load=False) + ds.subset(key="organism", values=["Homo sapiens"]) ds.subset(key="organ", values=[organ]) ds.load() ds.subset_cells(key="cell_type", values=celltype) for x in ds.dataset_groups: for k, v in x.datasets.items(): - assert v.organism == "human", v.id + assert v.organism == "Homo sapiens", v.id assert v.ontology_container_sfaira.organ.is_a(query=v.organ, reference=organ), v.organ for y in np.unique(v.adata.obs[v._adata_ids.cell_type].values): assert v.ontology_container_sfaira.cell_type.is_a(query=y, reference=celltype), y -@pytest.mark.parametrize("match_to_reference", [ASSEMBLY_HUMAN, {"human": ASSEMBLY_HUMAN}]) +@pytest.mark.parametrize("match_to_release", [RELEASE_HUMAN, {"Homo sapiens": RELEASE_HUMAN}]) @pytest.mark.parametrize("remove_gene_version", [False, True]) @pytest.mark.parametrize("subset_genes_to_type", [None, "protein_coding"]) -def test_dsgs_streamline_features(match_to_reference: str, remove_gene_version: bool, subset_genes_to_type: str): - ds = prepare_dsg(load=False) - ds.subset(key="organism", values=["human"]) +def test_dsgs_streamline_features(match_to_release: str, remove_gene_version: bool, subset_genes_to_type: str): + ds = PrepareData().prepare_dsg(load=False) + ds.subset(key="organism", values=["Homo sapiens"]) ds.subset(key="organ", values=["lung"]) ds.load() - ds.streamline_features(remove_gene_version=remove_gene_version, match_to_reference=match_to_reference, + ds.streamline_features(remove_gene_version=remove_gene_version, match_to_release=match_to_release, subset_genes_to_type=subset_genes_to_type) - gc = ds.get_gc(match_to_reference["human"] if isinstance(match_to_reference, dict) else match_to_reference) - gc.subset(biotype=subset_genes_to_type) + gc = sfaira.versions.genomes.GenomeContainer( + organism="Homo Sapiens", + release=match_to_release["Homo sapiens"] if isinstance(match_to_release, dict) else match_to_release) + gc.set(biotype=subset_genes_to_type) for x in ds.datasets.values(): assert x.adata.var["gene_symbol"].tolist() == gc.symbols def test_dsg_load(): - ds = prepare_dsg(load=False) - ds.subset(key="organism", values=["human"]) + ds = PrepareData().prepare_dsg(load=False) + ds.subset(key="organism", values=["Homo sapiens"]) ds.subset(key="organ", values=["lung"]) ds = DatasetSuperGroup(dataset_groups=[ds]) ds.load() def test_dsg_adata(): - ds = prepare_dsg(load=False) - ds.subset(key="organism", values=["human"]) + ds = PrepareData().prepare_dsg(load=False) + ds.subset(key="organism", values=["Homo sapiens"]) ds.subset(key="organ", values=["lung"]) ds = DatasetSuperGroup(dataset_groups=[ds]) _ = ds.adata + + +def test_ds_interactive(): + adata_ids = AdataIdsSfaira() + # Prepare object: + ds = PrepareData().prepare_dsg(load=False) + ds.subset(key="doi_journal", values=["no_doi_mock1"]) + ds.load() + adata = ds.adata_ls[0] + di = DatasetInteractive(data=adata, gene_ens_col="index") + di.organism = "Homo sapiens" + di.organ = "lung" + di.cell_type_obs_key = "free_annotation" + # Test that adata is accessible in non-streamlined object: + _ = di.adata + # Test streamlining: + di.streamline_features(match_to_release=RELEASE_HUMAN) + di.streamline_metadata(schema="sfaira") + # Test entries in streamlined object: + adata_di = di.adata + assert adata_ids.cell_type in adata_di.obs.columns + assert adata_ids.cell_type + adata_ids.onto_id_suffix in adata_di.obs.columns + assert adata_ids.organ in adata_di.uns.keys() + assert np.all(adata_di.obs[adata_ids.cell_type].values == adata.obs["free_annotation"].values) + assert adata_di.uns[adata_ids.organ] == "lung" diff --git a/sfaira/unit_tests/tests_by_submodule/data/dataset/test_meta_data_streamlining.py b/sfaira/unit_tests/tests_by_submodule/data/dataset/test_meta_data_streamlining.py index 6e0c396ba..b3d84c7aa 100644 --- a/sfaira/unit_tests/tests_by_submodule/data/dataset/test_meta_data_streamlining.py +++ b/sfaira/unit_tests/tests_by_submodule/data/dataset/test_meta_data_streamlining.py @@ -1,11 +1,9 @@ import os import anndata -from cellxgene_schema import validate -import numpy as np import pytest -from sfaira.unit_tests.data_for_tests.loaders import ASSEMBLY_HUMAN, ASSEMBLY_MOUSE, prepare_dsg +from sfaira.unit_tests.data_for_tests.loaders import RELEASE_HUMAN, PrepareData from sfaira.unit_tests.directories import DIR_TEMP @@ -20,52 +18,53 @@ def test_dsgs_streamline_metadata(out_format: str, clean_obs: bool, clean_var: bool, clean_uns: bool, clean_obs_names: bool, keep_id_obs: bool, keep_orginal_obs: bool, keep_symbol_obs: bool): - ds = prepare_dsg(load=False) - ds.subset(key="organism", values=["human"]) + ds = PrepareData().prepare_dsg(load=False) + ds.subset(key="organism", values=["Homo sapiens"]) ds.subset(key="organ", values=["lung"]) if out_format == "cellxgene": # Other data data sets do not have complete enough annotation ds.subset(key="doi_journal", values=["no_doi_mock1", "no_doi_mock3"]) ds.load() - ds.streamline_features(remove_gene_version=False, match_to_reference=ASSEMBLY_HUMAN, + ds.streamline_features(remove_gene_version=False, match_to_release=RELEASE_HUMAN, subset_genes_to_type=None) ds.streamline_metadata(schema=out_format, clean_obs=clean_obs, clean_var=clean_var, clean_uns=clean_uns, clean_obs_names=clean_obs_names, keep_id_obs=keep_id_obs, keep_orginal_obs=keep_orginal_obs, keep_symbol_obs=keep_symbol_obs) -class ValidatorInMemory(validate.Validator): +@pytest.mark.parametrize("schema_version", ["2_0_0"]) +@pytest.mark.parametrize("organism", ["Homo sapiens", "Mus musculus"]) +def test_cellxgene_export(schema_version: str, organism: str): """ - Helper class to validate adata in memory and raise errors as in error stream rather than outstream. - The switch in log stream allows this test to be used as a unit test. + This test can be extended by future versions. """ + from cellxgene_schema import validate - def validate_adata_inmemory(self, adata: anndata.AnnData): - self.errors = [] - self.adata = adata - self._set_schema_def() - if not self.errors: - self._deep_check() - if self.warnings: - self.warnings = ["WARNING: " + i for i in self.warnings] - if self.errors: - self.errors = ["ERROR: " + i for i in self.errors] - if self.warnings or self.errors: - print(self.warnings[:20]) - print(self.errors[:20]) - assert False + class ValidatorInMemory(validate.Validator): + """ + Helper class to validate adata in memory and raise errors as in error stream rather than outstream. + The switch in log stream allows this test to be used as a unit test. + """ -@pytest.mark.parametrize("schema_version", ["2_0_0"]) -@pytest.mark.parametrize("organism", ["human", "mouse"]) -def test_cellxgene_export(schema_version: str, organism: str): - """ + def validate_adata_inmemory(self, adata: anndata.AnnData): + self.errors = [] + self.adata = adata + self._set_schema_def() + if not self.errors: + self._deep_check() + if self.warnings: + self.warnings = ["WARNING: " + i for i in self.warnings] + if self.errors: + self.errors = ["ERROR: " + i for i in self.errors] + if self.warnings or self.errors: + print(self.warnings[:20]) + print(self.errors[:20]) + assert False - This test can be extended by future versions. - """ - ds = prepare_dsg(load=False) - if organism == "human": + ds = PrepareData().prepare_dsg(load=False) + if organism == "Homo sapiens": ds.subset(key="doi_journal", values=["no_doi_mock1"]) else: ds.subset(key="doi_journal", values=["no_doi_mock2"]) diff --git a/sfaira/unit_tests/tests_by_submodule/data/test_data_utils.py b/sfaira/unit_tests/tests_by_submodule/data/test_data_utils.py index 5e766fa01..3e6b5732d 100644 --- a/sfaira/unit_tests/tests_by_submodule/data/test_data_utils.py +++ b/sfaira/unit_tests/tests_by_submodule/data/test_data_utils.py @@ -22,7 +22,7 @@ def test_map_celltype_to_ontology( perfectly_matched_query = ["type B pancreatic cell" == x for x in trial_cell_type_labels] matches = map_celltype_to_ontology( queries=trial_cell_type_labels, - organism="human", + organism="Homo sapiens", include_synonyms=True, anatomical_constraint=anatomical_constraint, choices_for_perfect_match=choices_for_perfect_match, diff --git a/sfaira/unit_tests/tests_by_submodule/data/test_store.py b/sfaira/unit_tests/tests_by_submodule/data/test_store.py index e95b410d6..c04bcb3b7 100644 --- a/sfaira/unit_tests/tests_by_submodule/data/test_store.py +++ b/sfaira/unit_tests/tests_by_submodule/data/test_store.py @@ -7,29 +7,33 @@ import scipy.sparse from typing import List +from sfaira.consts import AdataIdsSfaira from sfaira.data import load_store from sfaira.versions.genomes.genomes import GenomeContainer -from sfaira.unit_tests.data_for_tests.loaders import ASSEMBLY_MOUSE, prepare_dsg, prepare_store +from sfaira.unit_tests.data_for_tests.loaders import RELEASE_MOUSE, RELEASE_HUMAN, PrepareData def _get_single_store(store_format: str): - store_path = prepare_store(store_format=store_format) + store_path = PrepareData().prepare_store(store_format=store_format) stores = load_store(cache_path=store_path, store_format=store_format) - stores.subset(attr_key="organism", values=["mouse"]) - store = stores.stores["mouse"] + stores.subset(attr_key="organism", values=["Mus musculus"]) + store = stores.stores["Mus musculus"] return store -@pytest.mark.parametrize("store_format", ["h5ad", "dao"]) +@pytest.mark.parametrize("store_format", ["h5ad", "dao", "anndata"]) def test_fatal(store_format: str): """ - Test if basic methods abort. + Test if basic methods of stores abort. """ - store_path = prepare_store(store_format=store_format) - stores = load_store(cache_path=store_path, store_format=store_format) - stores.subset(attr_key="organism", values=["mouse"]) - store = stores.stores["mouse"] + if store_format == "anndata": + stores = PrepareData().prepare_store_anndata() + else: + store_path = PrepareData().prepare_store(store_format=store_format) + stores = load_store(cache_path=store_path, store_format=store_format) + stores.subset(attr_key="organism", values=["Mus musculus"]) + store = stores.stores["Mus musculus"] # Test both single and multi-store: for x in [store, stores]: _ = x.n_obs @@ -73,16 +77,35 @@ def test_adata_slice(store_format: str, as_sparse: bool): @pytest.mark.parametrize("store_format", ["h5ad", "dao"]) -def test_data(store_format: str): +def test_config(store_format: str): + """ + Test that data set config files can be set, written and recovered. + """ + store_path = PrepareData().prepare_store(store_format=store_format) + config_path = os.path.join(store_path, "config_lung") + store = load_store(cache_path=store_path, store_format=store_format) + store.subset(attr_key="organism", values=["Mus musculus"]) + store.subset(attr_key="assay_sc", values=["10x technology"]) + store.write_config(fn=config_path) + store2 = load_store(cache_path=store_path, store_format=store_format) + store2.load_config(fn=config_path + ".pickle") + assert np.all(store.indices.keys() == store2.indices.keys()) + assert np.all([np.all(store.indices[k] == store2.indices[k]) + for k in store.indices.keys()]) + + +@pytest.mark.parametrize("store_format", ["h5ad", "dao"]) +def test_store_data(store_format: str): """ Test if the data exposed by the store are the same as in the original Dataset instance after streamlining. """ + data = PrepareData() # Run standard streamlining workflow on dsg and compare to object relayed via store. # Prepare dsg. - dsg = prepare_dsg(load=True) + dsg = data.prepare_dsg(load=True) # Prepare store. # Rewriting store to avoid mismatch of randomly generated data in cache and store. - store_path = prepare_store(store_format=store_format, rewrite=False, rewrite_store=True) + store_path = data.prepare_store(store_format=store_format, rewrite=False, rewrite_store=True) store = load_store(cache_path=store_path, store_format=store_format) store.subset(attr_key="doi_journal", values=["no_doi_mock1"]) dataset_id = store.adata_by_key[list(store.indices.keys())[0]].uns["id"] @@ -134,46 +157,29 @@ def test_data(store_format: str): assert np.all(v == uns_ds[k]) -@pytest.mark.parametrize("store_format", ["h5ad", "dao"]) -def test_config(store_format: str): - """ - Test that data set config files can be set, written and recovered. - """ - store_path = prepare_store(store_format=store_format) - config_path = os.path.join(store_path, "config_lung") - store = load_store(cache_path=store_path, store_format=store_format) - store.subset(attr_key="organism", values=["mouse"]) - store.subset(attr_key="assay_sc", values=["10x technology"]) - store.write_config(fn=config_path) - store2 = load_store(cache_path=store_path, store_format=store_format) - store2.load_config(fn=config_path + ".pickle") - assert np.all(store.indices.keys() == store2.indices.keys()) - assert np.all([np.all(store.indices[k] == store2.indices[k]) - for k in store.indices.keys()]) - - @pytest.mark.parametrize("store_format", ["h5ad", "dao"]) @pytest.mark.parametrize("idx", [np.arange(1, 10), np.concatenate([np.arange(30, 50), np.array([1, 4, 98])])]) @pytest.mark.parametrize("batch_size", [1, ]) @pytest.mark.parametrize("obs_keys", [["cell_type"]]) @pytest.mark.parametrize("randomized_batch_access", [True, False]) -def test_generator_shapes(store_format: str, idx, batch_size: int, obs_keys: List[str], randomized_batch_access: bool): +def test_generator_basic_data(store_format: str, idx, batch_size: int, obs_keys: List[str], + randomized_batch_access: bool): """ Test generators queries do not throw errors and that output shapes are correct. """ # Need to re-write because specific obs_keys are required: - store_path = prepare_store(store_format=store_format) + store_path = PrepareData().prepare_store(store_format=store_format) store = load_store(cache_path=store_path, store_format=store_format) - store.subset(attr_key="organism", values=["mouse"]) - gc = GenomeContainer(assembly=ASSEMBLY_MOUSE) - gc.subset(**{"biotype": "protein_coding"}) + store.subset(attr_key="organism", values=["Mus musculus"]) + gc = GenomeContainer(release=RELEASE_MOUSE, organism="Mus musculus") + gc.set(**{"biotype": "protein_coding"}) store.genome_container = gc def map_fn(x, obs): return (x, ), - g = store.generator(idx={"mouse": idx}, batch_size=batch_size, map_fn=map_fn, obs_keys=obs_keys, + g = store.generator(idx={"Mus musculus": idx}, batch_size=batch_size, map_fn=map_fn, obs_keys=obs_keys, randomized_batch_access=randomized_batch_access) g = g.iterator nobs = len(idx) if idx is not None else store.n_obs @@ -191,6 +197,50 @@ def map_fn(x, obs): x = x_i batch_sizes.append(x_i.shape[0]) assert counter > 0 - assert x.shape[1] == store.n_vars["mouse"], (x.shape, store.n_vars["mouse"]) + assert x.shape[1] == store.n_vars["Mus musculus"], (x.shape, store.n_vars["Mus musculus"]) assert np.sum(batch_sizes) == nobs, (batch_sizes, nobs) assert x.shape[1] == gc.n_var, (x.shape, gc.n_var) + + +@pytest.mark.parametrize("store_format", ["h5ad", "dao"]) +@pytest.mark.parametrize("idx", [None, np.array([1, 4, 98])]) +@pytest.mark.parametrize("randomized_batch_access", [True, False]) +def test_generator_blocked_data(store_format: str, idx, randomized_batch_access: bool): + """ + Test generators queries do not throw errors and that output shapes are correct. + """ + block_col = AdataIdsSfaira().cell_type + obs_keys = [block_col] + # Need to re-write because specific obs_keys are required: + store_path = PrepareData().prepare_store(store_format=store_format) + store = load_store(cache_path=store_path, store_format=store_format) + store.subset(attr_key="organism", values=["Homo sapiens"]) + gc = GenomeContainer(release=RELEASE_HUMAN, organism="Homo sapiens") + gc.set(**{"biotype": "protein_coding"}) + store.genome_container = gc + + def map_fn(x, obs): + return (obs, ), + + block_vals = store.obs["Homo sapiens"][block_col].values + g = store.generator(idx={"Homo sapiens": idx}, batch_size=0, map_fn=map_fn, obs_keys=obs_keys, + randomized_batch_access=randomized_batch_access, + batch_schedule="blocks", grouping=block_vals) + g = g.iterator + batch_sizes = [] + for i, z in enumerate(g()): + obs_i, = z[0] + # Check that this batch is one single block: + assert len(np.unique(obs_i[block_col].values)) == 1 + batch_sizes.append(obs_i.shape[0]) + assert len(batch_sizes) > 0 + # Check that one batch was emitted per block: + if idx is None: + assert len(np.unique(block_vals)) == len(batch_sizes) + else: + assert len(np.unique(block_vals[idx])) == len(batch_sizes) + # Check that the total number of observations across blocks is correct: + if idx is None: + assert np.sum(batch_sizes) == store.n_obs + else: + assert np.sum(batch_sizes) == len(idx) diff --git a/sfaira/unit_tests/tests_by_submodule/estimators/test_estimator.py b/sfaira/unit_tests/tests_by_submodule/estimators/test_estimator.py index af1168005..d2bea960a 100644 --- a/sfaira/unit_tests/tests_by_submodule/estimators/test_estimator.py +++ b/sfaira/unit_tests/tests_by_submodule/estimators/test_estimator.py @@ -6,24 +6,26 @@ import pytest from typing import Union -from sfaira.consts import AdataIdsSfaira, CACHE_DIR +from sfaira import settings +from sfaira.consts import AdataIdsSfaira from sfaira.data import DistributedStoreSingleFeatureSpace, DistributedStoreMultipleFeatureSpaceBase, load_store from sfaira.estimators import EstimatorKeras, EstimatorKerasCelltype, EstimatorKerasEmbedding from sfaira.versions.genomes.genomes import CustomFeatureContainer from sfaira.versions.metadata import OntologyOboCustom from sfaira.versions.topologies import TopologyContainer -from sfaira.unit_tests.data_for_tests.loaders.consts import CELLTYPES, CL_VERSION -from sfaira.unit_tests.data_for_tests.loaders.utils import prepare_dsg, prepare_store +from sfaira.unit_tests.data_for_tests.loaders.consts import CELLTYPES, CL_VERSION, ASSEMBLY_HUMAN, ASSEMBLY_MOUSE +from sfaira.unit_tests.data_for_tests.loaders.utils import PrepareData from sfaira.unit_tests.directories import DIR_TEMP -CACHE_DIR_GENOMES = os.path.join(CACHE_DIR, "genomes") - ADATA_IDS = AdataIdsSfaira() -ASSEMBLY = ADATA_IDS.feature_kwargs["match_to_reference"] +ASSEMBLY = { + "Homo sapiens": ASSEMBLY_HUMAN, + "Mus musculus": ASSEMBLY_MOUSE, +} GENES = { - "mouse": ["ENSMUSG00000000003", "ENSMUSG00000000028"], - "human": ["ENSG00000000003", "ENSG00000000005"], + "Homo sapiens": ["ENSG00000000003", "ENSG00000000005"], + "Mus musculus": ["ENSMUSG00000000003", "ENSMUSG00000000028"], } TARGETS = CELLTYPES TARGET_UNIVERSE = CELLTYPES @@ -69,8 +71,9 @@ class HelperEstimatorBase: data: Union[anndata.AnnData, DistributedStoreSingleFeatureSpace, DistributedStoreMultipleFeatureSpaceBase] tc: TopologyContainer - def load_adata(self, organism="human", organ=None, match_to_reference=None): - dsg = prepare_dsg(load=True, match_to_reference=match_to_reference) + def load_adata(self, organism="Homo sapiens", organ=None, match_to_reference=None): + data = PrepareData() + dsg = data.prepare_dsg(load=True, match_to_release=match_to_reference) dsg.subset(key="doi_journal", values=["no_doi_mock1", "no_doi_mock2", "no_doi_mock3"]) if organism is not None: dsg.subset(key="organism", values=organism) @@ -79,8 +82,9 @@ def load_adata(self, organism="human", organ=None, match_to_reference=None): self.adata_ids = dsg.dataset_groups[0]._adata_ids self.data = dsg.adata_ls - def load_store(self, organism="human", organ=None, match_to_reference=None): - store_path = prepare_store(store_format="dao", match_to_reference=match_to_reference) + def load_store(self, organism="Homo sapiens", organ=None, match_to_reference=None): + data = PrepareData() + store_path = data.prepare_store(store_format="dao", match_to_reference=match_to_reference) store = load_store(cache_path=store_path, store_format="dao") store.subset(attr_key="doi_journal", values=["no_doi_mock1", "no_doi_mock2", "no_doi_mock3"]) if organism is not None: @@ -91,12 +95,13 @@ def load_store(self, organism="human", organ=None, match_to_reference=None): self.data = store.stores[organism] def load_multistore(self): - store_path = prepare_store(store_format="dao") + data = PrepareData() + store_path = data.prepare_store(store_format="dao") store = load_store(cache_path=store_path, store_format="dao") store.subset(attr_key="doi_journal", values=["no_doi_mock1", "no_doi_mock2", "no_doi_mock3"]) self.adata_ids = store._adata_ids_sfaira - assert "mouse" in store.stores.keys(), store.stores.keys() - assert "human" in store.stores.keys(), store.stores.keys() + assert "Mus musculus" in store.stores.keys(), store.stores.keys() + assert "Homo sapiens" in store.stores.keys(), store.stores.keys() self.data = store @@ -144,7 +149,7 @@ def estimator_train(self, test_split, randomized_batch_access): def basic_estimator_test(self): pass - def load_estimator(self, model_type, data_type, feature_space, test_split, organism="human"): + def load_estimator(self, model_type, data_type, feature_space, test_split, organism="Homo sapiens"): self.init_topology(model_type=model_type, feature_space=feature_space, organism=organism) np.random.seed(1) if data_type == "adata": @@ -171,7 +176,8 @@ def init_topology(self, model_type: str, feature_space: str, organism: str): if feature_space == "full": # Read 500 genes (not full protein coding) to compromise between being able to distinguish observations # and reducing run time of unit tests. - tab = pd.read_csv(os.path.join(CACHE_DIR_GENOMES, ASSEMBLY[organism] + ".csv")) + tab = pd.read_csv(os.path.join(settings.cachedir_genomes, "_".join(organism.split(" ")).lower(), + ASSEMBLY[organism] + ".csv")) genes_full = tab.loc[tab["gene_biotype"].values == "protein_coding", "gene_id"].values[:500].tolist() topology["input"]["genes"] = ["ensg", genes_full] else: @@ -283,11 +289,12 @@ def init_genome_custom(self, n_features) -> CustomFeatureContainer: "gene_id": ["dim_" + str(i) for i in range(n_features)], "gene_biotype": ["embedding" for _ in range(n_features)], }), - organism="homo_sapiens", + organism="Homo sapiens", ) - def load_adata(self, organism="human", organ=None): - dsg = prepare_dsg(load=True) + def load_adata(self, organism="Homo sapiens", organ=None): + data = PrepareData() + dsg = data.prepare_dsg(load=True) dsg.subset(key="doi_journal", values=["no_doi_mock1", "no_doi_mock3", "no_doi_mock3"]) if organism is not None: dsg.subset(key="organism", values=organism) @@ -400,7 +407,7 @@ def test_dataset_size(batch_size: int, randomized_batch_access: bool): # Need full feature space here because observations are not necessarily different in small model testing feature # space with only two genes: test_estim.load_estimator(model_type="linear", data_type="store", feature_space="reduced", test_split=0.2, - organism="human") + organism="Homo sapiens") idx_train = test_estim.estimator.idx_train ds_train = test_estim.estimator.get_one_time_tf_dataset(idx=idx_train, batch_size=batch_size, mode='eval') x_train_shape = 0 @@ -425,7 +432,7 @@ def map_fn(x, obs): @pytest.mark.parametrize("data_type", ["adata", "store"]) -@pytest.mark.parametrize("test_split", [0.3, {"id": "human_lung_2021_10xtechnology_mock1_001_no_doi_mock1"}]) +@pytest.mark.parametrize("test_split", [0.3, {"id": "homosapiens_lung_2021_10xtechnology_mock1_001_no_doi_mock1"}]) def test_split_index_sets(data_type: str, test_split): """ Test that train, val, test split index sets are correct: @@ -438,8 +445,8 @@ def test_split_index_sets(data_type: str, test_split): test_estim = HelperEstimatorKerasEmbedding() # Need full feature space here because observations are not necessarily different in small model testing feature # space with only two genes: - test_estim.load_estimator(model_type="linear", data_type=data_type, feature_space="full", organism="human", - test_split=test_split) + test_estim.load_estimator(model_type="linear", data_type=data_type, feature_space="full", + organism="Homo sapiens", test_split=test_split) idx_train = test_estim.estimator.idx_train idx_eval = test_estim.estimator.idx_eval idx_test = test_estim.estimator.idx_test diff --git a/sfaira/unit_tests/tests_by_submodule/trainer/test_trainer.py b/sfaira/unit_tests/tests_by_submodule/trainer/test_trainer.py index 5f9b401f2..4cf37edc5 100644 --- a/sfaira/unit_tests/tests_by_submodule/trainer/test_trainer.py +++ b/sfaira/unit_tests/tests_by_submodule/trainer/test_trainer.py @@ -1,6 +1,9 @@ +import abc + import anndata import numpy as np import os +import pathlib from typing import Union from sfaira.consts.ontologies import DEFAULT_UBERON, DEFAULT_CL @@ -18,6 +21,8 @@ def get_cu(): Get file name of a target universe for loading by trainer. """ # Create temporary cell type universe to give to trainer. + if not os.path.exists(DIR_TEMP): + pathlib.Path(DIR_TEMP).mkdir(parents=True, exist_ok=True) fn = os.path.join(DIR_TEMP, "universe_temp.csv") cl = OntologyCl(branch=DEFAULT_CL) uberon = OntologyUberon(branch=DEFAULT_UBERON) @@ -27,7 +32,7 @@ def get_cu(): return fn -class HelperTrainerBase(HelperEstimatorBase): +class HelperTrainerBase: data: Union[anndata.AnnData, load_store] trainer: Union[TrainModelCelltype, TrainModelEmbedding] @@ -36,6 +41,18 @@ def __init__(self, zoo: ModelZoo): self.model_id = zoo.model_id self.tc = zoo.topology_container + def load_adata(self, **kwargs): + """ + This is inherited from estimator test helper. + """ + pass + + def load_store(self, **kwargs): + """ + This is inherited from estimator test helper. + """ + pass + def load_data(self, data_type): """ Builds training data according to reference used in model definition. @@ -45,11 +62,11 @@ def load_data(self, data_type): """ np.random.seed(1) if data_type == "adata": - self.load_adata(organism="human", match_to_reference=self.tc.gc.assembly) + self.load_adata(organism="Homo sapiens", match_to_reference=self.tc.gc.release) else: - self.load_store(organism="human", match_to_reference=self.tc.gc.assembly) + self.load_store(organism="Homo sapiens", match_to_reference=self.tc.gc.release) - def test_init(self, cls, **kwargs): + def test_init(self, cls, estimator_kwargs: dict = {}, **kwargs): if not os.path.exists(DIR_TEMP): os.mkdir(DIR_TEMP) self.load_data(data_type="adata") @@ -59,30 +76,42 @@ def test_init(self, cls, **kwargs): **kwargs ) self.trainer.zoo.model_id = self.model_id - self.trainer.init_estim(override_hyperpar={}) + self.trainer.init_estim(override_hyperpar={}, **estimator_kwargs) def test_save(self): if not os.path.exists(DIR_TEMP): os.mkdir(DIR_TEMP) - self.trainer.estimator.train(epochs=1, max_steps_per_epoch=1, test_split=0.1, validation_split=0.1, - optimizer="adam", lr=0.005) - self.trainer.save(fn=os.path.join(DIR_TEMP, "trainer"), model=True, specific=True) + self.trainer.estimator.train( + epochs=1, + max_steps_per_epoch=1, + test_split=0.1, + validation_split=0.1, + optimizer="adam", + lr=0.005, + ) + self.trainer.save( + fn=os.path.join(DIR_TEMP, "trainer"), model=True, specific=True + ) + + +class HelperTrainer(HelperEstimatorBase, HelperTrainerBase): + pass def test_save_embedding(): - model_id = "embedding_human-lung-linear-0.1-0.1_mylab" + model_id = "embedding_homosapiens-lung-linear-0.1-0.1_mylab" zoo = ModelZoo() zoo.model_id = model_id - test_trainer = HelperTrainerBase(zoo=zoo) + test_trainer = HelperTrainer(zoo=zoo) test_trainer.test_init(cls=TrainModelEmbedding) test_trainer.test_save() def test_save_celltypes(): tmp_fn = get_cu() - model_id = "celltype_human-lung-mlp-0.0.1-0.1_mylab" + model_id = "celltype_homosapiens-lung-mlp-0.0.1-0.1_mylab" zoo = ModelZoo() zoo.model_id = model_id - test_trainer = HelperTrainerBase(zoo=zoo) + test_trainer = HelperTrainer(zoo=zoo) test_trainer.test_init(cls=TrainModelCelltype, fn_target_universe=tmp_fn) test_trainer.test_save() diff --git a/sfaira/unit_tests/tests_by_submodule/ui/test_userinterface.py b/sfaira/unit_tests/tests_by_submodule/ui/test_userinterface.py index 5fefc2b56..b5898c1b5 100644 --- a/sfaira/unit_tests/tests_by_submodule/ui/test_userinterface.py +++ b/sfaira/unit_tests/tests_by_submodule/ui/test_userinterface.py @@ -4,10 +4,10 @@ import pandas as pd import urllib.request +from sfaira import settings from sfaira.ui import UserInterface -from sfaira.unit_tests.data_for_tests.loaders.utils import prepare_dsg +from sfaira.unit_tests.data_for_tests.loaders.utils import PrepareData from sfaira.unit_tests import DIR_TEMP -from sfaira.consts import SFAIRA_REPO_URL class HelperUi: @@ -29,7 +29,11 @@ def prepare_local_tempfiles(self): if not os.path.exists(self.temp_fn): os.makedirs(self.temp_fn) # download an example weight from sfaira repo - lookuptable = pd.read_csv(os.path.join(SFAIRA_REPO_URL, 'model_lookuptable.csv'), header=0, index_col=0) + lookuptable = pd.read_csv( + os.path.join(settings.sfaira_repo_url, 'model_lookuptable.csv'), + header=0, + index_col=0 + ) url = lookuptable.loc[0, "model_file_path"] if os.path.basename(url) not in os.listdir(self.temp_fn): urllib.request.urlretrieve(url, os.path.join(self.temp_fn, os.path.basename(url))) @@ -40,8 +44,8 @@ def _get_adata(self): :return: """ - dsg = prepare_dsg(rewrite=True, load=False) - dsg.subset(key="id", values=["human_lung_2021_None_mock4_001_no_doi_mock4"]) + dsg = PrepareData().prepare_dsg(rewrite=True, load=False) + dsg.subset(key="id", values=["homosapiens_lung_2021_None_mock4_001_no_doi_mock4"]) dsg.load() return dsg.adata @@ -59,10 +63,10 @@ def test_public_repo_ui_init(self): :return: """ - self.ui = UserInterface(custom_repo=None, sfaira_repo=True) + self.ui = UserInterface(custom_repo=None, sfaira_repo=True, cache_path=self.temp_fn) def test_data_and_model_loading(self): - self.ui = UserInterface(custom_repo=None, sfaira_repo=True) + self.ui = UserInterface(custom_repo=None, sfaira_repo=True, cache_path=self.temp_fn) self.ui.zoo_embedding.model_id = 'embedding_human-blood-ae-0.2-0.1_theislab' self.ui.zoo_celltype.model_id = 'celltype_human-blood-mlp-0.1.3-0.1_theislab' test_data = self._get_adata() diff --git a/sfaira/unit_tests/tests_by_submodule/ui/test_zoo.py b/sfaira/unit_tests/tests_by_submodule/ui/test_zoo.py index 988374128..038547d38 100644 --- a/sfaira/unit_tests/tests_by_submodule/ui/test_zoo.py +++ b/sfaira/unit_tests/tests_by_submodule/ui/test_zoo.py @@ -2,12 +2,12 @@ def test_for_fatal_embedding(): - model_id = "embedding_human-lung-linear-0.1-0.1_mylab" + model_id = "embedding_homosapiens-lung-linear-0.1-0.1_mylab" zoo = ModelZoo() zoo.model_id = model_id assert zoo.model_id == model_id assert zoo.model_class == "embedding" - assert zoo.model_name == "human-lung-linear-0.1-0.1" + assert zoo.model_name == "homosapiens-lung-linear-0.1-0.1" assert zoo.organisation == "mylab" _ = zoo.topology_container _ = zoo.topology_container.topology @@ -15,12 +15,12 @@ def test_for_fatal_embedding(): def test_for_fatal_celltype(): - model_id = "celltype_human-lung-mlp-0.0.1-0.1_mylab" + model_id = "celltype_homosapiens-lung-mlp-0.0.1-0.1_mylab" zoo = ModelZoo() zoo.model_id = model_id assert zoo.model_id == model_id assert zoo.model_class == "celltype" - assert zoo.model_name == "human-lung-mlp-0.0.1-0.1" + assert zoo.model_name == "homosapiens-lung-mlp-0.0.1-0.1" assert zoo.organisation == "mylab" _ = zoo.topology_container _ = zoo.topology_container.topology diff --git a/sfaira/unit_tests/tests_by_submodule/versions/test_genomes.py b/sfaira/unit_tests/tests_by_submodule/versions/test_genomes.py index 3403fa45f..eaba6c267 100644 --- a/sfaira/unit_tests/tests_by_submodule/versions/test_genomes.py +++ b/sfaira/unit_tests/tests_by_submodule/versions/test_genomes.py @@ -4,20 +4,20 @@ from sfaira.versions.genomes import GenomeContainer, translate_id_to_symbols, translate_symbols_to_id -ASSEMBLY = "Mus_musculus.GRCm38.102" +RELEASE = "102" +ORGANISM = "Mus musculus" """ GenomeContainer. """ -@pytest.mark.parametrize("assembly", [ASSEMBLY]) -def test_gc_init(assembly: Union[str]): +def test_gc_init(): """ Tests different modes of initialisation for fatal errors. """ - gc = GenomeContainer(assembly=assembly) - assert gc.organism == "mus_musculus" + gc = GenomeContainer(release=RELEASE, organism=ORGANISM) + assert gc.organism == "Mus musculus" @pytest.mark.parametrize("subset", [ @@ -31,8 +31,8 @@ def test_gc_subsetting(subset: Tuple[dict, int]): """ Tests if genome container is subsetted correctly. """ - gc = GenomeContainer(assembly="Mus_musculus.GRCm38.102") - gc.subset(**subset[0]) + gc = GenomeContainer(release=RELEASE, organism=ORGANISM) + gc.set(**subset[0]) assert gc.n_var == subset[1] assert len(gc.ensembl) == subset[1] assert len(gc.symbols) == subset[1] @@ -56,12 +56,12 @@ def test_translate_id_to_symbols(genes): Tests translate_id_to_symbols and translate_symbols_to_id for translation errors. """ x, y = genes - y_hat = translate_symbols_to_id(x=x, assembly="Mus_musculus.GRCm38.102") + y_hat = translate_symbols_to_id(x=x, release=RELEASE, organism=ORGANISM) # Correct target spelling of y: y = [z.upper() for z in y] if isinstance(y, list) else y.upper() assert np.all(y_hat == y) y, x = genes - y_hat = translate_id_to_symbols(x=x, assembly="Mus_musculus.GRCm38.102") + y_hat = translate_id_to_symbols(x=x, release=RELEASE, organism=ORGANISM) # Correct target spelling of y: y = [z[0].upper() + z[1:].lower() for z in y] if isinstance(y, list) else y[0].upper() + y[1:].lower() assert np.all(y_hat == y) diff --git a/sfaira/unit_tests/tests_by_submodule/versions/test_ontologies.py b/sfaira/unit_tests/tests_by_submodule/versions/test_ontologies.py index 4b8840b1f..0638d2e45 100644 --- a/sfaira/unit_tests/tests_by_submodule/versions/test_ontologies.py +++ b/sfaira/unit_tests/tests_by_submodule/versions/test_ontologies.py @@ -1,8 +1,8 @@ import numpy as np from sfaira.consts.ontologies import DEFAULT_CL, DEFAULT_HSAPDV, DEFAULT_MONDO, DEFAULT_MMUSDV, DEFAULT_PATO, \ - DEFAULT_UBERON + DEFAULT_NCBITAXON, DEFAULT_UBERON from sfaira.versions.metadata import OntologyUberon, OntologyCl, OntologyHancestro, OntologyHsapdv, OntologyMondo, \ - OntologyMmusdv, OntologyEfo, OntologySex + OntologyMmusdv, OntologyEfo, OntologyTaxon, OntologySex, OntologyUberonLifecyclestage """ OntologyCelltypes @@ -27,7 +27,7 @@ def test_cl_is_a(): assert not oc.is_a(query="lymphocyte", reference="T cell") -def test_cl_effective_leaves(): +def test_effective_leaves(): """ Tests if node sets can be mapped to effective leaf sets via `OntologyCelltypes.get_effective_leaves()` """ @@ -40,7 +40,7 @@ def test_cl_effective_leaves(): assert set(x) == {"stromal cell", "T-helper 1 cell", "T-helper 17 cell"}, x -def test_cl_map_leaves(): +def test_map_leaves(): """ Tests if nodes can be mapped to leave nodes in ontology. """ @@ -51,7 +51,7 @@ def test_cl_map_leaves(): assert np.all(leaf_map_2 == np.sort([oc.convert_to_name(oc.leaves).index(x) for x in list(leaf_map_1)])) -def test_cl_set_leaves(): +def test_set_leaves(): """ Tests if ontology behaves correctly if leaf nodes were reset. """ @@ -72,6 +72,17 @@ def test_cl_set_leaves(): assert np.all(leaf_map_4 == np.sort([oc.convert_to_name(oc.leaves).index(x) for x in list(leaf_map_3)])) +def test_reset_root(): + """ + Tests if root can be reset correctly. + """ + oc = OntologyCl(branch=DEFAULT_CL, use_developmental_relationships=False) + oc.reset_root(root="T cell") + assert "T-helper 1 cell" in oc.node_names + assert "T cell" in oc.node_names + assert "lymphocyte" not in oc.node_names + + """ OntologyEfo """ @@ -151,6 +162,16 @@ def test_mmusdv_loading(): _ = OntologyMmusdv(branch=DEFAULT_MMUSDV, recache=False) +""" +NCBI Taxon +""" + + +def test_taxon_loading(): + _ = OntologyTaxon(branch=DEFAULT_NCBITAXON, recache=True) + _ = OntologyTaxon(branch=DEFAULT_NCBITAXON, recache=False) + + """ Sex """ @@ -171,6 +192,10 @@ def test_uberon_loading(): _ = OntologyUberon(branch=DEFAULT_UBERON, recache=False) +def test_uberon_lcs_loading(): + _ = OntologyUberonLifecyclestage(branch=DEFAULT_UBERON, recache=False) + + def test_uberon_subsetting(): ou = OntologyUberon(branch=DEFAULT_UBERON) assert ou.is_a(query="lobe of lung", reference="lung") diff --git a/sfaira/versions/genomes/__init__.py b/sfaira/versions/genomes/__init__.py index afc17716d..a91511877 100644 --- a/sfaira/versions/genomes/__init__.py +++ b/sfaira/versions/genomes/__init__.py @@ -1,2 +1,2 @@ -from .genomes import GenomeContainer, GtfInterface +from .genomes import ReactiveFeatureContainer, CustomFeatureContainer, GenomeContainer, GtfInterface from .utils import translate_id_to_symbols, translate_symbols_to_id diff --git a/sfaira/versions/genomes/genomes.py b/sfaira/versions/genomes/genomes.py index 728bb276a..f50ca0fd1 100644 --- a/sfaira/versions/genomes/genomes.py +++ b/sfaira/versions/genomes/genomes.py @@ -1,7 +1,8 @@ """ -Functionalities to interact with gene sets defined in an assembly and gene-annotation (such as protein-coding). +Functionalities to interact with feature sets defined in an assembly or interactively by user. """ - +import abc +import ftplib import gzip import numpy as np import os @@ -11,7 +12,7 @@ import urllib.error import urllib.request -from sfaira.consts.directories import CACHE_DIR_GENOMES +from sfaira import settings KEY_SYMBOL = "gene_name" KEY_ID = "gene_id" @@ -26,33 +27,56 @@ class GtfInterface: - def __init__(self, assembly: str): - self.assembly = assembly + release: str + organism: str + + def __init__(self, release: str, organism: str): + self.release = release + self.organism = organism @property def cache_dir(self): """ - The cache dir is in a cache directory in the sfaira installation that is excempt from git versioning. + The cache dir is in a cache directory in the homedirectory of the user by default and can be user modified. """ - cache_dir_path = pathlib.Path(CACHE_DIR_GENOMES) + cache_dir_path = os.path.join(settings.cachedir_genomes, self.ensembl_organism) + cache_dir_path = pathlib.Path(cache_dir_path) cache_dir_path.mkdir(parents=True, exist_ok=True) - return CACHE_DIR_GENOMES + return cache_dir_path @property def cache_fn(self): return os.path.join(self.cache_dir, self.assembly + ".csv") @property - def release(self) -> str: - return self.assembly.split(".")[-1] + def assembly(self) -> str: + # Get variable middle string of assembly name by looking files up on ftp server: + ftp = ftplib.FTP("ftp.ensembl.org") + ftp.login() + ftp.cwd(self.url_ensembl_dir) + data = [] + ftp.dir(data.append) + ftp.quit() + target_file = [line.split(' ')[-1] for line in data] + # Filter assembly files starting with organism name: + target_file = [x for x in target_file if x.split(".")[0].lower() == self.ensembl_organism] + # Filter target assembly: + target_file = [x for x in target_file if len(x.split(".")) == 5] + assert len(target_file) == 1, target_file # There should only be one file left if filters work correctly. + assembly = target_file[0].split(".gtf.gz")[0] + return assembly + + @property + def ensembl_organism(self): + return "_".join([x.lower() for x in self.organism.split(" ")]) @property - def organism(self) -> str: - return self.assembly.split(".")[0].lower() + def url_ensembl_dir(self): + return f"pub/release-{self.release}/gtf/{self.ensembl_organism}" @property - def url_ensembl_ftp(self): - return f"ftp://ftp.ensembl.org/pub/release-{self.release}/gtf/{self.organism}/{self.assembly}.gtf.gz" + def url_ensembl_gtf(self): + return f"ftp://ftp.ensembl.org/{self.url_ensembl_dir}/{self.assembly}.gtf.gz" def download_gtf_ensembl(self): """ @@ -60,13 +84,16 @@ def download_gtf_ensembl(self): """ temp_file = os.path.join(self.cache_dir, self.assembly + ".gtf.gz") try: - _ = urllib.request.urlretrieve(url=self.url_ensembl_ftp, filename=temp_file) + _ = urllib.request.urlretrieve(url=self.url_ensembl_gtf, filename=temp_file) except urllib.error.URLError as e: - raise ValueError(f"Could not download gtf from {self.url_ensembl_ftp} with urllib.error.URLError: {e}, " + raise ValueError(f"Could not download gtf from {self.url_ensembl_gtf} with urllib.error.URLError: {e}, " f"check if assembly name '{self.assembly}' corresponds to an actual assembly.") with gzip.open(temp_file) as f: tab = pandas.read_csv(f, sep="\t", comment="#", header=None) - os.remove(temp_file) # Delete temporary file .gtf.gz. + # Delete temporary file .gtf.gz if it still exists (some times already deleted by parallel process in grid + # search). + if os.path.exists(temp_file): + os.remove(temp_file) tab = tab.loc[tab[KEY_GTF_REGION_TYPE].values == VALUE_GTF_GENE, :] conversion_tab = pandas.DataFrame({ KEY_ID: [ @@ -88,7 +115,37 @@ def cache(self) -> pandas.DataFrame: return pandas.read_csv(self.cache_fn) -class GenomeContainer: +class GenomeContainerBase: + """ + Base container class for a gene annotation. + """ + + @abc.abstractmethod + def organism(self): + pass + + @abc.abstractmethod + def set(self, **kwargs): + pass + + @abc.abstractmethod + def symbols(self) -> List[str]: + pass + + @abc.abstractmethod + def ensembl(self) -> List[str]: + pass + + @abc.abstractmethod + def biotype(self) -> List[str]: + pass + + @abc.abstractmethod + def n_var(self): + pass + + +class GenomeContainer(GenomeContainerBase): """ Container class for a genome annotation for a specific release. @@ -97,11 +154,13 @@ class GenomeContainer: """ genome_tab: pandas.DataFrame - assembly: str + organism: str + release: str def __init__( self, - assembly: str = None, + organism: str = None, + release: str = None, ): """ Are you not sure which assembly to use? @@ -111,22 +170,21 @@ def __init__( - You could use one used by a specific aligner, the assemblies used by 10x cellranger are described here for example: https://support.10xgenomics.com/single-cell-gene-expression/software/release-notes/build - :param assembly: The full name of the genome assembly, e.g. Homo_sapiens.GRCh38.102. + :param release: The full name of the genome assembly, e.g. Homo_sapiens.GRCh38.102. """ - if not isinstance(assembly, str): - raise ValueError(f"supplied assembly {assembly} was not a string") - self.assembly = assembly - self.gtfi = GtfInterface(assembly=self.assembly) + if not isinstance(organism, str): + raise ValueError(f"supplied organism {organism} was not a string") + if not isinstance(release, str): + raise ValueError(f"supplied release {release} was not a string") + self.organism = organism + self.release = release + self.gtfi = GtfInterface(organism=self.organism, release=self.release) self.load_genome() - @property - def organism(self): - return self.gtfi.organism - def load_genome(self): self.genome_tab = self.gtfi.cache - def subset( + def set( self, biotype: Union[None, str, List[str]] = None, symbols: Union[None, str, List[str]] = None, @@ -312,3 +370,53 @@ def __init__( @property def organism(self): return self._organism + + +class ReactiveFeatureContainer(GenomeContainerBase): + + """ + Container class for features that can reactively loaded based on features present in data. + + The symbols are added by the store that uses this container. + """ + + def __init__(self, **kwargs): + self._symbols = None + + @property + def organism(self): + return None + + def set( + self, + symbols: Union[None, str, List[str]] = None, + ): + """ + Set feature space to identifiers (symbol or ensemble ID). + + Note that there is no background (assembly) feature space to subset in this class. + + :param symbols: Gene symbol(s) of gene(s) to subset genome to. Elements have to appear in genome. + Separate in string via "," if choosing multiple or supply as list of string. + """ + self._symbols = symbols + + @property + def symbols(self) -> List[str]: + return self._symbols + + @symbols.setter + def symbols(self, x): + self._symbols = x + + @property + def ensembl(self) -> List[str]: + return self._symbols + + @property + def biotype(self) -> List[str]: + return ["custom" for x in self._symbols] + + @property + def n_var(self) -> int: + return len(self.symbols) if self.symbols is not None else None diff --git a/sfaira/versions/genomes/utils.py b/sfaira/versions/genomes/utils.py index 70c65e500..03e02d0ce 100644 --- a/sfaira/versions/genomes/utils.py +++ b/sfaira/versions/genomes/utils.py @@ -3,7 +3,7 @@ from sfaira.versions.genomes import GenomeContainer -def translate_symbols_to_id(x: Union[str, Iterable[str]], assembly: str) -> Union[str, List[str]]: +def translate_symbols_to_id(x: Union[str, Iterable[str]], release: str, organism: str) -> Union[str, List[str]]: """ Translate gene symbols to ENSEMBL IDs. @@ -17,13 +17,15 @@ def translate_symbols_to_id(x: Union[str, Iterable[str]], assembly: str) -> Unio for example: https://support.10xgenomics.com/single-cell-gene-expression/software/release-notes/build :param x: Symbol(s) to translate. - :param assembly: The full name of the genome assembly, e.g. "Homo_sapiens.GRCh38.102". + :param release: The name of the release of genome assembly, e.g. "102" for assembly "Homo_sapiens.GRCh38.102". + :param organism: The name of the organism of genome assembly, e.g. "Homo sapiens" for assembly + "Homo_sapiens.GRCh38.102". :return: ENSEMBL IDs """ - return GenomeContainer(assembly=assembly).translate_symbols_to_id(x=x) + return GenomeContainer(release=release, organism=organism).translate_symbols_to_id(x=x) -def translate_id_to_symbols(x: Union[str, Iterable[str]], assembly: str) -> Union[str, List[str]]: +def translate_id_to_symbols(x: Union[str, Iterable[str]], release: str, organism: str) -> Union[str, List[str]]: """ Translate ENSEMBL IDs to gene symbols. @@ -37,7 +39,9 @@ def translate_id_to_symbols(x: Union[str, Iterable[str]], assembly: str) -> Unio for example: https://support.10xgenomics.com/single-cell-gene-expression/software/release-notes/build :param x: ENSEMBL ID(s) to translate. - :param assembly: The full name of the genome assembly, e.g. "Homo_sapiens.GRCh38.102". + :param release: The name of the release of genome assembly, e.g. "102" for assembly "Homo_sapiens.GRCh38.102". + :param organism: The name of the organism of genome assembly, e.g. "Homo sapiens" for assembly + "Homo_sapiens.GRCh38.102". :return: Gene symbols. """ - return GenomeContainer(assembly=assembly).translate_id_to_symbols(x=x) + return GenomeContainer(release=release, organism=organism).translate_id_to_symbols(x=x) diff --git a/sfaira/versions/metadata/__init__.py b/sfaira/versions/metadata/__init__.py index e0b6579d6..86bcfbdcd 100644 --- a/sfaira/versions/metadata/__init__.py +++ b/sfaira/versions/metadata/__init__.py @@ -1,4 +1,4 @@ from sfaira.versions.metadata.base import Ontology, OntologyList, OntologyHierarchical, OntologyObo, \ OntologyOboCustom, OntologyCl, OntologyHancestro, OntologyUberon, OntologyHsapdv, OntologyMondo, \ - OntologyMmusdv, OntologySex, OntologyEfo, OntologyCellosaurus + OntologyMmusdv, OntologySex, OntologyEfo, OntologyCellosaurus, OntologyTaxon, OntologyUberonLifecyclestage from sfaira.versions.metadata.universe import CelltypeUniverse diff --git a/sfaira/versions/metadata/base.py b/sfaira/versions/metadata/base.py index dacf85591..d62c7e86a 100644 --- a/sfaira/versions/metadata/base.py +++ b/sfaira/versions/metadata/base.py @@ -6,10 +6,11 @@ import owlready2 import pickle import requests +from functools import lru_cache from typing import Dict, List, Tuple, Union -from sfaira.consts.directories import CACHE_DIR_ONTOLOGIES +from sfaira import settings """ Ontology managament classes. @@ -33,7 +34,7 @@ def cached_load_file(url, ontology_cache_dir, ontology_cache_fn, recache: bool = # TODO add caching option. obofile = url else: - ontology_cache_dir = os.path.join(CACHE_DIR_ONTOLOGIES, ontology_cache_dir) + ontology_cache_dir = os.path.join(settings.cachedir_ontologies, ontology_cache_dir) obofile = os.path.join(ontology_cache_dir, ontology_cache_fn) # Download if necessary: if not os.path.isfile(obofile) or recache: @@ -62,7 +63,7 @@ def cached_load_ebi(ontology_cache_dir, ontology_cache_fn, recache: bool = False :param recache: :return: """ - ontology_cache_dir = os.path.join(CACHE_DIR_ONTOLOGIES, ontology_cache_dir) + ontology_cache_dir = os.path.join(settings.cachedir_ontologies, ontology_cache_dir) picklefile = os.path.join(ontology_cache_dir, ontology_cache_fn) if os.path.isfile(picklefile) and not recache: with open(picklefile, 'rb') as f: @@ -119,6 +120,35 @@ def __init__( def node_names(self) -> List[str]: return self.nodes + @property + def node_ids(self) -> List[str]: + return self.nodes + + @property + def leaves(self) -> List[str]: + return self.nodes + + @property + def n_leaves(self) -> int: + return len(self.nodes) + + def prepare_maps_to_leaves( + self, + include_self: bool = True + ) -> Dict[str, np.ndarray]: + """ + Precomputes all maps of nodes to their leave nodes. + + Note that for a list ontology, this maps each node to itself. + + :param include_self: whether to include node itself + :return: Dictionary of index vectors of leave node matches for each node (key). + """ + if include_self: + return dict([(x, np.array([self.leaves.index(x)])) for x in self.leaves]) + else: + return dict([(x, np.array([])) for x in self.leaves]) + def is_a_node_id(self, x: str) -> bool: return x in self.node_names @@ -167,12 +197,25 @@ def is_a(self, query: str, reference: str) -> bool: """ return query == reference + def get_ancestors(self, node: str) -> List[str]: + return [] + class OntologyHierarchical(Ontology, abc.ABC): """ Basic ordered ontology container """ - graph: networkx.MultiDiGraph + _graph: networkx.MultiDiGraph + + @property + def graph(self) -> networkx.MultiDiGraph: + return self._graph + + @graph.setter + def graph(self, graph: networkx.MultiDiGraph): + self._graph = graph + self.get_ancestors.cache_clear() + self.get_descendants.cache_clear() def _check_graph(self): if not networkx.is_directed_acyclic_graph(self.graph): @@ -259,6 +302,10 @@ def convert_to_id(self, x: Union[str, List[str]]) -> Union[str, List[str]]: else: return x + @lru_cache(maxsize=None) + def __convert_to_id_cached(self, x: str) -> str: + return self.convert_to_id(x) + @property def leaves(self) -> List[str]: return [x for x in self.graph.nodes() if self.graph.in_degree(x) == 0] @@ -271,10 +318,10 @@ def leaves(self, x: List[str]): This clips nodes that are not upstream of defined leaves. :param x: New set of leaves nodes, identified as IDs. """ - x = self.convert_to_id(x=x) + x = self.convert_to_id(x) nodes_to_remove = [] for y in self.graph.nodes(): - if not np.any([self.is_a(query=z, reference=y) for z in x]): + if not np.any([self.is_a(query=z, reference=y, convert_to_id=False) for z in x]): nodes_to_remove.append(y) self.graph.remove_nodes_from(nodes_to_remove) @@ -299,32 +346,36 @@ def get_effective_leaves(self, x: List[str]) -> List[str]: assert isinstance(x, list), "supply either list or str to get_effective_leaves" if len(x) == 0: raise ValueError("x was empty list, get_effective_leaves cannot be called on empty list") - x = np.unique(x).tolist() + x = list(np.unique(x)) x = self.convert_to_id(x=x) leaves = [] for y in x: - if not np.any([self.is_a(query=z, reference=y) for z in list(set(x) - {y})]): + if not np.any([self.is_a(query=z, reference=y, convert_to_id=False) for z in list(set(x) - {y})]): leaves.append(y) return leaves + @lru_cache(maxsize=None) def get_ancestors(self, node: str) -> List[str]: - node = self.convert_to_id(node) + node = self.__convert_to_id_cached(node) return list(networkx.ancestors(self.graph, node)) + @lru_cache(maxsize=None) def get_descendants(self, node: str) -> List[str]: - node = self.convert_to_id(node) + node = self.__convert_to_id_cached(node) return list(networkx.descendants(self.graph, node)) - def is_a(self, query: str, reference: str) -> bool: + def is_a(self, query: str, reference: str, convert_to_id: bool = True) -> bool: """ Checks if query node is reference node or an ancestor thereof. :param query: Query node name. Node ID or name. :param reference: Reference node name. Node ID or name. + :param convert_to_id: Whether to call self.convert_to_id on `query` and `reference` input arguments :return: If query node is reference node or an ancestor thereof. """ - query = self.convert_to_id(query) - reference = self.convert_to_id(reference) + if convert_to_id: + query = self.__convert_to_id_cached(query) + reference = self.__convert_to_id_cached(reference) return query in self.get_ancestors(node=reference) or query == reference def map_to_leaves( @@ -344,13 +395,11 @@ def map_to_leaves( :param include_self: DEPRECEATED. :return: """ - node = self.convert_to_id(node) + node = self.__convert_to_id_cached(node) ancestors = self.get_ancestors(node) # Add node itself to list of ancestors. ancestors = ancestors + [node] - if len(ancestors) > 0: - ancestors = self.convert_to_id(ancestors) - leaves = self.convert_to_id(self.leaves) + leaves = self.leaves if return_type == "ids": return [x for x in leaves if x in ancestors] elif return_type == "idx": @@ -377,6 +426,10 @@ def prepare_maps_to_leaves( print(f"time for precomputing ancestors: {time.time()-t0}") return maps + def reset_root(self, root: str): + new_nodes = [self.convert_to_id(x=root)] + self.get_ancestors(node=root) + self.graph = self.graph.subgraph(nodes=new_nodes) + @abc.abstractmethod def synonym_node_properties(self) -> List[str]: pass @@ -800,6 +853,23 @@ def synonym_node_properties(self) -> List[str]: return ["synonym", "latin term", "has relational adjective"] +class OntologyUberonLifecyclestage(OntologyUberon): + + """ + Subset of UBERON for generic life cycle stages that can be used for organism not covered by specific developmental + ontologies. + """ + + def __init__( + self, + branch: str, + recache: bool = False, + **kwargs + ): + super().__init__(branch=branch, recache=recache, **kwargs) + self.reset_root(root="UBERON:0000105") + + class OntologyCl(OntologyExtendedObo): def __init__( @@ -922,7 +992,7 @@ def __init__( recache: bool = False, **kwargs ): - # URL for releases: + # URL for releases, not used here yet because versioning with respect to releases below is not consistent yet. # url=f"https://raw.githubusercontent.com/obophenotype/developmental-stage-ontologies/{branch}/src/mmusdv/mmusdv.obo" obofile = cached_load_file( url="http://ontologies.berkeleybop.org/mmusdv.obo", @@ -1032,6 +1102,42 @@ def __init__(self, recache: bool = False): ) +class OntologyTaxon(OntologyExtendedObo): + + """ + Note on ontology: The same repo also contains ncbitaxon.obs, the full ontology which is ~500MB large and + takes multiple minutes to load. We are using a reduced version, taxslim, here. + + See also https://github.com/obophenotype/ncbitaxon/releases/download/{branch}/ncbitaxon.obo. + """ + + def __init__( + self, + branch: str, + recache: bool = False, + **kwargs + ): + obofile = cached_load_file( + url=f"https://github.com/obophenotype/ncbitaxon/releases/download/{branch}/taxslim.obo", + ontology_cache_dir="ncbitaxon", + ontology_cache_fn=f"ncbitaxon_{branch}.obo", + recache=recache, + ) + super().__init__(obo=obofile) + + # Clean up nodes: + nodes_to_delete = [] + for k, v in self.graph.nodes.items(): + if "name" not in v.keys(): + nodes_to_delete.append(k) + for k in nodes_to_delete: + self.graph.remove_node(k) + + @property + def synonym_node_properties(self) -> List[str]: + return ["synonym"] + + class OntologyEfo(OntologyExtendedObo): def __init__( diff --git a/sfaira/versions/metadata/extensions/__init__.py b/sfaira/versions/metadata/extensions/__init__.py deleted file mode 100644 index 6fe32c7de..000000000 --- a/sfaira/versions/metadata/extensions/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .obo_extension import ONTOLOGIY_EXTENSION diff --git a/sfaira/versions/metadata/extensions/obo_extension.py b/sfaira/versions/metadata/extensions/obo_extension.py deleted file mode 100644 index 2a54299ae..000000000 --- a/sfaira/versions/metadata/extensions/obo_extension.py +++ /dev/null @@ -1 +0,0 @@ -ONTOLOGIY_EXTENSION = {} diff --git a/sfaira/versions/metadata/universe.py b/sfaira/versions/metadata/universe.py index f929c8214..9cc19a2dc 100644 --- a/sfaira/versions/metadata/universe.py +++ b/sfaira/versions/metadata/universe.py @@ -2,8 +2,7 @@ import pandas as pd from typing import Dict, List, Tuple, Union -from sfaira.versions.metadata import OntologyCl, OntologyUberon -from sfaira.versions.metadata.extensions import ONTOLOGIY_EXTENSION +from sfaira.versions.metadata import OntologyCl, OntologyList, OntologyUberon TARGET_UNIVERSE_KEY_NAME = "name" TARGET_UNIVERSE_KEY_ID = "id" @@ -20,14 +19,10 @@ class CelltypeUniverse: onto_uberon: OntologyUberon _target_universe: Union[List[str], None] - def __init__(self, cl: OntologyCl, uberon: OntologyUberon, **kwargs): + def __init__(self, cl: Union[OntologyCl, OntologyList], uberon: OntologyUberon, **kwargs): self.onto_cl = cl self.onto_uberon = uberon self._target_universe = None - self._set_extension() - - def _set_extension(self): - self.onto_cl.add_extension(ONTOLOGIY_EXTENSION) def __validate_target_universe_table(self, tab: pd.DataFrame): assert len(tab.columns) == 2 diff --git a/sfaira/versions/topologies/__init__.py b/sfaira/versions/topologies/__init__.py index 16095b0e5..79eb0529e 100644 --- a/sfaira/versions/topologies/__init__.py +++ b/sfaira/versions/topologies/__init__.py @@ -1,34 +1,34 @@ -from . import human -from . import mouse +from . import homosapiens +from . import musmusculus from .class_interface import TopologyContainer TOPOLOGIES = { - "mouse": { + "Mus musculus": { "celltype": { - "marker": mouse.celltype.celltypemarker.CELLTYPEMARKER_TOPOLOGIES, - "mlp": mouse.celltype.celltypemlp.CELLTYPEMLP_TOPOLOGIES + "marker": musmusculus.celltype.celltypemarker.CELLTYPEMARKER_TOPOLOGIES, + "mlp": musmusculus.celltype.celltypemlp.CELLTYPEMLP_TOPOLOGIES }, "embedding": { - "ae": mouse.embedding.ae.AE_TOPOLOGIES, - "linear": mouse.embedding.linear.LINEAR_TOPOLOGIES, - "nmf": mouse.embedding.nmf.NMF_TOPOLOGIES, - "vae": mouse.embedding.vae.VAE_TOPOLOGIES, - "vaeiaf": mouse.embedding.vaeiaf.VAEIAF_TOPOLOGIES, - "vaevamp": mouse.embedding.vaevamp.VAEVAMP_TOPOLOGIES + "ae": musmusculus.embedding.ae.AE_TOPOLOGIES, + "linear": musmusculus.embedding.linear.LINEAR_TOPOLOGIES, + "nmf": musmusculus.embedding.nmf.NMF_TOPOLOGIES, + "vae": musmusculus.embedding.vae.VAE_TOPOLOGIES, + "vaeiaf": musmusculus.embedding.vaeiaf.VAEIAF_TOPOLOGIES, + "vaevamp": musmusculus.embedding.vaevamp.VAEVAMP_TOPOLOGIES } }, - "human": { + "Homo sapiens": { "celltype": { - "marker": human.celltype.celltypemarker.CELLTYPEMARKER_TOPOLOGIES, - "mlp": human.celltype.celltypemlp.CELLTYPEMLP_TOPOLOGIES + "marker": homosapiens.celltype.celltypemarker.CELLTYPEMARKER_TOPOLOGIES, + "mlp": homosapiens.celltype.celltypemlp.CELLTYPEMLP_TOPOLOGIES }, "embedding": { - "ae": human.embedding.ae.AE_TOPOLOGIES, - "linear": human.embedding.linear.LINEAR_TOPOLOGIES, - "nmf": human.embedding.nmf.NMF_TOPOLOGIES, - "vae": human.embedding.vae.VAE_TOPOLOGIES, - "vaeiaf": human.embedding.vaeiaf.VAEIAF_TOPOLOGIES, - "vaevamp": human.embedding.vaevamp.VAEVAMP_TOPOLOGIES + "ae": homosapiens.embedding.ae.AE_TOPOLOGIES, + "linear": homosapiens.embedding.linear.LINEAR_TOPOLOGIES, + "nmf": homosapiens.embedding.nmf.NMF_TOPOLOGIES, + "vae": homosapiens.embedding.vae.VAE_TOPOLOGIES, + "vaeiaf": homosapiens.embedding.vaeiaf.VAEIAF_TOPOLOGIES, + "vaevamp": homosapiens.embedding.vaevamp.VAEVAMP_TOPOLOGIES } } } diff --git a/sfaira/versions/topologies/class_interface.py b/sfaira/versions/topologies/class_interface.py index b7e1025f5..eac9f9f89 100644 --- a/sfaira/versions/topologies/class_interface.py +++ b/sfaira/versions/topologies/class_interface.py @@ -1,6 +1,6 @@ from typing import Union -from sfaira.versions.genomes.genomes import GenomeContainer +from sfaira.versions.genomes.genomes import GenomeContainer, ReactiveFeatureContainer class TopologyContainer: @@ -17,11 +17,17 @@ def __init__( ): self.topology = topology if custom_genome_constainer is None: - self.gc = GenomeContainer(assembly=self.topology["input"]["genome"]) + if self.topology["input"]["genome"] is not None: + self.gc = GenomeContainer( + organism=" ".join(self.topology["input"]["genome"].split(".")[0].split("_")), + release=self.topology["input"]["genome"].split(".")[-1], + ) + else: + self.gc = ReactiveFeatureContainer() else: assert isinstance(custom_genome_constainer, GenomeContainer) self.gc = custom_genome_constainer - self.gc.subset(**dict([tuple(self.topology["input"]["genes"])])) + self.gc.set(**dict([tuple(self.topology["input"]["genes"])])) self.topology_id = topology_id @property diff --git a/sfaira/versions/topologies/homosapiens/__init__.py b/sfaira/versions/topologies/homosapiens/__init__.py new file mode 100644 index 000000000..b8d49f1e2 --- /dev/null +++ b/sfaira/versions/topologies/homosapiens/__init__.py @@ -0,0 +1,2 @@ +from sfaira.versions.topologies.homosapiens import celltype +from sfaira.versions.topologies.homosapiens import embedding diff --git a/sfaira/versions/topologies/homosapiens/celltype/__init__.py b/sfaira/versions/topologies/homosapiens/celltype/__init__.py new file mode 100644 index 000000000..6cb9c1214 --- /dev/null +++ b/sfaira/versions/topologies/homosapiens/celltype/__init__.py @@ -0,0 +1,2 @@ +from sfaira.versions.topologies.homosapiens.celltype.celltypemarker import CELLTYPEMARKER_TOPOLOGIES +from sfaira.versions.topologies.homosapiens.celltype.celltypemlp import CELLTYPEMLP_TOPOLOGIES diff --git a/sfaira/versions/topologies/human/celltype/celltypemarker.py b/sfaira/versions/topologies/homosapiens/celltype/celltypemarker.py similarity index 100% rename from sfaira/versions/topologies/human/celltype/celltypemarker.py rename to sfaira/versions/topologies/homosapiens/celltype/celltypemarker.py diff --git a/sfaira/versions/topologies/human/celltype/celltypemlp.py b/sfaira/versions/topologies/homosapiens/celltype/celltypemlp.py similarity index 100% rename from sfaira/versions/topologies/human/celltype/celltypemlp.py rename to sfaira/versions/topologies/homosapiens/celltype/celltypemlp.py diff --git a/sfaira/versions/topologies/homosapiens/embedding/__init__.py b/sfaira/versions/topologies/homosapiens/embedding/__init__.py new file mode 100644 index 000000000..fa646a353 --- /dev/null +++ b/sfaira/versions/topologies/homosapiens/embedding/__init__.py @@ -0,0 +1,6 @@ +from sfaira.versions.topologies.homosapiens.embedding.ae import AE_TOPOLOGIES +from sfaira.versions.topologies.homosapiens.embedding.linear import LINEAR_TOPOLOGIES +from sfaira.versions.topologies.homosapiens.embedding.nmf import NMF_TOPOLOGIES +from sfaira.versions.topologies.homosapiens.embedding.vae import VAE_TOPOLOGIES +from sfaira.versions.topologies.homosapiens.embedding.vaeiaf import VAEIAF_TOPOLOGIES +from sfaira.versions.topologies.homosapiens.embedding.vaevamp import VAEVAMP_TOPOLOGIES diff --git a/sfaira/versions/topologies/human/embedding/ae.py b/sfaira/versions/topologies/homosapiens/embedding/ae.py similarity index 100% rename from sfaira/versions/topologies/human/embedding/ae.py rename to sfaira/versions/topologies/homosapiens/embedding/ae.py diff --git a/sfaira/versions/topologies/human/embedding/linear.py b/sfaira/versions/topologies/homosapiens/embedding/linear.py similarity index 100% rename from sfaira/versions/topologies/human/embedding/linear.py rename to sfaira/versions/topologies/homosapiens/embedding/linear.py diff --git a/sfaira/versions/topologies/human/embedding/nmf.py b/sfaira/versions/topologies/homosapiens/embedding/nmf.py similarity index 100% rename from sfaira/versions/topologies/human/embedding/nmf.py rename to sfaira/versions/topologies/homosapiens/embedding/nmf.py diff --git a/sfaira/versions/topologies/human/embedding/vae.py b/sfaira/versions/topologies/homosapiens/embedding/vae.py similarity index 100% rename from sfaira/versions/topologies/human/embedding/vae.py rename to sfaira/versions/topologies/homosapiens/embedding/vae.py diff --git a/sfaira/versions/topologies/human/embedding/vaeiaf.py b/sfaira/versions/topologies/homosapiens/embedding/vaeiaf.py similarity index 100% rename from sfaira/versions/topologies/human/embedding/vaeiaf.py rename to sfaira/versions/topologies/homosapiens/embedding/vaeiaf.py diff --git a/sfaira/versions/topologies/human/embedding/vaevamp.py b/sfaira/versions/topologies/homosapiens/embedding/vaevamp.py similarity index 100% rename from sfaira/versions/topologies/human/embedding/vaevamp.py rename to sfaira/versions/topologies/homosapiens/embedding/vaevamp.py diff --git a/sfaira/versions/topologies/human/__init__.py b/sfaira/versions/topologies/human/__init__.py deleted file mode 100644 index 6630987ef..000000000 --- a/sfaira/versions/topologies/human/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from sfaira.versions.topologies.human import celltype -from sfaira.versions.topologies.human import embedding diff --git a/sfaira/versions/topologies/human/celltype/__init__.py b/sfaira/versions/topologies/human/celltype/__init__.py deleted file mode 100644 index e657e4e78..000000000 --- a/sfaira/versions/topologies/human/celltype/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from sfaira.versions.topologies.human.celltype.celltypemarker import CELLTYPEMARKER_TOPOLOGIES -from sfaira.versions.topologies.human.celltype.celltypemlp import CELLTYPEMLP_TOPOLOGIES diff --git a/sfaira/versions/topologies/human/embedding/__init__.py b/sfaira/versions/topologies/human/embedding/__init__.py deleted file mode 100644 index 829c1623e..000000000 --- a/sfaira/versions/topologies/human/embedding/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from sfaira.versions.topologies.human.embedding.ae import AE_TOPOLOGIES -from sfaira.versions.topologies.human.embedding.linear import LINEAR_TOPOLOGIES -from sfaira.versions.topologies.human.embedding.nmf import NMF_TOPOLOGIES -from sfaira.versions.topologies.human.embedding.vae import VAE_TOPOLOGIES -from sfaira.versions.topologies.human.embedding.vaeiaf import VAEIAF_TOPOLOGIES -from sfaira.versions.topologies.human.embedding.vaevamp import VAEVAMP_TOPOLOGIES diff --git a/sfaira/versions/topologies/mouse/__init__.py b/sfaira/versions/topologies/mouse/__init__.py deleted file mode 100644 index 4105b813a..000000000 --- a/sfaira/versions/topologies/mouse/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from sfaira.versions.topologies.mouse import celltype -from sfaira.versions.topologies.mouse import embedding diff --git a/sfaira/versions/topologies/mouse/celltype/__init__.py b/sfaira/versions/topologies/mouse/celltype/__init__.py deleted file mode 100644 index d98e272bf..000000000 --- a/sfaira/versions/topologies/mouse/celltype/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from sfaira.versions.topologies.mouse.celltype.celltypemarker import CELLTYPEMARKER_TOPOLOGIES -from sfaira.versions.topologies.mouse.celltype.celltypemlp import CELLTYPEMLP_TOPOLOGIES diff --git a/sfaira/versions/topologies/mouse/embedding/__init__.py b/sfaira/versions/topologies/mouse/embedding/__init__.py deleted file mode 100644 index d36c96479..000000000 --- a/sfaira/versions/topologies/mouse/embedding/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from sfaira.versions.topologies.mouse.embedding.ae import AE_TOPOLOGIES -from sfaira.versions.topologies.mouse.embedding.linear import LINEAR_TOPOLOGIES -from sfaira.versions.topologies.mouse.embedding.nmf import NMF_TOPOLOGIES -from sfaira.versions.topologies.mouse.embedding.vae import VAE_TOPOLOGIES -from sfaira.versions.topologies.mouse.embedding.vaeiaf import VAEIAF_TOPOLOGIES -from sfaira.versions.topologies.mouse.embedding.vaevamp import VAEVAMP_TOPOLOGIES diff --git a/sfaira/versions/topologies/musmusculus/__init__.py b/sfaira/versions/topologies/musmusculus/__init__.py new file mode 100644 index 000000000..d527b6a94 --- /dev/null +++ b/sfaira/versions/topologies/musmusculus/__init__.py @@ -0,0 +1,2 @@ +from sfaira.versions.topologies.musmusculus import celltype +from sfaira.versions.topologies.musmusculus import embedding diff --git a/sfaira/versions/topologies/musmusculus/celltype/__init__.py b/sfaira/versions/topologies/musmusculus/celltype/__init__.py new file mode 100644 index 000000000..3b9799907 --- /dev/null +++ b/sfaira/versions/topologies/musmusculus/celltype/__init__.py @@ -0,0 +1,2 @@ +from sfaira.versions.topologies.musmusculus.celltype.celltypemarker import CELLTYPEMARKER_TOPOLOGIES +from sfaira.versions.topologies.musmusculus.celltype.celltypemlp import CELLTYPEMLP_TOPOLOGIES diff --git a/sfaira/versions/topologies/mouse/celltype/celltypemarker.py b/sfaira/versions/topologies/musmusculus/celltype/celltypemarker.py similarity index 100% rename from sfaira/versions/topologies/mouse/celltype/celltypemarker.py rename to sfaira/versions/topologies/musmusculus/celltype/celltypemarker.py diff --git a/sfaira/versions/topologies/mouse/celltype/celltypemlp.py b/sfaira/versions/topologies/musmusculus/celltype/celltypemlp.py similarity index 100% rename from sfaira/versions/topologies/mouse/celltype/celltypemlp.py rename to sfaira/versions/topologies/musmusculus/celltype/celltypemlp.py diff --git a/sfaira/versions/topologies/musmusculus/embedding/__init__.py b/sfaira/versions/topologies/musmusculus/embedding/__init__.py new file mode 100644 index 000000000..9c0aebac1 --- /dev/null +++ b/sfaira/versions/topologies/musmusculus/embedding/__init__.py @@ -0,0 +1,6 @@ +from sfaira.versions.topologies.musmusculus.embedding.ae import AE_TOPOLOGIES +from sfaira.versions.topologies.musmusculus.embedding.linear import LINEAR_TOPOLOGIES +from sfaira.versions.topologies.musmusculus.embedding.nmf import NMF_TOPOLOGIES +from sfaira.versions.topologies.musmusculus.embedding.vae import VAE_TOPOLOGIES +from sfaira.versions.topologies.musmusculus.embedding.vaeiaf import VAEIAF_TOPOLOGIES +from sfaira.versions.topologies.musmusculus.embedding.vaevamp import VAEVAMP_TOPOLOGIES diff --git a/sfaira/versions/topologies/mouse/embedding/ae.py b/sfaira/versions/topologies/musmusculus/embedding/ae.py similarity index 100% rename from sfaira/versions/topologies/mouse/embedding/ae.py rename to sfaira/versions/topologies/musmusculus/embedding/ae.py diff --git a/sfaira/versions/topologies/mouse/embedding/linear.py b/sfaira/versions/topologies/musmusculus/embedding/linear.py similarity index 100% rename from sfaira/versions/topologies/mouse/embedding/linear.py rename to sfaira/versions/topologies/musmusculus/embedding/linear.py diff --git a/sfaira/versions/topologies/mouse/embedding/nmf.py b/sfaira/versions/topologies/musmusculus/embedding/nmf.py similarity index 100% rename from sfaira/versions/topologies/mouse/embedding/nmf.py rename to sfaira/versions/topologies/musmusculus/embedding/nmf.py diff --git a/sfaira/versions/topologies/mouse/embedding/vae.py b/sfaira/versions/topologies/musmusculus/embedding/vae.py similarity index 100% rename from sfaira/versions/topologies/mouse/embedding/vae.py rename to sfaira/versions/topologies/musmusculus/embedding/vae.py diff --git a/sfaira/versions/topologies/mouse/embedding/vaeiaf.py b/sfaira/versions/topologies/musmusculus/embedding/vaeiaf.py similarity index 100% rename from sfaira/versions/topologies/mouse/embedding/vaeiaf.py rename to sfaira/versions/topologies/musmusculus/embedding/vaeiaf.py diff --git a/sfaira/versions/topologies/mouse/embedding/vaevamp.py b/sfaira/versions/topologies/musmusculus/embedding/vaevamp.py similarity index 100% rename from sfaira/versions/topologies/mouse/embedding/vaevamp.py rename to sfaira/versions/topologies/musmusculus/embedding/vaevamp.py