Dev (#286)

* split CLI template test into annotate and test (#272) * fix correct assay_sc (#272) * mitigates merge conflict between dev and release Signed-off-by: zethson <lukas.heumos@posteo.net>
theislab · May 31, 2021 · b0df842 · b0df842
1 parent 27ac19a
commit b0df842
Show file tree

Hide file tree

Showing 7 changed files with 226 additions and 141 deletions.
diff --git a/README.rst b/README.rst
@@ -30,7 +30,7 @@ sfaira - data and model repository for single-cell data
    :align: center
 
 sfaira_ is a model and a data repository in a single python package (preprint_).
-We provide an interactive overview of the current state of the zoos on sfaira-site_.
+We provide an interactive overview of the current state of the zoos on sfaira-portal_.
 
 Its data zoo gives users access to streamlined data loaders that allow reproducible use of published and private data sets for model training and exploration.
 Its model zoo gives user streamlined access to pre-trained models and to common model architectures to ease usage of neural networks in common single-cell analysis workflows:
@@ -43,4 +43,4 @@ sfaira integrates into scanpy_ workflows.
 .. _preprint: https://www.biorxiv.org/content/10.1101/2020.12.16.419036v1
 .. _DCA: https://github.com/theislab/dca
 .. _scArches: https://github.com/theislab/scarches
-.. _sfaira-site: https://theislab.github.io/sfaira-site/index.html
+.. _sfaira-portal: https://theislab.github.io/sfaira-portal/
diff --git a/docs/index.rst b/docs/index.rst
@@ -15,10 +15,10 @@ sfaira - data and model repository for single-cell data
    :align: center
 
 sfaira_ is a model and a data repository in a single python package.
-We provide an interactive overview of the current state of the zoos on sfaira-site_.
+We provide an interactive overview of the current state of the zoos on sfaira-portal_.
 
 .. _sfaira: https://www.biorxiv.org/content/10.1101/2020.12.16.419036v1
-.. _sfaira-site: https://theislab.github.io/sfaira-site/index.html
+.. _sfaira-portal: https://theislab.github.io/sfaira-portal/
 
 .. include:: environment_brief.rst
 

diff --git a/sfaira/cli.py b/sfaira/cli.py
@@ -7,6 +7,8 @@
 import rich.logging
 from rich import traceback
 from rich import print
+
+from sfaira.commands.annotate_dataloader import DataloaderAnnotater
 from sfaira.commands.test_dataloader import DataloaderTester
 
 from sfaira.commands.clean_dataloader import DataloaderCleaner
@@ -102,6 +104,20 @@ def validate_dataloader(path) -> None:
     dataloader_validator.validate()
 
 
+@sfaira_cli.command()
+@click.argument('path', type=click.Path(exists=True))
+@click.option('--doi', type=str, default=None)
+@click.option('--test-data', type=click.Path(exists=True))
+def annotate_dataloader(path, doi, test_data) -> None:
+    """
+    Annotates a dataloader.
+
+    PATH is the absolute path of the root of your sfaira clone.
+    """
+    dataloader_annotater = DataloaderAnnotater()
+    dataloader_annotater.annotate(path, doi, test_data)
+
+
 @sfaira_cli.command()
 @click.argument('path', type=click.Path(exists=True))
 @click.option('--test-data', type=click.Path(exists=True))

diff --git a/sfaira/commands/annotate_dataloader.py b/sfaira/commands/annotate_dataloader.py
@@ -0,0 +1,179 @@
+import os
+import pydoc
+import shutil
+
+from sfaira.data import DatasetGroupDirectoryOriented, DatasetGroup, DatasetBase
+from sfaira.data.utils import read_yaml
+
+try:
+    import sfaira_extension as sfairae
+except ImportError:
+    sfairae = None
+
+
+class DataloaderAnnotater:
+
+    def __init__(self):
+        self.WD = os.path.dirname(__file__)
+        self.file_path = None
+        self.file_path_sfairae = None
+        self.meta_path = None
+        self.cache_path = None
+        self.dir_loader = None
+        self.dir_loader_sfairae = None
+        self.package_source = None
+
+    def annotate(self, path: str, doi: str, test_data: str):
+        """
+        Annotates a provided dataloader.
+
+        Moderate the suggestions made here: Choose the best fit cell ontology label for your cells.
+        Sfaira uses multiple mechanisms of finding matches, depending on how the free text was generated, these might be
+        differentially successful. The proposed IDs groups are separate by ":|||:" strings to give you a visual anchor
+        when going through these lists. You need to delete all of these division strings and all labels in the second
+        columns other than the best fit label. Do not change the first column,
+        (Note that columns are separated by ",")
+        You can also manually check maps here: https://www.ebi.ac.uk/ols/ontologies/cl
+        """
+        doi_sfaira_repr = f'd{doi.translate({ord(c): "_" for c in r"!@#$%^&*()[]/{};:,.<>?|`~-=_+"})}'
+        self._setup_loader(doi_sfaira_repr)
+        self._annotate(test_data, path, doi)
+
+    def _setup_loader(self, doi_sfaira_repr: str):
+        """
+        Define the file names, loader paths and base paths of loader collections for sfaira and sfaira_extension
+        """
+        dir_loader_sfaira = "sfaira.data.dataloaders.loaders."
+        file_path_sfaira = "/" + "/".join(pydoc.locate(dir_loader_sfaira + "FILE_PATH").split("/")[:-1])
+        if sfairae is not None:
+            dir_loader_sfairae = "sfaira_extension.data.dataloaders.loaders."
+            file_path_sfairae = "/" + "/".join(pydoc.locate(dir_loader_sfairae + "FILE_PATH").split("/")[:-1])
+        else:
+            file_path_sfairae = None
+        # Check if loader name is a directory either in sfaira or sfaira_extension loader collections:
+        if doi_sfaira_repr in os.listdir(file_path_sfaira):
+            dir_loader = dir_loader_sfaira + "." + doi_sfaira_repr
+            package_source = "sfaira"
+        elif doi_sfaira_repr in os.listdir(file_path_sfairae):
+            dir_loader = dir_loader_sfairae + "." + doi_sfaira_repr
+            package_source = "sfairae"
+        else:
+            raise ValueError("data loader not found in sfaira and also not in sfaira_extension")
+        file_path = pydoc.locate(dir_loader + ".FILE_PATH")
+        meta_path = None
+        cache_path = None
+        # Clear dataset cache
+        shutil.rmtree(cache_path, ignore_errors=True)
+
+        self.file_path = file_path
+        self.file_path_sfairae = file_path_sfairae
+        self.meta_path = meta_path
+        self.cache_path = cache_path
+        self.dir_loader = dir_loader
+        self.dir_loader_sfairae = dir_loader_sfairae
+        self.package_source = package_source
+
+    def _get_ds(self, test_data: str):
+        ds = DatasetGroupDirectoryOriented(
+            file_base=self.file_path,
+            data_path=test_data,
+            meta_path=None,
+            cache_path=None
+        )
+
+        return ds
+
+    def buffered_load(self, test_data: str):
+        ds = self._get_ds(test_data=test_data)
+        # TODO try-except with good error description saying that the data loader is broken here:
+        ds.load(
+            remove_gene_version=False,
+            match_to_reference=None,
+            load_raw=True,  # Force raw load so non confound future tests by data loader bugs in previous versions.
+            allow_caching=True,
+        )
+
+        assert len(ds.ids) > 0, f"no data sets loaded, make sure raw data is in {test_data}"
+        return ds
+
+    def _annotate(self, test_data: str, path: str, doi: str):
+        ds = self.buffered_load(test_data=test_data)
+        # Create cell type conversion table:
+        cwd = os.path.dirname(self.file_path)
+        dataset_module = str(cwd.split("/")[-1])
+        # Group data sets by file module:
+        # Note that if we were not grouping the cell type map .tsv files by file module, we could directly call
+        # write_ontology_class_map on the ds.
+        for f in os.listdir(cwd):
+            if os.path.isfile(os.path.join(cwd, f)):  # only files
+                # Narrow down to data set files:
+                if f.split(".")[-1] == "py" and f.split(".")[0] not in ["__init__", "base", "group"]:
+                    file_module = ".".join(f.split(".")[:-1])
+
+                    # I) Instantiate Data set group to get all IDs of data sets associated with this .py file.
+                    # Note that all data sets in this directory are already loaded in ds, so we just need the IDs.
+                    DatasetFound = pydoc.locate(self.dir_loader + "." + file_module + ".Dataset")
+                    # Load objects from name space:
+                    # - load(): Loading function that return anndata instance.
+                    # - SAMPLE_FNS: File name list for DatasetBaseGroupLoadingManyFiles
+                    load_func = pydoc.locate(self.dir_loader + "." + file_module + ".load")
+                    load_func_annotation = pydoc.locate(self.dir_loader + "." + file_module + ".LOAD_ANNOTATION")
+                    # Also check sfaira_extension for additional load_func_annotation:
+                    if self.package_source != "sfairae" and sfairae is not None:
+                        load_func_annotation_sfairae = pydoc.locate(self.dir_loader_sfairae + "." + dataset_module +
+                                                                    "." + file_module + ".LOAD_ANNOTATION")
+                        # LOAD_ANNOTATION is a dictionary so we can use update to extend it.
+                        if load_func_annotation_sfairae is not None and load_func_annotation is not None:
+                            load_func_annotation.update(load_func_annotation_sfairae)
+                        elif load_func_annotation_sfairae is not None and load_func_annotation is None:
+                            load_func_annotation = load_func_annotation_sfairae
+                    sample_fns = pydoc.locate(self.dir_loader + "." + file_module + ".SAMPLE_FNS")
+                    fn_yaml = os.path.join(cwd, file_module + ".yaml")
+                    fn_yaml = fn_yaml if os.path.exists(fn_yaml) else None
+                    # Check for sample_fns in yaml:
+                    if fn_yaml is not None:
+                        assert os.path.exists(fn_yaml), f"did not find yaml {fn_yaml}"
+                        yaml_vals = read_yaml(fn=fn_yaml)
+                        if sample_fns is None and yaml_vals["meta"]["sample_fns"] is not None:
+                            sample_fns = yaml_vals["meta"]["sample_fns"]
+                    if sample_fns is None:
+                        sample_fns = [None]
+                    # Here we distinguish between class that are already defined and those that are not.
+                    # The latter case arises if meta data are defined in YAMLs and _load is given as a function.
+                    if DatasetFound is None:
+                        datasets_f = [
+                            DatasetBase(
+                                data_path=test_data,
+                                meta_path=self.meta_path,
+                                cache_path=self.cache_path,
+                                load_func=load_func,
+                                dict_load_func_annotation=load_func_annotation,
+                                sample_fn=x,
+                                sample_fns=sample_fns if sample_fns != [None] else None,
+                                yaml_path=fn_yaml,
+                            ) for x in sample_fns
+                        ]
+                    else:
+                        datasets_f = [
+                            DatasetFound(
+                                data_path=test_data,
+                                meta_path=self.meta_path,
+                                cache_path=self.cache_path,
+                                load_func=load_func,
+                                load_func_annotation=load_func_annotation,
+                                sample_fn=x,
+                                sample_fns=sample_fns if sample_fns != [None] else None,
+                                yaml_path=fn_yaml,
+                            ) for x in sample_fns
+                        ]
+                    # II) Build a data set group from the already loaded data sets and use the group ontology writing
+                    # function.
+                    dsg_f = DatasetGroup(datasets=dict([(x.id, ds.datasets[x.id]) for x in datasets_f]))
+                    # III) Write this directly into the sfaira clone so that it can be committed via git.
+                    # TODO any errors not to be caught here?
+                    doi_sfaira_repr = f'd{doi.translate({ord(c): "_" for c in r"!@#$%^&*()[]/{};:,.<>?|`~-=_+"})}'
+                    dsg_f.write_ontology_class_map(
+                        fn=os.path.join(f"{path}/sfaira/data/dataloaders/loaders/{doi_sfaira_repr}/{file_module}.tsv"),
+                        protected_writing=True,
+                        n_suggest=4,
+                    )
diff --git a/...aders/loaders/d10_1016_j_cmet_2019_01_021/mouse_pancreas_2019_10xsequencing_thompson_x.py b/...aders/loaders/d10_1016_j_cmet_2019_01_021/mouse_pancreas_2019_10xsequencing_thompson_x.py
@@ -33,7 +33,7 @@ def __init__(self, **kwargs):
         self.normalization = "raw"
         self.organ = "pancreas"
         self.organism = "mouse"
-        self.assay_sc = "Drop-seq"
+        self.assay_sc = "10x 3' v2"
         self.state_exact = "diabetic"
         self.year = 2019
         self.sample_source = "primary_tissue"

diff --git a/sfaira/interface/user_interface.py b/sfaira/interface/user_interface.py
@@ -27,6 +27,7 @@ class UserInterface:
     # initialise your sfaira instance with a model lookuptable.
     # instead of setting `custom_repo` when initialising the UI you can also use `sfaira_repo=True` to use public weights
     ui = sfaira.ui.UserInterface(custom_repo="/path/to/local/repo/folder/or/zenodo/repo/URL", sfaira_repo=False)
+    ui.load_data(anndata.read("/path/to/file.h5ad"))  # load your dataset into sfaira
     ui.zoo_embedding.model_id = 'embedding_human-blood-ae-0.2-0.1_theislab'  # pick desired model here
     ui.zoo_celltype.model_id = 'celltype_human-blood-mlp-0.1.3-0.1_theislab'  # pick desired model here
     ui.load_data(anndata.read("/path/to/file.h5ad"), gene_symbol_col='index', gene_ens_col='gene_ids')  # load your dataset into sfaira