split template test into annotate and test (#272)

* split template test into annotate and test Signed-off-by: zethson <lukas.heumos@posteo.net> * fix annotation Signed-off-by: zethson <lukas.heumos@posteo.net> * fix writing path for annotations Signed-off-by: zethson <lukas.heumos@posteo.net> * reverted to old, correct annotation Signed-off-by: zethson <lukas.heumos@posteo.net> * fix correct assay_sc Signed-off-by: zethson <lukas.heumos@posteo.net>
theislab · May 28, 2021 · e615869 · e615869
1 parent 950abad
commit e615869
Show file tree

Hide file tree

Showing 4 changed files with 221 additions and 137 deletions.
diff --git a/sfaira/cli.py b/sfaira/cli.py
@@ -7,6 +7,8 @@
 import rich.logging
 from rich import traceback
 from rich import print
+
+from sfaira.commands.annotate_dataloader import DataloaderAnnotater
 from sfaira.commands.test_dataloader import DataloaderTester
 
 from sfaira.commands.clean_dataloader import DataloaderCleaner
@@ -102,6 +104,20 @@ def validate_dataloader(path) -> None:
     dataloader_validator.validate()
 
 
+@sfaira_cli.command()
+@click.argument('path', type=click.Path(exists=True))
+@click.option('--doi', type=str, default=None)
+@click.option('--test-data', type=click.Path(exists=True))
+def annotate_dataloader(path, doi, test_data) -> None:
+    """
+    Annotates a dataloader.
+
+    PATH is the absolute path of the root of your sfaira clone.
+    """
+    dataloader_annotater = DataloaderAnnotater()
+    dataloader_annotater.annotate(path, doi, test_data)
+
+
 @sfaira_cli.command()
 @click.argument('path', type=click.Path(exists=True))
 @click.option('--test-data', type=click.Path(exists=True))

diff --git a/sfaira/commands/annotate_dataloader.py b/sfaira/commands/annotate_dataloader.py
@@ -0,0 +1,179 @@
+import os
+import pydoc
+import shutil
+
+from sfaira.data import DatasetGroupDirectoryOriented, DatasetGroup, DatasetBase
+from sfaira.data.utils import read_yaml
+
+try:
+    import sfaira_extension as sfairae
+except ImportError:
+    sfairae = None
+
+
+class DataloaderAnnotater:
+
+    def __init__(self):
+        self.WD = os.path.dirname(__file__)
+        self.file_path = None
+        self.file_path_sfairae = None
+        self.meta_path = None
+        self.cache_path = None
+        self.dir_loader = None
+        self.dir_loader_sfairae = None
+        self.package_source = None
+
+    def annotate(self, path: str, doi: str, test_data: str):
+        """
+        Annotates a provided dataloader.
+
+        Moderate the suggestions made here: Choose the best fit cell ontology label for your cells.
+        Sfaira uses multiple mechanisms of finding matches, depending on how the free text was generated, these might be
+        differentially successful. The proposed IDs groups are separate by ":|||:" strings to give you a visual anchor
+        when going through these lists. You need to delete all of these division strings and all labels in the second
+        columns other than the best fit label. Do not change the first column,
+        (Note that columns are separated by ",")
+        You can also manually check maps here: https://www.ebi.ac.uk/ols/ontologies/cl
+        """
+        doi_sfaira_repr = f'd{doi.translate({ord(c): "_" for c in r"!@#$%^&*()[]/{};:,.<>?|`~-=_+"})}'
+        self._setup_loader(doi_sfaira_repr)
+        self._annotate(test_data, path, doi)
+
+    def _setup_loader(self, doi_sfaira_repr: str):
+        """
+        Define the file names, loader paths and base paths of loader collections for sfaira and sfaira_extension
+        """
+        dir_loader_sfaira = "sfaira.data.dataloaders.loaders."
+        file_path_sfaira = "/" + "/".join(pydoc.locate(dir_loader_sfaira + "FILE_PATH").split("/")[:-1])
+        if sfairae is not None:
+            dir_loader_sfairae = "sfaira_extension.data.dataloaders.loaders."
+            file_path_sfairae = "/" + "/".join(pydoc.locate(dir_loader_sfairae + "FILE_PATH").split("/")[:-1])
+        else:
+            file_path_sfairae = None
+        # Check if loader name is a directory either in sfaira or sfaira_extension loader collections:
+        if doi_sfaira_repr in os.listdir(file_path_sfaira):
+            dir_loader = dir_loader_sfaira + "." + doi_sfaira_repr
+            package_source = "sfaira"
+        elif doi_sfaira_repr in os.listdir(file_path_sfairae):
+            dir_loader = dir_loader_sfairae + "." + doi_sfaira_repr
+            package_source = "sfairae"
+        else:
+            raise ValueError("data loader not found in sfaira and also not in sfaira_extension")
+        file_path = pydoc.locate(dir_loader + ".FILE_PATH")
+        meta_path = None
+        cache_path = None
+        # Clear dataset cache
+        shutil.rmtree(cache_path, ignore_errors=True)
+
+        self.file_path = file_path
+        self.file_path_sfairae = file_path_sfairae
+        self.meta_path = meta_path
+        self.cache_path = cache_path
+        self.dir_loader = dir_loader
+        self.dir_loader_sfairae = dir_loader_sfairae
+        self.package_source = package_source
+
+    def _get_ds(self, test_data: str):
+        ds = DatasetGroupDirectoryOriented(
+            file_base=self.file_path,
+            data_path=test_data,
+            meta_path=None,
+            cache_path=None
+        )
+
+        return ds
+
+    def buffered_load(self, test_data: str):
+        ds = self._get_ds(test_data=test_data)
+        # TODO try-except with good error description saying that the data loader is broken here:
+        ds.load(
+            remove_gene_version=False,
+            match_to_reference=None,
+            load_raw=True,  # Force raw load so non confound future tests by data loader bugs in previous versions.
+            allow_caching=True,
+        )
+
+        assert len(ds.ids) > 0, f"no data sets loaded, make sure raw data is in {test_data}"
+        return ds
+
+    def _annotate(self, test_data: str, path: str, doi: str):
+        ds = self.buffered_load(test_data=test_data)
+        # Create cell type conversion table:
+        cwd = os.path.dirname(self.file_path)
+        dataset_module = str(cwd.split("/")[-1])
+        # Group data sets by file module:
+        # Note that if we were not grouping the cell type map .tsv files by file module, we could directly call
+        # write_ontology_class_map on the ds.
+        for f in os.listdir(cwd):
+            if os.path.isfile(os.path.join(cwd, f)):  # only files
+                # Narrow down to data set files:
+                if f.split(".")[-1] == "py" and f.split(".")[0] not in ["__init__", "base", "group"]:
+                    file_module = ".".join(f.split(".")[:-1])
+
+                    # I) Instantiate Data set group to get all IDs of data sets associated with this .py file.
+                    # Note that all data sets in this directory are already loaded in ds, so we just need the IDs.
+                    DatasetFound = pydoc.locate(self.dir_loader + "." + file_module + ".Dataset")
+                    # Load objects from name space:
+                    # - load(): Loading function that return anndata instance.
+                    # - SAMPLE_FNS: File name list for DatasetBaseGroupLoadingManyFiles
+                    load_func = pydoc.locate(self.dir_loader + "." + file_module + ".load")
+                    load_func_annotation = pydoc.locate(self.dir_loader + "." + file_module + ".LOAD_ANNOTATION")
+                    # Also check sfaira_extension for additional load_func_annotation:
+                    if self.package_source != "sfairae" and sfairae is not None:
+                        load_func_annotation_sfairae = pydoc.locate(self.dir_loader_sfairae + "." + dataset_module +
+                                                                    "." + file_module + ".LOAD_ANNOTATION")
+                        # LOAD_ANNOTATION is a dictionary so we can use update to extend it.
+                        if load_func_annotation_sfairae is not None and load_func_annotation is not None:
+                            load_func_annotation.update(load_func_annotation_sfairae)
+                        elif load_func_annotation_sfairae is not None and load_func_annotation is None:
+                            load_func_annotation = load_func_annotation_sfairae
+                    sample_fns = pydoc.locate(self.dir_loader + "." + file_module + ".SAMPLE_FNS")
+                    fn_yaml = os.path.join(cwd, file_module + ".yaml")
+                    fn_yaml = fn_yaml if os.path.exists(fn_yaml) else None
+                    # Check for sample_fns in yaml:
+                    if fn_yaml is not None:
+                        assert os.path.exists(fn_yaml), f"did not find yaml {fn_yaml}"
+                        yaml_vals = read_yaml(fn=fn_yaml)
+                        if sample_fns is None and yaml_vals["meta"]["sample_fns"] is not None:
+                            sample_fns = yaml_vals["meta"]["sample_fns"]
+                    if sample_fns is None:
+                        sample_fns = [None]
+                    # Here we distinguish between class that are already defined and those that are not.
+                    # The latter case arises if meta data are defined in YAMLs and _load is given as a function.
+                    if DatasetFound is None:
+                        datasets_f = [
+                            DatasetBase(
+                                data_path=test_data,
+                                meta_path=self.meta_path,
+                                cache_path=self.cache_path,
+                                load_func=load_func,
+                                dict_load_func_annotation=load_func_annotation,
+                                sample_fn=x,
+                                sample_fns=sample_fns if sample_fns != [None] else None,
+                                yaml_path=fn_yaml,
+                            ) for x in sample_fns
+                        ]
+                    else:
+                        datasets_f = [
+                            DatasetFound(
+                                data_path=test_data,
+                                meta_path=self.meta_path,
+                                cache_path=self.cache_path,
+                                load_func=load_func,
+                                load_func_annotation=load_func_annotation,
+                                sample_fn=x,
+                                sample_fns=sample_fns if sample_fns != [None] else None,
+                                yaml_path=fn_yaml,
+                            ) for x in sample_fns
+                        ]
+                    # II) Build a data set group from the already loaded data sets and use the group ontology writing
+                    # function.
+                    dsg_f = DatasetGroup(datasets=dict([(x.id, ds.datasets[x.id]) for x in datasets_f]))
+                    # III) Write this directly into the sfaira clone so that it can be committed via git.
+                    # TODO any errors not to be caught here?
+                    doi_sfaira_repr = f'd{doi.translate({ord(c): "_" for c in r"!@#$%^&*()[]/{};:,.<>?|`~-=_+"})}'
+                    dsg_f.write_ontology_class_map(
+                        fn=os.path.join(f"{path}/sfaira/data/dataloaders/loaders/{doi_sfaira_repr}/{file_module}.tsv"),
+                        protected_writing=True,
+                        n_suggest=4,
+                    )
diff --git a/...aders/loaders/d10_1016_j_cmet_2019_01_021/mouse_pancreas_2019_10xsequencing_thompson_x.py b/...aders/loaders/d10_1016_j_cmet_2019_01_021/mouse_pancreas_2019_10xsequencing_thompson_x.py
@@ -33,7 +33,7 @@ def __init__(self, **kwargs):
         self.normalization = "raw"
         self.organ = "pancreas"
         self.organism = "mouse"
-        self.assay_sc = "Drop-seq"
+        self.assay_sc = "10x 3' v2"
         self.state_exact = "diabetic"
         self.year = 2019
         self.sample_source = "primary_tissue"