Skip to content

Commit

Permalink
split template test into annotate and test (#272)
Browse files Browse the repository at this point in the history
* split template test into annotate and test

Signed-off-by: zethson <lukas.heumos@posteo.net>

* fix annotation

Signed-off-by: zethson <lukas.heumos@posteo.net>

* fix writing path for annotations

Signed-off-by: zethson <lukas.heumos@posteo.net>

* reverted to old, correct annotation

Signed-off-by: zethson <lukas.heumos@posteo.net>

* fix correct assay_sc

Signed-off-by: zethson <lukas.heumos@posteo.net>
  • Loading branch information
Zethson authored May 28, 2021
1 parent 950abad commit e615869
Show file tree
Hide file tree
Showing 4 changed files with 221 additions and 137 deletions.
16 changes: 16 additions & 0 deletions sfaira/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import rich.logging
from rich import traceback
from rich import print

from sfaira.commands.annotate_dataloader import DataloaderAnnotater
from sfaira.commands.test_dataloader import DataloaderTester

from sfaira.commands.clean_dataloader import DataloaderCleaner
Expand Down Expand Up @@ -102,6 +104,20 @@ def validate_dataloader(path) -> None:
dataloader_validator.validate()


@sfaira_cli.command()
@click.argument('path', type=click.Path(exists=True))
@click.option('--doi', type=str, default=None)
@click.option('--test-data', type=click.Path(exists=True))
def annotate_dataloader(path, doi, test_data) -> None:
"""
Annotates a dataloader.
PATH is the absolute path of the root of your sfaira clone.
"""
dataloader_annotater = DataloaderAnnotater()
dataloader_annotater.annotate(path, doi, test_data)


@sfaira_cli.command()
@click.argument('path', type=click.Path(exists=True))
@click.option('--test-data', type=click.Path(exists=True))
Expand Down
179 changes: 179 additions & 0 deletions sfaira/commands/annotate_dataloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import os
import pydoc
import shutil

from sfaira.data import DatasetGroupDirectoryOriented, DatasetGroup, DatasetBase
from sfaira.data.utils import read_yaml

try:
import sfaira_extension as sfairae
except ImportError:
sfairae = None


class DataloaderAnnotater:

def __init__(self):
self.WD = os.path.dirname(__file__)
self.file_path = None
self.file_path_sfairae = None
self.meta_path = None
self.cache_path = None
self.dir_loader = None
self.dir_loader_sfairae = None
self.package_source = None

def annotate(self, path: str, doi: str, test_data: str):
"""
Annotates a provided dataloader.
Moderate the suggestions made here: Choose the best fit cell ontology label for your cells.
Sfaira uses multiple mechanisms of finding matches, depending on how the free text was generated, these might be
differentially successful. The proposed IDs groups are separate by ":|||:" strings to give you a visual anchor
when going through these lists. You need to delete all of these division strings and all labels in the second
columns other than the best fit label. Do not change the first column,
(Note that columns are separated by ",")
You can also manually check maps here: https://www.ebi.ac.uk/ols/ontologies/cl
"""
doi_sfaira_repr = f'd{doi.translate({ord(c): "_" for c in r"!@#$%^&*()[]/{};:,.<>?|`~-=_+"})}'
self._setup_loader(doi_sfaira_repr)
self._annotate(test_data, path, doi)

def _setup_loader(self, doi_sfaira_repr: str):
"""
Define the file names, loader paths and base paths of loader collections for sfaira and sfaira_extension
"""
dir_loader_sfaira = "sfaira.data.dataloaders.loaders."
file_path_sfaira = "/" + "/".join(pydoc.locate(dir_loader_sfaira + "FILE_PATH").split("/")[:-1])
if sfairae is not None:
dir_loader_sfairae = "sfaira_extension.data.dataloaders.loaders."
file_path_sfairae = "/" + "/".join(pydoc.locate(dir_loader_sfairae + "FILE_PATH").split("/")[:-1])
else:
file_path_sfairae = None
# Check if loader name is a directory either in sfaira or sfaira_extension loader collections:
if doi_sfaira_repr in os.listdir(file_path_sfaira):
dir_loader = dir_loader_sfaira + "." + doi_sfaira_repr
package_source = "sfaira"
elif doi_sfaira_repr in os.listdir(file_path_sfairae):
dir_loader = dir_loader_sfairae + "." + doi_sfaira_repr
package_source = "sfairae"
else:
raise ValueError("data loader not found in sfaira and also not in sfaira_extension")
file_path = pydoc.locate(dir_loader + ".FILE_PATH")
meta_path = None
cache_path = None
# Clear dataset cache
shutil.rmtree(cache_path, ignore_errors=True)

self.file_path = file_path
self.file_path_sfairae = file_path_sfairae
self.meta_path = meta_path
self.cache_path = cache_path
self.dir_loader = dir_loader
self.dir_loader_sfairae = dir_loader_sfairae
self.package_source = package_source

def _get_ds(self, test_data: str):
ds = DatasetGroupDirectoryOriented(
file_base=self.file_path,
data_path=test_data,
meta_path=None,
cache_path=None
)

return ds

def buffered_load(self, test_data: str):
ds = self._get_ds(test_data=test_data)
# TODO try-except with good error description saying that the data loader is broken here:
ds.load(
remove_gene_version=False,
match_to_reference=None,
load_raw=True, # Force raw load so non confound future tests by data loader bugs in previous versions.
allow_caching=True,
)

assert len(ds.ids) > 0, f"no data sets loaded, make sure raw data is in {test_data}"
return ds

def _annotate(self, test_data: str, path: str, doi: str):
ds = self.buffered_load(test_data=test_data)
# Create cell type conversion table:
cwd = os.path.dirname(self.file_path)
dataset_module = str(cwd.split("/")[-1])
# Group data sets by file module:
# Note that if we were not grouping the cell type map .tsv files by file module, we could directly call
# write_ontology_class_map on the ds.
for f in os.listdir(cwd):
if os.path.isfile(os.path.join(cwd, f)): # only files
# Narrow down to data set files:
if f.split(".")[-1] == "py" and f.split(".")[0] not in ["__init__", "base", "group"]:
file_module = ".".join(f.split(".")[:-1])

# I) Instantiate Data set group to get all IDs of data sets associated with this .py file.
# Note that all data sets in this directory are already loaded in ds, so we just need the IDs.
DatasetFound = pydoc.locate(self.dir_loader + "." + file_module + ".Dataset")
# Load objects from name space:
# - load(): Loading function that return anndata instance.
# - SAMPLE_FNS: File name list for DatasetBaseGroupLoadingManyFiles
load_func = pydoc.locate(self.dir_loader + "." + file_module + ".load")
load_func_annotation = pydoc.locate(self.dir_loader + "." + file_module + ".LOAD_ANNOTATION")
# Also check sfaira_extension for additional load_func_annotation:
if self.package_source != "sfairae" and sfairae is not None:
load_func_annotation_sfairae = pydoc.locate(self.dir_loader_sfairae + "." + dataset_module +
"." + file_module + ".LOAD_ANNOTATION")
# LOAD_ANNOTATION is a dictionary so we can use update to extend it.
if load_func_annotation_sfairae is not None and load_func_annotation is not None:
load_func_annotation.update(load_func_annotation_sfairae)
elif load_func_annotation_sfairae is not None and load_func_annotation is None:
load_func_annotation = load_func_annotation_sfairae
sample_fns = pydoc.locate(self.dir_loader + "." + file_module + ".SAMPLE_FNS")
fn_yaml = os.path.join(cwd, file_module + ".yaml")
fn_yaml = fn_yaml if os.path.exists(fn_yaml) else None
# Check for sample_fns in yaml:
if fn_yaml is not None:
assert os.path.exists(fn_yaml), f"did not find yaml {fn_yaml}"
yaml_vals = read_yaml(fn=fn_yaml)
if sample_fns is None and yaml_vals["meta"]["sample_fns"] is not None:
sample_fns = yaml_vals["meta"]["sample_fns"]
if sample_fns is None:
sample_fns = [None]
# Here we distinguish between class that are already defined and those that are not.
# The latter case arises if meta data are defined in YAMLs and _load is given as a function.
if DatasetFound is None:
datasets_f = [
DatasetBase(
data_path=test_data,
meta_path=self.meta_path,
cache_path=self.cache_path,
load_func=load_func,
dict_load_func_annotation=load_func_annotation,
sample_fn=x,
sample_fns=sample_fns if sample_fns != [None] else None,
yaml_path=fn_yaml,
) for x in sample_fns
]
else:
datasets_f = [
DatasetFound(
data_path=test_data,
meta_path=self.meta_path,
cache_path=self.cache_path,
load_func=load_func,
load_func_annotation=load_func_annotation,
sample_fn=x,
sample_fns=sample_fns if sample_fns != [None] else None,
yaml_path=fn_yaml,
) for x in sample_fns
]
# II) Build a data set group from the already loaded data sets and use the group ontology writing
# function.
dsg_f = DatasetGroup(datasets=dict([(x.id, ds.datasets[x.id]) for x in datasets_f]))
# III) Write this directly into the sfaira clone so that it can be committed via git.
# TODO any errors not to be caught here?
doi_sfaira_repr = f'd{doi.translate({ord(c): "_" for c in r"!@#$%^&*()[]/{};:,.<>?|`~-=_+"})}'
dsg_f.write_ontology_class_map(
fn=os.path.join(f"{path}/sfaira/data/dataloaders/loaders/{doi_sfaira_repr}/{file_module}.tsv"),
protected_writing=True,
n_suggest=4,
)
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self, **kwargs):
self.normalization = "raw"
self.organ = "pancreas"
self.organism = "mouse"
self.assay_sc = "Drop-seq"
self.assay_sc = "10x 3' v2"
self.state_exact = "diabetic"
self.year = 2019
self.sample_source = "primary_tissue"
Expand Down
Loading

0 comments on commit e615869

Please sign in to comment.