Skip to content

Commit

Permalink
Dev (#286)
Browse files Browse the repository at this point in the history
* split CLI template test into annotate and test (#272)
* fix correct assay_sc (#272)
* mitigates merge conflict between dev and release

Signed-off-by: zethson <lukas.heumos@posteo.net>
  • Loading branch information
davidsebfischer authored May 31, 2021
1 parent 27ac19a commit b0df842
Show file tree
Hide file tree
Showing 7 changed files with 226 additions and 141 deletions.
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ sfaira - data and model repository for single-cell data
:align: center

sfaira_ is a model and a data repository in a single python package (preprint_).
We provide an interactive overview of the current state of the zoos on sfaira-site_.
We provide an interactive overview of the current state of the zoos on sfaira-portal_.

Its data zoo gives users access to streamlined data loaders that allow reproducible use of published and private data sets for model training and exploration.
Its model zoo gives user streamlined access to pre-trained models and to common model architectures to ease usage of neural networks in common single-cell analysis workflows:
Expand All @@ -43,4 +43,4 @@ sfaira integrates into scanpy_ workflows.
.. _preprint: https://www.biorxiv.org/content/10.1101/2020.12.16.419036v1
.. _DCA: https://github.com/theislab/dca
.. _scArches: https://github.com/theislab/scarches
.. _sfaira-site: https://theislab.github.io/sfaira-site/index.html
.. _sfaira-portal: https://theislab.github.io/sfaira-portal/
4 changes: 2 additions & 2 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ sfaira - data and model repository for single-cell data
:align: center

sfaira_ is a model and a data repository in a single python package.
We provide an interactive overview of the current state of the zoos on sfaira-site_.
We provide an interactive overview of the current state of the zoos on sfaira-portal_.

.. _sfaira: https://www.biorxiv.org/content/10.1101/2020.12.16.419036v1
.. _sfaira-site: https://theislab.github.io/sfaira-site/index.html
.. _sfaira-portal: https://theislab.github.io/sfaira-portal/

.. include:: environment_brief.rst

Expand Down
16 changes: 16 additions & 0 deletions sfaira/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import rich.logging
from rich import traceback
from rich import print

from sfaira.commands.annotate_dataloader import DataloaderAnnotater
from sfaira.commands.test_dataloader import DataloaderTester

from sfaira.commands.clean_dataloader import DataloaderCleaner
Expand Down Expand Up @@ -102,6 +104,20 @@ def validate_dataloader(path) -> None:
dataloader_validator.validate()


@sfaira_cli.command()
@click.argument('path', type=click.Path(exists=True))
@click.option('--doi', type=str, default=None)
@click.option('--test-data', type=click.Path(exists=True))
def annotate_dataloader(path, doi, test_data) -> None:
"""
Annotates a dataloader.
PATH is the absolute path of the root of your sfaira clone.
"""
dataloader_annotater = DataloaderAnnotater()
dataloader_annotater.annotate(path, doi, test_data)


@sfaira_cli.command()
@click.argument('path', type=click.Path(exists=True))
@click.option('--test-data', type=click.Path(exists=True))
Expand Down
179 changes: 179 additions & 0 deletions sfaira/commands/annotate_dataloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import os
import pydoc
import shutil

from sfaira.data import DatasetGroupDirectoryOriented, DatasetGroup, DatasetBase
from sfaira.data.utils import read_yaml

try:
import sfaira_extension as sfairae
except ImportError:
sfairae = None


class DataloaderAnnotater:

def __init__(self):
self.WD = os.path.dirname(__file__)
self.file_path = None
self.file_path_sfairae = None
self.meta_path = None
self.cache_path = None
self.dir_loader = None
self.dir_loader_sfairae = None
self.package_source = None

def annotate(self, path: str, doi: str, test_data: str):
"""
Annotates a provided dataloader.
Moderate the suggestions made here: Choose the best fit cell ontology label for your cells.
Sfaira uses multiple mechanisms of finding matches, depending on how the free text was generated, these might be
differentially successful. The proposed IDs groups are separate by ":|||:" strings to give you a visual anchor
when going through these lists. You need to delete all of these division strings and all labels in the second
columns other than the best fit label. Do not change the first column,
(Note that columns are separated by ",")
You can also manually check maps here: https://www.ebi.ac.uk/ols/ontologies/cl
"""
doi_sfaira_repr = f'd{doi.translate({ord(c): "_" for c in r"!@#$%^&*()[]/{};:,.<>?|`~-=_+"})}'
self._setup_loader(doi_sfaira_repr)
self._annotate(test_data, path, doi)

def _setup_loader(self, doi_sfaira_repr: str):
"""
Define the file names, loader paths and base paths of loader collections for sfaira and sfaira_extension
"""
dir_loader_sfaira = "sfaira.data.dataloaders.loaders."
file_path_sfaira = "/" + "/".join(pydoc.locate(dir_loader_sfaira + "FILE_PATH").split("/")[:-1])
if sfairae is not None:
dir_loader_sfairae = "sfaira_extension.data.dataloaders.loaders."
file_path_sfairae = "/" + "/".join(pydoc.locate(dir_loader_sfairae + "FILE_PATH").split("/")[:-1])
else:
file_path_sfairae = None
# Check if loader name is a directory either in sfaira or sfaira_extension loader collections:
if doi_sfaira_repr in os.listdir(file_path_sfaira):
dir_loader = dir_loader_sfaira + "." + doi_sfaira_repr
package_source = "sfaira"
elif doi_sfaira_repr in os.listdir(file_path_sfairae):
dir_loader = dir_loader_sfairae + "." + doi_sfaira_repr
package_source = "sfairae"
else:
raise ValueError("data loader not found in sfaira and also not in sfaira_extension")
file_path = pydoc.locate(dir_loader + ".FILE_PATH")
meta_path = None
cache_path = None
# Clear dataset cache
shutil.rmtree(cache_path, ignore_errors=True)

self.file_path = file_path
self.file_path_sfairae = file_path_sfairae
self.meta_path = meta_path
self.cache_path = cache_path
self.dir_loader = dir_loader
self.dir_loader_sfairae = dir_loader_sfairae
self.package_source = package_source

def _get_ds(self, test_data: str):
ds = DatasetGroupDirectoryOriented(
file_base=self.file_path,
data_path=test_data,
meta_path=None,
cache_path=None
)

return ds

def buffered_load(self, test_data: str):
ds = self._get_ds(test_data=test_data)
# TODO try-except with good error description saying that the data loader is broken here:
ds.load(
remove_gene_version=False,
match_to_reference=None,
load_raw=True, # Force raw load so non confound future tests by data loader bugs in previous versions.
allow_caching=True,
)

assert len(ds.ids) > 0, f"no data sets loaded, make sure raw data is in {test_data}"
return ds

def _annotate(self, test_data: str, path: str, doi: str):
ds = self.buffered_load(test_data=test_data)
# Create cell type conversion table:
cwd = os.path.dirname(self.file_path)
dataset_module = str(cwd.split("/")[-1])
# Group data sets by file module:
# Note that if we were not grouping the cell type map .tsv files by file module, we could directly call
# write_ontology_class_map on the ds.
for f in os.listdir(cwd):
if os.path.isfile(os.path.join(cwd, f)): # only files
# Narrow down to data set files:
if f.split(".")[-1] == "py" and f.split(".")[0] not in ["__init__", "base", "group"]:
file_module = ".".join(f.split(".")[:-1])

# I) Instantiate Data set group to get all IDs of data sets associated with this .py file.
# Note that all data sets in this directory are already loaded in ds, so we just need the IDs.
DatasetFound = pydoc.locate(self.dir_loader + "." + file_module + ".Dataset")
# Load objects from name space:
# - load(): Loading function that return anndata instance.
# - SAMPLE_FNS: File name list for DatasetBaseGroupLoadingManyFiles
load_func = pydoc.locate(self.dir_loader + "." + file_module + ".load")
load_func_annotation = pydoc.locate(self.dir_loader + "." + file_module + ".LOAD_ANNOTATION")
# Also check sfaira_extension for additional load_func_annotation:
if self.package_source != "sfairae" and sfairae is not None:
load_func_annotation_sfairae = pydoc.locate(self.dir_loader_sfairae + "." + dataset_module +
"." + file_module + ".LOAD_ANNOTATION")
# LOAD_ANNOTATION is a dictionary so we can use update to extend it.
if load_func_annotation_sfairae is not None and load_func_annotation is not None:
load_func_annotation.update(load_func_annotation_sfairae)
elif load_func_annotation_sfairae is not None and load_func_annotation is None:
load_func_annotation = load_func_annotation_sfairae
sample_fns = pydoc.locate(self.dir_loader + "." + file_module + ".SAMPLE_FNS")
fn_yaml = os.path.join(cwd, file_module + ".yaml")
fn_yaml = fn_yaml if os.path.exists(fn_yaml) else None
# Check for sample_fns in yaml:
if fn_yaml is not None:
assert os.path.exists(fn_yaml), f"did not find yaml {fn_yaml}"
yaml_vals = read_yaml(fn=fn_yaml)
if sample_fns is None and yaml_vals["meta"]["sample_fns"] is not None:
sample_fns = yaml_vals["meta"]["sample_fns"]
if sample_fns is None:
sample_fns = [None]
# Here we distinguish between class that are already defined and those that are not.
# The latter case arises if meta data are defined in YAMLs and _load is given as a function.
if DatasetFound is None:
datasets_f = [
DatasetBase(
data_path=test_data,
meta_path=self.meta_path,
cache_path=self.cache_path,
load_func=load_func,
dict_load_func_annotation=load_func_annotation,
sample_fn=x,
sample_fns=sample_fns if sample_fns != [None] else None,
yaml_path=fn_yaml,
) for x in sample_fns
]
else:
datasets_f = [
DatasetFound(
data_path=test_data,
meta_path=self.meta_path,
cache_path=self.cache_path,
load_func=load_func,
load_func_annotation=load_func_annotation,
sample_fn=x,
sample_fns=sample_fns if sample_fns != [None] else None,
yaml_path=fn_yaml,
) for x in sample_fns
]
# II) Build a data set group from the already loaded data sets and use the group ontology writing
# function.
dsg_f = DatasetGroup(datasets=dict([(x.id, ds.datasets[x.id]) for x in datasets_f]))
# III) Write this directly into the sfaira clone so that it can be committed via git.
# TODO any errors not to be caught here?
doi_sfaira_repr = f'd{doi.translate({ord(c): "_" for c in r"!@#$%^&*()[]/{};:,.<>?|`~-=_+"})}'
dsg_f.write_ontology_class_map(
fn=os.path.join(f"{path}/sfaira/data/dataloaders/loaders/{doi_sfaira_repr}/{file_module}.tsv"),
protected_writing=True,
n_suggest=4,
)
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self, **kwargs):
self.normalization = "raw"
self.organ = "pancreas"
self.organism = "mouse"
self.assay_sc = "Drop-seq"
self.assay_sc = "10x 3' v2"
self.state_exact = "diabetic"
self.year = 2019
self.sample_source = "primary_tissue"
Expand Down
1 change: 1 addition & 0 deletions sfaira/interface/user_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class UserInterface:
# initialise your sfaira instance with a model lookuptable.
# instead of setting `custom_repo` when initialising the UI you can also use `sfaira_repo=True` to use public weights
ui = sfaira.ui.UserInterface(custom_repo="/path/to/local/repo/folder/or/zenodo/repo/URL", sfaira_repo=False)
ui.load_data(anndata.read("/path/to/file.h5ad")) # load your dataset into sfaira
ui.zoo_embedding.model_id = 'embedding_human-blood-ae-0.2-0.1_theislab' # pick desired model here
ui.zoo_celltype.model_id = 'celltype_human-blood-mlp-0.1.3-0.1_theislab' # pick desired model here
ui.load_data(anndata.read("/path/to/file.h5ad"), gene_symbol_col='index', gene_ens_col='gene_ids') # load your dataset into sfaira
Expand Down
Loading

0 comments on commit b0df842

Please sign in to comment.