Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

split annotation and testing of new data set #270

Closed
davidsebfischer opened this issue May 17, 2021 · 3 comments
Closed

split annotation and testing of new data set #270

davidsebfischer opened this issue May 17, 2021 · 3 comments
Assignees

Comments

@davidsebfischer
Copy link
Contributor

I am imagining sth like this, so that first annotate()and then test_load()can be called:

sfaira/unit_tests/data_contribution/test_data_template.py

import os
import pydoc
import shutil

from sfaira.data import DatasetGroupDirectoryOriented, DatasetGroup, DatasetBase
from sfaira.data.utils import read_yaml


def _get_ds(doi_sfaira_repr: str, test_data: str):
    flattened_doi = doi_sfaira_repr
    # Define file names and loader paths in sfaira or sfaira_extension:
    # Define base paths of loader collections in sfaira and sfaira_extension:
    dir_loader_sfaira = "sfaira.data.dataloaders.loaders."
    file_path_sfaira = "/" + "/".join(pydoc.locate(dir_loader_sfaira + "FILE_PATH").split("/")[:-1])
    if sfairae is not None:
        dir_loader_sfairae = "sfaira_extension.data.dataloaders.loaders."
        file_path_sfairae = "/" + "/".join(pydoc.locate(dir_loader_sfairae + "FILE_PATH").split("/")[:-1])
    else:
        file_path_sfairae = None
    # Check if loader name is a directory either in sfaira or sfaira_extension loader collections:
    if flattened_doi in os.listdir(file_path_sfaira):
        dir_loader = dir_loader_sfaira + "." + flattened_doi
        package_source = "sfaira"
    elif flattened_doi in os.listdir(file_path_sfairae):
        dir_loader = dir_loader_sfairae + "." + flattened_doi
        package_source = "sfairae"
    else:
        raise ValueError("data loader not found in sfaira and also not in sfaira_extension")
    file_path = pydoc.locate(dir_loader + ".FILE_PATH")
    meta_path = None
    cache_path = None
    # Clear dataset cache
    shutil.rmtree(cache_path, ignore_errors=True)

    ds = DatasetGroupDirectoryOriented(
        file_base=file_path,
        data_path=test_data,
        meta_path=None,
        cache_path=None
    )


def buffered_load(doi_sfaira_repr: str, test_data: str):
    ds = _get_ds(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
    # TODO try-except with good error description saying that the data loader is broken here:
    ds.load(
        remove_gene_version=False,
        match_to_reference=None,
        load_raw=True,  # Force raw load so non confound future tests by data loader bugs in previous versions.
        allow_caching=True,
    )
   assert len(ds.ids) > 0, f"no data sets loaded, make sure raw data is in {test_data}"
   return ds

def annotate(doi_sfaira_repr: str, test_data: str):
    ds = buffered_load(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
    # Create cell type conversion table:
    cwd = os.path.dirname(file_path)
    dataset_module = str(cwd.split("/")[-1])
    # Group data sets by file module:
    # Note that if we were not grouping the cell type map .tsv files by file module, we could directly call
    # write_ontology_class_map on the ds.
    for f in os.listdir(cwd):
        if os.path.isfile(os.path.join(cwd, f)):  # only files
            # Narrow down to data set files:
            if f.split(".")[-1] == "py" and f.split(".")[0] not in ["__init__", "base", "group"]:
                file_module = ".".join(f.split(".")[:-1])

                # I) Instantiate Data set group to get all IDs of data sets associated with this .py file.
                # Note that all data sets in this directory are already loaded in ds, so we just need the IDs.
                DatasetFound = pydoc.locate(dir_loader + "." + file_module + ".Dataset")
                # Load objects from name space:
                # - load(): Loading function that return anndata instance.
                # - SAMPLE_FNS: File name list for DatasetBaseGroupLoadingManyFiles
                load_func = pydoc.locate(dir_loader + "." + file_module + ".load")
                load_func_annotation = pydoc.locate(dir_loader + "." + file_module + ".LOAD_ANNOTATION")
                # Also check sfaira_extension for additional load_func_annotation:
                if package_source != "sfairae" and sfairae is not None:
                    load_func_annotation_sfairae = pydoc.locate(dir_loader_sfairae + "." + dataset_module +
                                                                "." + file_module + ".LOAD_ANNOTATION")
                    # LOAD_ANNOTATION is a dictionary so we can use update to extend it.
                    if load_func_annotation_sfairae is not None and load_func_annotation is not None:
                        load_func_annotation.update(load_func_annotation_sfairae)
                    elif load_func_annotation_sfairae is not None and load_func_annotation is None:
                        load_func_annotation = load_func_annotation_sfairae
                sample_fns = pydoc.locate(dir_loader + "." + file_module + ".SAMPLE_FNS")
                fn_yaml = os.path.join(cwd, file_module + ".yaml")
                fn_yaml = fn_yaml if os.path.exists(fn_yaml) else None
                # Check for sample_fns in yaml:
                if fn_yaml is not None:
                    assert os.path.exists(fn_yaml), f"did not find yaml {fn_yaml}"
                    yaml_vals = read_yaml(fn=fn_yaml)
                    if sample_fns is None and yaml_vals["meta"]["sample_fns"] is not None:
                        sample_fns = yaml_vals["meta"]["sample_fns"]
                if sample_fns is None:
                    sample_fns = [None]
                # Here we distinguish between class that are already defined and those that are not.
                # The latter case arises if meta data are defined in YAMLs and _load is given as a function.
                if DatasetFound is None:
                    datasets_f = [
                        DatasetBase(
                            data_path=test_data,
                            meta_path=meta_path,
                            cache_path=cache_path,
                            load_func=load_func,
                            dict_load_func_annotation=load_func_annotation,
                            sample_fn=x,
                            sample_fns=sample_fns if sample_fns != [None] else None,
                            yaml_path=fn_yaml,
                        ) for x in sample_fns
                    ]
                else:
                    datasets_f = [
                        DatasetFound(
                            data_path=test_data,
                            meta_path=meta_path,
                            cache_path=cache_path,
                            load_func=load_func,
                            load_func_annotation=load_func_annotation,
                            sample_fn=x,
                            sample_fns=sample_fns if sample_fns != [None] else None,
                            yaml_path=fn_yaml,
                        ) for x in sample_fns
                    ]
                # II) Build a data set group from the already loaded data sets and use the group ontology writing
                # function.
                dsg_f = DatasetGroup(datasets=dict([(x.id, ds.datasets[x.id]) for x in datasets_f]))
                # III) Write this directly into sfaira installation so that it can be committed via git.
                # TODO any errors not to be caught here?
                dsg_f.write_ontology_class_map(
                    fn=os.path.join(cwd, file_module + ".tsv"),
                    protected_writing=True,
                    n_suggest=4,
                )
                dsg_f.clean_ontology_class_map()

def test_load(doi_sfaira_repr: str, test_data: str):
    ds = _get_ds(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
    # TODO try-except with good error description saying that the data loader is broken here:
    ds.load(
        remove_gene_version=True,
        match_to_reference=TODO get organism here,
        load_raw=True,
        allow_caching=True
    )
    # Try loading from cache:
    ds = _get_ds(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
    # TODO try-except with good error description saying that the data loader is broken here:
    ds.load(
        remove_gene_version=True,
        match_to_reference=TODO get organism here,
        load_raw=False,
        allow_caching=True
    )
    shutil.rmtree(cache_path, ignore_errors=True)
@davidsebfischer
Copy link
Contributor Author

davidsebfischer commented May 17, 2021

By the way, if you find time to build a mini mock data set by just generating data randomly in a mock data loader function and couple it to a YAML, this would be super useful to unit test this pipeline and we could also use these mock data sets in other unit tests. Right now, contribution is not unit tested but I think it should be.

@Zethson
Copy link
Member

Zethson commented May 19, 2021

@davidsebfischer` is there any reason why you changed

ds.clean_ontology_class_map()

to

dsg_f.clean_ontology_class_map()

? The former should be the correct, right?

@davidsebfischer
Copy link
Contributor Author

I did that because I wanted to save one additional cycle of building datasetgroups, i think both should work? Feel free to change, this is borderline pseudocode.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants