split annotation and testing of new data set #270

davidsebfischer · 2021-05-17T12:47:34Z

I am imagining sth like this, so that first annotate()and then test_load()can be called:

sfaira/unit_tests/data_contribution/test_data_template.py

import os
import pydoc
import shutil

from sfaira.data import DatasetGroupDirectoryOriented, DatasetGroup, DatasetBase
from sfaira.data.utils import read_yaml


def _get_ds(doi_sfaira_repr: str, test_data: str):
    flattened_doi = doi_sfaira_repr
    # Define file names and loader paths in sfaira or sfaira_extension:
    # Define base paths of loader collections in sfaira and sfaira_extension:
    dir_loader_sfaira = "sfaira.data.dataloaders.loaders."
    file_path_sfaira = "/" + "/".join(pydoc.locate(dir_loader_sfaira + "FILE_PATH").split("/")[:-1])
    if sfairae is not None:
        dir_loader_sfairae = "sfaira_extension.data.dataloaders.loaders."
        file_path_sfairae = "/" + "/".join(pydoc.locate(dir_loader_sfairae + "FILE_PATH").split("/")[:-1])
    else:
        file_path_sfairae = None
    # Check if loader name is a directory either in sfaira or sfaira_extension loader collections:
    if flattened_doi in os.listdir(file_path_sfaira):
        dir_loader = dir_loader_sfaira + "." + flattened_doi
        package_source = "sfaira"
    elif flattened_doi in os.listdir(file_path_sfairae):
        dir_loader = dir_loader_sfairae + "." + flattened_doi
        package_source = "sfairae"
    else:
        raise ValueError("data loader not found in sfaira and also not in sfaira_extension")
    file_path = pydoc.locate(dir_loader + ".FILE_PATH")
    meta_path = None
    cache_path = None
    # Clear dataset cache
    shutil.rmtree(cache_path, ignore_errors=True)

    ds = DatasetGroupDirectoryOriented(
        file_base=file_path,
        data_path=test_data,
        meta_path=None,
        cache_path=None
    )


def buffered_load(doi_sfaira_repr: str, test_data: str):
    ds = _get_ds(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
    # TODO try-except with good error description saying that the data loader is broken here:
    ds.load(
        remove_gene_version=False,
        match_to_reference=None,
        load_raw=True,  # Force raw load so non confound future tests by data loader bugs in previous versions.
        allow_caching=True,
    )
   assert len(ds.ids) > 0, f"no data sets loaded, make sure raw data is in {test_data}"
   return ds

def annotate(doi_sfaira_repr: str, test_data: str):
    ds = buffered_load(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
    # Create cell type conversion table:
    cwd = os.path.dirname(file_path)
    dataset_module = str(cwd.split("/")[-1])
    # Group data sets by file module:
    # Note that if we were not grouping the cell type map .tsv files by file module, we could directly call
    # write_ontology_class_map on the ds.
    for f in os.listdir(cwd):
        if os.path.isfile(os.path.join(cwd, f)):  # only files
            # Narrow down to data set files:
            if f.split(".")[-1] == "py" and f.split(".")[0] not in ["__init__", "base", "group"]:
                file_module = ".".join(f.split(".")[:-1])

                # I) Instantiate Data set group to get all IDs of data sets associated with this .py file.
                # Note that all data sets in this directory are already loaded in ds, so we just need the IDs.
                DatasetFound = pydoc.locate(dir_loader + "." + file_module + ".Dataset")
                # Load objects from name space:
                # - load(): Loading function that return anndata instance.
                # - SAMPLE_FNS: File name list for DatasetBaseGroupLoadingManyFiles
                load_func = pydoc.locate(dir_loader + "." + file_module + ".load")
                load_func_annotation = pydoc.locate(dir_loader + "." + file_module + ".LOAD_ANNOTATION")
                # Also check sfaira_extension for additional load_func_annotation:
                if package_source != "sfairae" and sfairae is not None:
                    load_func_annotation_sfairae = pydoc.locate(dir_loader_sfairae + "." + dataset_module +
                                                                "." + file_module + ".LOAD_ANNOTATION")
                    # LOAD_ANNOTATION is a dictionary so we can use update to extend it.
                    if load_func_annotation_sfairae is not None and load_func_annotation is not None:
                        load_func_annotation.update(load_func_annotation_sfairae)
                    elif load_func_annotation_sfairae is not None and load_func_annotation is None:
                        load_func_annotation = load_func_annotation_sfairae
                sample_fns = pydoc.locate(dir_loader + "." + file_module + ".SAMPLE_FNS")
                fn_yaml = os.path.join(cwd, file_module + ".yaml")
                fn_yaml = fn_yaml if os.path.exists(fn_yaml) else None
                # Check for sample_fns in yaml:
                if fn_yaml is not None:
                    assert os.path.exists(fn_yaml), f"did not find yaml {fn_yaml}"
                    yaml_vals = read_yaml(fn=fn_yaml)
                    if sample_fns is None and yaml_vals["meta"]["sample_fns"] is not None:
                        sample_fns = yaml_vals["meta"]["sample_fns"]
                if sample_fns is None:
                    sample_fns = [None]
                # Here we distinguish between class that are already defined and those that are not.
                # The latter case arises if meta data are defined in YAMLs and _load is given as a function.
                if DatasetFound is None:
                    datasets_f = [
                        DatasetBase(
                            data_path=test_data,
                            meta_path=meta_path,
                            cache_path=cache_path,
                            load_func=load_func,
                            dict_load_func_annotation=load_func_annotation,
                            sample_fn=x,
                            sample_fns=sample_fns if sample_fns != [None] else None,
                            yaml_path=fn_yaml,
                        ) for x in sample_fns
                    ]
                else:
                    datasets_f = [
                        DatasetFound(
                            data_path=test_data,
                            meta_path=meta_path,
                            cache_path=cache_path,
                            load_func=load_func,
                            load_func_annotation=load_func_annotation,
                            sample_fn=x,
                            sample_fns=sample_fns if sample_fns != [None] else None,
                            yaml_path=fn_yaml,
                        ) for x in sample_fns
                    ]
                # II) Build a data set group from the already loaded data sets and use the group ontology writing
                # function.
                dsg_f = DatasetGroup(datasets=dict([(x.id, ds.datasets[x.id]) for x in datasets_f]))
                # III) Write this directly into sfaira installation so that it can be committed via git.
                # TODO any errors not to be caught here?
                dsg_f.write_ontology_class_map(
                    fn=os.path.join(cwd, file_module + ".tsv"),
                    protected_writing=True,
                    n_suggest=4,
                )
                dsg_f.clean_ontology_class_map()

def test_load(doi_sfaira_repr: str, test_data: str):
    ds = _get_ds(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
    # TODO try-except with good error description saying that the data loader is broken here:
    ds.load(
        remove_gene_version=True,
        match_to_reference=TODO get organism here,
        load_raw=True,
        allow_caching=True
    )
    # Try loading from cache:
    ds = _get_ds(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
    # TODO try-except with good error description saying that the data loader is broken here:
    ds.load(
        remove_gene_version=True,
        match_to_reference=TODO get organism here,
        load_raw=False,
        allow_caching=True
    )
    shutil.rmtree(cache_path, ignore_errors=True)

The text was updated successfully, but these errors were encountered:

davidsebfischer · 2021-05-17T13:54:54Z

By the way, if you find time to build a mini mock data set by just generating data randomly in a mock data loader function and couple it to a YAML, this would be super useful to unit test this pipeline and we could also use these mock data sets in other unit tests. Right now, contribution is not unit tested but I think it should be.

Zethson · 2021-05-19T12:51:22Z

@davidsebfischer` is there any reason why you changed

ds.clean_ontology_class_map()

to

dsg_f.clean_ontology_class_map()

? The former should be the correct, right?

davidsebfischer · 2021-05-19T12:55:42Z

I did that because I wanted to save one additional cycle of building datasetgroups, i think both should work? Feel free to change, this is borderline pseudocode.

davidsebfischer assigned Zethson May 17, 2021

Zethson mentioned this issue May 19, 2021

split template test into annotate and test #272

Merged

davidsebfischer closed this as completed Jul 26, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

split annotation and testing of new data set #270

split annotation and testing of new data set #270

davidsebfischer commented May 17, 2021

davidsebfischer commented May 17, 2021 •

edited

Loading

Zethson commented May 19, 2021

davidsebfischer commented May 19, 2021

split annotation and testing of new data set #270

split annotation and testing of new data set #270

Comments

davidsebfischer commented May 17, 2021

davidsebfischer commented May 17, 2021 • edited Loading

Zethson commented May 19, 2021

davidsebfischer commented May 19, 2021

davidsebfischer commented May 17, 2021 •

edited

Loading