You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import os
import pydoc
import shutil
from sfaira.data import DatasetGroupDirectoryOriented, DatasetGroup, DatasetBase
from sfaira.data.utils import read_yaml
def _get_ds(doi_sfaira_repr: str, test_data: str):
flattened_doi = doi_sfaira_repr
# Define file names and loader paths in sfaira or sfaira_extension:
# Define base paths of loader collections in sfaira and sfaira_extension:
dir_loader_sfaira = "sfaira.data.dataloaders.loaders."
file_path_sfaira = "/" + "/".join(pydoc.locate(dir_loader_sfaira + "FILE_PATH").split("/")[:-1])
if sfairae is not None:
dir_loader_sfairae = "sfaira_extension.data.dataloaders.loaders."
file_path_sfairae = "/" + "/".join(pydoc.locate(dir_loader_sfairae + "FILE_PATH").split("/")[:-1])
else:
file_path_sfairae = None
# Check if loader name is a directory either in sfaira or sfaira_extension loader collections:
if flattened_doi in os.listdir(file_path_sfaira):
dir_loader = dir_loader_sfaira + "." + flattened_doi
package_source = "sfaira"
elif flattened_doi in os.listdir(file_path_sfairae):
dir_loader = dir_loader_sfairae + "." + flattened_doi
package_source = "sfairae"
else:
raise ValueError("data loader not found in sfaira and also not in sfaira_extension")
file_path = pydoc.locate(dir_loader + ".FILE_PATH")
meta_path = None
cache_path = None
# Clear dataset cache
shutil.rmtree(cache_path, ignore_errors=True)
ds = DatasetGroupDirectoryOriented(
file_base=file_path,
data_path=test_data,
meta_path=None,
cache_path=None
)
def buffered_load(doi_sfaira_repr: str, test_data: str):
ds = _get_ds(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
# TODO try-except with good error description saying that the data loader is broken here:
ds.load(
remove_gene_version=False,
match_to_reference=None,
load_raw=True, # Force raw load so non confound future tests by data loader bugs in previous versions.
allow_caching=True,
)
assert len(ds.ids) > 0, f"no data sets loaded, make sure raw data is in {test_data}"
return ds
def annotate(doi_sfaira_repr: str, test_data: str):
ds = buffered_load(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
# Create cell type conversion table:
cwd = os.path.dirname(file_path)
dataset_module = str(cwd.split("/")[-1])
# Group data sets by file module:
# Note that if we were not grouping the cell type map .tsv files by file module, we could directly call
# write_ontology_class_map on the ds.
for f in os.listdir(cwd):
if os.path.isfile(os.path.join(cwd, f)): # only files
# Narrow down to data set files:
if f.split(".")[-1] == "py" and f.split(".")[0] not in ["__init__", "base", "group"]:
file_module = ".".join(f.split(".")[:-1])
# I) Instantiate Data set group to get all IDs of data sets associated with this .py file.
# Note that all data sets in this directory are already loaded in ds, so we just need the IDs.
DatasetFound = pydoc.locate(dir_loader + "." + file_module + ".Dataset")
# Load objects from name space:
# - load(): Loading function that return anndata instance.
# - SAMPLE_FNS: File name list for DatasetBaseGroupLoadingManyFiles
load_func = pydoc.locate(dir_loader + "." + file_module + ".load")
load_func_annotation = pydoc.locate(dir_loader + "." + file_module + ".LOAD_ANNOTATION")
# Also check sfaira_extension for additional load_func_annotation:
if package_source != "sfairae" and sfairae is not None:
load_func_annotation_sfairae = pydoc.locate(dir_loader_sfairae + "." + dataset_module +
"." + file_module + ".LOAD_ANNOTATION")
# LOAD_ANNOTATION is a dictionary so we can use update to extend it.
if load_func_annotation_sfairae is not None and load_func_annotation is not None:
load_func_annotation.update(load_func_annotation_sfairae)
elif load_func_annotation_sfairae is not None and load_func_annotation is None:
load_func_annotation = load_func_annotation_sfairae
sample_fns = pydoc.locate(dir_loader + "." + file_module + ".SAMPLE_FNS")
fn_yaml = os.path.join(cwd, file_module + ".yaml")
fn_yaml = fn_yaml if os.path.exists(fn_yaml) else None
# Check for sample_fns in yaml:
if fn_yaml is not None:
assert os.path.exists(fn_yaml), f"did not find yaml {fn_yaml}"
yaml_vals = read_yaml(fn=fn_yaml)
if sample_fns is None and yaml_vals["meta"]["sample_fns"] is not None:
sample_fns = yaml_vals["meta"]["sample_fns"]
if sample_fns is None:
sample_fns = [None]
# Here we distinguish between class that are already defined and those that are not.
# The latter case arises if meta data are defined in YAMLs and _load is given as a function.
if DatasetFound is None:
datasets_f = [
DatasetBase(
data_path=test_data,
meta_path=meta_path,
cache_path=cache_path,
load_func=load_func,
dict_load_func_annotation=load_func_annotation,
sample_fn=x,
sample_fns=sample_fns if sample_fns != [None] else None,
yaml_path=fn_yaml,
) for x in sample_fns
]
else:
datasets_f = [
DatasetFound(
data_path=test_data,
meta_path=meta_path,
cache_path=cache_path,
load_func=load_func,
load_func_annotation=load_func_annotation,
sample_fn=x,
sample_fns=sample_fns if sample_fns != [None] else None,
yaml_path=fn_yaml,
) for x in sample_fns
]
# II) Build a data set group from the already loaded data sets and use the group ontology writing
# function.
dsg_f = DatasetGroup(datasets=dict([(x.id, ds.datasets[x.id]) for x in datasets_f]))
# III) Write this directly into sfaira installation so that it can be committed via git.
# TODO any errors not to be caught here?
dsg_f.write_ontology_class_map(
fn=os.path.join(cwd, file_module + ".tsv"),
protected_writing=True,
n_suggest=4,
)
dsg_f.clean_ontology_class_map()
def test_load(doi_sfaira_repr: str, test_data: str):
ds = _get_ds(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
# TODO try-except with good error description saying that the data loader is broken here:
ds.load(
remove_gene_version=True,
match_to_reference=TODO get organism here,
load_raw=True,
allow_caching=True
)
# Try loading from cache:
ds = _get_ds(doi_sfaira_repr=doi_sfaira_repr, test_data=test_data)
# TODO try-except with good error description saying that the data loader is broken here:
ds.load(
remove_gene_version=True,
match_to_reference=TODO get organism here,
load_raw=False,
allow_caching=True
)
shutil.rmtree(cache_path, ignore_errors=True)
The text was updated successfully, but these errors were encountered:
By the way, if you find time to build a mini mock data set by just generating data randomly in a mock data loader function and couple it to a YAML, this would be super useful to unit test this pipeline and we could also use these mock data sets in other unit tests. Right now, contribution is not unit tested but I think it should be.
I did that because I wanted to save one additional cycle of building datasetgroups, i think both should work? Feel free to change, this is borderline pseudocode.
I am imagining sth like this, so that first
annotate()
and thentest_load()
can be called:sfaira/unit_tests/data_contribution/test_data_template.py
The text was updated successfully, but these errors were encountered: