theislab · davidsebfischer · Jan 19, 2021 · Jan 19, 2021
diff --git a/docs/data.rst b/docs/data.rst
@@ -32,8 +32,8 @@ Add data sets
 ~~~~~~~~~~~~~
 
     1. Write a data loader as outlined below.
-    2. Identify the raw files as indicated in the data loader classes and copy them into your directory structure as required by your data laoder.
-    3. You can contribute the data loader to public sfaira, we do not manage data upload though. During publication, you would upload this data set to a server like GEO and the dataloader contributed to sfaira would use this download link.
+    2. Identify the raw files as indicated in the data loader classes and copy them into your directory structure as required by your data loader.
+    3. You can contribute the data loader to public sfaira, we do not manage data upload though. During publication, you would upload this data set to a server like GEO and the data loader contributed to sfaira would use this download link.
 
 Use data loaders on existing data repository
 --------------------------------------------
@@ -63,6 +63,21 @@ This directory contains an `__init__.py` file which makes these data loaders vis
 Next, each data set is represented by one data loader python file in this directory.
 See below for more complex set ups with repetitive data loader code.
 
+Check that the data loader was not already implemented
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We will open issues for all planned data loaders, so you can search both the code_ base and our GitHub issues_ for
+matching data loaders before you start writing one.
+The core data loader identified is the directory compatible doi,
+which is the doi with all special characters replaced by "_" and a "d" prefix is used:
+"10.1016/j.cell.2019.06.029" becomes "d10_1016_j_cell_2019_06_029".
+Searching for this string should yield a match if it is already implemented, take care to look for both
+preprint and publication DOIs if both are available.
+We will also mention publication names in issues, you will however not find these in the code.
+
+.. _code: https://github.com/theislab/sfaira/tree/dev
+.. _issues: https://github.com/theislab/sfaira/issues
+
+
 The data loader python file
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -80,9 +95,10 @@ before it is loaded into memory:
             self,
             path: Union[str, None] = None,
             meta_path: Union[str, None] = None,
+            cache_path: Union[str, None] = None,
             **kwargs
     ):
-        super().__init__(path=path, meta_path=meta_path, **kwargs)
+        super().__init__(path=path, meta_path=meta_path, cache_path=cache_path, **kwargs)
         # Data set meta data: You do not have to include all of these and can simply skip lines corresponding
         # to attritbutes that you do not have access to. These are meta data on a sample level.
         # The meta data attributes labeled with (*) may als be supplied per cell, see below,
@@ -93,8 +109,8 @@ before it is loaded into memory:
         self.author = x  # author (list) who sampled / created the data set
         self.doi = x  # doi of data set accompanying manuscript
 
-        self.download = x  # download website(s) of data files
-        self.download_meta = x  # download website(s) of meta data files
+        self.download_url_data = x  # download website(s) of data files
+        self.download_url_meta = x  # download website(s) of meta data files
 
         self.age = x  # (*, optional) age of sample
         self.dev_stage = x  # (*, optional) developmental stage of organism
@@ -140,10 +156,18 @@ before it is loaded into memory:
 
 
 2. A function called to load the data set into memory:
+It is important to set an automated path indicating the location of the raw files here.
+Our recommendation for this directory set-up is that you define a directory folder in your directory structure
+in which all of these raw files will be (self.path) and then add a sub-directory named as
+`self.directory_formatted_doi` (ie. the doi with all special characters replaced by "_" and place the raw files
+directly into this sub directory.
 
 .. code-block:: python
 
     def _load(self, fn=None):
+        # assuming that i uploaded an h5ad somewhere (in self.download)
+        if fn is None:
+            fn = os.path.join(self.path, self.directory_formatted_doi, "my.h5ad")
         self.adata = anndata.read(fn)  # loading instruction into .adata, use other ones if the data is not h5ad
         # Some times, you need to load multiple files (e.g. counts and annotation), all of this code would be here.
 
@@ -157,12 +181,13 @@ In summary, a simply example data loader for a mouse lung data set could look li
                 self,
                 path: Union[str, None] = None,
                 meta_path: Union[str, None] = None,
+                cache_path: Union[str, None] = None,
                 **kwargs
         ):
-            super().__init__(path=path, meta_path=meta_path, **kwargs)
+            super().__init__(path=path, meta_path=meta_path, cache_path=cache_path, **kwargs)
             self.author = "me"
             self.doi = "my preprint"
-            self.download = "my GEO upload"
+            self.download_url_data = "my GEO upload"
             self.normalisation = "raw"  # because I uploaded raw counts, which is good practice!
             self.organ = "lung"
             self.organism = "mouse"
@@ -174,7 +199,7 @@ In summary, a simply example data loader for a mouse lung data set could look li
         def _load(self, fn=None):
             # assuming that i uploaded an h5ad somewhere (in self.download)
             if fn is None:
-                fn = os.path.join(self.path, "mouse", "lung", "my.h5ad")
+                fn = os.path.join(self.path, self.directory_formatted_doi, "my.h5ad")
             self.adata = anndata.read(fn)
 
 
@@ -244,7 +269,28 @@ Metadata management
 
 We constrain meta data by ontologies where possible. The current restrictions are:
 
-    - .organism must either mouse or human.
+    - .age: unconstrained string, try using units of years for human and units of months for mice
+    - .dev_stage: unconstrained string, this will constrained to an ontology in the future,
+        try choosing from HSAPDV (http://www.obofoundry.org/ontology/hsapdv.html) for human
+        or from MMUSDEV (http://www.obofoundry.org/ontology/mmusdv.html) for mouse
+    - .ethnicity: unconstrained string, this will constrained to an ontology in the future,
+        try choosing from HANCESTRO (https://www.ebi.ac.uk/ols/ontologies/hancestro)
+    - .healthy: bool
+    - .normalisation: unconstrained string, this will constrained to an ontology in the future,
+        try using {"raw", "scaled"}
+    - .organ: unconstrained string, this will constrained to an ontology in the future, try to choose
+        term from Uberon (http://www.obofoundry.org/ontology/ehdaa2.html)
+        or from EHDAA2 (http://www.obofoundry.org/ontology/ehdaa2.html) for human
+        or from EMAPA (http://www.obofoundry.org/ontology/emapa.html) for mouse
+    - .organism: constrained string, {"mouse", "human"}. In the future, we will use NCBITAXON
+        (http://www.obofoundry.org/ontology/ncbitaxon.html).
+    - .protocol: unconstrained string, this will constrained to an anatomic ontology in the future,
+        try choosing a term from https://www.ebi.ac.uk/ols/ontologies/efo/terms?iri=http%3A%2F%2Fwww.ebi.ac.uk%2Fefo%2FEFO_0010183&viewMode=All&siblings=false
+    - .sex: constrained string, {"female", "male"}
+    - .state_exact: unconstrained string, try to be concise and anticipate that this field is queried by automatised searches.
+        If you give treatment concentrations, intervals or similar measurements use square brackets around the quantity
+        and use units: `[1g]`
+    - .year: must be an integer year, e.g. 2020
 
 Follow this issue_ for details on upcoming ontology integrations.
 
@@ -253,10 +299,119 @@ Follow this issue_ for details on upcoming ontology integrations.
 Genome management
 -----------------
 
+You do not have to worry about this unless you are interested,
+this section is not required reading for writing data loaders.
+
 We streamline feature spaces used by models by defining standardized gene sets that are used as model input.
 Per default, sfaira works with the protein coding genes of a genome assembly right now.
 A model topology version includes the genome it was trained for, which also defines the feature of this model as genes.
 As genome assemblies are updated, model topology version can be updated and models retrained to reflect these changes.
 Note that because protein coding genes do not change drastically between genome assemblies,
 sample can be carried over to assemblies they were not aligned against by matching gene identifiers.
 Sfaira automatically tries to overlap gene identifiers to the genome assembly selected through the current model.
+
+FAQ
+---
+
+How is the dataset’s ID structured?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Organism_Organ_Year_Protocol_NumberOfDataset_FirstAuthorLastname_doi
+
+How do I assemble the data set ID if some of its element meta data are not unique?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The data set ID is designed to be a unique identifier of a data set.
+Therefore, it is not an issue if it does not capture the full complexity of the data.
+Simply choose the meta data value out of the list of corresponding values which comes first in the alphabet.
+
+What are cell-wise and sample-wise meta data?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Metadata can be set on a per sample level or, in some cases, per cell.
+Sample-wise meta data can be directly set in the constructor (e.g self.organism = “human”).
+Cell-wise metadata can be provided in `.obs` of the loaded data, here,
+a Dataset attribute contains the name of the `.obs` column that contains these cell-wise labels
+(e.g. self.obs_key_organism).
+Note that sample-wise meta data should be yielded as such and not as a column in `.obs` to simplify loading.
+
+Which meta data objects are optional?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Mandatory on sample (self.attribute) or cell level (self.obs_key_attribute):
+
+    - .id: Dataset ID. This is used to identify the data set uniquely.
+        Example: self.id = "human_colon_2019_10x_smilie_001_10.1016/j.cell.2019.06.029"
+    - .download_url_data: Link to data download website.
+        Example: self.download = "some URL"
+    - .download_url_meta: Download link to metadata. Assumes that meta data is defined in .download_url_data if not
+        specified.
+        Example: self.download_meta = "some URL"
+    - .var_symbol_col, .var_ensembl_col: Location of gene name as gene symbol and/or ENSEMBL ID in adata.var
+        (if index of adata.var, set to “index”, otherwise to column name). One of the two must be provided.
+        Example: self.var_symbol_col = 'index', self.var_ensembl_col = “GeneID”
+    - .author: First author of publication (or list of all authors).
+        self.author = "Last name, first name" # or ["Last name, first name", "Last name, first name"]
+    - .doi: Doi of publication
+        Example: self.doi = "10.1016/j.cell.2019.06.029"
+    - .organism (or .obs_key_organism): Organism sampled.
+        Example: self.organism = “human”
+
+Highly recommended:
+
+    - .normalization: Normalization of count data:
+        Example: self.normalization = “raw”
+    - .organ (or .obs_key_organ): Organ sampled.
+        Example: self.organ = “liver”
+    - .protocol (or .obs_key_protocol): Protocol with which data was collected.
+        Example: self.protocol = “10x”
+
+Optional (if available):
+
+    - .age (or .obs_key_age): Age of individual sampled.
+        Example: self.age = 80  # (80 years old for human)
+    - .dev_stage (or .obs_key_dev_stage): Developmental stage of individual sampled.
+        Example: self.dev_stage = “mature”
+    - .ethnicity (or .obs_key_ethnicity): Ethnicity of individual sampled (only for human).
+        Example: self.ethnicity = “free text”
+    - .healthy (or .obs_key_healthy): Is the sampled from a disease individual? (bool)
+        Example: self.healthy = True
+    - .sex (or .obs_key_sex): Sex of individual sampled.
+        Example: self.sex = “male”
+    - .state_exact (or .obs_key_state_exact): Exact disease state
+        self.state_exact = free text
+    - .obs_key_cellontology_original: Column in .obs in which free text cell type names are stored.
+        Example: self.obs_key_cellontology_original = 'CellType'
+    - .year: Year of publication:
+        Example: self.year = 2019
+
+How do I cache data sets?
+~~~~~~~~~~~~~~~~~~~~~~~~~
+When loading a dataset with `Dataset.load(),`you can specify if the adata object
+should be cached or not  (allow_caching= True).
+If set to True, the loaded adata object will be cached as an h5ad object for faster reloading.
+
+How do I add cell type annotation?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We are simplifying this right now, new instructions will be available second half of January.
+
+Why are constructor (`__init__`) and loading function (`_load`) split in the template data loader?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Initiation and data set loading are handled separately to allow lazy loading.
+All steps that are required to load the count data and
+additional metadata should be defined solely in the `_load` section.
+Setting of class metadata such as `.doi`, `.id` etc. should be done in the constructor.
+
+How do I tell sfaira where the gene names are?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+By setting the attributes `.var_symbol_col` or `.var_ensembl_col` in the constructor.
+If the gene names are in the index of this data frame, you can set “index” as the value of these attributes.
+
+I only have gene symbols (human readable names, often abbreviations), such as HGNC or MGI, but not ENSEMBL identifiers, is that a problem?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+No, that is not a problem. They will automatically be converted to Ensembl IDs.
+You can, however, specify the reference genome in `Dataset.load(match_to_reference = ReferenceGenomeName)`
+to which the names should be mapped to.
+
+I have CITE-seq data, where can I put the protein quantification?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We will soon provide a structured interface for loading and accessing CITE-seq data,
+for now you can add it into `self.adata.obsm[“CITE”]`.
diff --git a/sfaira/consts/adata_fields.py b/sfaira/consts/adata_fields.py
@@ -199,7 +199,7 @@ def __init__(self):
         self.organ_allowed_entries = None
         self.organism_allowed_entries = ["mouse", "human"]
         self.protocol_allowed_entries = None
-        self.sex_allowed_entries = None
+        self.sex_allowed_entries = ["female", "male"]
         self.subtissue_allowed_entries = None
         self.year_allowed_entries = list(range(2000, 3000))
         # Free fields that are not constrained:

diff --git a/sfaira/data/base.py b/sfaira/data/base.py
@@ -152,10 +152,6 @@ def _load(self, fn):
     def _download(self, fn):
         pass
 
-    @property
-    def _directory_formatted_doi(self) -> str:
-        return "d" + "_".join("_".join("_".join(self.doi.split("/")).split(".")).split("-"))
-
     @property
     def _directory_formatted_id(self) -> str:
         return "_".join("_".join(self.id.split("/")).split("."))
@@ -200,11 +196,11 @@ def _load_cached(
             raise ValueError("provide either fn in load or path in constructor")
 
         assert self.cache_path is not None, "set self.cache_path first"
-        assert self._directory_formatted_doi is not None, "set self.doi first"
+        assert self.directory_formatted_doi is not None, "set self.doi first"
         assert self._directory_formatted_id is not None, "set self.id first"
         fn_cache = os.path.join(
             self.cache_path,
-            self._directory_formatted_doi,
+            self.directory_formatted_doi,
             self._directory_formatted_id + ".h5ad"
         )
         # Check if raw loader has to be called:
@@ -872,6 +868,10 @@ def doi(self, x: str):
         self.__erasing_protection(attr="doi", val_old=self._doi, val_new=x)
         self._doi = x
 
+    @property
+    def directory_formatted_doi(self) -> str:
+        return "d" + "_".join("_".join("_".join(self.doi.split("/")).split(".")).split("-"))
+
     @property
     def download(self) -> Union[Tuple[List[str]], Tuple[List[None]]]:
         """