theislab · davidsebfischer · Oct 22, 2021 · Sep 14, 2021 · Sep 16, 2021 · Sep 16, 2021
diff --git a/.github/workflows/create_templates.yml b/.github/workflows/create_templates.yml
@@ -29,5 +29,5 @@ jobs:
 
       - name: Create single_dataset template
         run: |
-          echo -e "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" | sfaira create-dataloader
+          echo -e "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" | sfaira create-dataloader
           rm -rf d10_1000_j_journal_2021_01_001/
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ docs/api/
 sfaira/unit_tests/data_for_testing/mock_data/store*
 **cache
 **temp
+sfaira/unit_tests/template_data/*
 
 # General patterns:
 git abuild

diff --git a/README.rst b/README.rst
@@ -25,11 +25,11 @@
 sfaira - data and model repository for single-cell data
 =======================================================
 
-.. image:: https://github.com/theislab/sfaira/blob/master/resources/images/concept.png
-   :width: 1000px
+.. image:: https://github.com/theislab/sfaira/blob/main/resources/images/concept.png
+   :width: 400px
    :align: center
 
-sfaira_ is a model and a data repository in a single python package (preprint_).
+sfaira_ is a model and a data repository in a single python package (`full paper`_).
 We provide an interactive overview of the current state of the zoos on sfaira-portal_.
 
 Its data zoo gives users access to streamlined data loaders that allow reproducible use of published and private data sets for model training and exploration.
@@ -40,7 +40,7 @@ sfaira integrates into scanpy_ workflows.
 
 .. _scanpy: https://github.com/theislab/scanpy
 .. _sfaira: https://sfaira.readthedocs.io
-.. _preprint: https://www.biorxiv.org/content/10.1101/2020.12.16.419036v1
+.. _full paper: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02452-6
 .. _DCA: https://github.com/theislab/dca
 .. _scArches: https://github.com/theislab/scarches
 .. _sfaira-portal: https://theislab.github.io/sfaira-portal/
diff --git a/docs/adding_datasets.rst b/docs/adding_datasets.rst
@@ -14,6 +14,9 @@ This process requires a couple of steps as outlined in the following sections.
    Overview of contributing dataloaders to sfaira. First, ensure that your data is not yet available as a dataloader.
    Next, create a dataloader. Afterwards, validate/annotate it to finally test it. Finally, submit your dataloader to sfaira.
 
+Step-by-step guide through CLI
+------------------------------
+
 sfaira features an interactive way of creating, formatting and testing dataloaders through a command line interface (CLI).
 The common workflow using the CLI looks as follows:
 

diff --git a/docs/data_life_cycle.rst b/docs/data_life_cycle.rst
@@ -15,6 +15,9 @@ where step 1-3 is often only performed once by the original authors of the data
 while step 4 and 5 are repeated multiple times in the community for different meta studies.
 Sfaira offers the following functionality groups that accelerate steps along this pipeline:
 
+Sfaira tools across life cycle
+------------------------------
+
 I) Data loaders
 ~~~~~~~~~~~~~~~
 We maintain streamlined data loader code that improve **Curation** (step 4) and make this step sharable and iteratively improvable.
@@ -31,3 +34,20 @@ III) Stores
 Using the streamlined data set collections from (II), we built a computationally efficient data interface for machine learning on such large distributed data set collection, thus improving **Usage** (step 5):
 Specifically, this interface is optimised for out-of-core observation-centric indexing in scenarios that are typical to machine learning on single-cell data.
 Read more in our guide to data stores :ref:`distributed_data_rst`.
+
+FAIR data
+---------
+
+FAIR_ data is a set of data management guidelines that are designed to improve data reuse and automated access
+(see also the original publication of FAIR_ for more details).
+The key data management topics addressed by FAIR_ are findability, accessibility, interoperability and reusability.
+Single-cell data sets are usually public and also adhere to varying degrees to FAIR_ principles.
+We designed sfaira so that it improves FAIR_ attributes of published data sets beyond their state at publication.
+Specifically, sfaira:
+
+- improves **findability** of data sets by serving data sets through complex meta data query.
+- improves **accessibility** of data sets by serving streamlined data sets.
+- improves **interoperability** of data sets by streamlining data using versioned meta data ontologies.
+- improves **reusability** of data sets by allowing for iterative improvements of meta data annotation and by shipping usage critical meta data.
+
+.. _FAIR: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4792175/
diff --git a/requirements.txt b/requirements.txt
@@ -1,16 +1,18 @@
 anndata>=0.7.6
 crossref_commons
-cellxgene-schema
+cellxgene-schema>=2.0.3
 dask
 docutils
 fuzzywuzzy
 IPython
 loompy
 matplotlib
 networkx
+numba
 numpy>=1.16.4
 obonet
 openpyxl
+owlready2
 pandas
 pyarrow
 pytest>=6.2.2

diff --git a/sfaira/cli.py b/sfaira/cli.py
@@ -15,7 +15,7 @@
 from sfaira.commands.validate_dataloader import DataloaderValidator
 from sfaira.commands.validate_h5ad import H5adValidator
 
-import sfaira
+from sfaira import __version__
 from sfaira.commands.create_dataloader import DataloaderCreator
 from sfaira.commands.upgrade import UpgradeCommand
 
@@ -43,7 +43,7 @@ def main():
 
 
 @click.group()
-@click.version_option(sfaira.__version__, message=click.style(f'sfaira Version: {sfaira.__version__}', fg='blue'))
+@click.version_option(__version__, message=click.style(f'sfaira Version: {__version__}', fg='blue'))
 @click.option('-v', '--verbose', is_flag=True, default=False, help='Enable verbose output (print debug statements).')
 @click.option("-l", "--log-file", help="Save a verbose log to a file.")
 @click.pass_context

diff --git a/sfaira/commands/create_dataloader.py b/sfaira/commands/create_dataloader.py
@@ -36,10 +36,13 @@ class TemplateAttributes:
     ethnicity: str = ''  # ethnicity of the sample
     sample_source: str = ''  # source of the sample
     state_exact: str = ''  # state of the sample
-    year: str = 2021  # year in which sample was acquired
+    year: int = 2021  # year in which sample was acquired
     number_of_datasets: str = 1  # Required to determine the file names
 
-    cell_types_original_obs_key: str = ''  # Original cell type key in obs
+    cell_type_obs_key: str = ''  # Original cell type key in obs
+
+    gene_id_ensembl_var_key: str = ''  # Gene id ensembl key in var
+    gene_id_symbols_var_key: str = ''  # Gene id symbols key in var
 
 
 class DataloaderCreator:
@@ -117,9 +120,13 @@ def _prompt_dataloader_configuration(self):
         self.template_attributes.primary_data = str(sfaira_questionary(function='confirm',
                                                                        question='Primary data:',
                                                                        default='Yes'))
-        self.template_attributes.default_embedding = sfaira_questionary(function='text',
-                                                                        question='Default embedding:',
-                                                                        default='NA')
+        is_default_embedding = sfaira_questionary(function='confirm',
+                                                  question='Does your dataset have a default embedding?',
+                                                  default='No')
+        if is_default_embedding:
+            self.template_attributes.default_embedding = str(sfaira_questionary(function='text',
+                                                                                question='Default embedding obsm key:',
+                                                                                default='X_umap'))
         self.template_attributes.organism = sfaira_questionary(function='text',
                                                                question='Organism:',
                                                                default='NA')
@@ -145,12 +152,26 @@ def _prompt_dataloader_configuration(self):
                                                      question='Does your dataset have a cell type annotation?',
                                                      default='No')
         if is_cell_type_annotation:
-            self.template_attributes.cell_types_original_obs_key = sfaira_questionary(function='text',
-                                                                                      question='Cell type annotation obs key:',
-                                                                                      default='')
+            self.template_attributes.cell_type_obs_key = sfaira_questionary(function='text',
+                                                                            question='Cell type annotation obs key:',
+                                                                            default='')
+        is_gene_id_symbols = sfaira_questionary(function='confirm',
+                                                question='Does your dataset have gene ID symbols (gene names, e.g. TP53)?',
+                                                default='No')
+        if is_gene_id_symbols:
+            self.template_attributes.gene_id_symbols_var_key = sfaira_questionary(function='text',
+                                                                                  question='Gene id symbols var key:',
+                                                                                  default='index')
+        is_gene_id_ensembl = sfaira_questionary(function='confirm',
+                                                question='Does your dataset have Ensembl gene IDs (e.g. ENSG00000141510)?',
+                                                default='No')
+        if is_gene_id_ensembl:
+            self.template_attributes.gene_id_ensembl_var_key = sfaira_questionary(function='text',
+                                                                                  question='Gene id ensembl var key:',
+                                                                                  default='')
         self.template_attributes.year = sfaira_questionary(function='text',
                                                            question='Year:',
-                                                           default='2021')
+                                                           default=2021)
         first_author = author[0] if isinstance(author, list) else author
         try:
             first_author_lastname = first_author.split(',')[0]
@@ -159,7 +180,7 @@ def _prompt_dataloader_configuration(self):
             first_author_lastname = first_author
         self.template_attributes.id_without_doi = f'{clean_id_str(self.template_attributes.organism)}_' \
                                                   f'{clean_id_str(self.template_attributes.organ)}_' \
-                                                  f'{clean_id_str(self.template_attributes.year)}_' \
+                                                  f'{clean_id_str(str(self.template_attributes.year))}_' \
                                                   f'{clean_id_str(self.template_attributes.assay_sc)}_' \
                                                   f'{clean_id_str(first_author_lastname)}_001'
         self.template_attributes.id = f'{self.template_attributes.id_without_doi}_' \

diff --git a/sfaira/commands/templates/multiple_datasets/cookiecutter.json b/sfaira/commands/templates/multiple_datasets/cookiecutter.json
@@ -21,5 +21,7 @@
     "primary_data": "",
     "default_embedding": "",
     "create_extra_description": "",
-    "cell_types_original_obs_key": ""
+    "cell_type_obs_key": "",
+    "gene_id_ensembl_var_key": "",
+    "gene_id_symbols_var_key": ""
   }
diff --git a/...ltiple_datasets/{{ cookiecutter.doi_sfaira_repr }}/{{ cookiecutter.id_without_doi }}.yaml b/...ltiple_datasets/{{ cookiecutter.doi_sfaira_repr }}/{{ cookiecutter.id_without_doi }}.yaml
@@ -5,7 +5,7 @@ dataset_structure:
 {% endfor %}dataset_wise:
     author: "{{ cookiecutter.author }}"
     default_embedding:
-{% for fn in cookiecutter.sample_fns.fns %}        {{ fn }}: "{{ cookiecutter.default_embedding }}"
+{% for fn in cookiecutter.sample_fns.fns %}        {{ fn }}: {% if cookiecutter.default_embedding %}"{{ cookiecutter.default_embedding }}"{% endif %}
 {% endfor %}    doi_preprint:
     doi_journal: "{{ cookiecutter.doi }}"
     download_url_data:
@@ -14,7 +14,7 @@ dataset_structure:
 {% for fn in cookiecutter.sample_fns.fns %}        {{ fn }}:
 {% endfor %}    primary_data: {{ cookiecutter.primary_data }}
     normalization: "{{ cookiecutter.normalization }}"
-    year: "{{ cookiecutter.year }}"
+    year: {{ cookiecutter.year }}
 dataset_or_observation_wise:
     assay_sc:
 {% for fn in cookiecutter.sample_fns.fns %}        {{ fn }}: "{{ cookiecutter.assay_sc }}"
@@ -62,9 +62,9 @@ dataset_or_observation_wise:
 {% for fn in cookiecutter.sample_fns.fns %}        {{ fn }}:
 {% endfor %}    tech_sample_obs_key:
 observation_wise:
-    cell_types_original_obs_key: "{{ cookiecutter.cell_types_original_obs_key }}"
+    cell_type_obs_key: {% if cookiecutter.cell_type_obs_key %}"{{ cookiecutter.cell_type_obs_key }}"{% endif %}
 feature_wise:
-    gene_id_ensembl_var_key:
-    gene_id_symbols_var_key:
+    gene_id_ensembl_var_key: {% if cookiecutter.gene_id_ensembl_var_key %}"{{ cookiecutter.gene_id_ensembl_var_key }}"{% endif %}
+    gene_id_symbols_var_key: {% if cookiecutter.gene_id_symbols_var_key %}"{{ cookiecutter.gene_id_symbols_var_key }}"{% endif %}
 meta:
     version: "1.0"
diff --git a/sfaira/commands/templates/single_dataset/cookiecutter.json b/sfaira/commands/templates/single_dataset/cookiecutter.json
@@ -21,5 +21,7 @@
     "primary_data": "",
     "default_embedding": "",
     "create_extra_description": "",
-    "cell_types_original_obs_key": ""
-  }
+    "cell_type_obs_key": "",
+    "gene_id_ensembl_var_key": "",
+    "gene_id_symbols_var_key": ""
+}
diff --git a/.../single_dataset/{{ cookiecutter.doi_sfaira_repr }}/{{ cookiecutter.id_without_doi }}.yaml b/.../single_dataset/{{ cookiecutter.doi_sfaira_repr }}/{{ cookiecutter.id_without_doi }}.yaml
@@ -4,14 +4,14 @@ dataset_structure:
         - "{{ cookiecutter.sample_fns }}"
 dataset_wise:
     author: "{{ cookiecutter.author }}"
-    default_embedding: "{{ cookiecutter.default_embedding }}"
+    default_embedding: {% if cookiecutter.default_embedding %}"{{ cookiecutter.default_embedding }}"{% endif %}
     doi_preprint:
     doi_journal: "{{ cookiecutter.doi }}"
     download_url_data: "{{ cookiecutter.download_url_data }}"
     download_url_meta: "{{ cookiecutter.download_url_meta }}"
     primary_data: {{ cookiecutter.primary_data }}
     normalization: "{{ cookiecutter.normalization }}"
-    year: "{{ cookiecutter.year }}"
+    year: {{ cookiecutter.year }}
 dataset_or_observation_wise:
     assay_sc: "{{ cookiecutter.assay_sc }}"
     assay_sc_obs_key:
@@ -44,9 +44,9 @@ dataset_or_observation_wise:
     tech_sample:
     tech_sample_obs_key:
 observation_wise:
-    cell_types_original_obs_key: "{{ cookiecutter.cell_types_original_obs_key }}"
+    cell_type_obs_key: {% if cookiecutter.cell_type_obs_key %}"{{ cookiecutter.cell_type_obs_key }}"{% endif %}
 feature_wise:
-    gene_id_ensembl_var_key:
-    gene_id_symbols_var_key:
+    gene_id_ensembl_var_key: {% if cookiecutter.gene_id_ensembl_var_key %}"{{ cookiecutter.gene_id_ensembl_var_key }}"{% endif %}
+    gene_id_symbols_var_key: {% if cookiecutter.gene_id_symbols_var_key %}"{{ cookiecutter.gene_id_symbols_var_key }}"{% endif %}
 meta:
     version: "1.0"
diff --git a/sfaira/commands/validate_dataloader.py b/sfaira/commands/validate_dataloader.py
@@ -78,7 +78,7 @@ def _validate_required_attributes(self):
                       'dataset_or_observation_wise:organism',
                       'dataset_or_observation_wise:sample_source',
                       ['feature_wise:gene_id_ensembl_var_key',
-                       'feature_wise:gene_id_symbol_var_key']]
+                       'feature_wise:gene_id_symbols_var_key']]
 
         # TODO This is some spaghetti which could be more performant with set look ups.
         flattened_dict = flatten(self.content, reducer=make_reducer(delimiter=':'))

diff --git a/sfaira/consts/__init__.py b/sfaira/consts/__init__.py
@@ -1,6 +1,5 @@
-from sfaira.consts.adata_fields import AdataIds, AdataIdsSfaira, AdataIdsCellxgene, AdataIdsCellxgeneGeneral, \
-    AdataIdsCellxgeneHuman_v1_1_0, AdataIdsCellxgeneMouse_v1_1_0
-from sfaira.consts.directories import CACHE_DIR
+from sfaira.consts.adata_fields import AdataIds, AdataIdsSfaira, AdataIdsCellxgene, AdataIdsCellxgene_v2_0_0
+from sfaira.consts.directories import CACHE_DIR, SFAIRA_REPO_URL
 from sfaira.consts.meta_data_files import META_DATA_FIELDS
 from sfaira.consts.ontologies import OntologyContainerSfaira
 from sfaira.consts.utils import clean_cache