updates to configs

openproblems-bio · Sep 7, 2024 · dc7d140 · dc7d140
1 parent 2261364
commit dc7d140
Show file tree

Hide file tree

Showing 37 changed files with 199 additions and 250 deletions.
diff --git a/_viash.yaml b/_viash.yaml
@@ -1,16 +1,111 @@
 name: task_perturbation_prediction
-version: 1.1.0
-
-# package metadata
+label: Perturbation Prediction
+summary: Predicting how small molecules change gene expression in different cell types.
 description: |
-  Predicting how small molecules change gene expression in different cell types.
+  Human biology can be complex, in part due to the function and interplay of the body's
+  approximately 37 trillion cells, which are organized into tissues, organs, and systems.
+  However, recent advances in single-cell technologies have provided unparalleled insight
+  into the function of cells and tissues at the level of DNA, RNA, and proteins. Yet
+  leveraging single-cell methods to develop medicines requires mapping causal links
+  between chemical perturbations and the downstream impact on cell state. These experiments
+  are costly and labor intensive, and not all cells and tissues are amenable to
+  high-throughput transcriptomic screening. If data science could help accurately predict
+  chemical perturbations in new cell types, it could accelerate and expand the development
+  of new medicines.
+
+  Several methods have been developed for drug perturbation prediction, most of which are
+  variations on the autoencoder architecture (Dr.VAE, scGEN, and ChemCPA). However, these
+  methods lack proper benchmarking datasets with diverse cell types to determine how well
+  they generalize. The largest available training dataset is the NIH-funded Connectivity
+  Map (CMap), which comprises over 1.3M small molecule perturbation measurements. However,
+  the CMap includes observations of only 978 genes, less than 5% of all genes. Furthermore,
+  the CMap data is comprised almost entirely of measurements in cancer cell lines, which
+  may not accurately represent human biology.
+
+  This task aims to predict how small molecules change gene expression in different cell
+  types. This task was a [Kaggle competition](https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/overview)
+  as part of the [NeurIPS 2023 competition track](https://neurips.cc/virtual/2023/competition/66586).
+
+  The task is to predict the gene expression profile of a cell after a small molecule
+  perturbation. For this competition, we designed and generated a novel single-cell
+  perturbational dataset in human peripheral blood mononuclear cells (PBMCs). We
+  selected 144 compounds from the Library of Integrated Network-Based Cellular Signatures
+  (LINCS) Connectivity Map dataset ([PMID: 29195078](https://pubmed.ncbi.nlm.nih.gov/29195078/))
+  and measured single-cell gene
+  expression profiles after 24 hours of treatment. The experiment was repeated in three
+  healthy human donors, and the compounds were selected based on diverse transcriptional
+  signatures observed in CD34+ hematopoietic stem cells (data not released). We performed
+  this experiment in human PBMCs because the cells are commercially available with
+  pre-obtained consent for public release and PBMCs are a primary, disease-relevant tissue
+  that contains multiple mature cell types (including T-cells, B-cells, myeloid cells,
+  and NK cells) with established markers for annotation of cell types. To supplement this
+  dataset, we also measured cells from each donor at baseline with joint scRNA and
+  single-cell chromatin accessibility measurements using the 10x Multiome assay. We hope
+  that the addition of rich multi-omic data for each donor and cell type at baseline will
+  help establish biological priors that explain the susceptibility of particular genes to
+  exhibit perturbation responses in difference biological contexts.
+
+version: dev
 license: MIT
 keywords: [single-cell, perturbation prediction, perturbation, openproblems, benchmark]
 links:
   issue_tracker: https://github.com/openproblems-bio/task_perturbation_prediction/issues
   repository: https://github.com/openproblems-bio/task_perturbation_prediction
   docker_registry: ghcr.io
 
+authors:
+  - name: Artur Szałata
+    roles: [ author ]
+    info:
+      github: szalata
+      orcid: "000-0001-8413-234X"
+  - name: Robrecht Cannoodt
+    roles: [ author ]
+    info:
+      github: rcannood
+      orcid: "0000-0003-3641-729X"
+  - name: Daniel Burkhardt
+    roles: [ author ]
+    info:
+      github: dburkhardt
+      orcid: 0000-0001-7744-1363
+  - name: Malte D. Luecken
+    roles: [ author ]
+    info:
+      github: LuckyMD
+      orcid: 0000-0001-7464-7921
+  - name: Tin M. Tunjic
+    roles: [ contributor ]
+    info:
+      github: ttunja
+      orcid: 0000-0001-8842-6548
+  - name: Mengbo Wang
+    roles: [ contributor ]
+    info:
+      github: wangmengbo
+      orcid: 0000-0002-0266-9993
+  - name: Andrew Benz
+    roles: [ author ]
+    info:
+      github: andrew-benz
+      orcid: 0009-0002-8118-1861
+  - name: Tianyu Liu
+    roles: [ contributor ]
+    info:
+      github: HelloWorldLTY
+      orcid: 0000-0002-9412-6573
+  - name: Jalil Nourisa
+    roles: [ contributor ]
+    info:
+      github: janursa
+      orcid: 0000-0002-7539-4396
+  - name: Rico Meinl
+    roles: [ contributor ]
+    info:
+      github: ricomnl
+      orcid: 0000-0003-4356-6058
+
+
 # technical settings
 organization: openproblems-bio
 viash_version: 0.9.0
@@ -22,4 +117,4 @@ info:
 
 # set default labels
 config_mods: |
-  .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
+  .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
diff --git a/src/api/file_de_test.yaml b/src/api/file_de_test.yaml
@@ -1,10 +1,10 @@
 type: file
 example: resources/datasets/neurips-2023-data/de_test.h5ad
+label: DE test
+summary: "Differential expression results for testing."
 info:
-  label: DE test
-  summary: "Differential expression results for testing."
-  file_type: h5ad
-  slots:
+  format:
+    type: h5ad
     obs:
       - name: cell_type
         type: string

diff --git a/src/api/file_de_train.yaml b/src/api/file_de_train.yaml
@@ -1,10 +1,10 @@
 type: file
 example: resources/datasets/neurips-2023-data/de_train.h5ad
+label: DE train
+summary: "Differential expression results for training."
 info:
-  label: DE train
-  summary: "Differential expression results for training."
-  file_type: h5ad
-  slots:
+  format:
+    type: h5ad
     obs:
       - name: cell_type
         type: string

diff --git a/src/api/file_id_map.yaml b/src/api/file_id_map.yaml
@@ -1,19 +1,20 @@
 type: file
 example: resources/datasets/neurips-2023-data/id_map.csv
+label: ID Map
+summary: "File indicates the order of de_test, the cell types and the small molecule names."
 info:
-  label: ID Map
-  summary: "File indicates the order of de_test, the cell types and the small molecule names."
-  file_type: csv
-  columns:
-    - name: id
-      type: integer
-      description: Index of the test observation
-      required: true
-    - name: cell_type
-      type: string
-      description: "Cell type name"
-      required: true
-    - name: sm_name
-      type: string
-      description: "Small molecule name"
-      required: true
+  format:
+    type: csv
+    columns:
+      - name: id
+        type: integer
+        description: Index of the test observation
+        required: true
+      - name: cell_type
+        type: string
+        description: "Cell type name"
+        required: true
+      - name: sm_name
+        type: string
+        description: "Small molecule name"
+        required: true
diff --git a/src/api/file_model.yaml b/src/api/file_model.yaml
@@ -1,6 +1,4 @@
 type: file
 example: resources/datasets/neurips-2023-data/model/
-info:
-  label: Model
-  summary: "Optional model output. If no value is passed, the model will be removed at the end of the run."
-  file_type: directory
+label: Model
+summary: "Optional model output. If no value is passed, the model will be removed at the end of the run."
diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml
@@ -1,10 +1,10 @@
 type: file
 example: resources/datasets/neurips-2023-data/prediction.h5ad
+label: Prediction
+summary: "Differential Gene Expression prediction"
 info:
-  label: Prediction
-  summary: "Differential Gene Expression prediction"
-  file_type: h5ad
-  slots:
+  format:
+    type: h5ad
     layers:
       - name: prediction
         type: double

diff --git a/src/api/file_sc_counts.yaml b/src/api/file_sc_counts.yaml
@@ -1,10 +1,10 @@
 type: file
 example: resources/neurips-2023-raw/sc_counts.h5ad
+label: Single Cell Counts
+summary: "Anndata with the counts of the whole dataset."
 info:
-  label: Single Cell Counts
-  summary: "Anndata with the counts of the whole dataset."
-  file_type: h5ad
-  slots:
+  format:
+    type: h5ad
     obs:
       - name: dose_uM
         description: "Dose in micromolar."

diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml
@@ -1,10 +1,10 @@
 type: file
 example: resources/datasets/neurips-2023-data/score.h5ad
+label: Score
+summary: "File indicating the score of a metric."
 info:
-  label: Score
-  summary: "File indicating the score of a metric."
-  file_type: h5ad
-  slots:
+  format:
+    type: h5ad
     uns:
       - type: string
         name: dataset_id

diff --git a/src/api/task_info.yaml b/src/api/task_info.yaml