Merge remote-tracking branch 'origin/main' into add_cpa

openproblems-bio · Oct 31, 2024 · 2c95a19 · 2c95a19
2 parents ad287df + 2fa4446
commit 2c95a19
Show file tree

Hide file tree

Showing 75 changed files with 703 additions and 489 deletions.
diff --git a/README.md b/README.md
@@ -139,8 +139,8 @@ perturbation responses in difference biological contexts.
 flowchart LR
   file_sc_counts("Single Cell Counts")
   comp_process_dataset[/"Process dataset"/]
-  file_de_train_h5ad("DE train")
-  file_de_test_h5ad("DE test")
+  file_de_train("DE train")
+  file_de_test("DE test")
   file_id_map("ID Map")
   comp_control_method[/"Control Method"/]
   comp_method[/"Method"/]
@@ -149,13 +149,13 @@ flowchart LR
   file_model("Model")
   file_score("Score")
   file_sc_counts---comp_process_dataset
-  comp_process_dataset-->file_de_train_h5ad
-  comp_process_dataset-->file_de_test_h5ad
+  comp_process_dataset-->file_de_train
+  comp_process_dataset-->file_de_test
   comp_process_dataset-->file_id_map
-  file_de_train_h5ad---comp_control_method
-  file_de_train_h5ad---comp_method
-  file_de_test_h5ad---comp_control_method
-  file_de_test_h5ad---comp_metric
+  file_de_train---comp_control_method
+  file_de_train---comp_method
+  file_de_test---comp_control_method
+  file_de_test---comp_metric
   file_id_map---comp_control_method
   file_id_map---comp_method
   comp_control_method-->file_prediction
@@ -224,8 +224,8 @@ Arguments:
 | Name              | Type   | Description                                                                                                         |
 |:------------------|:-------|:--------------------------------------------------------------------------------------------------------------------|
 | `--sc_counts`     | `file` | Anndata with the counts of the whole dataset.                                                                       |
-| `--de_train_h5ad` | `file` | (*Output*) Differential expression results for training. Default: `de_train.h5ad`.                                  |
-| `--de_test_h5ad`  | `file` | (*Output*) Differential expression results for testing. Default: `de_test.h5ad`.                                    |
+| `--de_train` | `file` | (*Output*) Differential expression results for training. Default: `de_train.h5ad`.                                  |
+| `--de_test`  | `file` | (*Output*) Differential expression results for testing. Default: `de_test.h5ad`.                                    |
 | `--id_map`        | `file` | (*Output*) File indicates the order of de_test, the cell types and the small molecule names. Default: `id_map.csv`. |
 
 </div>
@@ -371,8 +371,8 @@ Arguments:
 
 | Name              | Type     | Description                                                                         |
 |:------------------|:---------|:------------------------------------------------------------------------------------|
-| `--de_train_h5ad` | `file`   | (*Optional*) Differential expression results for training.                          |
-| `--de_test_h5ad`  | `file`   | Differential expression results for testing.                                        |
+| `--de_train` | `file`   | (*Optional*) Differential expression results for training.                          |
+| `--de_test`  | `file`   | Differential expression results for testing.                                        |
 | `--id_map`        | `file`   | File indicates the order of de_test, the cell types and the small molecule names.   |
 | `--layer`         | `string` | (*Optional*) Which layer to use for prediction. Default: `clipped_sign_log10_pval`. |
 | `--output`        | `file`   | (*Output*) Differential Gene Expression prediction.                                 |
@@ -392,7 +392,7 @@ Arguments:
 
 | Name              | Type     | Description                                                                                                         |
 |:------------------|:---------|:--------------------------------------------------------------------------------------------------------------------|
-| `--de_train_h5ad` | `file`   | (*Optional*) Differential expression results for training.                                                          |
+| `--de_train` | `file`   | (*Optional*) Differential expression results for training.                                                          |
 | `--id_map`        | `file`   | File indicates the order of de_test, the cell types and the small molecule names.                                   |
 | `--layer`         | `string` | (*Optional*) Which layer to use for prediction. Default: `clipped_sign_log10_pval`.                                 |
 | `--output`        | `file`   | (*Output*) Differential Gene Expression prediction.                                                                 |
@@ -413,7 +413,7 @@ Arguments:
 
 | Name                 | Type     | Description                                                                                   |
 |:---------------------|:---------|:----------------------------------------------------------------------------------------------|
-| `--de_test_h5ad`     | `file`   | Differential expression results for testing.                                                  |
+| `--de_test`     | `file`   | Differential expression results for testing.                                                  |
 | `--de_test_layer`    | `string` | (*Optional*) In which layer to find the DE data. Default: `clipped_sign_log10_pval`.          |
 | `--prediction`       | `file`   | Differential Gene Expression prediction.                                                      |
 | `--prediction_layer` | `string` | (*Optional*) In which layer to find the predicted DE data. Default: `prediction`.             |

diff --git a/_viash.yaml b/_viash.yaml
@@ -47,7 +47,7 @@ description: |
 
 version: dev
 license: MIT
-keywords: [single-cell, perturbation prediction, perturbation, openproblems, benchmark]
+keywords: [single-cell, perturbation prediction, perturbation, benchmark]
 links:
   issue_tracker: https://github.com/openproblems-bio/task_perturbation_prediction/issues
   repository: https://github.com/openproblems-bio/task_perturbation_prediction
@@ -121,7 +121,7 @@ viash_version: 0.9.0
 info:
   test_resources:
     - type: s3
-      path: s3://openproblems-data/resources/perturbation_prediction/datasets
+      path: s3://openproblems-data/resources/task_perturbation_prediction/datasets
       dest: resources/datasets
 
 # set default labels

diff --git a/common b/common
diff --git a/scripts/add_a_method.md b/scripts/add_a_method.md
diff --git a/scripts/build_components.sh b/scripts/build_components.sh
diff --git a/scripts/create_component/.gitignore b/scripts/create_component/.gitignore
@@ -0,0 +1,2 @@
+# if users change the scripts, the changes should not be committed.
+/create_*_*.s
diff --git a/scripts/create_component/create_python_method.sh b/scripts/create_component/create_python_method.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_python_method \
+  --language python \
+  --type method
diff --git a/scripts/create_component/create_python_metric.sh b/scripts/create_component/create_python_metric.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_python_metric \
+  --language python \
+  --type metric
diff --git a/scripts/create_component/create_r_method.sh b/scripts/create_component/create_r_method.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_r_method \
+  --language r \
+  --type method
diff --git a/scripts/create_component/create_r_metric.sh b/scripts/create_component/create_r_metric.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_r_metric \
+  --language r \
+  --type metric
diff --git a/scripts/create_readme.sh b/scripts/create_readme.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 
-common/create_task_readme/create_task_readme \
-  --task_dir src \
-  --output README.md
+set -e
+
+common/scripts/create_task_readme --input src/api
diff --git a/scripts/datasets/neurips-2023-data.sh → ...pts/create_resources/neurips-2023-data.sh b/scripts/datasets/neurips-2023-data.sh → ...pts/create_resources/neurips-2023-data.sh
@@ -42,19 +42,19 @@ nextflow run \
 
 echo ">> Run method"
 viash run src/control_methods/mean_across_compounds/config.vsh.yaml -- \
-  --de_train_h5ad "$OUT/de_train.h5ad" \
-  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --de_train "$OUT/de_train.h5ad" \
+  --de_test "$OUT/de_test.h5ad" \
   --id_map "$OUT/id_map.csv" \
   --output "$OUT/prediction.h5ad"
 
 echo ">> Run metric"
 viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
   --prediction "$OUT/prediction.h5ad" \
-  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --de_test "$OUT/de_test.h5ad" \
   --output "$OUT/score.h5ad"
 
 echo ">> Uploading results to S3"
-# aws s3 sync --profile op \
-#   "resources/datasets" \
-#   "s3://openproblems-data/resources/perturbation_prediction/datasets/" \
-#   --delete --dryrun
+aws s3 sync --profile op \
+  "resources/datasets" \
+  "s3://openproblems-data/resources/task_perturbation_prediction/datasets/" \
+  --delete --dryrun
diff --git a/scripts/datasets/neurips-2023-kaggle.sh → ...s/create_resources/neurips-2023-kaggle.sh b/scripts/datasets/neurips-2023-kaggle.sh → ...s/create_resources/neurips-2023-kaggle.sh
@@ -35,21 +35,21 @@ viash run src/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yaml --
 
 echo ">> Run method"
 viash run src/control_methods/mean_across_compounds/config.vsh.yaml -- \
-  --de_train_h5ad "$OUT/de_train.h5ad" \
-  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --de_train "$OUT/de_train.h5ad" \
+  --de_test "$OUT/de_test.h5ad" \
   --id_map "$OUT/id_map.csv" \
   --output "$OUT/prediction.h5ad"
 
 echo ">> Run metric"
 viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
   --prediction "$OUT/prediction.h5ad" \
-  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --de_test "$OUT/de_test.h5ad" \
   --output "$OUT/score.h5ad"
 
 cat > "$OUT/state.yaml" <<'EOF'
 id: neurips-2023-kaggle
-de_train_h5ad: !file de_train.h5ad
-de_test_h5ad: !file de_test.h5ad
+de_train: !file de_train.h5ad
+de_test: !file de_test.h5ad
 id_map: !file id_map.csv
 EOF
 

diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh
diff --git a/scripts/init_submodule.sh b/scripts/init_submodule.sh
diff --git a/scripts/project/build_all_components.sh b/scripts/project/build_all_components.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html)
+viash ns build --parallel
diff --git a/scripts/project/build_all_docker_containers.sh b/scripts/project/build_all_docker_containers.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -e
+
+# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html)
+# and set up the container via a cached build
+viash ns build --parallel --setup cachedbuild
diff --git a/scripts/project/test_all_components.sh b/scripts/project/test_all_components.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+# Test all components in a namespace (refer https://viash.io/reference/cli/ns_test.html)
+viash ns test --parallel
diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# NOTE: depending on the the datasets and components, you may need to launch this workflow
+# on a different compute platform (e.g. a HPC, AWS Cloud, Azure Cloud, Google Cloud).
+# please refer to the nextflow information for more details:
+# https://www.nextflow.io/docs/latest/
+
+set -e
+
+echo "Running benchmark on test data"
+echo "  Make sure to run 'scripts/project/build_all_docker_containers.sh'!"
+
+# generate a unique id
+resources_dir="resources"
+RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
+publish_dir="resources/results/${RUN_ID}"
+
+# write the parameters to file
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: neurips-2023-data
+    de_train: "$resources_dir/neurips-2023-data/de_train.h5ad"
+    de_test: "$resources_dir/neurips-2023-data/de_test.h5ad"
+    id_map: "$resources_dir/neurips-2023-data/id_map.csv"
+    layer: clipped_sign_log10_pval
+  - id: neurips-2023-kaggle
+    de_train: "$resources_dir/neurips-2023-kaggle/de_train.h5ad"
+    de_test: "$resources_dir/neurips-2023-kaggle/de_test.h5ad"
+    id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv"
+    layer: sign_log10_pval
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+# run the benchmark
+nextflow run . \
+  -main-script target/nextflow/workflows/run_benchmark/main.nf \
+  -profile docker \
+  -resume \
+  -c common/nextflow_helpers/labels_ci.config \
+  -params-file /tmp/params.yaml
diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+# generate a unique id
+RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
+resources_dir="s3://openproblems-data/resources/task_perturbation_prediction/datasets/"
+publish_dir="s3://openproblems-data/resources/task_perturbation_prediction/results/${RUN_ID}"
+
+# write the parameters to file
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: neurips-2023-data
+    de_train: "$resources_dir/neurips-2023-data/de_train.h5ad"
+    de_test: "$resources_dir/neurips-2023-data/de_test.h5ad"
+    id_map: "$resources_dir/neurips-2023-data/id_map.csv"
+    layer: clipped_sign_log10_pval
+  # - id: neurips-2023-kaggle
+  #   de_train: "$resources_dir/neurips-2023-kaggle/de_train.h5ad"
+  #   de_test: "$resources_dir/neurips-2023-kaggle/de_test.h5ad"
+  #   id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv"
+  #   layer: sign_log10_pval
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+tw launch https://github.com/openproblems-bio/task_perturbation_prediction.git \
+  --revision build/main \
+  --pull-latest \
+  --main-script target/nextflow/workflows/run_benchmark/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config common/nextflow_helpers/labels_tw.config \
+  --labels task_perturbation_prediction,full
diff --git a/scripts/run_stability_test.sh → scripts/run_benchmark/run_stability_test.sh b/scripts/run_stability_test.sh → scripts/run_benchmark/run_stability_test.sh
diff --git a/scripts/run_stability_tw.sh → scripts/run_benchmark/run_stability_tw.sh b/scripts/run_stability_tw.sh → scripts/run_benchmark/run_stability_tw.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 RUN_ID="stability_$(date +%Y-%m-%d_%H-%M-%S)"
-publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"
+publish_dir="s3://openproblems-data/resources/task_perturbation_prediction/results/${RUN_ID}"
 
 cat > /tmp/params.yaml << HERE
 id: neurips-2023-data
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# if users change the scripts, the changes should not be committed.
		/create__.s