update runable code

openproblems-bio · Apr 28, 2024 · 3e43c11 · 3e43c11
1 parent 75e84b7
commit 3e43c11
Show file tree

Hide file tree

Showing 18 changed files with 3,794 additions and 23 deletions.
diff --git a/models.py b/models.py
@@ -0,0 +1,95 @@
+# Model Architecture
+import torch
+import torch.nn as nn
+import torch.optim
+
+
+class CustomTransformer(nn.Module):
+    def __init__(self, num_features, num_labels, d_model=128, num_heads=8, num_layers=6):  # num_heads=8
+        super(CustomTransformer, self).__init__()
+        self.embedding = nn.Linear(num_features, d_model)
+        # Embedding layer for sparse features
+        # self.embedding = nn.Embedding(num_features, d_model)
+
+        # self.norm = nn.BatchNorm1d(d_model, affine=True)
+        self.norm = nn.LayerNorm(d_model)
+        # self.transformer = nn.Transformer(d_model=d_model, nhead=num_heads, num_encoder_layers=num_layers,
+        #                                 dropout=0.1, device='cuda')
+        self.transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, device='cuda', dropout=0.3,
+                                       activation=nn.GELU(),
+                                       batch_first=True), enable_nested_tensor=True, num_layers=num_layers
+        )
+        # Dropout layer for regularization
+        # self.dropout = nn.Dropout(0.2)
+        self.fc = nn.Linear(d_model, num_labels)
+
+    def forward(self, x):
+        x = self.embedding(x)
+
+        # x = (self.transformer(x,x))
+        x = self.transformer(x)
+        x = self.norm(x)
+        # x = self.fc(self.dropout(x))
+        x = self.fc(x)
+        return x
+
+
+class CustomTransformer_v3(nn.Module):  # mean + std
+    def __init__(self, num_features, num_labels, d_model=128, num_heads=8, num_layers=6, dropout=0.3):
+        super(CustomTransformer_v3, self).__init__()
+        self.num_target_encodings = 18211 * 4
+        self.num_sparse_features = num_features - self.num_target_encodings
+
+        self.sparse_feature_embedding = nn.Linear(self.num_sparse_features, d_model)
+        self.target_encoding_embedding = nn.Linear(self.num_target_encodings, d_model)
+        self.norm = nn.LayerNorm(d_model)
+
+        self.concatenation_layer = nn.Linear(2 * d_model, d_model)
+        self.transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, activation=nn.GELU(),
+                                       batch_first=True),
+            num_layers=num_layers
+        )
+        self.fc = nn.Linear(d_model, num_labels)
+
+    def forward(self, x):
+        sparse_features = x[:, :self.num_sparse_features]
+        target_encodings = x[:, self.num_sparse_features:]
+
+        sparse_features = self.sparse_feature_embedding(sparse_features)
+        target_encodings = self.target_encoding_embedding(target_encodings)
+
+        combined_features = torch.cat((sparse_features, target_encodings), dim=1)
+        combined_features = self.concatenation_layer(combined_features)
+        combined_features = self.norm(combined_features)
+
+        x = self.transformer(combined_features)
+        x = self.norm(x)
+
+        x = self.fc(x)
+        return x
+
+
+class CustomMLP(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=6, dropout=0.3, layer_norm=True):
+        super(CustomMLP, self).__init__()
+        layers = []
+
+        for _ in range(num_layers):
+            if layer_norm:
+                layers.append(nn.LayerNorm(input_dim))
+            layers.append(nn.Linear(input_dim, hidden_dim))
+            layers.append(nn.ReLU())
+            if dropout > 0:
+                layers.append(nn.Dropout(p=dropout))
+            input_dim = hidden_dim
+
+        self.model = nn.Sequential(*layers)
+        self.fc = nn.Linear(hidden_dim, output_dim)
+
+    def forward(self, x):
+        x = self.model(x)
+        x = self.fc(x)
+        return x
+
diff --git a/scripts/add_a_method.sh b/scripts/add_a_method.sh
@@ -5,38 +5,39 @@
 # exit 1
 
 # # sync resources
-# scripts/download_resources.sh
+scripts/download_resources.sh
 
 # create a new component
 method_id="lb2"
 method_lang="python" # change this to "r" if need be
 
-# viash run src/common/create_component/config.vsh.yaml -- \
-#   --language "$method_lang" \
-#   --name "$method_id"
+viash run src/common/create_component/config.vsh.yaml -- \
+  --language "$method_lang" \
+  --name "$method_id"
 
 # TODO: fill in required fields in src/task/methods/foo/config.vsh.yaml
 # TODO: edit src/task/methods/foo/script.py/R
 
 # test the component
 viash test src/task/methods/$method_id/config.vsh.yaml
 
-# # rebuild the container (only if you change something to the docker platform)
-# # You can reduce the memory and cpu allotted to jobs in _viash.yaml by modifying .platforms[.type == "nextflow"].config.labels
-# viash run src/task/methods/$method_id/config.vsh.yaml -- \
-#   ---setup cachedbuild ---verbose
-
-# # # run the method
-# viash run src/task/methods/$method_id/config.vsh.yaml -- \
-#   --de_train "resources/neurips-2023-kaggle/de_train.parquet" \
-#   --id_map "resources/neurips-2023-kaggle/id_map.csv" \
-#   --output "output/prediction.parquet"
-
-# # run evaluation metric
-# viash run src/task/metrics/mean_rowwise_rmse/config.vsh.yaml -- \
-#   --de_test "resources/neurips-2023-kaggle/de_test.parquet" \
-#   --prediction "output/prediction.parquet" \
-#   --output "output/score.h5ad"
-
-# # print score on kaggle test dataset
-# python -c 'import anndata; print(anndata.read_h5ad("output/score.h5ad").uns)'
+# rebuild the container (only if you change something to the docker platform)
+# You can reduce the memory and cpu allotted to jobs in _viash.yaml by modifying .platforms[.type == "nextflow"].config.labels
+viash run src/task/methods/$method_id/config.vsh.yaml -- \
+  ---setup cachedbuild ---verbose
+
+# # run the method
+viash run src/task/methods/$method_id/config.vsh.yaml -- \
+  --de_train "resources/neurips-2023-kaggle/de_train.parquet" \
+  --de_test "resources/neurips-2023-kaggle/de_test.parquet" \
+  --id_map "resources/neurips-2023-kaggle/id_map.csv" \
+  --output "output/prediction.parquet"
+
+# run evaluation metric
+viash run src/task/metrics/mean_rowwise_rmse/config.vsh.yaml -- \
+  --de_test "resources/neurips-2023-kaggle/de_test.parquet" \
+  --prediction "output/prediction.parquet" \
+  --output "output/score.h5ad"
+
+# print score on kaggle test dataset
+python -c 'import anndata; print(anndata.read_h5ad("output/score.h5ad").uns)'
diff --git a/src/common/sync_test_resources/config.vsh.yaml b/src/common/sync_test_resources/config.vsh.yaml
@@ -0,0 +1,41 @@
+functionality:
+  name: "sync_test_resources"
+  namespace: "common"
+  version: "dev"
+  description: Synchronise the test resources from s3 to resources_test
+  usage: |
+    sync_test_resources
+    sync_test_resources --input s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/ --output resources
+  arguments:
+    - name: "--input"
+      alternatives: ["-i"]
+      type: string
+      description: "Path to the S3 bucket to sync from."
+      default: "s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/"
+    - name: "--output"
+      alternatives: ["-o"]
+      type: file
+      default: resources
+      direction: output
+      description: "Path to the test resource directory."
+    - name: "--quiet"
+      type: boolean_true
+      description: "Displays the operations that would be performed using the specified command without actually running them."
+    - name: "--dryrun"
+      type: boolean_true
+      description: "Does not display the operations performed from the specified command."
+    - name: "--delete"
+      type: boolean_true
+      description: "Files that exist in the destination but not in the source are deleted during sync."
+    - name: "--exclude"
+      type: "string"
+      multiple: true
+      description: Exclude all files or objects from the command that matches the specified pattern.
+  resources:
+    - type: bash_script
+      path: script.sh
+platforms:
+  - type: docker
+    image: "amazon/aws-cli:2.7.12"
+  - type: native
+  - type: nextflow
diff --git a/src/common/sync_test_resources/script.sh b/src/common/sync_test_resources/script.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+## VIASH START
+par_input='s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/'
+par_output='resources_test'
+## VIASH END
+
+extra_params=( )
+
+if [ "$par_quiet" == "true" ]; then
+  extra_params+=( "--quiet" )
+fi
+if [ "$par_dryrun" == "true" ]; then
+  extra_params+=( "--dryrun" )
+fi
+if [ "$par_delete" == "true" ]; then
+  extra_params+=( "--delete" )
+fi
+
+if [ ! -z ${par_exclude+x} ]; then
+  IFS=":"
+  for var in $par_exclude; do
+    unset IFS
+    extra_params+=( "--exclude" "$var" )
+  done
+fi
+
+
+# Disable the use of the Amazon EC2 instance metadata service (IMDS).
+# see https://florian.ec/blog/github-actions-awscli-errors/
+# or https://github.com/aws/aws-cli/issues/5234#issuecomment-705831465
+export AWS_EC2_METADATA_DISABLED=true
+
+aws s3 sync "$par_input" "$par_output" --no-sign-request "${extra_params[@]}"
diff --git a/src/task/methods/lb2/README.md b/src/task/methods/lb2/README.md
@@ -0,0 +1,39 @@
+# Open Problems – Single-Cell Perturbations 2nd Place Solution
+
+[Kaggle's Open Problems – Single-Cell Perturbations competition. ](https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/overview)
+### Setup 
+cuda version - cu121
+python 3.8
+```bash
+pip install -r  requirements.txt
+```
+
+
+To train the models modify config_train.yaml and run:
+
+```bash
+python train.py --config config_train.yaml
+```
+
+
+To run the models modify config_test.yaml and run:
+
+```bash
+python predict.py --config config_test.yaml
+```
+
+Make sure you have a file names sample_submission.csv which is the same
+as the resulted dataframe but filled with zeros.
+
+### Hardware
+1x Nvidia RTX 3080 mobile
+Windows 11
+11th Gen Intel(R) Core(TM) i7-11800H
+
+
+## Sources
+1. [Single Cell Perturbations Kaggle 2nd place solution](https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458738)
+2. [Lion PyTorch by lucidrains](https://github.com/lucidrains/lion-pytorch)
+3. [Understanding the Competition - Open Problems](https://www.kaggle.com/code/ayushs9020/understanding-the-competition-open-problems)
+4. [OP2 EDA Baseline by alexandervc](https://www.kaggle.com/code/alexandervc/op2-eda-baseline-s)
+
diff --git a/src/task/methods/lb2/config.vsh.yaml b/src/task/methods/lb2/config.vsh.yaml
@@ -0,0 +1,68 @@
+# The API specifies which type of component this is.
+# It contains specifications for:
+#   - The input/output files
+#   - Common parameters
+#   - A unit test
+__merge__: ../../api/comp_method.yaml
+
+functionality:
+  # A unique identifier for your component (required).
+  # Can contain only lowercase letters or underscores.
+  name: lb2
+
+  # Metadata for your component
+  info:
+    # A relatively short label, used when rendering visualisations (required)
+    label: lb2
+    # A one sentence summary of how this method works (required). Used when 
+    # rendering summary tables.
+    summary: "FILL IN: A one sentence summary of this method."
+    # A multi-line description of how this component works (required). Used
+    # when rendering reference documentation.
+    description: |
+      FILL IN: A (multi-line) description of how this method works.
+    # Which normalisation method this component prefers to use (required).
+    preferred_normalization: log_cp10k
+    # A reference key from the bibtex library at src/common/library.bib (required).
+    reference: bibtex_reference_key
+    # URL to the documentation for this method (required).
+    documentation_url: https://url.to/the/documentation
+    # URL to the code repository for this method (required).
+    repository_url: https://github.com/organisation/repository
+
+  # Component-specific parameters (optional)
+  # arguments:
+  #   - type: file
+  #     name: "--config"
+  #     default: "config_train.yaml"
+
+  # Resources required to run the component
+  resources:
+    # The script of your component (required)
+    - type: python_script
+      path: script.py
+    - path: models.py
+    - path: utils.py
+    - path: sample_submission.csv
+
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4
+    setup:
+      - type: python
+        packages: [anndata,
+        pytorch==2.1.0,
+        torchvision==0.16.0,
+        torchaudio==2.1.0,
+        fastparquet ,
+        pyarrow,
+           pandas~=2.0.3,
+scikit-learn~=1.0.1,
+tqdm~=4.66.1,
+numpy~=1.23,
+matplotlib~=3.5.0,
+PyYAML~=6.0.1,
+lion-pytorch ]
+  - type: nextflow
+    directives:
+      label: [ midtime, lowmem, midcpu ]
diff --git a/src/task/methods/lb2/config_test.yaml b/src/task/methods/lb2/config_test.yaml
@@ -0,0 +1,10 @@
+n_components_list: # targets dimension list
+  - 18211
+d_models_list:
+  - 128
+batch_size: 5
+data_file: 'de_train.parquet'
+id_map_file: 'id_map.csv'
+device: cuda
+seed: null
+models_dir: 'trained_models'
diff --git a/src/task/methods/lb2/config_train.yaml b/src/task/methods/lb2/config_train.yaml
@@ -0,0 +1,13 @@
+n_components_list: # targets dimension list
+  - 18211
+d_models_list:
+  - 128
+batch_size: 32
+sampling_strategy: k-means  # Choose either 'k-means' or 'random'
+data_file: "/gpfs/gibbs/pi/zhao/tl688/single_cell_pb/de_train.parquet"
+id_map_file: "/gpfs/gibbs/pi/zhao/tl688/single_cell_pb/id_map.csv"
+validation_percentage: 0.1
+device: cuda
+seed: null
+num_epochs: 1 #20000
+early_stopping: 5000