Skip to content

Commit

Permalink
update runable code
Browse files Browse the repository at this point in the history
  • Loading branch information
HelloWorldLTY committed Apr 28, 2024
1 parent 75e84b7 commit 3e43c11
Show file tree
Hide file tree
Showing 18 changed files with 3,794 additions and 23 deletions.
95 changes: 95 additions & 0 deletions models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Model Architecture
import torch
import torch.nn as nn
import torch.optim


class CustomTransformer(nn.Module):
def __init__(self, num_features, num_labels, d_model=128, num_heads=8, num_layers=6): # num_heads=8
super(CustomTransformer, self).__init__()
self.embedding = nn.Linear(num_features, d_model)
# Embedding layer for sparse features
# self.embedding = nn.Embedding(num_features, d_model)

# self.norm = nn.BatchNorm1d(d_model, affine=True)
self.norm = nn.LayerNorm(d_model)
# self.transformer = nn.Transformer(d_model=d_model, nhead=num_heads, num_encoder_layers=num_layers,
# dropout=0.1, device='cuda')
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, device='cuda', dropout=0.3,
activation=nn.GELU(),
batch_first=True), enable_nested_tensor=True, num_layers=num_layers
)
# Dropout layer for regularization
# self.dropout = nn.Dropout(0.2)
self.fc = nn.Linear(d_model, num_labels)

def forward(self, x):
x = self.embedding(x)

# x = (self.transformer(x,x))
x = self.transformer(x)
x = self.norm(x)
# x = self.fc(self.dropout(x))
x = self.fc(x)
return x


class CustomTransformer_v3(nn.Module): # mean + std
def __init__(self, num_features, num_labels, d_model=128, num_heads=8, num_layers=6, dropout=0.3):
super(CustomTransformer_v3, self).__init__()
self.num_target_encodings = 18211 * 4
self.num_sparse_features = num_features - self.num_target_encodings

self.sparse_feature_embedding = nn.Linear(self.num_sparse_features, d_model)
self.target_encoding_embedding = nn.Linear(self.num_target_encodings, d_model)
self.norm = nn.LayerNorm(d_model)

self.concatenation_layer = nn.Linear(2 * d_model, d_model)
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, activation=nn.GELU(),
batch_first=True),
num_layers=num_layers
)
self.fc = nn.Linear(d_model, num_labels)

def forward(self, x):
sparse_features = x[:, :self.num_sparse_features]
target_encodings = x[:, self.num_sparse_features:]

sparse_features = self.sparse_feature_embedding(sparse_features)
target_encodings = self.target_encoding_embedding(target_encodings)

combined_features = torch.cat((sparse_features, target_encodings), dim=1)
combined_features = self.concatenation_layer(combined_features)
combined_features = self.norm(combined_features)

x = self.transformer(combined_features)
x = self.norm(x)

x = self.fc(x)
return x


class CustomMLP(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, num_layers=6, dropout=0.3, layer_norm=True):
super(CustomMLP, self).__init__()
layers = []

for _ in range(num_layers):
if layer_norm:
layers.append(nn.LayerNorm(input_dim))
layers.append(nn.Linear(input_dim, hidden_dim))
layers.append(nn.ReLU())
if dropout > 0:
layers.append(nn.Dropout(p=dropout))
input_dim = hidden_dim

self.model = nn.Sequential(*layers)
self.fc = nn.Linear(hidden_dim, output_dim)

def forward(self, x):
x = self.model(x)
x = self.fc(x)
return x

47 changes: 24 additions & 23 deletions scripts/add_a_method.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,39 @@
# exit 1

# # sync resources
# scripts/download_resources.sh
scripts/download_resources.sh

# create a new component
method_id="lb2"
method_lang="python" # change this to "r" if need be

# viash run src/common/create_component/config.vsh.yaml -- \
# --language "$method_lang" \
# --name "$method_id"
viash run src/common/create_component/config.vsh.yaml -- \
--language "$method_lang" \
--name "$method_id"

# TODO: fill in required fields in src/task/methods/foo/config.vsh.yaml
# TODO: edit src/task/methods/foo/script.py/R

# test the component
viash test src/task/methods/$method_id/config.vsh.yaml

# # rebuild the container (only if you change something to the docker platform)
# # You can reduce the memory and cpu allotted to jobs in _viash.yaml by modifying .platforms[.type == "nextflow"].config.labels
# viash run src/task/methods/$method_id/config.vsh.yaml -- \
# ---setup cachedbuild ---verbose

# # # run the method
# viash run src/task/methods/$method_id/config.vsh.yaml -- \
# --de_train "resources/neurips-2023-kaggle/de_train.parquet" \
# --id_map "resources/neurips-2023-kaggle/id_map.csv" \
# --output "output/prediction.parquet"

# # run evaluation metric
# viash run src/task/metrics/mean_rowwise_rmse/config.vsh.yaml -- \
# --de_test "resources/neurips-2023-kaggle/de_test.parquet" \
# --prediction "output/prediction.parquet" \
# --output "output/score.h5ad"

# # print score on kaggle test dataset
# python -c 'import anndata; print(anndata.read_h5ad("output/score.h5ad").uns)'
# rebuild the container (only if you change something to the docker platform)
# You can reduce the memory and cpu allotted to jobs in _viash.yaml by modifying .platforms[.type == "nextflow"].config.labels
viash run src/task/methods/$method_id/config.vsh.yaml -- \
---setup cachedbuild ---verbose

# # run the method
viash run src/task/methods/$method_id/config.vsh.yaml -- \
--de_train "resources/neurips-2023-kaggle/de_train.parquet" \
--de_test "resources/neurips-2023-kaggle/de_test.parquet" \
--id_map "resources/neurips-2023-kaggle/id_map.csv" \
--output "output/prediction.parquet"

# run evaluation metric
viash run src/task/metrics/mean_rowwise_rmse/config.vsh.yaml -- \
--de_test "resources/neurips-2023-kaggle/de_test.parquet" \
--prediction "output/prediction.parquet" \
--output "output/score.h5ad"

# print score on kaggle test dataset
python -c 'import anndata; print(anndata.read_h5ad("output/score.h5ad").uns)'
41 changes: 41 additions & 0 deletions src/common/sync_test_resources/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
functionality:
name: "sync_test_resources"
namespace: "common"
version: "dev"
description: Synchronise the test resources from s3 to resources_test
usage: |
sync_test_resources
sync_test_resources --input s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/ --output resources
arguments:
- name: "--input"
alternatives: ["-i"]
type: string
description: "Path to the S3 bucket to sync from."
default: "s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/"
- name: "--output"
alternatives: ["-o"]
type: file
default: resources
direction: output
description: "Path to the test resource directory."
- name: "--quiet"
type: boolean_true
description: "Displays the operations that would be performed using the specified command without actually running them."
- name: "--dryrun"
type: boolean_true
description: "Does not display the operations performed from the specified command."
- name: "--delete"
type: boolean_true
description: "Files that exist in the destination but not in the source are deleted during sync."
- name: "--exclude"
type: "string"
multiple: true
description: Exclude all files or objects from the command that matches the specified pattern.
resources:
- type: bash_script
path: script.sh
platforms:
- type: docker
image: "amazon/aws-cli:2.7.12"
- type: native
- type: nextflow
34 changes: 34 additions & 0 deletions src/common/sync_test_resources/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

## VIASH START
par_input='s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/'
par_output='resources_test'
## VIASH END

extra_params=( )

if [ "$par_quiet" == "true" ]; then
extra_params+=( "--quiet" )
fi
if [ "$par_dryrun" == "true" ]; then
extra_params+=( "--dryrun" )
fi
if [ "$par_delete" == "true" ]; then
extra_params+=( "--delete" )
fi

if [ ! -z ${par_exclude+x} ]; then
IFS=":"
for var in $par_exclude; do
unset IFS
extra_params+=( "--exclude" "$var" )
done
fi


# Disable the use of the Amazon EC2 instance metadata service (IMDS).
# see https://florian.ec/blog/github-actions-awscli-errors/
# or https://github.com/aws/aws-cli/issues/5234#issuecomment-705831465
export AWS_EC2_METADATA_DISABLED=true

aws s3 sync "$par_input" "$par_output" --no-sign-request "${extra_params[@]}"
39 changes: 39 additions & 0 deletions src/task/methods/lb2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Open Problems – Single-Cell Perturbations 2nd Place Solution

[Kaggle's Open Problems – Single-Cell Perturbations competition. ](https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/overview)
### Setup
cuda version - cu121
python 3.8
```bash
pip install -r requirements.txt
```


To train the models modify config_train.yaml and run:

```bash
python train.py --config config_train.yaml
```


To run the models modify config_test.yaml and run:

```bash
python predict.py --config config_test.yaml
```

Make sure you have a file names sample_submission.csv which is the same
as the resulted dataframe but filled with zeros.

### Hardware
1x Nvidia RTX 3080 mobile
Windows 11
11th Gen Intel(R) Core(TM) i7-11800H


## Sources
1. [Single Cell Perturbations Kaggle 2nd place solution](https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458738)
2. [Lion PyTorch by lucidrains](https://github.com/lucidrains/lion-pytorch)
3. [Understanding the Competition - Open Problems](https://www.kaggle.com/code/ayushs9020/understanding-the-competition-open-problems)
4. [OP2 EDA Baseline by alexandervc](https://www.kaggle.com/code/alexandervc/op2-eda-baseline-s)

68 changes: 68 additions & 0 deletions src/task/methods/lb2/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# The API specifies which type of component this is.
# It contains specifications for:
# - The input/output files
# - Common parameters
# - A unit test
__merge__: ../../api/comp_method.yaml

functionality:
# A unique identifier for your component (required).
# Can contain only lowercase letters or underscores.
name: lb2

# Metadata for your component
info:
# A relatively short label, used when rendering visualisations (required)
label: lb2
# A one sentence summary of how this method works (required). Used when
# rendering summary tables.
summary: "FILL IN: A one sentence summary of this method."
# A multi-line description of how this component works (required). Used
# when rendering reference documentation.
description: |
FILL IN: A (multi-line) description of how this method works.
# Which normalisation method this component prefers to use (required).
preferred_normalization: log_cp10k
# A reference key from the bibtex library at src/common/library.bib (required).
reference: bibtex_reference_key
# URL to the documentation for this method (required).
documentation_url: https://url.to/the/documentation
# URL to the code repository for this method (required).
repository_url: https://github.com/organisation/repository

# Component-specific parameters (optional)
# arguments:
# - type: file
# name: "--config"
# default: "config_train.yaml"

# Resources required to run the component
resources:
# The script of your component (required)
- type: python_script
path: script.py
- path: models.py
- path: utils.py
- path: sample_submission.csv

platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4
setup:
- type: python
packages: [anndata,
pytorch==2.1.0,
torchvision==0.16.0,
torchaudio==2.1.0,
fastparquet ,
pyarrow,
pandas~=2.0.3,
scikit-learn~=1.0.1,
tqdm~=4.66.1,
numpy~=1.23,
matplotlib~=3.5.0,
PyYAML~=6.0.1,
lion-pytorch ]
- type: nextflow
directives:
label: [ midtime, lowmem, midcpu ]
10 changes: 10 additions & 0 deletions src/task/methods/lb2/config_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
n_components_list: # targets dimension list
- 18211
d_models_list:
- 128
batch_size: 5
data_file: 'de_train.parquet'
id_map_file: 'id_map.csv'
device: cuda
seed: null
models_dir: 'trained_models'
13 changes: 13 additions & 0 deletions src/task/methods/lb2/config_train.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
n_components_list: # targets dimension list
- 18211
d_models_list:
- 128
batch_size: 32
sampling_strategy: k-means # Choose either 'k-means' or 'random'
data_file: "/gpfs/gibbs/pi/zhao/tl688/single_cell_pb/de_train.parquet"
id_map_file: "/gpfs/gibbs/pi/zhao/tl688/single_cell_pb/id_map.csv"
validation_percentage: 0.1
device: cuda
seed: null
num_epochs: 1 #20000
early_stopping: 5000
Loading

0 comments on commit 3e43c11

Please sign in to comment.