diff --git a/scripts/run_benchmark_tw_traens.sh b/scripts/run_benchmark_tw_traens.sh new file mode 100755 index 00000000..908a150f --- /dev/null +++ b/scripts/run_benchmark_tw_traens.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# todo: remove this before merging PR #64 + +RUN_ID="traens_$(date +%Y-%m-%d_%H-%M-%S)" +resources_dir="s3://openproblems-bio/public/neurips-2023-competition/workflow-resources" +publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/results/${RUN_ID}" + +cat > /tmp/params.yaml << HERE +param_list: + - id: neurips-2023-data + de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad" + de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad" + id_map: "$resources_dir/neurips-2023-data/id_map.csv" + layer: clipped_sign_log10_pval +method_ids: [transformer_ensemble] +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \ + --revision fix_trafo_ens_build \ + --pull-latest \ + --main-script target/nextflow/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --config src/common/nextflow_helpers/labels_tw.config \ No newline at end of file diff --git a/src/task/methods/transformer_ensemble/config.vsh.yaml b/src/task/methods/transformer_ensemble/config.vsh.yaml index ed72ac68..057f06fb 100644 --- a/src/task/methods/transformer_ensemble/config.vsh.yaml +++ b/src/task/methods/transformer_ensemble/config.vsh.yaml @@ -26,13 +26,24 @@ functionality: description: "Number of training epochs." info: test_default: 10 + - name: --d_model + type: integer + default: 128 + description: "Dimensionality of the model." + - name: --batch_size + type: integer + default: 32 + description: "Batch size." + - name: --early_stopping + type: integer + default: 5000 + description: "Number of epochs to wait for early stopping." resources: - type: python_script path: script.py - path: models.py - path: utils.py - path: train.py - - path: ../../utils/anndata_to_dataframe.py platforms: - type: docker image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 diff --git a/src/task/methods/transformer_ensemble/script.py b/src/task/methods/transformer_ensemble/script.py index 5f48d5e7..79764953 100644 --- a/src/task/methods/transformer_ensemble/script.py +++ b/src/task/methods/transformer_ensemble/script.py @@ -2,18 +2,21 @@ import anndata as ad import sys import torch -import copy +import os device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ## VIASH START par = { - "de_train_h5ad": "resources/neurips-2023-data/de_train.h5ad", - "de_train": "resources/neurips-2023-data/de_train.h5ad", - "id_map": "resources/neurips-2023-data/id_map.csv", - "output": "output.h5ad", + "de_train_h5ad": "resources/neurips-2023-kaggle/de_train.h5ad", + "id_map": "resources/neurips-2023-kaggle/id_map.csv", + "output": "output/prediction.h5ad", + "output_model": "output/model/", "num_train_epochs": 10, - "layer": "clipped_sign_log10_pval" + "early_stopping": 5000, + "batch_size": 64, + "d_model": 128, + "layer": "sign_log10_pval" } meta = { "resources_dir": "src/task/methods/transformer_ensemble", @@ -22,29 +25,32 @@ sys.path.append(meta["resources_dir"]) -# Fixed training params -d_model = 128 -batch_size = 32 -early_stopping = 5000 - -from anndata_to_dataframe import anndata_to_dataframe from utils import prepare_augmented_data, prepare_augmented_data_mean_only from train import train_k_means_strategy, train_non_k_means_strategy +# create output model directory if need be +if par["output_model"]: + os.makedirs(par["output_model"], exist_ok=True) + # read data de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) -de_train = anndata_to_dataframe(de_train_h5ad, par["layer"]) id_map = pd.read_csv(par["id_map"]) +# convert .obs categoricals to string for ease of use +for col in de_train_h5ad.obs.select_dtypes(include=["category"]).columns: + de_train_h5ad.obs[col] = de_train_h5ad.obs[col].astype(str) +# reset index +de_train_h5ad.obs.reset_index(drop=True, inplace=True) + # determine other variables gene_names = list(de_train_h5ad.var_names) n_components = len(gene_names) # train and predict models +# note, the weights intentionally don't add up to one argsets = [ # Note by author - weight_df1: 0.5 (utilizing std, mean, and clustering sampling, yielding 0.551) { - "name": "weight_df1", "mean_std": "mean_std", "uncommon": False, "sampling_strategy": "random", @@ -52,7 +58,6 @@ }, # Note by author - weight_df2: 0.25 (excluding uncommon elements, resulting in 0.559) { - "name": "weight_df2", "mean_std": "mean_std", "uncommon": True, "sampling_strategy": "random", @@ -60,7 +65,6 @@ }, # Note by author - weight_df3: 0.25 (leveraging clustering sampling, achieving 0.575) { - "name": "weight_df3", "mean_std": "mean_std", "uncommon": False, # should this be set to False or True? "sampling_strategy": "k-means", @@ -68,9 +72,8 @@ }, # Note by author - weight_df4: 0.3 (incorporating mean, random sampling, and excluding std, attaining 0.554) { - "name": "weight_df4", "mean_std": "mean", - "uncommon": False, # should this be set to False or True? + "uncommon": False, "sampling_strategy": "random", "weight": 0.3, } @@ -80,19 +83,22 @@ predictions = [] print(f"Train and predict models", flush=True) -for argset in argsets: - print(f"Train and predict model {argset['name']}", flush=True) +for i, argset in enumerate(argsets): + print(f"Train and predict model {i+1}/{len(argsets)}", flush=True) print(f"> Prepare augmented data", flush=True) if argset["mean_std"] == "mean_std": one_hot_encode_features, targets, one_hot_test = prepare_augmented_data( - de_train=copy.deepcopy(de_train), - id_map=copy.deepcopy(id_map), + de_train_h5ad=de_train_h5ad, + id_map=id_map, + layer=par["layer"], uncommon=argset["uncommon"], ) elif argset["mean_std"] == "mean": - one_hot_encode_features, targets, one_hot_test = ( - prepare_augmented_data_mean_only(de_train=de_train, id_map=id_map) + one_hot_encode_features, targets, one_hot_test = prepare_augmented_data_mean_only( + de_train_h5ad=de_train_h5ad, + id_map=id_map, + layer=par["layer"], ) else: raise ValueError("Invalid mean_std argument") @@ -101,24 +107,24 @@ if argset["sampling_strategy"] == "k-means": label_reducer, scaler, transformer_model = train_k_means_strategy( n_components=n_components, - d_model=d_model, + d_model=par["d_model"], one_hot_encode_features=one_hot_encode_features, targets=targets, num_epochs=par["num_train_epochs"], - early_stopping=early_stopping, - batch_size=batch_size, + early_stopping=par["early_stopping"], + batch_size=par["batch_size"], device=device, mean_std=argset["mean_std"], ) elif argset["sampling_strategy"] == "random": label_reducer, scaler, transformer_model = train_non_k_means_strategy( n_components=n_components, - d_model=d_model, + d_model=par["d_model"], one_hot_encode_features=one_hot_encode_features, targets=targets, num_epochs=par["num_train_epochs"], - early_stopping=early_stopping, - batch_size=batch_size, + early_stopping=par["early_stopping"], + batch_size=par["batch_size"], device=device, mean_std=argset["mean_std"], ) @@ -138,8 +144,8 @@ print(f"Predict on test data", flush=True) num_samples = len(unseen_data) transformed_data = [] - for i in range(0, num_samples, batch_size): - batch_result = transformer_model(unseen_data[i : i + batch_size]) + for i in range(0, num_samples, par["batch_size"]): + batch_result = transformer_model(unseen_data[i : i + par["batch_size"]]) transformed_data.append(batch_result) transformed_data = torch.vstack(transformed_data) if scaler: @@ -150,13 +156,20 @@ ).to(device) pred = transformed_data.cpu().detach().numpy() + + if par["output_model"]: + model_path = f"{par['output_model']}/model_{i}.pt" + torch.save(transformer_model.state_dict(), model_path) + pred_path = f"{par['output_model']}/pred_{i}.csv" + pd.DataFrame(pred).to_csv(pred_path, index=False) + predictions.append(pred) print(f"Combine predictions", flush=True) # compute weighted sum -sum_weights = sum([argset["weight"] for argset in argsets]) +# note, the weights intentionally don't add up to one weighted_pred = sum([ - pred * argset["weight"] / sum_weights + pred * argset["weight"] for argset, pred in zip(argsets, predictions) ]) diff --git a/src/task/methods/transformer_ensemble/utils.py b/src/task/methods/transformer_ensemble/utils.py index d5beea93..5b4a7a0b 100644 --- a/src/task/methods/transformer_ensemble/utils.py +++ b/src/task/methods/transformer_ensemble/utils.py @@ -22,29 +22,33 @@ def reduce_labels(Y, n_components): def prepare_augmented_data( - de_train, + de_train_h5ad, id_map, + layer, uncommon=False ): - de_train = de_train.drop(columns = ['split']) xlist = ['cell_type', 'sm_name'] - _ylist = ['cell_type', 'sm_name', 'sm_lincs_id', 'SMILES', 'control'] - y = de_train.drop(columns=_ylist) + y = pd.DataFrame( + de_train_h5ad.layers[layer], + columns=de_train_h5ad.var_names, + index=de_train_h5ad.obs.index + ) # Combine train and test data for one-hot encoding - combined_data = pd.concat([de_train[xlist], id_map[xlist]]) + combined_data = pd.concat([de_train_h5ad.obs[xlist], id_map[xlist]]) dum_data = pd.get_dummies(combined_data, columns=xlist) # Split the combined data back into train and test - train = dum_data.iloc[:len(de_train)] - test = dum_data.iloc[len(de_train):] + train = dum_data.iloc[:de_train_h5ad.n_obs] + test = dum_data.iloc[de_train_h5ad.n_obs:] if uncommon: uncommon = [f for f in train if f not in test] X = train.drop(columns=uncommon) X = train - de_cell_type = de_train.iloc[:, [0] + list(range(5, de_train.shape[1]))] - de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))] + + de_cell_type = pd.concat([de_train_h5ad.obs[['cell_type']], y], axis=1) + de_sm_name = pd.concat([de_train_h5ad.obs[['sm_name']], y], axis=1) mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index() std_cell_type = de_cell_type.groupby('cell_type').std().reset_index().fillna(0) @@ -107,29 +111,30 @@ def prepare_augmented_data( def prepare_augmented_data_mean_only( - de_train, + de_train_h5ad, + layer, id_map ): - de_train = de_train.drop(columns = ['split']) xlist = ['cell_type', 'sm_name'] - _ylist = ['cell_type', 'sm_name', 'sm_lincs_id', 'SMILES', 'control'] - y = de_train.drop(columns=_ylist) - # train = pd.get_dummies(de_train[xlist], columns=xlist) - # test = pd.get_dummies(id_map[xlist], columns=xlist) + y = pd.DataFrame( + de_train_h5ad.layers[layer], + columns=de_train_h5ad.var_names, + index=de_train_h5ad.obs.index + ) # Combine train and test data for one-hot encoding - combined_data = pd.concat([de_train[xlist], id_map[xlist]]) + combined_data = pd.concat([de_train_h5ad.obs[xlist], id_map[xlist]]) dum_data = pd.get_dummies(combined_data, columns=xlist) # Split the combined data back into train and test - train = dum_data.iloc[:len(de_train)] - test = dum_data.iloc[len(de_train):] + train = dum_data.iloc[:de_train_h5ad.n_obs] + test = dum_data.iloc[de_train_h5ad.n_obs:] # uncommon = [f for f in train if f not in test] # X = train.drop(columns=uncommon) X = train - de_cell_type = de_train.iloc[:, [0] + list(range(5, de_train.shape[1]))] - de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))] + de_cell_type = pd.concat([de_train_h5ad.obs[['cell_type']], y], axis=1) + de_sm_name = pd.concat([de_train_h5ad.obs[['sm_name']], y], axis=1) mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index() mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index() rows = []