diff --git a/scripts/run_benchmark_tw_traens.sh b/scripts/run_benchmark_tw_traens.sh
new file mode 100755
index 00000000..908a150f
--- /dev/null
+++ b/scripts/run_benchmark_tw_traens.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# todo: remove this before merging PR #64
+
+RUN_ID="traens_$(date +%Y-%m-%d_%H-%M-%S)"
+resources_dir="s3://openproblems-bio/public/neurips-2023-competition/workflow-resources"
+publish_dir="s3://openproblems-data/resources/dge_perturbation_prediction/results/${RUN_ID}"
+
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: neurips-2023-data
+    de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad"
+    de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad"
+    id_map: "$resources_dir/neurips-2023-data/id_map.csv"
+    layer: clipped_sign_log10_pval
+method_ids: [transformer_ensemble]
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+tw launch https://github.com/openproblems-bio/task-dge-perturbation-prediction.git \
+  --revision fix_trafo_ens_build \
+  --pull-latest \
+  --main-script target/nextflow/workflows/run_benchmark/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file /tmp/params.yaml \
+  --config src/common/nextflow_helpers/labels_tw.config
\ No newline at end of file
diff --git a/src/task/methods/transformer_ensemble/config.vsh.yaml b/src/task/methods/transformer_ensemble/config.vsh.yaml
index ed72ac68..057f06fb 100644
--- a/src/task/methods/transformer_ensemble/config.vsh.yaml
+++ b/src/task/methods/transformer_ensemble/config.vsh.yaml
@@ -26,13 +26,24 @@ functionality:
       description: "Number of training epochs."
       info:
         test_default: 10
+    - name: --d_model
+      type: integer
+      default: 128
+      description: "Dimensionality of the model."
+    - name: --batch_size
+      type: integer
+      default: 32
+      description: "Batch size."
+    - name: --early_stopping
+      type: integer
+      default: 5000
+      description: "Number of epochs to wait for early stopping."
   resources:
     - type: python_script
       path: script.py
     - path: models.py
     - path: utils.py
     - path: train.py
-    - path: ../../utils/anndata_to_dataframe.py
 platforms:
   - type: docker
     image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4
diff --git a/src/task/methods/transformer_ensemble/script.py b/src/task/methods/transformer_ensemble/script.py
index 5f48d5e7..79764953 100644
--- a/src/task/methods/transformer_ensemble/script.py
+++ b/src/task/methods/transformer_ensemble/script.py
@@ -2,18 +2,21 @@
 import anndata as ad
 import sys
 import torch
-import copy
+import os
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 ## VIASH START
 par = {
-    "de_train_h5ad": "resources/neurips-2023-data/de_train.h5ad",
-    "de_train": "resources/neurips-2023-data/de_train.h5ad",
-    "id_map": "resources/neurips-2023-data/id_map.csv",
-    "output": "output.h5ad",
+    "de_train_h5ad": "resources/neurips-2023-kaggle/de_train.h5ad",
+    "id_map": "resources/neurips-2023-kaggle/id_map.csv",
+    "output": "output/prediction.h5ad",
+    "output_model": "output/model/",
     "num_train_epochs": 10,
-    "layer": "clipped_sign_log10_pval"
+    "early_stopping": 5000,
+    "batch_size": 64,
+    "d_model": 128,
+    "layer": "sign_log10_pval"
 }
 meta = {
     "resources_dir": "src/task/methods/transformer_ensemble",
@@ -22,29 +25,32 @@
 
 sys.path.append(meta["resources_dir"])
 
-# Fixed training params
-d_model = 128
-batch_size = 32
-early_stopping = 5000
-
-from anndata_to_dataframe import anndata_to_dataframe
 from utils import prepare_augmented_data, prepare_augmented_data_mean_only
 from train import train_k_means_strategy, train_non_k_means_strategy
 
+# create output model directory if need be
+if par["output_model"]:
+    os.makedirs(par["output_model"], exist_ok=True)
+
 # read data
 de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"])
-de_train = anndata_to_dataframe(de_train_h5ad, par["layer"])
 id_map = pd.read_csv(par["id_map"])
 
+# convert .obs categoricals to string for ease of use
+for col in de_train_h5ad.obs.select_dtypes(include=["category"]).columns:
+    de_train_h5ad.obs[col] = de_train_h5ad.obs[col].astype(str)
+# reset index
+de_train_h5ad.obs.reset_index(drop=True, inplace=True)
+
 # determine other variables
 gene_names = list(de_train_h5ad.var_names)
 n_components = len(gene_names)
 
 # train and predict models
+# note, the weights intentionally don't add up to one
 argsets = [
     # Note by author - weight_df1: 0.5 (utilizing std, mean, and clustering sampling, yielding 0.551)
     {
-        "name": "weight_df1",
         "mean_std": "mean_std",
         "uncommon": False,
         "sampling_strategy": "random",
@@ -52,7 +58,6 @@
     },
     # Note by author - weight_df2: 0.25 (excluding uncommon elements, resulting in 0.559)
     {
-        "name": "weight_df2",
         "mean_std": "mean_std",
         "uncommon": True,
         "sampling_strategy": "random",
@@ -60,7 +65,6 @@
     },
     # Note by author - weight_df3: 0.25 (leveraging clustering sampling, achieving 0.575)
     {
-        "name": "weight_df3",
         "mean_std": "mean_std",
         "uncommon": False, # should this be set to False or True?
         "sampling_strategy": "k-means",
@@ -68,9 +72,8 @@
     },
     # Note by author - weight_df4: 0.3 (incorporating mean, random sampling, and excluding std, attaining 0.554)
     {
-        "name": "weight_df4",
         "mean_std": "mean",
-        "uncommon": False, # should this be set to False or True?
+        "uncommon": False,
         "sampling_strategy": "random",
         "weight": 0.3,
     }
@@ -80,19 +83,22 @@
 predictions = []
 
 print(f"Train and predict models", flush=True)
-for argset in argsets:
-    print(f"Train and predict model {argset['name']}", flush=True)
+for i, argset in enumerate(argsets):
+    print(f"Train and predict model {i+1}/{len(argsets)}", flush=True)
 
     print(f"> Prepare augmented data", flush=True)
     if argset["mean_std"] == "mean_std":
         one_hot_encode_features, targets, one_hot_test = prepare_augmented_data(
-            de_train=copy.deepcopy(de_train),
-            id_map=copy.deepcopy(id_map),
+            de_train_h5ad=de_train_h5ad,
+            id_map=id_map,
+            layer=par["layer"],
             uncommon=argset["uncommon"],
         )
     elif argset["mean_std"] == "mean":
-        one_hot_encode_features, targets, one_hot_test = (
-            prepare_augmented_data_mean_only(de_train=de_train, id_map=id_map)
+        one_hot_encode_features, targets, one_hot_test = prepare_augmented_data_mean_only(
+            de_train_h5ad=de_train_h5ad,
+            id_map=id_map,
+            layer=par["layer"],
         )
     else:
         raise ValueError("Invalid mean_std argument")
@@ -101,24 +107,24 @@
     if argset["sampling_strategy"] == "k-means":
         label_reducer, scaler, transformer_model = train_k_means_strategy(
             n_components=n_components,
-            d_model=d_model,
+            d_model=par["d_model"],
             one_hot_encode_features=one_hot_encode_features,
             targets=targets,
             num_epochs=par["num_train_epochs"],
-            early_stopping=early_stopping,
-            batch_size=batch_size,
+            early_stopping=par["early_stopping"],
+            batch_size=par["batch_size"],
             device=device,
             mean_std=argset["mean_std"],
         )
     elif argset["sampling_strategy"] == "random":
         label_reducer, scaler, transformer_model = train_non_k_means_strategy(
             n_components=n_components,
-            d_model=d_model,
+            d_model=par["d_model"],
             one_hot_encode_features=one_hot_encode_features,
             targets=targets,
             num_epochs=par["num_train_epochs"],
-            early_stopping=early_stopping,
-            batch_size=batch_size,
+            early_stopping=par["early_stopping"],
+            batch_size=par["batch_size"],
             device=device,
             mean_std=argset["mean_std"],
         )
@@ -138,8 +144,8 @@
     print(f"Predict on test data", flush=True)
     num_samples = len(unseen_data)
     transformed_data = []
-    for i in range(0, num_samples, batch_size):
-        batch_result = transformer_model(unseen_data[i : i + batch_size])
+    for i in range(0, num_samples, par["batch_size"]):
+        batch_result = transformer_model(unseen_data[i : i + par["batch_size"]])
         transformed_data.append(batch_result)
     transformed_data = torch.vstack(transformed_data)
     if scaler:
@@ -150,13 +156,20 @@
         ).to(device)
 
     pred = transformed_data.cpu().detach().numpy()
+
+    if par["output_model"]:
+        model_path = f"{par['output_model']}/model_{i}.pt"
+        torch.save(transformer_model.state_dict(), model_path)
+        pred_path = f"{par['output_model']}/pred_{i}.csv"
+        pd.DataFrame(pred).to_csv(pred_path, index=False)
+
     predictions.append(pred)
 
 print(f"Combine predictions", flush=True)
 # compute weighted sum
-sum_weights = sum([argset["weight"] for argset in argsets])
+# note, the weights intentionally don't add up to one
 weighted_pred = sum([
-    pred * argset["weight"] / sum_weights
+    pred * argset["weight"]
     for argset, pred in zip(argsets, predictions)
 ])
 
diff --git a/src/task/methods/transformer_ensemble/utils.py b/src/task/methods/transformer_ensemble/utils.py
index d5beea93..5b4a7a0b 100644
--- a/src/task/methods/transformer_ensemble/utils.py
+++ b/src/task/methods/transformer_ensemble/utils.py
@@ -22,29 +22,33 @@ def reduce_labels(Y, n_components):
 
 
 def prepare_augmented_data(
-        de_train,
+        de_train_h5ad,
         id_map,
+        layer,
         uncommon=False
     ):
-    de_train = de_train.drop(columns = ['split'])
     xlist = ['cell_type', 'sm_name']
-    _ylist = ['cell_type', 'sm_name', 'sm_lincs_id', 'SMILES', 'control']
-    y = de_train.drop(columns=_ylist)
+    y = pd.DataFrame(
+        de_train_h5ad.layers[layer],
+        columns=de_train_h5ad.var_names,
+        index=de_train_h5ad.obs.index
+    )
 
     # Combine train and test data for one-hot encoding
-    combined_data = pd.concat([de_train[xlist], id_map[xlist]])
+    combined_data = pd.concat([de_train_h5ad.obs[xlist], id_map[xlist]])
 
     dum_data = pd.get_dummies(combined_data, columns=xlist)
 
     # Split the combined data back into train and test
-    train = dum_data.iloc[:len(de_train)]
-    test = dum_data.iloc[len(de_train):]
+    train = dum_data.iloc[:de_train_h5ad.n_obs]
+    test = dum_data.iloc[de_train_h5ad.n_obs:]
     if uncommon:
         uncommon = [f for f in train if f not in test]
         X = train.drop(columns=uncommon)
     X = train
-    de_cell_type = de_train.iloc[:, [0] + list(range(5, de_train.shape[1]))]
-    de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))]
+    
+    de_cell_type = pd.concat([de_train_h5ad.obs[['cell_type']], y], axis=1)
+    de_sm_name = pd.concat([de_train_h5ad.obs[['sm_name']], y], axis=1)
 
     mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index()
     std_cell_type = de_cell_type.groupby('cell_type').std().reset_index().fillna(0)
@@ -107,29 +111,30 @@ def prepare_augmented_data(
 
 
 def prepare_augmented_data_mean_only(
-        de_train,
+        de_train_h5ad,
+        layer,
         id_map
     ):
-    de_train = de_train.drop(columns = ['split'])
     xlist = ['cell_type', 'sm_name']
-    _ylist = ['cell_type', 'sm_name', 'sm_lincs_id', 'SMILES', 'control']
-    y = de_train.drop(columns=_ylist)
-    # train = pd.get_dummies(de_train[xlist], columns=xlist)
-    # test = pd.get_dummies(id_map[xlist], columns=xlist)
+    y = pd.DataFrame(
+        de_train_h5ad.layers[layer],
+        columns=de_train_h5ad.var_names,
+        index=de_train_h5ad.obs.index
+    )
     # Combine train and test data for one-hot encoding
-    combined_data = pd.concat([de_train[xlist], id_map[xlist]])
+    combined_data = pd.concat([de_train_h5ad.obs[xlist], id_map[xlist]])
 
     dum_data = pd.get_dummies(combined_data, columns=xlist)
 
     # Split the combined data back into train and test
-    train = dum_data.iloc[:len(de_train)]
-    test = dum_data.iloc[len(de_train):]
+    train = dum_data.iloc[:de_train_h5ad.n_obs]
+    test = dum_data.iloc[de_train_h5ad.n_obs:]
     # uncommon = [f for f in train if f not in test]
     # X = train.drop(columns=uncommon)
 
     X = train
-    de_cell_type = de_train.iloc[:, [0] + list(range(5, de_train.shape[1]))]
-    de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))]
+    de_cell_type = pd.concat([de_train_h5ad.obs[['cell_type']], y], axis=1)
+    de_sm_name = pd.concat([de_train_h5ad.obs[['sm_name']], y], axis=1)
     mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index()
     mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index()
     rows = []