diff --git a/.env-template b/.env-template
new file mode 100644
index 0000000..a83f166
--- /dev/null
+++ b/.env-template
@@ -0,0 +1,22 @@
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+
+USERNAME= # name of user, used to name temporary files (e.g. rsterling)
+PROJECT_CODE=re_gecip_cancer_colorectal # project code for submitting jobs (e.g. re_gecip_cancer_colorectal for CRC GeCIP)
+PROJECT_DIR=$parent_path/workdir # path to project directory - this is where log files are dumped so should be a location with plenty of space (e.g. /re_scratch/...)
+
+SAMPLE_LIST= # path to sample list file (e.g. sample_list_2021_06_29.tsv)
+REF_SIGNATURES_DIR= # path to folder containing reference signature tables (e.g. COSMIC_v3.3_CN_GRCh37.txt)
+SIG_DIR= # path to folder containing signature matrices (e.g. SigProfilerCNV48)
+GENE_LIST= # path to file containing list of genes (e.g. .../human-dna-repair-genes.tsv)
+
+FIGURE_DIR=$parent_path/figures # path to folder where figures will be saved (e.g. .../figures)
+DATA_DIR=$parent_path/data
+
+COMBINED_SIGS_DIR=$parent_path/data/combined_signatures # path to folder containing combined signature matrices (e.g. .../combined_sigs)
+
+CLINVAR_CADD_CMD=/re_gecip/shared_allGeCIPs/bkinnersley/CADD/CADD-scripts-master/CADD.sh # Path to script for running CADD
+
+# Mutation file paths
+GERMLINE_DIR=/gel_data_resources/main_programme/aggregation/aggregate_gVCF_strelka/aggV2/genomic_data
+AGGV2_SAMPLE_LIST=/gel_data_resources/main_programme/aggregation/aggregate_gVCF_strelka/aggV2/additional_data/sample_list/aggV2_sampleIds_mpv10_78195.tsv
+ONCOKB_DIR= # Folder containing oncokb annotations (e.g. .../OncoKB_annotation/output)
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index e73fcd7..102df04 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # Data stored in data directory
 data/
+figures/
 
 # renv packages
 renv/
diff --git a/README.md b/README.md
index f0b96f3..7e8601d 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,22 @@ Code and results for paper:
 
 [Comprehensive repertoire of the chromosomal alteration and mutational signatures across 16 cancer types from 10,983 cancer patients](doi.org/10.1101/2023.06.07.23290970)
 
-Disclaimer: This code was written inside the Genomics England Research Environment without github and it has not been tested outside the research environment. We provide it here to aid the transparency of the publication and make our analysis methods more accessible and reproducible. The pipelines provided will almost certainly not be of significant use outside Genomics England, however, we hope users find pieces of code which help to understand our work and may be useful in their research.
+Disclaimer: This code was written inside the Genomics England Research Environment without github and it has not been tested outside the research environment. We provide it here to aid the transparency of the publication and make our analysis methods more accessible and reproducible. The pipelines provided will almost certainly not be of significant use outside Genomics England, however, we hope users find pieces of code helpful to understand our work and useful in their research.
+
+# Getting started
+
+Clone the repository
+`git clone https://github.com/Wedge-lab/Gel_pan_cancer_signatures.git`
+
+Create .env file
+```
+cd Gel_pan_cancer_signatures
+cp .env-template .env
+```
+in `.env`, change all file names as required to your local files.
+
+Run editable install on a fresh conda environment (python=3.9)
+`pip install -e .`
 
 
 # Contained in this repository
diff --git a/scripts/clinical.sh b/scripts/clinical.sh
index eae7867..74b3371 100755
--- a/scripts/clinical.sh
+++ b/scripts/clinical.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../.env
 
 # directories and filenames
 run_name=clinical_tumour-group_InDelQC_nb
diff --git a/scripts/combine_signatures/combineCNV.sh b/scripts/combine_signatures/combineCNV.sh
index 60cad67..6030f1e 100755
--- a/scripts/combine_signatures/combineCNV.sh
+++ b/scripts/combine_signatures/combineCNV.sh
@@ -4,19 +4,20 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
+source ../../.env
 
 min_stability=1.0
 # Signature type
 sig_type=CNV48
 # Directory containing signatures
-sig_dir=${DATA_DIR}/copy_number_signatures/SigProfilerCNV48
+sig_dir=${SIG_DIR}/${sig_type}
 # COSMIC signatures file - column headered Type with each mutation type
 # Input signatures should be in additional columns
 # If there aren't any COSMIC reference signatures or extracting signatures deNovo, only include the Type column
-reference=${DATA_DIR}/COSMIC_v3.3_CN_GRCh37.txt
-sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv
+reference=${REF_SIGNATURES_DIR}/COSMIC_v3.3_CN_GRCh37.txt
+sample_file=${SAMPLE_LIST}
 
-dir_output=../../data/combinedSignatures_${sig_type}
+dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type}
 cohort_file=$dir_output/cohort_list.tsv
 
 # create input and output data directories
diff --git a/scripts/combine_signatures/combineDBS.sh b/scripts/combine_signatures/combineDBS.sh
index 0671b47..46e5229 100755
--- a/scripts/combine_signatures/combineDBS.sh
+++ b/scripts/combine_signatures/combineDBS.sh
@@ -2,20 +2,23 @@
 # Iteratively add cohort extracted signatures to pan-cancer COSMIC list
 
 #!/bin/bash
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+source ../../.env
 
 min_stability=1.0
 # Signature type
 sig_type=DBS78
 max_sigs=15
 # Directory containing signatures
-sig_dir=${DATA_DIR}/SIGmats/v4/${sig_type}
+sig_dir=${SIG_DIR}/${sig_type}
 
 # COSMIC signatures file - column headered Type with each mutation type
 # Input signatures should be in additional columns
-cosmic=${DATA_DIR}/COSMIC_v3.3_DBS_GRCh38.txt
-sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv
+cosmic=${REF_SIGNATURES_DIR}/COSMIC_v3.3_DBS_GRCh38.txt
+sample_file=${SAMPLE_LIST}
 
-dir_output=../../data/combinedSignatures_${sig_type}
+dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type}
 cohort_file=$dir_output/cohort_list.tsv
 
 # create input and output data directories
diff --git a/scripts/combine_signatures/combineID.sh b/scripts/combine_signatures/combineID.sh
index bfa03fd..0185548 100755
--- a/scripts/combine_signatures/combineID.sh
+++ b/scripts/combine_signatures/combineID.sh
@@ -2,19 +2,22 @@
 # Iteratively add cohort extracted signatures to pan-cancer COSMIC list
 
 #!/bin/bash
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+source ../../.env
 
 min_stability=1.0
 # Signature type
 sig_type=ID83
 max_sigs=25
 # Directory containing signatures
-sig_dir=${DATA_DIR}/SIGmats/v3/${sig_type} #v2_draft/${sig_type}
+sig_dir=${SIG_DIR}/${sig_type}
 # COSMIC signatures file - column headered Type with each mutation type
 # Input signatures should be in additional columns
-reference=${DATA_DIR}/COSMIC_v3.3_ID_GRCh37.txt
-sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv
+reference=${REF_SIGNATURES_DIR}/COSMIC_v3.3_ID_GRCh37.txt
+sample_file=${SAMPLE_LIST}
 
-dir_output=../../data/combinedSignatures_${sig_type}
+dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type}
 cohort_file=$dir_output/cohort_list.tsv
 
 # create input and output data directories
diff --git a/scripts/combine_signatures/combineSBS.sh b/scripts/combine_signatures/combineSBS.sh
index ccd011c..09deb72 100755
--- a/scripts/combine_signatures/combineSBS.sh
+++ b/scripts/combine_signatures/combineSBS.sh
@@ -2,19 +2,22 @@
 # Iteratively add cohort extracted signatures to pan-cancer COSMIC list
 
 #!/bin/bash
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+source ../../.env
 
 min_stability=1.0
 # Signature type
 sig_type=SBS288
 max_sigs=30
 # Directory containing signatures
-sig_dir=/re_gecip/cancer_pan/fprefect/botl/results/SIGmats/v2_draft/${sig_type}
+sig_dir=${SIG_DIR}/${sig_type}
 # COSMIC signatures file - column headered Type with each mutation type
 # Input signatures should be in additional columns
-cosmic=${DATA_DIR}/COSMIC_v3.3.1_SBS_GRCh38.txt
-sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv
+cosmic=${REF_SIGNATURES_DIR}/COSMIC_v3.3.1_SBS_GRCh38.txt
+sample_file=${SAMPLE_LIST}
 
-dir_output=../../data/combinedSignatures_${sig_type}
+dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type}
 cohort_file=$dir_output/cohort_list.tsv
 
 # create input and output data directories
diff --git a/scripts/combine_signatures/combineSV.sh b/scripts/combine_signatures/combineSV.sh
index 2be9f98..fa538f6 100755
--- a/scripts/combine_signatures/combineSV.sh
+++ b/scripts/combine_signatures/combineSV.sh
@@ -2,19 +2,22 @@
 # Iteratively add cohort extracted signatures to pan-cancer Reference list
 
 #!/bin/bash
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+source ../../.env
 
 min_stability=1.0
 # Signature type
 sig_type=SV32
 max_sigs=15
 # Directory containing signatures
-sig_dir=${DATA_DIR}/Results_SigProfiler_v2/
+sig_dir=${SIG_DIR}/${sig_type}
 # Reference signatures file - column headered Type with each mutation type
 # Input signatures should be in additional columns
-reference=${DATA_DIR}/Breast560_rearrangement.signatures.tsv
-sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv
+reference=${REF_SIGNATURES_DIR}/Breast560_rearrangement.signatures.tsv
+sample_file=${SAMPLE_LIST}
 
-dir_output=.../../combinedSignatures_${sig_type}
+dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type}
 cohort_file=$dir_output/cohort_list.tsv
 
 # create input and output data directories
diff --git a/scripts/data_prep/clinvar_cadd.sh b/scripts/data_prep/clinvar_cadd.sh
index ddd31de..45a4305 100755
--- a/scripts/data_prep/clinvar_cadd.sh
+++ b/scripts/data_prep/clinvar_cadd.sh
@@ -3,9 +3,7 @@
 
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../../.env
 
 # directories and filenames
 dir_analysis=CLINVAR_DIR
diff --git a/scripts/data_prep/germline.sh b/scripts/data_prep/germline.sh
index cbfcc07..cbead33 100755
--- a/scripts/data_prep/germline.sh
+++ b/scripts/data_prep/germline.sh
@@ -1,14 +1,11 @@
 #!/bin/bash
 # run cancGeneHits.nf
 
+#!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
 source ../../.env
 
-# parameters
-PROJECT_DIR=../workdir
-
 # Run parameters
 n_files=10000 # Max number of files to run (input a small number if just testing)
 PHRED_threshold=20
diff --git a/scripts/data_prep/loh.sh b/scripts/data_prep/loh.sh
index 5e9e49c..adf5684 100755
--- a/scripts/data_prep/loh.sh
+++ b/scripts/data_prep/loh.sh
@@ -3,11 +3,7 @@
 
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-source ../.env
-
-# parameters
-PROJECT_DIR=../workdir
+source ../../.env
 
 # Run parameters
 chunk_size=100 # Number of samples to run on a single process
@@ -25,7 +21,7 @@ mkdir -p $dir_analysis/input
 mkdir -p $dir_output
 
 # Battenberg file list
-awk 'NR==1 {for (i=1; i<=NF; i++) {f[$i] = i}}{ print $(f["tumour_sample_platekey"])"\t"$(f["filename_cna"]) }' /re_gecip/cancer_pan/fprefect/botl/results/sample_lists/sample_list_2021_06_29.tsv | sed 's/filename_cna/filename_batt/' > ${filename_battenberg_list}
+awk 'NR==1 {for (i=1; i<=NF; i++) {f[$i] = i}}{ print $(f["tumour_sample_platekey"])"\t"$(f["filename_cna"]) }' ${SAMPLE_LIST} | sed 's/filename_cna/filename_batt/' > ${filename_battenberg_list}
 
 # Gene regions from
 echo -e "chrom\tstart\tend\tgene_id" > ${filename_genes}
diff --git a/scripts/data_prep/make_germline_input.py b/scripts/data_prep/make_germline_input.py
index c6959c4..c75485c 100755
--- a/scripts/data_prep/make_germline_input.py
+++ b/scripts/data_prep/make_germline_input.py
@@ -1,26 +1,37 @@
+import os
 import sys
-import pandas as pd, numpy as np, os
+
+import numpy as np
+import pandas as pd
 from dotenv import load_dotenv
 
 load_dotenv()
-AGGV2_SAMPLE_LIST = os.getenv('AGGV2_SAMPLE_LIST')
-
-if __name__=='__main__':
+AGGV2_SAMPLE_LIST = os.getenv("AGGV2_SAMPLE_LIST")
 
+if __name__ == "__main__":
     dir_output, sample_list_file = sys.argv[1:]
 
     # Get all samples from previous projects
-    cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv"
-                              for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']]
+    cancer_analysis_tables = [
+        f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv"
+        for version in ["V8", "V11/V11_reheadered", "v14/v14_reheadered"]
+    ]
     germline_platekey_list = np.array([])
     for table in cancer_analysis_tables:
-        germline_platekey_list = np.union1d(germline_platekey_list,
-                                            np.array(pd.read_csv(table, sep="\t", usecols=["germline_sample_platekey"])))
+        germline_platekey_list = np.union1d(
+            germline_platekey_list,
+            np.array(
+                pd.read_csv(table, sep="\t", usecols=["germline_sample_platekey"])
+            ),
+        )
 
     # Crossmatch with aggV2
-    aggv2_sample_list = np.array(pd.read_csv(AGGV2_SAMPLE_LIST,
-                                    sep="\t", header=None)[0])
+    aggv2_sample_list = np.array(
+        pd.read_csv(AGGV2_SAMPLE_LIST, sep="\t", header=None)[0]
+    )
 
     germline_platekey_list = np.intersect1d(germline_platekey_list, aggv2_sample_list)
     print(f"{len(germline_platekey_list)} germline samples")
-    pd.DataFrame(germline_platekey_list, columns=['germline_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False)
+    pd.DataFrame(germline_platekey_list, columns=["germline_sample_platekey"]).to_csv(
+        f"{sample_list_file}", header=False, index=False
+    )
diff --git a/scripts/data_prep/make_somatic_input.py b/scripts/data_prep/make_somatic_input.py
index b756c45..4820be0 100755
--- a/scripts/data_prep/make_somatic_input.py
+++ b/scripts/data_prep/make_somatic_input.py
@@ -1,18 +1,25 @@
-import sys, os
-import pandas as pd, numpy as np
+import os
+import sys
 
+import numpy as np
+import pandas as pd
 
-if __name__=='__main__':
-
+if __name__ == "__main__":
     dir_output, sample_list_file = sys.argv[1:]
 
     # Get all samples from previous projects
-    cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv"
-                              for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']]
+    cancer_analysis_tables = [
+        f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv"
+        for version in ["V8", "V11/V11_reheadered", "v14/v14_reheadered"]
+    ]
     tumour_platekey_list = np.array([])
     for table in cancer_analysis_tables:
-        tumour_platekey_list = np.union1d(tumour_platekey_list,
-                                            np.array(pd.read_csv(table, sep="\t", usecols=["tumour_sample_platekey"])))
+        tumour_platekey_list = np.union1d(
+            tumour_platekey_list,
+            np.array(pd.read_csv(table, sep="\t", usecols=["tumour_sample_platekey"])),
+        )
 
     print(f"{len(tumour_platekey_list)} tumour_samples")
-    pd.DataFrame(tumour_platekey_list, columns=['tumour_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False)
+    pd.DataFrame(tumour_platekey_list, columns=["tumour_sample_platekey"]).to_csv(
+        f"{sample_list_file}", header=False, index=False
+    )
diff --git a/scripts/data_prep/oncokb.sh b/scripts/data_prep/oncokb.sh
index 18bc2cb..b6acb5e 100755
--- a/scripts/data_prep/oncokb.sh
+++ b/scripts/data_prep/oncokb.sh
@@ -3,11 +3,7 @@
 
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-source ../.env
-
-# parameters
-PROJECT_DIR=../../workdir
+source ../../.env
 
 # directories and filenames
 dir_analysis=../../data/cancGeneHits/somatic
diff --git a/scripts/genotype_assoc.sh b/scripts/genotype_assoc.sh
index 1dcd687..57ec58f 100755
--- a/scripts/genotype_assoc.sh
+++ b/scripts/genotype_assoc.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../.env
 
 # directories and filenames
 resampling_method=dCRT
diff --git a/scripts/germline_assoc.sh b/scripts/germline_assoc.sh
index 71452ea..bce2d1a 100755
--- a/scripts/germline_assoc.sh
+++ b/scripts/germline_assoc.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../.env
 
 # directories and filenames
 resampling_method=dCRT
diff --git a/scripts/somatic_assoc.sh b/scripts/somatic_assoc.sh
index 9fa308d..07b3072 100755
--- a/scripts/somatic_assoc.sh
+++ b/scripts/somatic_assoc.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../.env
 
 # directories and filenames
 resampling_method=dCRT
diff --git a/scripts/treatment_assoc.sh b/scripts/treatment_assoc.sh
index 16ac358..f253eaa 100755
--- a/scripts/treatment_assoc.sh
+++ b/scripts/treatment_assoc.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../.env
 
 # directories and filenames
 resampling_method=dCRT
diff --git a/scripts/twohit_assoc.sh b/scripts/twohit_assoc.sh
index 690090e..31d323c 100755
--- a/scripts/twohit_assoc.sh
+++ b/scripts/twohit_assoc.sh
@@ -1,10 +1,7 @@
 #!/bin/bash
-#!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../.env
 
 # directories and filenames
 resampling_method=dCRT
diff --git a/src/signatures/associations/cancGeneHit/targets.py b/src/signatures/associations/cancGeneHit/targets.py
index 9048069..9f25836 100755
--- a/src/signatures/associations/cancGeneHit/targets.py
+++ b/src/signatures/associations/cancGeneHit/targets.py
@@ -8,6 +8,7 @@
 
 load_dotenv()
 DATA_DIR = os.getenv("DATA_DIR")
+GENE_LIST = os.getenv("GENE_LIST")
 
 if __name__ == "__main__":
     samples_file, targets_file, n = sys.argv[1:]
@@ -117,7 +118,7 @@
 
     # DNA types from database
     DNA_repair = pd.merge(
-        pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t"),
+        pd.read_csv(GENE_LIST, sep="\t"),
         pd.DataFrame(index=np.array(genes)),
         left_on="Gene",
         right_index=True,
diff --git a/src/signatures/associations/cancGeneHit/targets_germline.py b/src/signatures/associations/cancGeneHit/targets_germline.py
index 78be786..27e9d7a 100755
--- a/src/signatures/associations/cancGeneHit/targets_germline.py
+++ b/src/signatures/associations/cancGeneHit/targets_germline.py
@@ -8,6 +8,7 @@
 
 load_dotenv()
 DATA_DIR = os.getenv("DATA_DIR")
+GENE_LIST = f"{DATA_DIR}/DNA_repair_genes.tsv"
 
 if __name__ == "__main__":
     samples_file, targets_file = sys.argv[1:]
@@ -63,7 +64,7 @@
 
     # DNA types from database
     DNA_repair = pd.merge(
-        pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t"),
+        pd.read_csv(GENE_LIST, sep="\t"),
         pd.DataFrame(index=np.array(genes)),
         left_on="Gene",
         right_index=True,
diff --git a/src/signatures/associations/cancGeneHit/targets_somatic.py b/src/signatures/associations/cancGeneHit/targets_somatic.py
index 5e8e9ea..c362001 100755
--- a/src/signatures/associations/cancGeneHit/targets_somatic.py
+++ b/src/signatures/associations/cancGeneHit/targets_somatic.py
@@ -8,6 +8,7 @@
 
 load_dotenv()
 DATA_DIR = os.getenv("DATA_DIR")
+GENE_LIST = os.getenv("GENE_LIST")
 
 if __name__ == "__main__":
     samples_file, targets_file = sys.argv[1:]
@@ -61,7 +62,7 @@
 
     # DNA types from database
     DNA_repair = pd.merge(
-        pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t"),
+        pd.read_csv(GENE_LIST, sep="\t"),
         pd.DataFrame(index=np.array(genes)),
         left_on="Gene",
         right_index=True,
diff --git a/src/signatures/associations/cancGeneHit/tests.py b/src/signatures/associations/cancGeneHit/tests.py
index 8dcb239..6e3df78 100755
--- a/src/signatures/associations/cancGeneHit/tests.py
+++ b/src/signatures/associations/cancGeneHit/tests.py
@@ -8,6 +8,7 @@
 
 load_dotenv()
 DATA_DIR = os.getenv("DATA_DIR")
+GENE_LIST = os.getenv("GENE_LIST")
 
 if __name__ == "__main__":
     samples_file, signatures_file, targets_file, tests_file, tests_file_binary, n = (
@@ -22,7 +23,7 @@
     groups = pd.read_csv(samples_file, usecols=["sample_id", "group"], sep="\t")
 
     # Subset for DNA repair genes
-    DNA_repair = pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t")
+    DNA_repair = pd.read_csv(GENE_LIST, sep="\t")
     if True:
         mock_genes = targets.keys()[
             [bool(re.search("^MOCK[0-9A-Z]+$", tgt)) for tgt in targets.keys()]
diff --git a/src/signatures/associations/cancGeneHit/tests_germline.py b/src/signatures/associations/cancGeneHit/tests_germline.py
index caf47a5..d3a1698 100755
--- a/src/signatures/associations/cancGeneHit/tests_germline.py
+++ b/src/signatures/associations/cancGeneHit/tests_germline.py
@@ -8,6 +8,7 @@
 
 load_dotenv()
 DATA_DIR = os.getenv("DATA_DIR")
+GENE_LIST = os.getenv("GENE_LIST")
 
 if __name__ == "__main__":
     samples_file, signatures_file, targets_file, tests_file, tests_file_binary = (
@@ -22,7 +23,7 @@
     groups = pd.read_csv(samples_file, usecols=["sample_id", "group"], sep="\t")
 
     # Subset for DNA repair genes
-    DNA_repair = pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t")
+    DNA_repair = pd.read_csv(GENE_LIST, sep="\t")
     if True:
         mock_genes = targets.keys()[
             [bool(re.search("^MOCK[0-9A-Z]+$", tgt)) for tgt in targets.keys()]
diff --git a/src/signatures/associations/clinical/samples_and_tests.py b/src/signatures/associations/clinical/samples_and_tests.py
index 9807d48..9e9de15 100755
--- a/src/signatures/associations/clinical/samples_and_tests.py
+++ b/src/signatures/associations/clinical/samples_and_tests.py
@@ -6,6 +6,7 @@
 
 load_dotenv()
 DATA_DIR = os.getenv("DATA_DIR")
+SAMPLE_LIST = os.getenv("SAMPLE_LIST")
 
 if __name__ == "__main__":
     samples_file, activities_file = sys.argv[1:]
@@ -18,7 +19,7 @@
 
     # Get tumour group
     sample_list_df = pd.read_csv(
-        f"{DATA_DIR}/sample_lists_incl_SEGs/sample_list.tsv",
+        SAMPLE_LIST,
         usecols=[
             "participant_id",
             "tumour_sample_platekey",
diff --git a/src/signatures/plotting/associations/clinicalAssoc.py b/src/signatures/plotting/associations/clinicalAssoc.py
index 7e02bdf..10cefba 100755
--- a/src/signatures/plotting/associations/clinicalAssoc.py
+++ b/src/signatures/plotting/associations/clinicalAssoc.py
@@ -521,7 +521,7 @@ def publish_fig(filename, publish="./"):
     plt.xlim(-2.5, 2.5)
 
     plt.xlabel(r"$\beta$", fontsize=fs)
-    publish_fig("clinical_panel", publish=FIGURE_DIR)
+    publish_fig("clinical_panel", publish=f"{FIGURE_DIR}")
 
     # ## Histology
 
@@ -533,7 +533,9 @@ def publish_fig(filename, publish="./"):
     results_target = pd.read_csv(f"{results_dir}/target_target_assoc.csv")
     fig_dir = f"{results_dir}/figures"
 
-    results_dir_cohort = f"/re_gecip/shared_allGeCIPs/pancancer_signatures/results/associations/clinicalSigs/{run_name}/output_cohort"
+    results_dir_cohort = (
+        f"{RESULT_DIR}/associations/clinicalSigs/{run_name}/output_cohort"
+    )
     results_zinb = pd.concat(
         (
             results_zinb,
@@ -708,4 +710,4 @@ def publish_fig(filename, publish="./"):
         plt.sca(ax)
         plt.axis("off")
 
-    publish_fig("histology_rate_comparison", publish=FIGURE_DIR)
+    publish_fig("histology_rate_comparison", publish=f"{FIGURE_DIR}")
diff --git a/src/signatures/plotting/associations/survivalCoefs.py b/src/signatures/plotting/associations/survivalCoefs.py
index 1563258..8519ec4 100644
--- a/src/signatures/plotting/associations/survivalCoefs.py
+++ b/src/signatures/plotting/associations/survivalCoefs.py
@@ -1,14 +1,20 @@
+import os
+
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import scipy
 import scipy.stats
+from dotenv import load_dotenv
 from lifelines import CoxPHFitter, KaplanMeierFitter
 from lifelines.statistics import logrank_test
 
 from signatures.plotting.combinedSignatures import loadSignatures, signatureRenamer
 
+load_dotenv()
+SAMPLE_LIST = os.getenv("SAMPLE_LIST")
+
 mpl.rcParams["mathtext.fontset"] = "stix"
 mpl.rcParams["font.family"] = "STIXGeneral"
 plt.rc("axes", labelsize=16)
@@ -106,7 +112,7 @@ def result_string(sig, cph_alt):
 
     # Add tumour type
     tumour_type = pd.read_csv(
-        "/re_gecip/shared_allGeCIPs/pancancer_signatures/results/sample_lists_incl_SEGs/sample_list_2021_06_29.tsv",
+        f"{SAMPLE_LIST}",
         sep="\t",
         usecols=[
             "participant_id",
diff --git a/src/signatures/plotting/associations/twoHitAssoc.py b/src/signatures/plotting/associations/twoHitAssoc.py
index 99e091f..0a74744 100755
--- a/src/signatures/plotting/associations/twoHitAssoc.py
+++ b/src/signatures/plotting/associations/twoHitAssoc.py
@@ -23,6 +23,7 @@
 RESULT_DIR = os.getenv("RESULT_DIR")
 FIGURE_DIR = os.getenv("FIGURE_DIR")
 DATA_DIR = os.getenv("DATA_DIR")
+GENE_LIST = os.getenv("GENE_LIST")
 
 mpl.rcParams["mathtext.fontset"] = "stix"
 mpl.rcParams["font.family"] = "STIXGeneral"
@@ -361,7 +362,7 @@ def getAssociationResults(results_dir, DNA_repair):
     print(len(results_combined))
 
     DNArepair = pd.read_csv(
-        "/re_gecip/shared_allGeCIPs/pancancer_signatures/data/human-dna-repair-genes.tsv",
+        GENE_LIST,
         sep="\t",
     )
     results_combined = pd.merge(
@@ -2198,7 +2199,7 @@ def plotGermlineSomatic(
     test_df.group = test_df.group.str.replace("Connective", "Sarcoma")
 
     # Subset DNA repair gene list
-    DNA_repair = pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t")
+    DNA_repair = pd.read_csv(GENE_LIST, sep="\t")
     DNA_repair = DNA_repair[DNA_repair.Gene.map(lambda x: x in target_df.keys())]
     Types = [
         "BER",
diff --git a/src/signatures/plotting/combinedSignatures.py b/src/signatures/plotting/combinedSignatures.py
index 2492c1d..9549f7e 100755
--- a/src/signatures/plotting/combinedSignatures.py
+++ b/src/signatures/plotting/combinedSignatures.py
@@ -24,9 +24,9 @@
 
 load_dotenv()
 COMBINED_SIGS_DIR = os.getenv("COMBINED_SIGS_DIR")
-REF_DIR = os.getenv("REF_DIR")
 FIGURE_DIR = os.getenv("FIGURE_DIR")
 DATA_DIR = os.getenv("DATA_DIR")
+REF_SIGNATURES_DIR = os.getenv("REF_SIGNATURES_DIR")
 
 mpl.rcParams["mathtext.fontset"] = "stix"
 mpl.rcParams["font.family"] = "STIXGeneral"
@@ -40,7 +40,7 @@
 # Signature directories
 sig_dirs = {
     "SBS288": f"{COMBINED_SIGS_DIR}/combinedSignatures_SBS288",
-    "DBS78": f"{COMBINED_SIGS_DIR}/combinedSignatures_DBS78_V4",
+    "DBS78": f"{COMBINED_SIGS_DIR}/combinedSignatures_DBS78",
     "ID83": f"{COMBINED_SIGS_DIR}/combinedSignatures_ID83",
     "CNV48": f"{COMBINED_SIGS_DIR}/combinedSignatures_CNV48",
     "SV32": f"{COMBINED_SIGS_DIR}/combinedSignatures_SV32",
@@ -49,11 +49,11 @@
 
 # COSMIC and other reference files
 references = {
-    "SBS288": f"{REF_DIR}/COSMIC_v3.3.1_SBS_GRCh38.txt",
-    "DBS78": f"{REF_DIR}/COSMIC_v3.3_DBS_GRCh38.txt",
-    "ID83": f"{REF_DIR}/COSMIC_v3.3_ID_GRCh37.txt",
-    "CNV48": f"{REF_DIR}/COSMIC_v3.3_CN_GRCh37.txt",
-    "SV32": f"{REF_DIR}/Breast560_rearrangement.signatures.tsv",
+    "SBS288": f"{REF_SIGNATURES_DIR}/COSMIC_v3.3.1_SBS_GRCh38.txt",
+    "DBS78": f"{REF_SIGNATURES_DIR}/COSMIC_v3.3_DBS_GRCh38.txt",
+    "ID83": f"{REF_SIGNATURES_DIR}/COSMIC_v3.3_ID_GRCh37.txt",
+    "CNV48": f"{REF_SIGNATURES_DIR}/COSMIC_v3.3_CN_GRCh37.txt",
+    "SV32": f"{REF_SIGNATURES_DIR}/Breast560_rearrangement.signatures.tsv",
 }
 
 map_colors = {"degasperi": "darkorange", "novel": "firebrick"}
@@ -134,12 +134,12 @@ def loadSignatures(sv_rename_deg=False):
     # Get reference Degasperi 2022/2020 signatures
     degasperi_sigs = {
         "SBS288": pd.read_excel(
-            f"{DATA_DIR}/science.abl9283_tables_s1_to_s33.xlsx", "Table S21"
+            f"{REF_SIGNATURES_DIR}/science.abl9283_tables_s1_to_s33.xlsx", "Table S21"
         ).set_index("mutationClass"),
         "DBS78": pd.read_excel(
-            f"{DATA_DIR}/science.abl9283_tables_s1_to_s33.xlsx", "Table S22"
+            f"{REF_SIGNATURES_DIR}/science.abl9283_tables_s1_to_s33.xlsx", "Table S22"
         ).set_index("mutationClass"),
-        "SV32": pd.read_csv(f"{DATA_DIR}/RefSigv1_Rearr.tsv", sep="\t"),
+        "SV32": pd.read_csv(f"{REF_SIGNATURES_DIR}/RefSigv1_Rearr.tsv", sep="\t"),
     }
     for sig_type in degasperi_sigs:
         degasperi_sigs[sig_type] = degasperi_sigs[sig_type].loc[
diff --git a/src/signatures/sampleCuration/GEL_NCRAS_XM.py b/src/signatures/sampleCuration/GEL_NCRAS_XM.py
index d28bde3..87a7148 100755
--- a/src/signatures/sampleCuration/GEL_NCRAS_XM.py
+++ b/src/signatures/sampleCuration/GEL_NCRAS_XM.py
@@ -9,6 +9,10 @@
 load_dotenv()
 DATA_DIR = os.getenv("DATA_DIR")
 RESULT_DIR = os.getenv("RESULT_DIR")
+SAMPLE_LIST = os.getenv("SAMPLE_LIST")
+for d in [DATA_DIR, RESULT_DIR, SAMPLE_LIST]:
+    if d is None:
+        raise ValueError("Environment variables not set")
 
 group_mapping = {
     "BileDuct-AdenoCA": ["adenocarcinoma"],  # cholangiocarcinoma
@@ -123,9 +127,9 @@
 
 def crossmatchGelNcras():
     # Import curated sample list from Alex and Dan
-    sample_df = pd.read_csv(
-        f"{RESULT_DIR}/sample_lists/sample_list_2021_06_29.tsv", delim_whitespace=True
-    ).rename(columns={"age_sampling": "age"})
+    sample_df = pd.read_csv(SAMPLE_LIST, delim_whitespace=True).rename(
+        columns={"age_sampling": "age"}
+    )
 
     # Import NCRAS data and crossmatch on tumour_pseudo_id
     sact = pd.read_csv(
diff --git a/src/signatures/sampleCuration/pan_cancer_sample.py b/src/signatures/sampleCuration/pan_cancer_sample.py
index 17c578c..cfd27f2 100755
--- a/src/signatures/sampleCuration/pan_cancer_sample.py
+++ b/src/signatures/sampleCuration/pan_cancer_sample.py
@@ -9,6 +9,7 @@
 load_dotenv()
 RESULT_DIR = os.getenv("RESULT_DIR")
 DATA_DIR = os.getenv("DATA_DIR")
+SAMPLE_LIST = os.getenv("SAMPLE_LIST")
 
 
 def getSamples(
@@ -37,7 +38,7 @@ def getSamples(
         "day_last_followup",
     ]
     sample_df = pd.read_csv(
-        f"{RESULT_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29.tsv",
+        f"{SAMPLE_LIST}",
         usecols=sample_keys,
         delim_whitespace=True,
     ).rename(columns={"age_sampling": "age"})