Merge pull request #20 from Wedge-lab/feat/code-from-gel

env-template
Wedge-lab · Dec 31, 2024 · 1c7995d · 1c7995d
2 parents f5309b8 + 6e419fb
commit 1c7995d
Show file tree

Hide file tree

Showing 32 changed files with 167 additions and 104 deletions.
diff --git a/.env-template b/.env-template
@@ -0,0 +1,22 @@
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+
+USERNAME= # name of user, used to name temporary files (e.g. rsterling)
+PROJECT_CODE=re_gecip_cancer_colorectal # project code for submitting jobs (e.g. re_gecip_cancer_colorectal for CRC GeCIP)
+PROJECT_DIR=$parent_path/workdir # path to project directory - this is where log files are dumped so should be a location with plenty of space (e.g. /re_scratch/...)
+
+SAMPLE_LIST= # path to sample list file (e.g. sample_list_2021_06_29.tsv)
+REF_SIGNATURES_DIR= # path to folder containing reference signature tables (e.g. COSMIC_v3.3_CN_GRCh37.txt)
+SIG_DIR= # path to folder containing signature matrices (e.g. SigProfilerCNV48)
+GENE_LIST= # path to file containing list of genes (e.g. .../human-dna-repair-genes.tsv)
+
+FIGURE_DIR=$parent_path/figures # path to folder where figures will be saved (e.g. .../figures)
+DATA_DIR=$parent_path/data
+
+COMBINED_SIGS_DIR=$parent_path/data/combined_signatures # path to folder containing combined signature matrices (e.g. .../combined_sigs)
+
+CLINVAR_CADD_CMD=/re_gecip/shared_allGeCIPs/bkinnersley/CADD/CADD-scripts-master/CADD.sh # Path to script for running CADD
+
+# Mutation file paths
+GERMLINE_DIR=/gel_data_resources/main_programme/aggregation/aggregate_gVCF_strelka/aggV2/genomic_data
+AGGV2_SAMPLE_LIST=/gel_data_resources/main_programme/aggregation/aggregate_gVCF_strelka/aggV2/additional_data/sample_list/aggV2_sampleIds_mpv10_78195.tsv
+ONCOKB_DIR= # Folder containing oncokb annotations (e.g. .../OncoKB_annotation/output)
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 # Data stored in data directory
 data/
+figures/
 
 # renv packages
 renv/

diff --git a/README.md b/README.md
@@ -3,7 +3,22 @@ Code and results for paper:
 
 [Comprehensive repertoire of the chromosomal alteration and mutational signatures across 16 cancer types from 10,983 cancer patients](doi.org/10.1101/2023.06.07.23290970)
 
-Disclaimer: This code was written inside the Genomics England Research Environment without github and it has not been tested outside the research environment. We provide it here to aid the transparency of the publication and make our analysis methods more accessible and reproducible. The pipelines provided will almost certainly not be of significant use outside Genomics England, however, we hope users find pieces of code which help to understand our work and may be useful in their research.
+Disclaimer: This code was written inside the Genomics England Research Environment without github and it has not been tested outside the research environment. We provide it here to aid the transparency of the publication and make our analysis methods more accessible and reproducible. The pipelines provided will almost certainly not be of significant use outside Genomics England, however, we hope users find pieces of code helpful to understand our work and useful in their research.
+
+# Getting started
+
+Clone the repository
+`git clone https://github.com/Wedge-lab/Gel_pan_cancer_signatures.git`
+
+Create .env file
+```
+cd Gel_pan_cancer_signatures
+cp .env-template .env
+```
+in `.env`, change all file names as required to your local files.
+
+Run editable install on a fresh conda environment (python=3.9)
+`pip install -e .`
 
 
 # Contained in this repository

diff --git a/scripts/clinical.sh b/scripts/clinical.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../.env
 
 # directories and filenames
 run_name=clinical_tumour-group_InDelQC_nb

diff --git a/scripts/combine_signatures/combineCNV.sh b/scripts/combine_signatures/combineCNV.sh
@@ -4,19 +4,20 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
+source ../../.env
 
 min_stability=1.0
 # Signature type
 sig_type=CNV48
 # Directory containing signatures
-sig_dir=${DATA_DIR}/copy_number_signatures/SigProfilerCNV48
+sig_dir=${SIG_DIR}/${sig_type}
 # COSMIC signatures file - column headered Type with each mutation type
 # Input signatures should be in additional columns
 # If there aren't any COSMIC reference signatures or extracting signatures deNovo, only include the Type column
-reference=${DATA_DIR}/COSMIC_v3.3_CN_GRCh37.txt
-sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv
+reference=${REF_SIGNATURES_DIR}/COSMIC_v3.3_CN_GRCh37.txt
+sample_file=${SAMPLE_LIST}
 
-dir_output=../../data/combinedSignatures_${sig_type}
+dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type}
 cohort_file=$dir_output/cohort_list.tsv
 
 # create input and output data directories

diff --git a/scripts/combine_signatures/combineDBS.sh b/scripts/combine_signatures/combineDBS.sh
@@ -2,20 +2,23 @@
 # Iteratively add cohort extracted signatures to pan-cancer COSMIC list
 
 #!/bin/bash
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+source ../../.env
 
 min_stability=1.0
 # Signature type
 sig_type=DBS78
 max_sigs=15
 # Directory containing signatures
-sig_dir=${DATA_DIR}/SIGmats/v4/${sig_type}
+sig_dir=${SIG_DIR}/${sig_type}
 
 # COSMIC signatures file - column headered Type with each mutation type
 # Input signatures should be in additional columns
-cosmic=${DATA_DIR}/COSMIC_v3.3_DBS_GRCh38.txt
-sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv
+cosmic=${REF_SIGNATURES_DIR}/COSMIC_v3.3_DBS_GRCh38.txt
+sample_file=${SAMPLE_LIST}
 
-dir_output=../../data/combinedSignatures_${sig_type}
+dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type}
 cohort_file=$dir_output/cohort_list.tsv
 
 # create input and output data directories

diff --git a/scripts/combine_signatures/combineID.sh b/scripts/combine_signatures/combineID.sh
@@ -2,19 +2,22 @@
 # Iteratively add cohort extracted signatures to pan-cancer COSMIC list
 
 #!/bin/bash
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+source ../../.env
 
 min_stability=1.0
 # Signature type
 sig_type=ID83
 max_sigs=25
 # Directory containing signatures
-sig_dir=${DATA_DIR}/SIGmats/v3/${sig_type} #v2_draft/${sig_type}
+sig_dir=${SIG_DIR}/${sig_type}
 # COSMIC signatures file - column headered Type with each mutation type
 # Input signatures should be in additional columns
-reference=${DATA_DIR}/COSMIC_v3.3_ID_GRCh37.txt
-sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv
+reference=${REF_SIGNATURES_DIR}/COSMIC_v3.3_ID_GRCh37.txt
+sample_file=${SAMPLE_LIST}
 
-dir_output=../../data/combinedSignatures_${sig_type}
+dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type}
 cohort_file=$dir_output/cohort_list.tsv
 
 # create input and output data directories

diff --git a/scripts/combine_signatures/combineSBS.sh b/scripts/combine_signatures/combineSBS.sh
@@ -2,19 +2,22 @@
 # Iteratively add cohort extracted signatures to pan-cancer COSMIC list
 
 #!/bin/bash
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+source ../../.env
 
 min_stability=1.0
 # Signature type
 sig_type=SBS288
 max_sigs=30
 # Directory containing signatures
-sig_dir=/re_gecip/cancer_pan/fprefect/botl/results/SIGmats/v2_draft/${sig_type}
+sig_dir=${SIG_DIR}/${sig_type}
 # COSMIC signatures file - column headered Type with each mutation type
 # Input signatures should be in additional columns
-cosmic=${DATA_DIR}/COSMIC_v3.3.1_SBS_GRCh38.txt
-sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv
+cosmic=${REF_SIGNATURES_DIR}/COSMIC_v3.3.1_SBS_GRCh38.txt
+sample_file=${SAMPLE_LIST}
 
-dir_output=../../data/combinedSignatures_${sig_type}
+dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type}
 cohort_file=$dir_output/cohort_list.tsv
 
 # create input and output data directories

diff --git a/scripts/combine_signatures/combineSV.sh b/scripts/combine_signatures/combineSV.sh
@@ -2,19 +2,22 @@
 # Iteratively add cohort extracted signatures to pan-cancer Reference list
 
 #!/bin/bash
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+source ../../.env
 
 min_stability=1.0
 # Signature type
 sig_type=SV32
 max_sigs=15
 # Directory containing signatures
-sig_dir=${DATA_DIR}/Results_SigProfiler_v2/
+sig_dir=${SIG_DIR}/${sig_type}
 # Reference signatures file - column headered Type with each mutation type
 # Input signatures should be in additional columns
-reference=${DATA_DIR}/Breast560_rearrangement.signatures.tsv
-sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv
+reference=${REF_SIGNATURES_DIR}/Breast560_rearrangement.signatures.tsv
+sample_file=${SAMPLE_LIST}
 
-dir_output=.../../combinedSignatures_${sig_type}
+dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type}
 cohort_file=$dir_output/cohort_list.tsv
 
 # create input and output data directories

diff --git a/scripts/data_prep/clinvar_cadd.sh b/scripts/data_prep/clinvar_cadd.sh
@@ -3,9 +3,7 @@
 
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../../.env
 
 # directories and filenames
 dir_analysis=CLINVAR_DIR

diff --git a/scripts/data_prep/germline.sh b/scripts/data_prep/germline.sh
@@ -1,14 +1,11 @@
 #!/bin/bash
 # run cancGeneHits.nf
 
+#!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
 source ../../.env
 
-# parameters
-PROJECT_DIR=../workdir
-
 # Run parameters
 n_files=10000 # Max number of files to run (input a small number if just testing)
 PHRED_threshold=20

diff --git a/scripts/data_prep/loh.sh b/scripts/data_prep/loh.sh
@@ -3,11 +3,7 @@
 
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-source ../.env
-
-# parameters
-PROJECT_DIR=../workdir
+source ../../.env
 
 # Run parameters
 chunk_size=100 # Number of samples to run on a single process
@@ -25,7 +21,7 @@ mkdir -p $dir_analysis/input
 mkdir -p $dir_output
 
 # Battenberg file list
-awk 'NR==1 {for (i=1; i<=NF; i++) {f[$i] = i}}{ print $(f["tumour_sample_platekey"])"\t"$(f["filename_cna"]) }' /re_gecip/cancer_pan/fprefect/botl/results/sample_lists/sample_list_2021_06_29.tsv | sed 's/filename_cna/filename_batt/' > ${filename_battenberg_list}
+awk 'NR==1 {for (i=1; i<=NF; i++) {f[$i] = i}}{ print $(f["tumour_sample_platekey"])"\t"$(f["filename_cna"]) }' ${SAMPLE_LIST} | sed 's/filename_cna/filename_batt/' > ${filename_battenberg_list}
 
 # Gene regions from
 echo -e "chrom\tstart\tend\tgene_id" > ${filename_genes}

diff --git a/scripts/data_prep/make_germline_input.py b/scripts/data_prep/make_germline_input.py
@@ -1,26 +1,37 @@
+import os
 import sys
-import pandas as pd, numpy as np, os
+
+import numpy as np
+import pandas as pd
 from dotenv import load_dotenv
 
 load_dotenv()
-AGGV2_SAMPLE_LIST = os.getenv('AGGV2_SAMPLE_LIST')
-
-if __name__=='__main__':
+AGGV2_SAMPLE_LIST = os.getenv("AGGV2_SAMPLE_LIST")
 
+if __name__ == "__main__":
     dir_output, sample_list_file = sys.argv[1:]
 
     # Get all samples from previous projects
-    cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv"
-                              for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']]
+    cancer_analysis_tables = [
+        f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv"
+        for version in ["V8", "V11/V11_reheadered", "v14/v14_reheadered"]
+    ]
     germline_platekey_list = np.array([])
     for table in cancer_analysis_tables:
-        germline_platekey_list = np.union1d(germline_platekey_list,
-                                            np.array(pd.read_csv(table, sep="\t", usecols=["germline_sample_platekey"])))
+        germline_platekey_list = np.union1d(
+            germline_platekey_list,
+            np.array(
+                pd.read_csv(table, sep="\t", usecols=["germline_sample_platekey"])
+            ),
+        )
 
     # Crossmatch with aggV2
-    aggv2_sample_list = np.array(pd.read_csv(AGGV2_SAMPLE_LIST,
-                                    sep="\t", header=None)[0])
+    aggv2_sample_list = np.array(
+        pd.read_csv(AGGV2_SAMPLE_LIST, sep="\t", header=None)[0]
+    )
 
     germline_platekey_list = np.intersect1d(germline_platekey_list, aggv2_sample_list)
     print(f"{len(germline_platekey_list)} germline samples")
-    pd.DataFrame(germline_platekey_list, columns=['germline_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False)
+    pd.DataFrame(germline_platekey_list, columns=["germline_sample_platekey"]).to_csv(
+        f"{sample_list_file}", header=False, index=False
+    )
diff --git a/scripts/data_prep/make_somatic_input.py b/scripts/data_prep/make_somatic_input.py
@@ -1,18 +1,25 @@
-import sys, os
-import pandas as pd, numpy as np
+import os
+import sys
 
+import numpy as np
+import pandas as pd
 
-if __name__=='__main__':
-
+if __name__ == "__main__":
     dir_output, sample_list_file = sys.argv[1:]
 
     # Get all samples from previous projects
-    cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv"
-                              for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']]
+    cancer_analysis_tables = [
+        f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv"
+        for version in ["V8", "V11/V11_reheadered", "v14/v14_reheadered"]
+    ]
     tumour_platekey_list = np.array([])
     for table in cancer_analysis_tables:
-        tumour_platekey_list = np.union1d(tumour_platekey_list,
-                                            np.array(pd.read_csv(table, sep="\t", usecols=["tumour_sample_platekey"])))
+        tumour_platekey_list = np.union1d(
+            tumour_platekey_list,
+            np.array(pd.read_csv(table, sep="\t", usecols=["tumour_sample_platekey"])),
+        )
 
     print(f"{len(tumour_platekey_list)} tumour_samples")
-    pd.DataFrame(tumour_platekey_list, columns=['tumour_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False)
+    pd.DataFrame(tumour_platekey_list, columns=["tumour_sample_platekey"]).to_csv(
+        f"{sample_list_file}", header=False, index=False
+    )
diff --git a/scripts/data_prep/oncokb.sh b/scripts/data_prep/oncokb.sh
@@ -3,11 +3,7 @@
 
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-source ../.env
-
-# parameters
-PROJECT_DIR=../../workdir
+source ../../.env
 
 # directories and filenames
 dir_analysis=../../data/cancGeneHits/somatic

diff --git a/scripts/genotype_assoc.sh b/scripts/genotype_assoc.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../.env
 
 # directories and filenames
 resampling_method=dCRT

diff --git a/scripts/germline_assoc.sh b/scripts/germline_assoc.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../.env
 
 # directories and filenames
 resampling_method=dCRT

diff --git a/scripts/somatic_assoc.sh b/scripts/somatic_assoc.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../.env
 
 # directories and filenames
 resampling_method=dCRT

diff --git a/scripts/treatment_assoc.sh b/scripts/treatment_assoc.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
 cd $parent_path
-
-# parameters
-PROJECT_DIR=../workdir
+source ../.env
 
 # directories and filenames
 resampling_method=dCRT