diff --git a/.env-template b/.env-template new file mode 100644 index 0000000..a83f166 --- /dev/null +++ b/.env-template @@ -0,0 +1,22 @@ +parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) + +USERNAME= # name of user, used to name temporary files (e.g. rsterling) +PROJECT_CODE=re_gecip_cancer_colorectal # project code for submitting jobs (e.g. re_gecip_cancer_colorectal for CRC GeCIP) +PROJECT_DIR=$parent_path/workdir # path to project directory - this is where log files are dumped so should be a location with plenty of space (e.g. /re_scratch/...) + +SAMPLE_LIST= # path to sample list file (e.g. sample_list_2021_06_29.tsv) +REF_SIGNATURES_DIR= # path to folder containing reference signature tables (e.g. COSMIC_v3.3_CN_GRCh37.txt) +SIG_DIR= # path to folder containing signature matrices (e.g. SigProfilerCNV48) +GENE_LIST= # path to file containing list of genes (e.g. .../human-dna-repair-genes.tsv) + +FIGURE_DIR=$parent_path/figures # path to folder where figures will be saved (e.g. .../figures) +DATA_DIR=$parent_path/data + +COMBINED_SIGS_DIR=$parent_path/data/combined_signatures # path to folder containing combined signature matrices (e.g. .../combined_sigs) + +CLINVAR_CADD_CMD=/re_gecip/shared_allGeCIPs/bkinnersley/CADD/CADD-scripts-master/CADD.sh # Path to script for running CADD + +# Mutation file paths +GERMLINE_DIR=/gel_data_resources/main_programme/aggregation/aggregate_gVCF_strelka/aggV2/genomic_data +AGGV2_SAMPLE_LIST=/gel_data_resources/main_programme/aggregation/aggregate_gVCF_strelka/aggV2/additional_data/sample_list/aggV2_sampleIds_mpv10_78195.tsv +ONCOKB_DIR= # Folder containing oncokb annotations (e.g. .../OncoKB_annotation/output) \ No newline at end of file diff --git a/.gitignore b/.gitignore index e73fcd7..102df04 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Data stored in data directory data/ +figures/ # renv packages renv/ diff --git a/README.md b/README.md index f0b96f3..7e8601d 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,22 @@ Code and results for paper: [Comprehensive repertoire of the chromosomal alteration and mutational signatures across 16 cancer types from 10,983 cancer patients](doi.org/10.1101/2023.06.07.23290970) -Disclaimer: This code was written inside the Genomics England Research Environment without github and it has not been tested outside the research environment. We provide it here to aid the transparency of the publication and make our analysis methods more accessible and reproducible. The pipelines provided will almost certainly not be of significant use outside Genomics England, however, we hope users find pieces of code which help to understand our work and may be useful in their research. +Disclaimer: This code was written inside the Genomics England Research Environment without github and it has not been tested outside the research environment. We provide it here to aid the transparency of the publication and make our analysis methods more accessible and reproducible. The pipelines provided will almost certainly not be of significant use outside Genomics England, however, we hope users find pieces of code helpful to understand our work and useful in their research. + +# Getting started + +Clone the repository +`git clone https://github.com/Wedge-lab/Gel_pan_cancer_signatures.git` + +Create .env file +``` +cd Gel_pan_cancer_signatures +cp .env-template .env +``` +in `.env`, change all file names as required to your local files. + +Run editable install on a fresh conda environment (python=3.9) +`pip install -e .` # Contained in this repository diff --git a/scripts/clinical.sh b/scripts/clinical.sh index eae7867..74b3371 100755 --- a/scripts/clinical.sh +++ b/scripts/clinical.sh @@ -1,9 +1,7 @@ #!/bin/bash parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) cd $parent_path - -# parameters -PROJECT_DIR=../workdir +source ../.env # directories and filenames run_name=clinical_tumour-group_InDelQC_nb diff --git a/scripts/combine_signatures/combineCNV.sh b/scripts/combine_signatures/combineCNV.sh index 60cad67..6030f1e 100755 --- a/scripts/combine_signatures/combineCNV.sh +++ b/scripts/combine_signatures/combineCNV.sh @@ -4,19 +4,20 @@ #!/bin/bash parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) cd $parent_path +source ../../.env min_stability=1.0 # Signature type sig_type=CNV48 # Directory containing signatures -sig_dir=${DATA_DIR}/copy_number_signatures/SigProfilerCNV48 +sig_dir=${SIG_DIR}/${sig_type} # COSMIC signatures file - column headered Type with each mutation type # Input signatures should be in additional columns # If there aren't any COSMIC reference signatures or extracting signatures deNovo, only include the Type column -reference=${DATA_DIR}/COSMIC_v3.3_CN_GRCh37.txt -sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv +reference=${REF_SIGNATURES_DIR}/COSMIC_v3.3_CN_GRCh37.txt +sample_file=${SAMPLE_LIST} -dir_output=../../data/combinedSignatures_${sig_type} +dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type} cohort_file=$dir_output/cohort_list.tsv # create input and output data directories diff --git a/scripts/combine_signatures/combineDBS.sh b/scripts/combine_signatures/combineDBS.sh index 0671b47..46e5229 100755 --- a/scripts/combine_signatures/combineDBS.sh +++ b/scripts/combine_signatures/combineDBS.sh @@ -2,20 +2,23 @@ # Iteratively add cohort extracted signatures to pan-cancer COSMIC list #!/bin/bash +parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +cd $parent_path +source ../../.env min_stability=1.0 # Signature type sig_type=DBS78 max_sigs=15 # Directory containing signatures -sig_dir=${DATA_DIR}/SIGmats/v4/${sig_type} +sig_dir=${SIG_DIR}/${sig_type} # COSMIC signatures file - column headered Type with each mutation type # Input signatures should be in additional columns -cosmic=${DATA_DIR}/COSMIC_v3.3_DBS_GRCh38.txt -sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv +cosmic=${REF_SIGNATURES_DIR}/COSMIC_v3.3_DBS_GRCh38.txt +sample_file=${SAMPLE_LIST} -dir_output=../../data/combinedSignatures_${sig_type} +dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type} cohort_file=$dir_output/cohort_list.tsv # create input and output data directories diff --git a/scripts/combine_signatures/combineID.sh b/scripts/combine_signatures/combineID.sh index bfa03fd..0185548 100755 --- a/scripts/combine_signatures/combineID.sh +++ b/scripts/combine_signatures/combineID.sh @@ -2,19 +2,22 @@ # Iteratively add cohort extracted signatures to pan-cancer COSMIC list #!/bin/bash +parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +cd $parent_path +source ../../.env min_stability=1.0 # Signature type sig_type=ID83 max_sigs=25 # Directory containing signatures -sig_dir=${DATA_DIR}/SIGmats/v3/${sig_type} #v2_draft/${sig_type} +sig_dir=${SIG_DIR}/${sig_type} # COSMIC signatures file - column headered Type with each mutation type # Input signatures should be in additional columns -reference=${DATA_DIR}/COSMIC_v3.3_ID_GRCh37.txt -sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv +reference=${REF_SIGNATURES_DIR}/COSMIC_v3.3_ID_GRCh37.txt +sample_file=${SAMPLE_LIST} -dir_output=../../data/combinedSignatures_${sig_type} +dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type} cohort_file=$dir_output/cohort_list.tsv # create input and output data directories diff --git a/scripts/combine_signatures/combineSBS.sh b/scripts/combine_signatures/combineSBS.sh index ccd011c..09deb72 100755 --- a/scripts/combine_signatures/combineSBS.sh +++ b/scripts/combine_signatures/combineSBS.sh @@ -2,19 +2,22 @@ # Iteratively add cohort extracted signatures to pan-cancer COSMIC list #!/bin/bash +parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +cd $parent_path +source ../../.env min_stability=1.0 # Signature type sig_type=SBS288 max_sigs=30 # Directory containing signatures -sig_dir=/re_gecip/cancer_pan/fprefect/botl/results/SIGmats/v2_draft/${sig_type} +sig_dir=${SIG_DIR}/${sig_type} # COSMIC signatures file - column headered Type with each mutation type # Input signatures should be in additional columns -cosmic=${DATA_DIR}/COSMIC_v3.3.1_SBS_GRCh38.txt -sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv +cosmic=${REF_SIGNATURES_DIR}/COSMIC_v3.3.1_SBS_GRCh38.txt +sample_file=${SAMPLE_LIST} -dir_output=../../data/combinedSignatures_${sig_type} +dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type} cohort_file=$dir_output/cohort_list.tsv # create input and output data directories diff --git a/scripts/combine_signatures/combineSV.sh b/scripts/combine_signatures/combineSV.sh index 2be9f98..fa538f6 100755 --- a/scripts/combine_signatures/combineSV.sh +++ b/scripts/combine_signatures/combineSV.sh @@ -2,19 +2,22 @@ # Iteratively add cohort extracted signatures to pan-cancer Reference list #!/bin/bash +parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +cd $parent_path +source ../../.env min_stability=1.0 # Signature type sig_type=SV32 max_sigs=15 # Directory containing signatures -sig_dir=${DATA_DIR}/Results_SigProfiler_v2/ +sig_dir=${SIG_DIR}/${sig_type} # Reference signatures file - column headered Type with each mutation type # Input signatures should be in additional columns -reference=${DATA_DIR}/Breast560_rearrangement.signatures.tsv -sample_file=${DATA_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29_incl_SEGs.tsv +reference=${REF_SIGNATURES_DIR}/Breast560_rearrangement.signatures.tsv +sample_file=${SAMPLE_LIST} -dir_output=.../../combinedSignatures_${sig_type} +dir_output=${COMBINED_SIGS_DIR}/combinedSignatures_${sig_type} cohort_file=$dir_output/cohort_list.tsv # create input and output data directories diff --git a/scripts/data_prep/clinvar_cadd.sh b/scripts/data_prep/clinvar_cadd.sh index ddd31de..45a4305 100755 --- a/scripts/data_prep/clinvar_cadd.sh +++ b/scripts/data_prep/clinvar_cadd.sh @@ -3,9 +3,7 @@ parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) cd $parent_path - -# parameters -PROJECT_DIR=../workdir +source ../../.env # directories and filenames dir_analysis=CLINVAR_DIR diff --git a/scripts/data_prep/germline.sh b/scripts/data_prep/germline.sh index cbfcc07..cbead33 100755 --- a/scripts/data_prep/germline.sh +++ b/scripts/data_prep/germline.sh @@ -1,14 +1,11 @@ #!/bin/bash # run cancGeneHits.nf +#!/bin/bash parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) cd $parent_path - source ../../.env -# parameters -PROJECT_DIR=../workdir - # Run parameters n_files=10000 # Max number of files to run (input a small number if just testing) PHRED_threshold=20 diff --git a/scripts/data_prep/loh.sh b/scripts/data_prep/loh.sh index 5e9e49c..adf5684 100755 --- a/scripts/data_prep/loh.sh +++ b/scripts/data_prep/loh.sh @@ -3,11 +3,7 @@ parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) cd $parent_path - -source ../.env - -# parameters -PROJECT_DIR=../workdir +source ../../.env # Run parameters chunk_size=100 # Number of samples to run on a single process @@ -25,7 +21,7 @@ mkdir -p $dir_analysis/input mkdir -p $dir_output # Battenberg file list -awk 'NR==1 {for (i=1; i<=NF; i++) {f[$i] = i}}{ print $(f["tumour_sample_platekey"])"\t"$(f["filename_cna"]) }' /re_gecip/cancer_pan/fprefect/botl/results/sample_lists/sample_list_2021_06_29.tsv | sed 's/filename_cna/filename_batt/' > ${filename_battenberg_list} +awk 'NR==1 {for (i=1; i<=NF; i++) {f[$i] = i}}{ print $(f["tumour_sample_platekey"])"\t"$(f["filename_cna"]) }' ${SAMPLE_LIST} | sed 's/filename_cna/filename_batt/' > ${filename_battenberg_list} # Gene regions from echo -e "chrom\tstart\tend\tgene_id" > ${filename_genes} diff --git a/scripts/data_prep/make_germline_input.py b/scripts/data_prep/make_germline_input.py index c6959c4..c75485c 100755 --- a/scripts/data_prep/make_germline_input.py +++ b/scripts/data_prep/make_germline_input.py @@ -1,26 +1,37 @@ +import os import sys -import pandas as pd, numpy as np, os + +import numpy as np +import pandas as pd from dotenv import load_dotenv load_dotenv() -AGGV2_SAMPLE_LIST = os.getenv('AGGV2_SAMPLE_LIST') - -if __name__=='__main__': +AGGV2_SAMPLE_LIST = os.getenv("AGGV2_SAMPLE_LIST") +if __name__ == "__main__": dir_output, sample_list_file = sys.argv[1:] # Get all samples from previous projects - cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv" - for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']] + cancer_analysis_tables = [ + f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv" + for version in ["V8", "V11/V11_reheadered", "v14/v14_reheadered"] + ] germline_platekey_list = np.array([]) for table in cancer_analysis_tables: - germline_platekey_list = np.union1d(germline_platekey_list, - np.array(pd.read_csv(table, sep="\t", usecols=["germline_sample_platekey"]))) + germline_platekey_list = np.union1d( + germline_platekey_list, + np.array( + pd.read_csv(table, sep="\t", usecols=["germline_sample_platekey"]) + ), + ) # Crossmatch with aggV2 - aggv2_sample_list = np.array(pd.read_csv(AGGV2_SAMPLE_LIST, - sep="\t", header=None)[0]) + aggv2_sample_list = np.array( + pd.read_csv(AGGV2_SAMPLE_LIST, sep="\t", header=None)[0] + ) germline_platekey_list = np.intersect1d(germline_platekey_list, aggv2_sample_list) print(f"{len(germline_platekey_list)} germline samples") - pd.DataFrame(germline_platekey_list, columns=['germline_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False) + pd.DataFrame(germline_platekey_list, columns=["germline_sample_platekey"]).to_csv( + f"{sample_list_file}", header=False, index=False + ) diff --git a/scripts/data_prep/make_somatic_input.py b/scripts/data_prep/make_somatic_input.py index b756c45..4820be0 100755 --- a/scripts/data_prep/make_somatic_input.py +++ b/scripts/data_prep/make_somatic_input.py @@ -1,18 +1,25 @@ -import sys, os -import pandas as pd, numpy as np +import os +import sys +import numpy as np +import pandas as pd -if __name__=='__main__': - +if __name__ == "__main__": dir_output, sample_list_file = sys.argv[1:] # Get all samples from previous projects - cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv" - for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']] + cancer_analysis_tables = [ + f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv" + for version in ["V8", "V11/V11_reheadered", "v14/v14_reheadered"] + ] tumour_platekey_list = np.array([]) for table in cancer_analysis_tables: - tumour_platekey_list = np.union1d(tumour_platekey_list, - np.array(pd.read_csv(table, sep="\t", usecols=["tumour_sample_platekey"]))) + tumour_platekey_list = np.union1d( + tumour_platekey_list, + np.array(pd.read_csv(table, sep="\t", usecols=["tumour_sample_platekey"])), + ) print(f"{len(tumour_platekey_list)} tumour_samples") - pd.DataFrame(tumour_platekey_list, columns=['tumour_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False) + pd.DataFrame(tumour_platekey_list, columns=["tumour_sample_platekey"]).to_csv( + f"{sample_list_file}", header=False, index=False + ) diff --git a/scripts/data_prep/oncokb.sh b/scripts/data_prep/oncokb.sh index 18bc2cb..b6acb5e 100755 --- a/scripts/data_prep/oncokb.sh +++ b/scripts/data_prep/oncokb.sh @@ -3,11 +3,7 @@ parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) cd $parent_path - -source ../.env - -# parameters -PROJECT_DIR=../../workdir +source ../../.env # directories and filenames dir_analysis=../../data/cancGeneHits/somatic diff --git a/scripts/genotype_assoc.sh b/scripts/genotype_assoc.sh index 1dcd687..57ec58f 100755 --- a/scripts/genotype_assoc.sh +++ b/scripts/genotype_assoc.sh @@ -1,9 +1,7 @@ #!/bin/bash parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) cd $parent_path - -# parameters -PROJECT_DIR=../workdir +source ../.env # directories and filenames resampling_method=dCRT diff --git a/scripts/germline_assoc.sh b/scripts/germline_assoc.sh index 71452ea..bce2d1a 100755 --- a/scripts/germline_assoc.sh +++ b/scripts/germline_assoc.sh @@ -1,9 +1,7 @@ #!/bin/bash parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) cd $parent_path - -# parameters -PROJECT_DIR=../workdir +source ../.env # directories and filenames resampling_method=dCRT diff --git a/scripts/somatic_assoc.sh b/scripts/somatic_assoc.sh index 9fa308d..07b3072 100755 --- a/scripts/somatic_assoc.sh +++ b/scripts/somatic_assoc.sh @@ -1,9 +1,7 @@ #!/bin/bash parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) cd $parent_path - -# parameters -PROJECT_DIR=../workdir +source ../.env # directories and filenames resampling_method=dCRT diff --git a/scripts/treatment_assoc.sh b/scripts/treatment_assoc.sh index 16ac358..f253eaa 100755 --- a/scripts/treatment_assoc.sh +++ b/scripts/treatment_assoc.sh @@ -1,9 +1,7 @@ #!/bin/bash parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) cd $parent_path - -# parameters -PROJECT_DIR=../workdir +source ../.env # directories and filenames resampling_method=dCRT diff --git a/scripts/twohit_assoc.sh b/scripts/twohit_assoc.sh index 690090e..31d323c 100755 --- a/scripts/twohit_assoc.sh +++ b/scripts/twohit_assoc.sh @@ -1,10 +1,7 @@ #!/bin/bash -#!/bin/bash parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) cd $parent_path - -# parameters -PROJECT_DIR=../workdir +source ../.env # directories and filenames resampling_method=dCRT diff --git a/src/signatures/associations/cancGeneHit/targets.py b/src/signatures/associations/cancGeneHit/targets.py index 9048069..9f25836 100755 --- a/src/signatures/associations/cancGeneHit/targets.py +++ b/src/signatures/associations/cancGeneHit/targets.py @@ -8,6 +8,7 @@ load_dotenv() DATA_DIR = os.getenv("DATA_DIR") +GENE_LIST = os.getenv("GENE_LIST") if __name__ == "__main__": samples_file, targets_file, n = sys.argv[1:] @@ -117,7 +118,7 @@ # DNA types from database DNA_repair = pd.merge( - pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t"), + pd.read_csv(GENE_LIST, sep="\t"), pd.DataFrame(index=np.array(genes)), left_on="Gene", right_index=True, diff --git a/src/signatures/associations/cancGeneHit/targets_germline.py b/src/signatures/associations/cancGeneHit/targets_germline.py index 78be786..27e9d7a 100755 --- a/src/signatures/associations/cancGeneHit/targets_germline.py +++ b/src/signatures/associations/cancGeneHit/targets_germline.py @@ -8,6 +8,7 @@ load_dotenv() DATA_DIR = os.getenv("DATA_DIR") +GENE_LIST = f"{DATA_DIR}/DNA_repair_genes.tsv" if __name__ == "__main__": samples_file, targets_file = sys.argv[1:] @@ -63,7 +64,7 @@ # DNA types from database DNA_repair = pd.merge( - pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t"), + pd.read_csv(GENE_LIST, sep="\t"), pd.DataFrame(index=np.array(genes)), left_on="Gene", right_index=True, diff --git a/src/signatures/associations/cancGeneHit/targets_somatic.py b/src/signatures/associations/cancGeneHit/targets_somatic.py index 5e8e9ea..c362001 100755 --- a/src/signatures/associations/cancGeneHit/targets_somatic.py +++ b/src/signatures/associations/cancGeneHit/targets_somatic.py @@ -8,6 +8,7 @@ load_dotenv() DATA_DIR = os.getenv("DATA_DIR") +GENE_LIST = os.getenv("GENE_LIST") if __name__ == "__main__": samples_file, targets_file = sys.argv[1:] @@ -61,7 +62,7 @@ # DNA types from database DNA_repair = pd.merge( - pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t"), + pd.read_csv(GENE_LIST, sep="\t"), pd.DataFrame(index=np.array(genes)), left_on="Gene", right_index=True, diff --git a/src/signatures/associations/cancGeneHit/tests.py b/src/signatures/associations/cancGeneHit/tests.py index 8dcb239..6e3df78 100755 --- a/src/signatures/associations/cancGeneHit/tests.py +++ b/src/signatures/associations/cancGeneHit/tests.py @@ -8,6 +8,7 @@ load_dotenv() DATA_DIR = os.getenv("DATA_DIR") +GENE_LIST = os.getenv("GENE_LIST") if __name__ == "__main__": samples_file, signatures_file, targets_file, tests_file, tests_file_binary, n = ( @@ -22,7 +23,7 @@ groups = pd.read_csv(samples_file, usecols=["sample_id", "group"], sep="\t") # Subset for DNA repair genes - DNA_repair = pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t") + DNA_repair = pd.read_csv(GENE_LIST, sep="\t") if True: mock_genes = targets.keys()[ [bool(re.search("^MOCK[0-9A-Z]+$", tgt)) for tgt in targets.keys()] diff --git a/src/signatures/associations/cancGeneHit/tests_germline.py b/src/signatures/associations/cancGeneHit/tests_germline.py index caf47a5..d3a1698 100755 --- a/src/signatures/associations/cancGeneHit/tests_germline.py +++ b/src/signatures/associations/cancGeneHit/tests_germline.py @@ -8,6 +8,7 @@ load_dotenv() DATA_DIR = os.getenv("DATA_DIR") +GENE_LIST = os.getenv("GENE_LIST") if __name__ == "__main__": samples_file, signatures_file, targets_file, tests_file, tests_file_binary = ( @@ -22,7 +23,7 @@ groups = pd.read_csv(samples_file, usecols=["sample_id", "group"], sep="\t") # Subset for DNA repair genes - DNA_repair = pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t") + DNA_repair = pd.read_csv(GENE_LIST, sep="\t") if True: mock_genes = targets.keys()[ [bool(re.search("^MOCK[0-9A-Z]+$", tgt)) for tgt in targets.keys()] diff --git a/src/signatures/associations/clinical/samples_and_tests.py b/src/signatures/associations/clinical/samples_and_tests.py index 9807d48..9e9de15 100755 --- a/src/signatures/associations/clinical/samples_and_tests.py +++ b/src/signatures/associations/clinical/samples_and_tests.py @@ -6,6 +6,7 @@ load_dotenv() DATA_DIR = os.getenv("DATA_DIR") +SAMPLE_LIST = os.getenv("SAMPLE_LIST") if __name__ == "__main__": samples_file, activities_file = sys.argv[1:] @@ -18,7 +19,7 @@ # Get tumour group sample_list_df = pd.read_csv( - f"{DATA_DIR}/sample_lists_incl_SEGs/sample_list.tsv", + SAMPLE_LIST, usecols=[ "participant_id", "tumour_sample_platekey", diff --git a/src/signatures/plotting/associations/clinicalAssoc.py b/src/signatures/plotting/associations/clinicalAssoc.py index 7e02bdf..10cefba 100755 --- a/src/signatures/plotting/associations/clinicalAssoc.py +++ b/src/signatures/plotting/associations/clinicalAssoc.py @@ -521,7 +521,7 @@ def publish_fig(filename, publish="./"): plt.xlim(-2.5, 2.5) plt.xlabel(r"$\beta$", fontsize=fs) - publish_fig("clinical_panel", publish=FIGURE_DIR) + publish_fig("clinical_panel", publish=f"{FIGURE_DIR}") # ## Histology @@ -533,7 +533,9 @@ def publish_fig(filename, publish="./"): results_target = pd.read_csv(f"{results_dir}/target_target_assoc.csv") fig_dir = f"{results_dir}/figures" - results_dir_cohort = f"/re_gecip/shared_allGeCIPs/pancancer_signatures/results/associations/clinicalSigs/{run_name}/output_cohort" + results_dir_cohort = ( + f"{RESULT_DIR}/associations/clinicalSigs/{run_name}/output_cohort" + ) results_zinb = pd.concat( ( results_zinb, @@ -708,4 +710,4 @@ def publish_fig(filename, publish="./"): plt.sca(ax) plt.axis("off") - publish_fig("histology_rate_comparison", publish=FIGURE_DIR) + publish_fig("histology_rate_comparison", publish=f"{FIGURE_DIR}") diff --git a/src/signatures/plotting/associations/survivalCoefs.py b/src/signatures/plotting/associations/survivalCoefs.py index 1563258..8519ec4 100644 --- a/src/signatures/plotting/associations/survivalCoefs.py +++ b/src/signatures/plotting/associations/survivalCoefs.py @@ -1,14 +1,20 @@ +import os + import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np import pandas as pd import scipy import scipy.stats +from dotenv import load_dotenv from lifelines import CoxPHFitter, KaplanMeierFitter from lifelines.statistics import logrank_test from signatures.plotting.combinedSignatures import loadSignatures, signatureRenamer +load_dotenv() +SAMPLE_LIST = os.getenv("SAMPLE_LIST") + mpl.rcParams["mathtext.fontset"] = "stix" mpl.rcParams["font.family"] = "STIXGeneral" plt.rc("axes", labelsize=16) @@ -106,7 +112,7 @@ def result_string(sig, cph_alt): # Add tumour type tumour_type = pd.read_csv( - "/re_gecip/shared_allGeCIPs/pancancer_signatures/results/sample_lists_incl_SEGs/sample_list_2021_06_29.tsv", + f"{SAMPLE_LIST}", sep="\t", usecols=[ "participant_id", diff --git a/src/signatures/plotting/associations/twoHitAssoc.py b/src/signatures/plotting/associations/twoHitAssoc.py index 99e091f..0a74744 100755 --- a/src/signatures/plotting/associations/twoHitAssoc.py +++ b/src/signatures/plotting/associations/twoHitAssoc.py @@ -23,6 +23,7 @@ RESULT_DIR = os.getenv("RESULT_DIR") FIGURE_DIR = os.getenv("FIGURE_DIR") DATA_DIR = os.getenv("DATA_DIR") +GENE_LIST = os.getenv("GENE_LIST") mpl.rcParams["mathtext.fontset"] = "stix" mpl.rcParams["font.family"] = "STIXGeneral" @@ -361,7 +362,7 @@ def getAssociationResults(results_dir, DNA_repair): print(len(results_combined)) DNArepair = pd.read_csv( - "/re_gecip/shared_allGeCIPs/pancancer_signatures/data/human-dna-repair-genes.tsv", + GENE_LIST, sep="\t", ) results_combined = pd.merge( @@ -2198,7 +2199,7 @@ def plotGermlineSomatic( test_df.group = test_df.group.str.replace("Connective", "Sarcoma") # Subset DNA repair gene list - DNA_repair = pd.read_csv(f"{DATA_DIR}/human-dna-repair-genes.tsv", sep="\t") + DNA_repair = pd.read_csv(GENE_LIST, sep="\t") DNA_repair = DNA_repair[DNA_repair.Gene.map(lambda x: x in target_df.keys())] Types = [ "BER", diff --git a/src/signatures/plotting/combinedSignatures.py b/src/signatures/plotting/combinedSignatures.py index 2492c1d..9549f7e 100755 --- a/src/signatures/plotting/combinedSignatures.py +++ b/src/signatures/plotting/combinedSignatures.py @@ -24,9 +24,9 @@ load_dotenv() COMBINED_SIGS_DIR = os.getenv("COMBINED_SIGS_DIR") -REF_DIR = os.getenv("REF_DIR") FIGURE_DIR = os.getenv("FIGURE_DIR") DATA_DIR = os.getenv("DATA_DIR") +REF_SIGNATURES_DIR = os.getenv("REF_SIGNATURES_DIR") mpl.rcParams["mathtext.fontset"] = "stix" mpl.rcParams["font.family"] = "STIXGeneral" @@ -40,7 +40,7 @@ # Signature directories sig_dirs = { "SBS288": f"{COMBINED_SIGS_DIR}/combinedSignatures_SBS288", - "DBS78": f"{COMBINED_SIGS_DIR}/combinedSignatures_DBS78_V4", + "DBS78": f"{COMBINED_SIGS_DIR}/combinedSignatures_DBS78", "ID83": f"{COMBINED_SIGS_DIR}/combinedSignatures_ID83", "CNV48": f"{COMBINED_SIGS_DIR}/combinedSignatures_CNV48", "SV32": f"{COMBINED_SIGS_DIR}/combinedSignatures_SV32", @@ -49,11 +49,11 @@ # COSMIC and other reference files references = { - "SBS288": f"{REF_DIR}/COSMIC_v3.3.1_SBS_GRCh38.txt", - "DBS78": f"{REF_DIR}/COSMIC_v3.3_DBS_GRCh38.txt", - "ID83": f"{REF_DIR}/COSMIC_v3.3_ID_GRCh37.txt", - "CNV48": f"{REF_DIR}/COSMIC_v3.3_CN_GRCh37.txt", - "SV32": f"{REF_DIR}/Breast560_rearrangement.signatures.tsv", + "SBS288": f"{REF_SIGNATURES_DIR}/COSMIC_v3.3.1_SBS_GRCh38.txt", + "DBS78": f"{REF_SIGNATURES_DIR}/COSMIC_v3.3_DBS_GRCh38.txt", + "ID83": f"{REF_SIGNATURES_DIR}/COSMIC_v3.3_ID_GRCh37.txt", + "CNV48": f"{REF_SIGNATURES_DIR}/COSMIC_v3.3_CN_GRCh37.txt", + "SV32": f"{REF_SIGNATURES_DIR}/Breast560_rearrangement.signatures.tsv", } map_colors = {"degasperi": "darkorange", "novel": "firebrick"} @@ -134,12 +134,12 @@ def loadSignatures(sv_rename_deg=False): # Get reference Degasperi 2022/2020 signatures degasperi_sigs = { "SBS288": pd.read_excel( - f"{DATA_DIR}/science.abl9283_tables_s1_to_s33.xlsx", "Table S21" + f"{REF_SIGNATURES_DIR}/science.abl9283_tables_s1_to_s33.xlsx", "Table S21" ).set_index("mutationClass"), "DBS78": pd.read_excel( - f"{DATA_DIR}/science.abl9283_tables_s1_to_s33.xlsx", "Table S22" + f"{REF_SIGNATURES_DIR}/science.abl9283_tables_s1_to_s33.xlsx", "Table S22" ).set_index("mutationClass"), - "SV32": pd.read_csv(f"{DATA_DIR}/RefSigv1_Rearr.tsv", sep="\t"), + "SV32": pd.read_csv(f"{REF_SIGNATURES_DIR}/RefSigv1_Rearr.tsv", sep="\t"), } for sig_type in degasperi_sigs: degasperi_sigs[sig_type] = degasperi_sigs[sig_type].loc[ diff --git a/src/signatures/sampleCuration/GEL_NCRAS_XM.py b/src/signatures/sampleCuration/GEL_NCRAS_XM.py index d28bde3..87a7148 100755 --- a/src/signatures/sampleCuration/GEL_NCRAS_XM.py +++ b/src/signatures/sampleCuration/GEL_NCRAS_XM.py @@ -9,6 +9,10 @@ load_dotenv() DATA_DIR = os.getenv("DATA_DIR") RESULT_DIR = os.getenv("RESULT_DIR") +SAMPLE_LIST = os.getenv("SAMPLE_LIST") +for d in [DATA_DIR, RESULT_DIR, SAMPLE_LIST]: + if d is None: + raise ValueError("Environment variables not set") group_mapping = { "BileDuct-AdenoCA": ["adenocarcinoma"], # cholangiocarcinoma @@ -123,9 +127,9 @@ def crossmatchGelNcras(): # Import curated sample list from Alex and Dan - sample_df = pd.read_csv( - f"{RESULT_DIR}/sample_lists/sample_list_2021_06_29.tsv", delim_whitespace=True - ).rename(columns={"age_sampling": "age"}) + sample_df = pd.read_csv(SAMPLE_LIST, delim_whitespace=True).rename( + columns={"age_sampling": "age"} + ) # Import NCRAS data and crossmatch on tumour_pseudo_id sact = pd.read_csv( diff --git a/src/signatures/sampleCuration/pan_cancer_sample.py b/src/signatures/sampleCuration/pan_cancer_sample.py index 17c578c..cfd27f2 100755 --- a/src/signatures/sampleCuration/pan_cancer_sample.py +++ b/src/signatures/sampleCuration/pan_cancer_sample.py @@ -9,6 +9,7 @@ load_dotenv() RESULT_DIR = os.getenv("RESULT_DIR") DATA_DIR = os.getenv("DATA_DIR") +SAMPLE_LIST = os.getenv("SAMPLE_LIST") def getSamples( @@ -37,7 +38,7 @@ def getSamples( "day_last_followup", ] sample_df = pd.read_csv( - f"{RESULT_DIR}/sample_lists_incl_SEGs/sample_list_2021_06_29.tsv", + f"{SAMPLE_LIST}", usecols=sample_keys, delim_whitespace=True, ).rename(columns={"age_sampling": "age"})