Merge pull request #17 from Wedge-lab/feat/code-from-gel

Code from Genomics England including signatureAssociations.nf
Wedge-lab · Dec 22, 2024 · 74b5577 · 74b5577
2 parents bb3c4ac + a15bea6
commit 74b5577
Show file tree

Hide file tree

Showing 39 changed files with 4,698 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -2,3 +2,36 @@
 Code and results for paper:
 
 [Comprehensive repertoire of the chromosomal alteration and mutational signatures across 16 cancer types from 10,983 cancer patients](doi.org/10.1101/2023.06.07.23290970)
+
+Disclaimer: This code was written inside the Genomics England Research Environment without github and it has not been tested outside the research environment. We provide it here to aid the transparency of the publication and make our analysis methods more accessible and reproducible. The pipelines provided will almost certainly not be of significant use outside Genomics England, however, we hope users find pieces of code which help to understand our work and may be useful in their research.
+
+
+# Contained in this repository
+
+
+### Curate gene mutations
+Bash scripts in `./scripts/data_prep` are for collecting and curating genetic mutations in WGS data available in Genomics England including inferring expected impact using CADD scores or OncoKB annotations.
+
+
+### Combine signatures
+
+
+
+
+### Association analysis
+
+A series of scripts in `./scripts` run association ananlysis on mutation rates with various covariates including germline and somatic gene inactivations and treatment exposures:
+```
+germline_assoc.sh
+treatment_assoc.sh
+genotype_assoc.sh
+somatic_assoc.sh
+twohit_assoc.sh
+```
+Each script prepares data and runs the `signatureAssociations.nf` pipeline.
+
+*This is the most interesting thing probably contained in this code*
+Go to `pipelines/signaturessignatureAssociations.nf/README.md` for more details.
+This pipeline contains the GLM association analysis with resampling which reduces the false positive rate of associations.
+
+
diff --git a/pipelines/clinical.sh → scripts/clinical.sh b/pipelines/clinical.sh → scripts/clinical.sh
diff --git a/scripts/data_prep/clinvar_cadd.sh b/scripts/data_prep/clinvar_cadd.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# run cancGeneHits.nf
+
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+
+# parameters
+PROJECT_DIR=../workdir
+
+# directories and filenames
+dir_analysis=CLINVAR_DIR
+cadd_cmd=../../data/cancGeneHits/clinvar
+filename_genes=../../data/cancer_gene_census.csv # Download from https://cancer.sanger.ac.uk/cosmic/download
+filename_clinvar_vcf=../../data/clinvar/clinvar.vcf.gz
+
+dir_output=${dir_analysis}/output
+
+#### code below this point should not need to be changed ####
+
+BASEDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+echo $BASEDIR
+
+# create input and output data directories
+mkdir -p $dir_output
+mkdir -p $dir_analysis/input
+
+# Gene regions
+sed -e 's/\t/_/g' ${filename_genes} | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/' \
+| awk 'BEGIN { FS = OFS = "\t" } { for(i=1; i<=NF; i++) if($i ~ /^ *$/) $i = 0 }; 1'  > ${dir_analysis}/input/gene_loci.tsv
+
+# create working directory and set as current
+dir_wd=${project_dir} # ${dir_output}/nextflow
+dir_wd_old=$( pwd )
+mkdir -p $dir_wd
+cd $dir_wd
+
+# create and set temporary directory
+dir_tmp=${project_dir}/tmp # /re_scratch/${username}/tmp_run_gwas_${project_code}
+mkdir -p $dir_tmp
+export NXF_TEMP=$dir_tmp
+
+# Run pipeline to get function mutations in cancer genes
+module load bio/nextflow
+nextflow run ../../src/pipelines/cancGeneHits/clinvarCadd.nf -with-trace -resume \
+    --filename_genes ${dir_analysis}/input/gene_loci.tsv\
+    --filename_clinvar_vcf ${filename_clinvar_vcf}\
+    --dir_output ${dir_output}\
+    --projectName ${project_code}
+
+# move back to original working directory
+cd $dir_wd_old
diff --git a/scripts/data_prep/germline.sh b/scripts/data_prep/germline.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# run cancGeneHits.nf
+
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+
+source ../../.env
+
+# parameters
+PROJECT_DIR=../workdir
+
+# Run parameters
+n_files=10000 # Max number of files to run (input a small number if just testing)
+PHRED_threshold=20
+
+# directories and filenames
+dir_analysis=../../data/cancGeneHits/germline_${PHRED_threshold} #test_data
+germline_dir=GERMLINE_DIR
+cadd_cmd=CLINVAR_CADD_CMD
+filename_germline_vcfs=${dir_analysis}/input/germline_vcfs.tsv
+filename_sample_list=${dir_analysis}/input/germline_sample_list.tsv
+filename_genes=../../data/cancer_gene_census.csv # Download from https://cancer.sanger.ac.uk/cosmic/download
+filename_clinvar_vcf=../../data/clinvar/clinvar.vcf.gz
+
+dir_output=${dir_analysis}/output
+
+#### code below this point should not need to be changed ####
+
+BASEDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+echo $BASEDIR
+
+# create input and output data directories
+mkdir -p $dir_output
+mkdir -p $dir_analysis/input
+
+if true; then
+    # Germline vcf file list
+    ls ${germline_dir} > ${dir_analysis}/germline_file_list.txt
+    cat ${dir_analysis}/germline_file_list.txt | rev | cut -d'/' -f1 | rev | cut -d'_' -f4 > ${dir_analysis}/chr_list.txt
+    echo -e "chromosome\tfilename_vcf" > ${filename_germline_vcfs}
+    paste -d$'\t' ${dir_analysis}/chr_list.txt ${dir_analysis}/germline_file_list.txt >> ${filename_germline_vcfs}
+    rm ${dir_analysis}/germline_file_list.txt ${dir_analysis}/chr_list.txt
+
+    # Gene regions
+    sed -e 's/\t/_/g' ${filename_genes} | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/' \
+    | awk 'BEGIN { FS = OFS = "\t" } { for(i=1; i<=NF; i++) if($i ~ /^ *$/) $i = 0 }; 1'  > ${dir_analysis}/input/gene_loci.tsv
+
+    # Make sample list
+    python make_germline_input.py $dir_analysis/input $filename_sample_list
+fi
+
+# create working directory and set as current
+dir_wd=${project_dir} # ${dir_output}/nextflow
+dir_wd_old=$( pwd )
+mkdir -p $dir_wd
+cd $dir_wd
+
+# create and set temporary directory
+dir_tmp=${project_dir}/tmp # /re_scratch/${username}/tmp_run_gwas_${project_code}
+mkdir -p $dir_tmp
+export NXF_TEMP=$dir_tmp
+
+
+# Run pipeline to get function mutations in cancer genes
+module load bio/nextflow/21.04.3
+nextflow run ../../src/pipelines/cancGeneHits/clinvarCadd.nf -with-trace -resume \
+    --filename_germline_vcfs ${filename_germline_vcfs}\
+    --filename_sample_list ${filename_sample_list}\
+    --filename_genes ${dir_analysis}/input/gene_loci.tsv\
+    --filename_clinvar_vcf ${filename_clinvar_vcf}\
+    --dir_output ${dir_output}\
+    --projectName ${project_code}\
+    --n_files ${n_files}\
+    --PHRED_threshold ${PHRED_threshold}
+
+# move back to original working directory
+cd $dir_wd_old
diff --git a/scripts/data_prep/loh.sh b/scripts/data_prep/loh.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# run cancGeneSignatures.nf
+
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+
+source ../.env
+
+# parameters
+PROJECT_DIR=../workdir
+
+# Run parameters
+chunk_size=100 # Number of samples to run on a single process
+
+# directories and filenames
+dir_analysis=../../data/cancGeneHits/somatic
+filename_battenberg_list=${dir_analysis}/input/battenberg_list.tsv #sample_data.tsv
+filename_sample_list=${dir_analysis}/input/loh_sample_list.tsv #sample_data.tsv
+filename_genes=${dir_analysis}/input/gene_loci.tsv # Download from https://cancer.sanger.ac.uk/cosmic/download
+
+dir_output=${dir_analysis}/output
+
+# create output directory
+mkdir -p $dir_analysis/input
+mkdir -p $dir_output
+
+# Battenberg file list
+awk 'NR==1 {for (i=1; i<=NF; i++) {f[$i] = i}}{ print $(f["tumour_sample_platekey"])"\t"$(f["filename_cna"]) }' /re_gecip/cancer_pan/fprefect/botl/results/sample_lists/sample_list_2021_06_29.tsv | sed 's/filename_cna/filename_batt/' > ${filename_battenberg_list}
+
+# Gene regions from
+echo -e "chrom\tstart\tend\tgene_id" > ${filename_genes}
+sed -e 's/\t/_/g' ../../data/cancer_gene_census.csv | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/'  >> ${filename_genes}
+
+# Make sample list
+python make_somatic_input.py $dir_analysis/input $filename_sample_list
+
+#### code below this point should not need to be changed ####
+
+# create working directory and set as current
+dir_wd=${PROJECT_DIR} # ${dir_output}/nextflow
+dir_wd_old=$( pwd )
+mkdir -p $dir_wd
+cd $dir_wd
+
+# create and set temporary directory
+dir_tmp=${PROJECT_DIR}/tmp # /re_scratch/${username}/tmp_run_gwas_${project_code}
+mkdir -p $dir_tmp
+export NXF_TEMP=$dir_tmp
+
+echo "Run nextflow"
+
+# Run pipeline to get function mutations in cancer genes
+nextflow run ../../src/pipelines/cancGeneHits/cancLOH.nf -with-trace \
+  --run_name "test"\
+  --filename_battenberg_list ${filename_battenberg_list}\
+  --filename_sample_list ${filename_sample_list}\
+  --filename_genes ${filename_genes}\
+  --chunk_size ${chunk_size}\
+  --dir_output ${dir_output}\
+  --projectName ${project_code}
+
+# move back to original working directory
+cd $dir_wd_old
diff --git a/scripts/data_prep/make_germline_input.py b/scripts/data_prep/make_germline_input.py
@@ -0,0 +1,26 @@
+import sys
+import pandas as pd, numpy as np, os
+from dotenv import load_dotenv
+
+load_dotenv()
+AGGV2_SAMPLE_LIST = os.getenv('AGGV2_SAMPLE_LIST')
+
+if __name__=='__main__':
+
+    dir_output, sample_list_file = sys.argv[1:]
+
+    # Get all samples from previous projects
+    cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv"
+                              for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']]
+    germline_platekey_list = np.array([])
+    for table in cancer_analysis_tables:
+        germline_platekey_list = np.union1d(germline_platekey_list,
+                                            np.array(pd.read_csv(table, sep="\t", usecols=["germline_sample_platekey"])))
+
+    # Crossmatch with aggV2
+    aggv2_sample_list = np.array(pd.read_csv(AGGV2_SAMPLE_LIST,
+                                    sep="\t", header=None)[0])
+
+    germline_platekey_list = np.intersect1d(germline_platekey_list, aggv2_sample_list)
+    print(f"{len(germline_platekey_list)} germline samples")
+    pd.DataFrame(germline_platekey_list, columns=['germline_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False)
diff --git a/scripts/data_prep/make_somatic_input.py b/scripts/data_prep/make_somatic_input.py
@@ -0,0 +1,18 @@
+import sys, os
+import pandas as pd, numpy as np
+
+
+if __name__=='__main__':
+
+    dir_output, sample_list_file = sys.argv[1:]
+
+    # Get all samples from previous projects
+    cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv"
+                              for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']]
+    tumour_platekey_list = np.array([])
+    for table in cancer_analysis_tables:
+        tumour_platekey_list = np.union1d(tumour_platekey_list,
+                                            np.array(pd.read_csv(table, sep="\t", usecols=["tumour_sample_platekey"])))
+
+    print(f"{len(tumour_platekey_list)} tumour_samples")
+    pd.DataFrame(tumour_platekey_list, columns=['tumour_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False)
diff --git a/scripts/data_prep/oncokb.sh b/scripts/data_prep/oncokb.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Combine results from OncoKB
+
+parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+cd $parent_path
+
+source ../.env
+
+# parameters
+PROJECT_DIR=../../workdir
+
+# directories and filenames
+dir_analysis=../../data/cancGeneHits/somatic
+filename_oncokb_list=${dir_analysis}/input/oncokb_list.tsv
+filename_genes=${dir_analysis}/input/gene_loci.tsv # Download from https://cancer.sanger.ac.uk/cosmic/download
+
+dir_output=${dir_analysis}/output
+
+chunk_size=100
+n_samples=200
+
+# create output directory
+mkdir -p $dir_analysis/input
+mkdir -p $dir_output
+
+# OncoKB file list
+echo "OncoKB file list"
+oncokb_dir=ONCOKB_DIR
+echo -e "tumour_sample_platekey\tfile" > ${filename_oncokb_list}
+ls ${oncokb_dir} | awk -v dir=$oncokb_dir '{ print $1"\t"dir"/"$1"/"$1"_combined_mutations_OncoKB_query_flatfile.tsv" }' >> ${filename_oncokb_list}
+
+# Gene regions
+echo "Gene regions"
+echo -e "chrom\tstart\tend\tgene_id" > ${filename_genes}
+sed -e 's/\t/_/g' ../../data/cancer_gene_census.csv | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/'  >> ${filename_genes}
+
+#### code below this point should not need to be changed ####
+
+BASEDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+echo $BASEDIR
+
+# create working directory and set as current
+dir_wd=${project_dir} # ${dir_output}/nextflow
+dir_wd_old=$( pwd )
+mkdir -p $dir_wd
+cd $dir_wd
+
+# create and set temporary directory
+dir_tmp=${project_dir}/tmp
+mkdir -p $dir_tmp
+export NXF_TEMP=$dir_tmp
+
+# Run pipeline to get function mutations in cancer genes
+nextflow run ../../src/pipelines/cancGeneHits/cancOncoKB.nf -with-trace \
+  --projectName ${project_code} \
+  --filename_oncokb_list ${filename_oncokb_list} \
+  --filename_genes ${filename_genes} \
+  --chunk_size ${chunk_size} \
+  --n_samples ${n_samples} \
+  --dir_output ${dir_output}
+
+# move back to original working directory
+cd $dir_wd_old
diff --git a/pipelines/genotype_assoc.sh → scripts/genotype_assoc.sh b/pipelines/genotype_assoc.sh → scripts/genotype_assoc.sh
diff --git a/pipelines/germline_assoc.sh → scripts/germline_assoc.sh b/pipelines/germline_assoc.sh → scripts/germline_assoc.sh
diff --git a/pipelines/somatic_assoc.sh → scripts/somatic_assoc.sh b/pipelines/somatic_assoc.sh → scripts/somatic_assoc.sh
diff --git a/pipelines/treatment_assoc.sh → scripts/treatment_assoc.sh b/pipelines/treatment_assoc.sh → scripts/treatment_assoc.sh
diff --git a/pipelines/twohit_assoc.sh → scripts/twohit_assoc.sh b/pipelines/twohit_assoc.sh → scripts/twohit_assoc.sh