-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #17 from Wedge-lab/feat/code-from-gel
Code from Genomics England including signatureAssociations.nf
- Loading branch information
Showing
39 changed files
with
4,698 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/bin/bash | ||
# run cancGeneHits.nf | ||
|
||
parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) | ||
cd $parent_path | ||
|
||
# parameters | ||
PROJECT_DIR=../workdir | ||
|
||
# directories and filenames | ||
dir_analysis=CLINVAR_DIR | ||
cadd_cmd=../../data/cancGeneHits/clinvar | ||
filename_genes=../../data/cancer_gene_census.csv # Download from https://cancer.sanger.ac.uk/cosmic/download | ||
filename_clinvar_vcf=../../data/clinvar/clinvar.vcf.gz | ||
|
||
dir_output=${dir_analysis}/output | ||
|
||
#### code below this point should not need to be changed #### | ||
|
||
BASEDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
echo $BASEDIR | ||
|
||
# create input and output data directories | ||
mkdir -p $dir_output | ||
mkdir -p $dir_analysis/input | ||
|
||
# Gene regions | ||
sed -e 's/\t/_/g' ${filename_genes} | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/' \ | ||
| awk 'BEGIN { FS = OFS = "\t" } { for(i=1; i<=NF; i++) if($i ~ /^ *$/) $i = 0 }; 1' > ${dir_analysis}/input/gene_loci.tsv | ||
|
||
# create working directory and set as current | ||
dir_wd=${project_dir} # ${dir_output}/nextflow | ||
dir_wd_old=$( pwd ) | ||
mkdir -p $dir_wd | ||
cd $dir_wd | ||
|
||
# create and set temporary directory | ||
dir_tmp=${project_dir}/tmp # /re_scratch/${username}/tmp_run_gwas_${project_code} | ||
mkdir -p $dir_tmp | ||
export NXF_TEMP=$dir_tmp | ||
|
||
# Run pipeline to get function mutations in cancer genes | ||
module load bio/nextflow | ||
nextflow run ../../src/pipelines/cancGeneHits/clinvarCadd.nf -with-trace -resume \ | ||
--filename_genes ${dir_analysis}/input/gene_loci.tsv\ | ||
--filename_clinvar_vcf ${filename_clinvar_vcf}\ | ||
--dir_output ${dir_output}\ | ||
--projectName ${project_code} | ||
|
||
# move back to original working directory | ||
cd $dir_wd_old |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#!/bin/bash | ||
# run cancGeneHits.nf | ||
|
||
parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) | ||
cd $parent_path | ||
|
||
source ../../.env | ||
|
||
# parameters | ||
PROJECT_DIR=../workdir | ||
|
||
# Run parameters | ||
n_files=10000 # Max number of files to run (input a small number if just testing) | ||
PHRED_threshold=20 | ||
|
||
# directories and filenames | ||
dir_analysis=../../data/cancGeneHits/germline_${PHRED_threshold} #test_data | ||
germline_dir=GERMLINE_DIR | ||
cadd_cmd=CLINVAR_CADD_CMD | ||
filename_germline_vcfs=${dir_analysis}/input/germline_vcfs.tsv | ||
filename_sample_list=${dir_analysis}/input/germline_sample_list.tsv | ||
filename_genes=../../data/cancer_gene_census.csv # Download from https://cancer.sanger.ac.uk/cosmic/download | ||
filename_clinvar_vcf=../../data/clinvar/clinvar.vcf.gz | ||
|
||
dir_output=${dir_analysis}/output | ||
|
||
#### code below this point should not need to be changed #### | ||
|
||
BASEDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
echo $BASEDIR | ||
|
||
# create input and output data directories | ||
mkdir -p $dir_output | ||
mkdir -p $dir_analysis/input | ||
|
||
if true; then | ||
# Germline vcf file list | ||
ls ${germline_dir} > ${dir_analysis}/germline_file_list.txt | ||
cat ${dir_analysis}/germline_file_list.txt | rev | cut -d'/' -f1 | rev | cut -d'_' -f4 > ${dir_analysis}/chr_list.txt | ||
echo -e "chromosome\tfilename_vcf" > ${filename_germline_vcfs} | ||
paste -d$'\t' ${dir_analysis}/chr_list.txt ${dir_analysis}/germline_file_list.txt >> ${filename_germline_vcfs} | ||
rm ${dir_analysis}/germline_file_list.txt ${dir_analysis}/chr_list.txt | ||
|
||
# Gene regions | ||
sed -e 's/\t/_/g' ${filename_genes} | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/' \ | ||
| awk 'BEGIN { FS = OFS = "\t" } { for(i=1; i<=NF; i++) if($i ~ /^ *$/) $i = 0 }; 1' > ${dir_analysis}/input/gene_loci.tsv | ||
|
||
# Make sample list | ||
python make_germline_input.py $dir_analysis/input $filename_sample_list | ||
fi | ||
|
||
# create working directory and set as current | ||
dir_wd=${project_dir} # ${dir_output}/nextflow | ||
dir_wd_old=$( pwd ) | ||
mkdir -p $dir_wd | ||
cd $dir_wd | ||
|
||
# create and set temporary directory | ||
dir_tmp=${project_dir}/tmp # /re_scratch/${username}/tmp_run_gwas_${project_code} | ||
mkdir -p $dir_tmp | ||
export NXF_TEMP=$dir_tmp | ||
|
||
|
||
# Run pipeline to get function mutations in cancer genes | ||
module load bio/nextflow/21.04.3 | ||
nextflow run ../../src/pipelines/cancGeneHits/clinvarCadd.nf -with-trace -resume \ | ||
--filename_germline_vcfs ${filename_germline_vcfs}\ | ||
--filename_sample_list ${filename_sample_list}\ | ||
--filename_genes ${dir_analysis}/input/gene_loci.tsv\ | ||
--filename_clinvar_vcf ${filename_clinvar_vcf}\ | ||
--dir_output ${dir_output}\ | ||
--projectName ${project_code}\ | ||
--n_files ${n_files}\ | ||
--PHRED_threshold ${PHRED_threshold} | ||
|
||
# move back to original working directory | ||
cd $dir_wd_old |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/bin/bash | ||
# run cancGeneSignatures.nf | ||
|
||
parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) | ||
cd $parent_path | ||
|
||
source ../.env | ||
|
||
# parameters | ||
PROJECT_DIR=../workdir | ||
|
||
# Run parameters | ||
chunk_size=100 # Number of samples to run on a single process | ||
|
||
# directories and filenames | ||
dir_analysis=../../data/cancGeneHits/somatic | ||
filename_battenberg_list=${dir_analysis}/input/battenberg_list.tsv #sample_data.tsv | ||
filename_sample_list=${dir_analysis}/input/loh_sample_list.tsv #sample_data.tsv | ||
filename_genes=${dir_analysis}/input/gene_loci.tsv # Download from https://cancer.sanger.ac.uk/cosmic/download | ||
|
||
dir_output=${dir_analysis}/output | ||
|
||
# create output directory | ||
mkdir -p $dir_analysis/input | ||
mkdir -p $dir_output | ||
|
||
# Battenberg file list | ||
awk 'NR==1 {for (i=1; i<=NF; i++) {f[$i] = i}}{ print $(f["tumour_sample_platekey"])"\t"$(f["filename_cna"]) }' /re_gecip/cancer_pan/fprefect/botl/results/sample_lists/sample_list_2021_06_29.tsv | sed 's/filename_cna/filename_batt/' > ${filename_battenberg_list} | ||
|
||
# Gene regions from | ||
echo -e "chrom\tstart\tend\tgene_id" > ${filename_genes} | ||
sed -e 's/\t/_/g' ../../data/cancer_gene_census.csv | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/' >> ${filename_genes} | ||
|
||
# Make sample list | ||
python make_somatic_input.py $dir_analysis/input $filename_sample_list | ||
|
||
#### code below this point should not need to be changed #### | ||
|
||
# create working directory and set as current | ||
dir_wd=${PROJECT_DIR} # ${dir_output}/nextflow | ||
dir_wd_old=$( pwd ) | ||
mkdir -p $dir_wd | ||
cd $dir_wd | ||
|
||
# create and set temporary directory | ||
dir_tmp=${PROJECT_DIR}/tmp # /re_scratch/${username}/tmp_run_gwas_${project_code} | ||
mkdir -p $dir_tmp | ||
export NXF_TEMP=$dir_tmp | ||
|
||
echo "Run nextflow" | ||
|
||
# Run pipeline to get function mutations in cancer genes | ||
nextflow run ../../src/pipelines/cancGeneHits/cancLOH.nf -with-trace \ | ||
--run_name "test"\ | ||
--filename_battenberg_list ${filename_battenberg_list}\ | ||
--filename_sample_list ${filename_sample_list}\ | ||
--filename_genes ${filename_genes}\ | ||
--chunk_size ${chunk_size}\ | ||
--dir_output ${dir_output}\ | ||
--projectName ${project_code} | ||
|
||
# move back to original working directory | ||
cd $dir_wd_old |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import sys | ||
import pandas as pd, numpy as np, os | ||
from dotenv import load_dotenv | ||
|
||
load_dotenv() | ||
AGGV2_SAMPLE_LIST = os.getenv('AGGV2_SAMPLE_LIST') | ||
|
||
if __name__=='__main__': | ||
|
||
dir_output, sample_list_file = sys.argv[1:] | ||
|
||
# Get all samples from previous projects | ||
cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv" | ||
for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']] | ||
germline_platekey_list = np.array([]) | ||
for table in cancer_analysis_tables: | ||
germline_platekey_list = np.union1d(germline_platekey_list, | ||
np.array(pd.read_csv(table, sep="\t", usecols=["germline_sample_platekey"]))) | ||
|
||
# Crossmatch with aggV2 | ||
aggv2_sample_list = np.array(pd.read_csv(AGGV2_SAMPLE_LIST, | ||
sep="\t", header=None)[0]) | ||
|
||
germline_platekey_list = np.intersect1d(germline_platekey_list, aggv2_sample_list) | ||
print(f"{len(germline_platekey_list)} germline samples") | ||
pd.DataFrame(germline_platekey_list, columns=['germline_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import sys, os | ||
import pandas as pd, numpy as np | ||
|
||
|
||
if __name__=='__main__': | ||
|
||
dir_output, sample_list_file = sys.argv[1:] | ||
|
||
# Get all samples from previous projects | ||
cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv" | ||
for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']] | ||
tumour_platekey_list = np.array([]) | ||
for table in cancer_analysis_tables: | ||
tumour_platekey_list = np.union1d(tumour_platekey_list, | ||
np.array(pd.read_csv(table, sep="\t", usecols=["tumour_sample_platekey"]))) | ||
|
||
print(f"{len(tumour_platekey_list)} tumour_samples") | ||
pd.DataFrame(tumour_platekey_list, columns=['tumour_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/bin/bash | ||
# Combine results from OncoKB | ||
|
||
parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) | ||
cd $parent_path | ||
|
||
source ../.env | ||
|
||
# parameters | ||
PROJECT_DIR=../../workdir | ||
|
||
# directories and filenames | ||
dir_analysis=../../data/cancGeneHits/somatic | ||
filename_oncokb_list=${dir_analysis}/input/oncokb_list.tsv | ||
filename_genes=${dir_analysis}/input/gene_loci.tsv # Download from https://cancer.sanger.ac.uk/cosmic/download | ||
|
||
dir_output=${dir_analysis}/output | ||
|
||
chunk_size=100 | ||
n_samples=200 | ||
|
||
# create output directory | ||
mkdir -p $dir_analysis/input | ||
mkdir -p $dir_output | ||
|
||
# OncoKB file list | ||
echo "OncoKB file list" | ||
oncokb_dir=ONCOKB_DIR | ||
echo -e "tumour_sample_platekey\tfile" > ${filename_oncokb_list} | ||
ls ${oncokb_dir} | awk -v dir=$oncokb_dir '{ print $1"\t"dir"/"$1"/"$1"_combined_mutations_OncoKB_query_flatfile.tsv" }' >> ${filename_oncokb_list} | ||
|
||
# Gene regions | ||
echo "Gene regions" | ||
echo -e "chrom\tstart\tend\tgene_id" > ${filename_genes} | ||
sed -e 's/\t/_/g' ../../data/cancer_gene_census.csv | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/' >> ${filename_genes} | ||
|
||
#### code below this point should not need to be changed #### | ||
|
||
BASEDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
echo $BASEDIR | ||
|
||
# create working directory and set as current | ||
dir_wd=${project_dir} # ${dir_output}/nextflow | ||
dir_wd_old=$( pwd ) | ||
mkdir -p $dir_wd | ||
cd $dir_wd | ||
|
||
# create and set temporary directory | ||
dir_tmp=${project_dir}/tmp | ||
mkdir -p $dir_tmp | ||
export NXF_TEMP=$dir_tmp | ||
|
||
# Run pipeline to get function mutations in cancer genes | ||
nextflow run ../../src/pipelines/cancGeneHits/cancOncoKB.nf -with-trace \ | ||
--projectName ${project_code} \ | ||
--filename_oncokb_list ${filename_oncokb_list} \ | ||
--filename_genes ${filename_genes} \ | ||
--chunk_size ${chunk_size} \ | ||
--n_samples ${n_samples} \ | ||
--dir_output ${dir_output} | ||
|
||
# move back to original working directory | ||
cd $dir_wd_old |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Oops, something went wrong.