diff --git a/README.md b/README.md index 1ae4636..f0b96f3 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,36 @@ Code and results for paper: [Comprehensive repertoire of the chromosomal alteration and mutational signatures across 16 cancer types from 10,983 cancer patients](doi.org/10.1101/2023.06.07.23290970) + +Disclaimer: This code was written inside the Genomics England Research Environment without github and it has not been tested outside the research environment. We provide it here to aid the transparency of the publication and make our analysis methods more accessible and reproducible. The pipelines provided will almost certainly not be of significant use outside Genomics England, however, we hope users find pieces of code which help to understand our work and may be useful in their research. + + +# Contained in this repository + + +### Curate gene mutations +Bash scripts in `./scripts/data_prep` are for collecting and curating genetic mutations in WGS data available in Genomics England including inferring expected impact using CADD scores or OncoKB annotations. + + +### Combine signatures + + + + +### Association analysis + +A series of scripts in `./scripts` run association ananlysis on mutation rates with various covariates including germline and somatic gene inactivations and treatment exposures: +``` +germline_assoc.sh +treatment_assoc.sh +genotype_assoc.sh +somatic_assoc.sh +twohit_assoc.sh +``` +Each script prepares data and runs the `signatureAssociations.nf` pipeline. + +*This is the most interesting thing probably contained in this code* +Go to `pipelines/signaturessignatureAssociations.nf/README.md` for more details. +This pipeline contains the GLM association analysis with resampling which reduces the false positive rate of associations. + + diff --git a/pipelines/clinical.sh b/scripts/clinical.sh similarity index 100% rename from pipelines/clinical.sh rename to scripts/clinical.sh diff --git a/scripts/data_prep/clinvar_cadd.sh b/scripts/data_prep/clinvar_cadd.sh new file mode 100755 index 0000000..ddd31de --- /dev/null +++ b/scripts/data_prep/clinvar_cadd.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# run cancGeneHits.nf + +parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +cd $parent_path + +# parameters +PROJECT_DIR=../workdir + +# directories and filenames +dir_analysis=CLINVAR_DIR +cadd_cmd=../../data/cancGeneHits/clinvar +filename_genes=../../data/cancer_gene_census.csv # Download from https://cancer.sanger.ac.uk/cosmic/download +filename_clinvar_vcf=../../data/clinvar/clinvar.vcf.gz + +dir_output=${dir_analysis}/output + +#### code below this point should not need to be changed #### + +BASEDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +echo $BASEDIR + +# create input and output data directories +mkdir -p $dir_output +mkdir -p $dir_analysis/input + +# Gene regions +sed -e 's/\t/_/g' ${filename_genes} | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/' \ +| awk 'BEGIN { FS = OFS = "\t" } { for(i=1; i<=NF; i++) if($i ~ /^ *$/) $i = 0 }; 1' > ${dir_analysis}/input/gene_loci.tsv + +# create working directory and set as current +dir_wd=${project_dir} # ${dir_output}/nextflow +dir_wd_old=$( pwd ) +mkdir -p $dir_wd +cd $dir_wd + +# create and set temporary directory +dir_tmp=${project_dir}/tmp # /re_scratch/${username}/tmp_run_gwas_${project_code} +mkdir -p $dir_tmp +export NXF_TEMP=$dir_tmp + +# Run pipeline to get function mutations in cancer genes +module load bio/nextflow +nextflow run ../../src/pipelines/cancGeneHits/clinvarCadd.nf -with-trace -resume \ + --filename_genes ${dir_analysis}/input/gene_loci.tsv\ + --filename_clinvar_vcf ${filename_clinvar_vcf}\ + --dir_output ${dir_output}\ + --projectName ${project_code} + +# move back to original working directory +cd $dir_wd_old diff --git a/scripts/data_prep/germline.sh b/scripts/data_prep/germline.sh new file mode 100755 index 0000000..cbfcc07 --- /dev/null +++ b/scripts/data_prep/germline.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# run cancGeneHits.nf + +parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +cd $parent_path + +source ../../.env + +# parameters +PROJECT_DIR=../workdir + +# Run parameters +n_files=10000 # Max number of files to run (input a small number if just testing) +PHRED_threshold=20 + +# directories and filenames +dir_analysis=../../data/cancGeneHits/germline_${PHRED_threshold} #test_data +germline_dir=GERMLINE_DIR +cadd_cmd=CLINVAR_CADD_CMD +filename_germline_vcfs=${dir_analysis}/input/germline_vcfs.tsv +filename_sample_list=${dir_analysis}/input/germline_sample_list.tsv +filename_genes=../../data/cancer_gene_census.csv # Download from https://cancer.sanger.ac.uk/cosmic/download +filename_clinvar_vcf=../../data/clinvar/clinvar.vcf.gz + +dir_output=${dir_analysis}/output + +#### code below this point should not need to be changed #### + +BASEDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +echo $BASEDIR + +# create input and output data directories +mkdir -p $dir_output +mkdir -p $dir_analysis/input + +if true; then + # Germline vcf file list + ls ${germline_dir} > ${dir_analysis}/germline_file_list.txt + cat ${dir_analysis}/germline_file_list.txt | rev | cut -d'/' -f1 | rev | cut -d'_' -f4 > ${dir_analysis}/chr_list.txt + echo -e "chromosome\tfilename_vcf" > ${filename_germline_vcfs} + paste -d$'\t' ${dir_analysis}/chr_list.txt ${dir_analysis}/germline_file_list.txt >> ${filename_germline_vcfs} + rm ${dir_analysis}/germline_file_list.txt ${dir_analysis}/chr_list.txt + + # Gene regions + sed -e 's/\t/_/g' ${filename_genes} | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/' \ + | awk 'BEGIN { FS = OFS = "\t" } { for(i=1; i<=NF; i++) if($i ~ /^ *$/) $i = 0 }; 1' > ${dir_analysis}/input/gene_loci.tsv + + # Make sample list + python make_germline_input.py $dir_analysis/input $filename_sample_list +fi + +# create working directory and set as current +dir_wd=${project_dir} # ${dir_output}/nextflow +dir_wd_old=$( pwd ) +mkdir -p $dir_wd +cd $dir_wd + +# create and set temporary directory +dir_tmp=${project_dir}/tmp # /re_scratch/${username}/tmp_run_gwas_${project_code} +mkdir -p $dir_tmp +export NXF_TEMP=$dir_tmp + + +# Run pipeline to get function mutations in cancer genes +module load bio/nextflow/21.04.3 +nextflow run ../../src/pipelines/cancGeneHits/clinvarCadd.nf -with-trace -resume \ + --filename_germline_vcfs ${filename_germline_vcfs}\ + --filename_sample_list ${filename_sample_list}\ + --filename_genes ${dir_analysis}/input/gene_loci.tsv\ + --filename_clinvar_vcf ${filename_clinvar_vcf}\ + --dir_output ${dir_output}\ + --projectName ${project_code}\ + --n_files ${n_files}\ + --PHRED_threshold ${PHRED_threshold} + +# move back to original working directory +cd $dir_wd_old diff --git a/scripts/data_prep/loh.sh b/scripts/data_prep/loh.sh new file mode 100755 index 0000000..5e9e49c --- /dev/null +++ b/scripts/data_prep/loh.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# run cancGeneSignatures.nf + +parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +cd $parent_path + +source ../.env + +# parameters +PROJECT_DIR=../workdir + +# Run parameters +chunk_size=100 # Number of samples to run on a single process + +# directories and filenames +dir_analysis=../../data/cancGeneHits/somatic +filename_battenberg_list=${dir_analysis}/input/battenberg_list.tsv #sample_data.tsv +filename_sample_list=${dir_analysis}/input/loh_sample_list.tsv #sample_data.tsv +filename_genes=${dir_analysis}/input/gene_loci.tsv # Download from https://cancer.sanger.ac.uk/cosmic/download + +dir_output=${dir_analysis}/output + +# create output directory +mkdir -p $dir_analysis/input +mkdir -p $dir_output + +# Battenberg file list +awk 'NR==1 {for (i=1; i<=NF; i++) {f[$i] = i}}{ print $(f["tumour_sample_platekey"])"\t"$(f["filename_cna"]) }' /re_gecip/cancer_pan/fprefect/botl/results/sample_lists/sample_list_2021_06_29.tsv | sed 's/filename_cna/filename_batt/' > ${filename_battenberg_list} + +# Gene regions from +echo -e "chrom\tstart\tend\tgene_id" > ${filename_genes} +sed -e 's/\t/_/g' ../../data/cancer_gene_census.csv | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/' >> ${filename_genes} + +# Make sample list +python make_somatic_input.py $dir_analysis/input $filename_sample_list + +#### code below this point should not need to be changed #### + +# create working directory and set as current +dir_wd=${PROJECT_DIR} # ${dir_output}/nextflow +dir_wd_old=$( pwd ) +mkdir -p $dir_wd +cd $dir_wd + +# create and set temporary directory +dir_tmp=${PROJECT_DIR}/tmp # /re_scratch/${username}/tmp_run_gwas_${project_code} +mkdir -p $dir_tmp +export NXF_TEMP=$dir_tmp + +echo "Run nextflow" + +# Run pipeline to get function mutations in cancer genes +nextflow run ../../src/pipelines/cancGeneHits/cancLOH.nf -with-trace \ + --run_name "test"\ + --filename_battenberg_list ${filename_battenberg_list}\ + --filename_sample_list ${filename_sample_list}\ + --filename_genes ${filename_genes}\ + --chunk_size ${chunk_size}\ + --dir_output ${dir_output}\ + --projectName ${project_code} + +# move back to original working directory +cd $dir_wd_old diff --git a/scripts/data_prep/make_germline_input.py b/scripts/data_prep/make_germline_input.py new file mode 100755 index 0000000..c6959c4 --- /dev/null +++ b/scripts/data_prep/make_germline_input.py @@ -0,0 +1,26 @@ +import sys +import pandas as pd, numpy as np, os +from dotenv import load_dotenv + +load_dotenv() +AGGV2_SAMPLE_LIST = os.getenv('AGGV2_SAMPLE_LIST') + +if __name__=='__main__': + + dir_output, sample_list_file = sys.argv[1:] + + # Get all samples from previous projects + cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv" + for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']] + germline_platekey_list = np.array([]) + for table in cancer_analysis_tables: + germline_platekey_list = np.union1d(germline_platekey_list, + np.array(pd.read_csv(table, sep="\t", usecols=["germline_sample_platekey"]))) + + # Crossmatch with aggV2 + aggv2_sample_list = np.array(pd.read_csv(AGGV2_SAMPLE_LIST, + sep="\t", header=None)[0]) + + germline_platekey_list = np.intersect1d(germline_platekey_list, aggv2_sample_list) + print(f"{len(germline_platekey_list)} germline samples") + pd.DataFrame(germline_platekey_list, columns=['germline_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False) diff --git a/scripts/data_prep/make_somatic_input.py b/scripts/data_prep/make_somatic_input.py new file mode 100755 index 0000000..b756c45 --- /dev/null +++ b/scripts/data_prep/make_somatic_input.py @@ -0,0 +1,18 @@ +import sys, os +import pandas as pd, numpy as np + + +if __name__=='__main__': + + dir_output, sample_list_file = sys.argv[1:] + + # Get all samples from previous projects + cancer_analysis_tables = [f"/re_gecip/shared_allGeCIPs/labkey_tables/{version}/cancer_analysis.tsv" + for version in ['V8', 'V11/V11_reheadered', 'v14/v14_reheadered']] + tumour_platekey_list = np.array([]) + for table in cancer_analysis_tables: + tumour_platekey_list = np.union1d(tumour_platekey_list, + np.array(pd.read_csv(table, sep="\t", usecols=["tumour_sample_platekey"]))) + + print(f"{len(tumour_platekey_list)} tumour_samples") + pd.DataFrame(tumour_platekey_list, columns=['tumour_sample_platekey']).to_csv(f"{sample_list_file}", header=False, index=False) diff --git a/scripts/data_prep/oncokb.sh b/scripts/data_prep/oncokb.sh new file mode 100755 index 0000000..18bc2cb --- /dev/null +++ b/scripts/data_prep/oncokb.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Combine results from OncoKB + +parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +cd $parent_path + +source ../.env + +# parameters +PROJECT_DIR=../../workdir + +# directories and filenames +dir_analysis=../../data/cancGeneHits/somatic +filename_oncokb_list=${dir_analysis}/input/oncokb_list.tsv +filename_genes=${dir_analysis}/input/gene_loci.tsv # Download from https://cancer.sanger.ac.uk/cosmic/download + +dir_output=${dir_analysis}/output + +chunk_size=100 +n_samples=200 + +# create output directory +mkdir -p $dir_analysis/input +mkdir -p $dir_output + +# OncoKB file list +echo "OncoKB file list" +oncokb_dir=ONCOKB_DIR +echo -e "tumour_sample_platekey\tfile" > ${filename_oncokb_list} +ls ${oncokb_dir} | awk -v dir=$oncokb_dir '{ print $1"\t"dir"/"$1"/"$1"_combined_mutations_OncoKB_query_flatfile.tsv" }' >> ${filename_oncokb_list} + +# Gene regions +echo "Gene regions" +echo -e "chrom\tstart\tend\tgene_id" > ${filename_genes} +sed -e 's/\t/_/g' ../../data/cancer_gene_census.csv | awk -F'","' '{print $4"\t"$1}' | sed -e 's/"//g' -e 's/:/\t/' -e 's/-/\t/' | tail -n +2 | sed -e 's/^/chr/' >> ${filename_genes} + +#### code below this point should not need to be changed #### + +BASEDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +echo $BASEDIR + +# create working directory and set as current +dir_wd=${project_dir} # ${dir_output}/nextflow +dir_wd_old=$( pwd ) +mkdir -p $dir_wd +cd $dir_wd + +# create and set temporary directory +dir_tmp=${project_dir}/tmp +mkdir -p $dir_tmp +export NXF_TEMP=$dir_tmp + +# Run pipeline to get function mutations in cancer genes +nextflow run ../../src/pipelines/cancGeneHits/cancOncoKB.nf -with-trace \ + --projectName ${project_code} \ + --filename_oncokb_list ${filename_oncokb_list} \ + --filename_genes ${filename_genes} \ + --chunk_size ${chunk_size} \ + --n_samples ${n_samples} \ + --dir_output ${dir_output} + +# move back to original working directory +cd $dir_wd_old diff --git a/pipelines/genotype_assoc.sh b/scripts/genotype_assoc.sh similarity index 100% rename from pipelines/genotype_assoc.sh rename to scripts/genotype_assoc.sh diff --git a/pipelines/germline_assoc.sh b/scripts/germline_assoc.sh similarity index 100% rename from pipelines/germline_assoc.sh rename to scripts/germline_assoc.sh diff --git a/pipelines/somatic_assoc.sh b/scripts/somatic_assoc.sh similarity index 100% rename from pipelines/somatic_assoc.sh rename to scripts/somatic_assoc.sh diff --git a/pipelines/treatment_assoc.sh b/scripts/treatment_assoc.sh similarity index 100% rename from pipelines/treatment_assoc.sh rename to scripts/treatment_assoc.sh diff --git a/pipelines/twohit_assoc.sh b/scripts/twohit_assoc.sh similarity index 100% rename from pipelines/twohit_assoc.sh rename to scripts/twohit_assoc.sh diff --git a/src/pipelines/cancGeneHits/cancGermlineHits.nf/main.nf b/src/pipelines/cancGeneHits/cancGermlineHits.nf/main.nf new file mode 100755 index 0000000..bbb8fe7 --- /dev/null +++ b/src/pipelines/cancGeneHits/cancGermlineHits.nf/main.nf @@ -0,0 +1,161 @@ +#!/usr/bin/env nextflow +// cancGermlineHits.nf pipeline +// written by Andy Everall using code from Ben Kinnersley +// + +// help message +def helpMessage() { + log.info""" + Mandatory arguments: + --filename_germline_vcfs tsv file with chromosome, vcf file path columns + --filename_sample_list List of samples to extract + --filename_genes Gene loci to extract - tsv file with chromosome, start, end, ID + --filename_clinvar_vcf vcf file of clinvar mutations + --dir_output Directory to save results + + Optional arguments: + --projectName LSF project name that jobs are submitted with (e.g. re_gecip_cancer_colorectal for CRC GeCIP). Default: re_gecip_cancer_pan + --max_signatures Max number of signatures to run in the method - use a small number when testing. Default: 1000 + --n_files Number of vcf files to process (put in a small number if doing a trial run) + """.stripIndent() +} +if (params.help){ + helpMessage() + exit 0 +} + + +// Input channels +chrom_ch = Channel.from(1..22).concat(Channel.of('X')) +file_chunks = file(params.filename_germline_vcfs) +file_genes = file(params.filename_genes) + +// run time 1-10 hours with 9500 samples +Channel.from(file_chunks) + .splitCsv(header: true, sep: '\t', limit: params.n_files ) + .map { row -> [row.chromosome, row.filename_vcf] } + .set { chunk_ch } + +process clinvar { + tag "Filter ClinVar" + label 'short_job' + + // errorStrategy { task.exitStatus==123 ? 'ignore':'terminate'} + + input: + val(chromosome) from chrom_ch + + output: + set env(chr_label), file("clinvar_filtered_*.vcf") into clinvar_filtered_ch + + module 'bio/BCFtools' + + """ + chr_label='chr${chromosome}' + echo -e '${chromosome} chr${chromosome}\n' > rename_chrs.txt + + # Select gene regions on chromosome + grep -P '^chr${chromosome}\t' ${params.filename_genes} | sed 's/^chr//g' > chr${chromosome}_gene_regions.txt + # Filter by pathogenic or likely pathogenic or at least one study says pathogenic/likely pathogenic and none say benign + bcftools view -i 'CLNSIG="Pathogenic/i"|CLNSIG="Likely_pathogenic"|(CLNSIGCONF~"Pathogenic/i"&CLNSIGCONF!~"Benign/i")' \ + -R chr${chromosome}_gene_regions.txt ${params.filename_clinvar_vcf} | \ + bcftools annotate --rename-chrs rename_chrs.txt > clinvar_filtered_chr${chromosome}.vcf + """ +} + +process cadd { + tag "Run CADD" + label 'short_job' + + // errorStrategy { task.exitStatus==123 ? 'ignore':'terminate'} + + input: + set chromosome, vcf_fname from chunk_ch + file(gene_regions) from file_genes + + output: + set val(chromosome), file("*_regions.vcf"), file("*.cadd.txt.gz") into mutations_ch // + + module 'bio/BCFtools:tools/snakemake/5.7.1-foss-2019a-Python-3.7.2:lang/Anaconda3/2021.11' + + """ + TODO: use conda environment from .env + + # Get unique ID of file chunk + uid=\$(echo ${vcf_fname} | awk -F'aggV2_' '{print \$2}' | cut -d'.' -f1) + + CADD_OUT="${params.dir_output}/cadd_output/\${uid}.cadd.txt.gz" + + vcf_prefix=\$(echo ${vcf_fname} | cut -d"." -f1) + + # For renameing chromosomes (CADD only uses numbers without chr) + chrom=${chromosome} + echo -e "${chromosome} \${chrom#chr}" > rename_chrs.txt + + echo "Filter samples and genes with bcftools" + # if [ -f "\$CADD_OUT" ]; then + if false; then + ln -s \$CADD_OUT + else + # Find all the variants that fall in the gene regions, keep only pass variants and only alt variants (vcf file contains pathogenic variants but alt genotype may not actually be present) + echo "bcftools to filter regions" + bcftools view -i 'FILTER="PASS" && GT="alt"' -R ${gene_regions} -S ${params.filename_sample_list} ${vcf_fname} > \${uid}_regions.vcf + echo "bcftools annotate to change chromosome label format from chrX to X" + bcftools annotate --rename-chrs rename_chrs.txt \${uid}_regions.vcf > \${uid}_reformat.vcf + + # bug in cadd - cannot have "chr" in chromosome name, needs to tab separated + # TODO set core number equal to number of cpus + echo "Run CADD" + bash ${params.cadd_cmd} -c 30 -g GRCh38 -o \${uid}.cadd.txt.gz \${uid}_reformat.vcf + fi + """ +} + +// Group genotypes by chromosome +mutations_ch.groupTuple().join(clinvar_filtered_ch) + .into { germline_grouped_ch; view_ch } + +process germline_hits { + tag "Extract mutations" + label 'short_job' + + // publishDir "${params.dir_output}/mutations", mode: "copy", overwrite: true + + input: + tuple val(chromosome), file(list_of_cadds), file(list_of_vcfs), file(clinvar_vcf) from germline_grouped_ch + + output: + file("aggv2-germline_cancer-gene_hits_*.tsv") into germline_hits_ch + file("CADD_ClinVar_filter_df_*.tsv") into filter_ch + + module 'bio/BCFtools:singularity/3.2.1:lang/Anaconda3/2021.11' + + """ + echo "${chromosome}, ${list_of_vcfs}" + + # Activate python virtual environment + # TODO: use conda environment from .env + + # Make file of vcfs + ls *_regions.vcf > vcf_list.txt + ls *.cadd.txt.gz > cadd_list.txt + + echo "Get ClinVar data table from vcf" + bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%INFO/CLNSIG\t%INFO/CLNSIGCONF\n' ${clinvar_vcf} > clinvar.tsv + + # Import to Hail format + python3 ${baseDir}/scripts/germline_hail.py \ + ${chromosome} \ + 'vcf_list.txt' \ + ${params.filename_sample_list} \ + 'cadd_list.txt' \ + ${params.filename_genes} \ + clinvar.tsv \ + 1 \ + 'GRCh38' \ + ${params.PHRED_threshold} + + """ +} +germline_hits_ch.collectFile(name: "aggv2-germline_cancer-gene_hits.tsv", keepHeader: true, skip: 1, storeDir: "${params.dir_output}") +filter_ch.collectFile(name: "CADD_ClinVar_filter_df.tsv", keepHeader: true, skip: 1, storeDir: "${params.dir_output}") diff --git a/src/pipelines/cancGeneHits/cancGermlineHits.nf/nextflow.config b/src/pipelines/cancGeneHits/cancGermlineHits.nf/nextflow.config new file mode 100755 index 0000000..ee9d43b --- /dev/null +++ b/src/pipelines/cancGeneHits/cancGermlineHits.nf/nextflow.config @@ -0,0 +1,67 @@ +// pipeline information +manifest { + author = 'Andy Everall' + description = 'Run' + mainScript = 'main.nf' + name = 'cancGermlineHits.nf' + nextflowVersion = '21.04.3' + version = '0.0.1' +} + +// environmental variables +// env { +// } + +// default global parameters +params { + // options: generic + help = false + + // options: LSF + projectName = 're_gecip_cancer_pan' + + // CADD executor + cadd_cmd = '/re_gecip/shared_allGeCIPs/bkinnersley/CADD/CADD-scripts-master/CADD.sh' + PHRED_threshold = 20 + + // options: singularity + bind = '/nas/weka.gel.zone/resources:/resources,/nas/weka.gel.zone/resources/vscaler/ohpc:/opt/ohpc,/home:/home,/re_gecip:/re_gecip,/re_scratch/acornish/hail:/tmp,/gel_data_resources:/gel_data_resources' + + // options: test run + max_signatures = 1000 +} + +// default process parameters +process { + executor = 'lsf' + clusterOptions = "-P ${params.projectName}" // should not specify -R "span[hosts=1]", as this is automatically included when cpus > 1 + errorStrategy = 'retry' + maxRetries = 3 + beforeScript = { task.attempt <= 1 ? 'sleep 1' : 'sleep 30' } // negate delay in file availability + withLabel: 'regular|standard' { + executor = 'local' + memory = { task.attempt <= 1 ? '16 GB' : '32 GB' } + queue = 'short' + } + withLabel: short_job { + time = { task.attempt <= 2 ? '4h' : '24h' } + cpus = { task.attempt <= 1 ? '2' : '4' } + memory = { task.attempt <= 1 ? '32 GB' : '64 GB' } + queue = { task.attempt <= 2 ? 'short' : 'medium' } + } +} + +// default job submission parameters +executor { + submitRateLimit = '5 sec' + $lsf { queueSize = 50 } + $local { queueSize = 50 } +} + +timeline { + enabled=true +} + +report { + enabled=true +} diff --git a/src/pipelines/cancGeneHits/cancGermlineHits.nf/scripts/germline_hail.py b/src/pipelines/cancGeneHits/cancGermlineHits.nf/scripts/germline_hail.py new file mode 100755 index 0000000..3095f61 --- /dev/null +++ b/src/pipelines/cancGeneHits/cancGermlineHits.nf/scripts/germline_hail.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python + +## Implement the ALFRED analysis in GEL using the Battenberg calls to identify LOH +''' + Input: + + Output: + +''' +import hail as hl +import pandas as pd, numpy as np +import sys +import re + +# Input arguments +chromosome, vcf_list_file, sample_file, cadd_list_file, gene_file, clinvar_file, n_cpu, reference, PHRED_threshold = sys.argv[1:] +PHRED_threshold = int(PHRED_threshold) +print(sys.argv[1:]) + +# Load list of VCF files +vcf_files = pd.read_csv(vcf_list_file, sep="\t", header=None)[0].to_list() +print(vcf_files) + +# Load list of CADD files +cadd_files = pd.read_csv(cadd_list_file, sep="\t", header=None)[0].to_list() +cadd_df = pd.concat([pd.read_csv(cadd_file,sep="\t",compression='gzip', + comment='#', names=['chrom','pos', + 'ref','alt', + 'raw','phred']) for cadd_file in cadd_files]) +cadd_df['chrom'] = "chr"+cadd_df.chrom.astype(str) + +# Load clinvar +clinvar_df = pd.read_csv(clinvar_file, sep="\t", names=['chrom','pos','id','ref','alt','CLNSIG','CLNSIGCONF']) + +# Read in samples +samples = pd.read_csv(sample_file, sep="\t", names=["germline_sample_platekey"]) + +# Read in genes - chrom,start,end,gene_id +genes = pd.read_csv(gene_file, sep="\t", names=['chrom','start','end','gene_id']).fillna(0) +genes['start'] = genes['start'].astype(int) +genes['end'] = genes['end'].astype(int) +genes = genes[genes.chrom==chromosome] +print(genes[genes.start==0]) +genes = genes[genes.start!=0] +print(f"genes: {len(genes)}") + + +# CADD and ClinVar filters +filter_df = pd.merge(cadd_df, clinvar_df, on=('chrom','pos','ref','alt'), how='left') +# Apply filters +# Negative controls +benign_match = np.vectorize(lambda x:bool(re.match('.*benign.*', x, re.IGNORECASE))) +filter_df = filter_df[(filter_df.phred>PHRED_threshold)&\ + (~benign_match(filter_df.CLNSIG.astype(str)))] +filter_df['alleles'] = filter_df[['ref','alt']].values.tolist() + +# initialise Hail - 101 retries to cope with parallel processing`` +hl.init(master="local[" + n_cpu + "]", default_reference=reference, spark_conf={"spark.port.maxRetries":"101"}) + +# Construct matrix_table in Hail +aggv2_mt = hl.import_vcf(vcf_files, reference_genome=reference)#, force_bgz=True) +print("All variants: ", aggv2_mt.count()) + +# Filter on CADD and ClinVar classification +filter_mt = hl.Table.from_pandas(filter_df[['chrom','pos','alleles']]) +filter_mt = filter_mt.annotate(locus=hl.locus(filter_mt.chrom, filter_mt.pos)).key_by('locus','alleles') +aggv2_mt = aggv2_mt.semi_join_rows(filter_mt) +print("CADD & ClinVar filter: ", aggv2_mt.count()) + +germline_hits_arr = np.zeros((len(genes), len(samples)), dtype=np.int64) +for idx, row in genes.reset_index(drop=True).iterrows(): + + # Get genotype for gene + print(row.gene_id, f"{row['chrom']}:{row['start']}-{row['end']}") + gene_mt = hl.filter_intervals(aggv2_mt, [hl.parse_locus_interval(f"{row['chrom']}:{row['start']}-{row['end']}", + reference_genome=reference)]) + if gene_mt.count()[0]==0: + print(f"{row['gene_id']} - empty") + continue + + print(f"{row['gene_id']} - not empty") + + # Get sum of genotype for each sample within gene + agg_sum = gene_mt.annotate_cols(n_variants_sum=hl.agg.sum(gene_mt.GT.n_alt_alleles())) + mutated = agg_sum.cols().to_pandas() + + # Save non-zero indices to hdf5 + germline_hits_arr[idx] = pd.merge(samples, mutated, how='left', + left_on='germline_sample_platekey', right_on='s').n_variants_sum + +germline_hits_df = pd.DataFrame(germline_hits_arr, index=genes.gene_id, columns=samples.germline_sample_platekey) +germline_hits_df.to_csv(f"aggv2-germline_cancer-gene_hits_{chromosome}.tsv", sep="\t", header=True, index=True) + +filter_df.to_csv(f"CADD_ClinVar_filter_df_{chromosome}.tsv", sep="\t", header=True, index=False) \ No newline at end of file diff --git a/src/pipelines/cancGeneHits/cancLOH.nf/main.nf b/src/pipelines/cancGeneHits/cancLOH.nf/main.nf new file mode 100755 index 0000000..fdcc340 --- /dev/null +++ b/src/pipelines/cancGeneHits/cancLOH.nf/main.nf @@ -0,0 +1,75 @@ +#!/usr/bin/env nextflow +// treatmentInducedMutations.nf pipeline +// written by Andy Everall +// + +// help message +def helpMessage() { + log.info""" + Mandatory arguments: + --filename_battenberg_list List of battenberg files - tsv file with tumour_sample_platekey, filename_batt + --filename_sample_list List of samples to extract + --filename_genes Gene loci to extract - tsv file with chrom, start, end, ID + --dir_output Directory to save results + + Optional arguments: + --projectName LSF project name that jobs are submitted with (e.g. re_gecip_cancer_colorectal for CRC GeCIP). Default: re_gecip_cancer_pan + --run_name Identifying name given to the run (usually would explain why it is different from previous runs). Default: testrun + --run_meta Description of the run and what makes it different to previous runs. Default: '' + """.stripIndent() +} +if (params.help){ + helpMessage() + exit 0 +} + +// Samples channel +file_samples=file(params.filename_sample_list) + +// run time 1-10 hours with 9500 samples +// Channel.fromPath(params.filename_sample_list).splitCsv(by: params.chunk_size).randomSample(2).set{ chunk_ch } +process split_samples { + tag "Split samples" + label 'standard' + + input: + file(sample_list) from file_samples + + output: + path 'sample_chunk_*' into sample_chunks_ch + + """ + split -l ${params.chunk_size} --numeric-suffixes ${sample_list} sample_chunk_ + """ + +} +sample_chunks_ch2 = sample_chunks_ch.flatMap()//.randomSample(2) + +process loh { + tag "Extract LOH mutations" + label 'standard' // 'short_job' + + input: + file(sample_list) from sample_chunks_ch2 + + output: + // file("loh_matrix.tsv") into loh_hits_ch + file("loh_fraction.tsv") into loh_frac_ch + file("n_overlapping_cnv.tsv") into n_cnv_ch + file("loh_number_size.tsv") into number_size_ch + + module 'bio/BCFtools:singularity/3.2.1:lang/Anaconda3/2021.11' + + """ + # Run LOH python script - searches from Battenberg output + + python ${baseDir}/scripts/loh.py \ + ${sample_list} \ + ${params.filename_battenberg_list} \ + ${params.filename_genes} + """ +} +// loh_hits_ch.collectFile(name: "lohclonal_gene_sample_matrix.tsv", keepHeader: true, skip: 1, storeDir: "${params.dir_output}") +loh_frac_ch.collectFile(name: "lohfrac_gene_sample_matrix.tsv", keepHeader: true, skip: 1, storeDir: "${params.dir_output}") +n_cnv_ch.collectFile(name: "ncnv_gene_sample_matrix.tsv", keepHeader: true, skip: 1, storeDir: "${params.dir_output}") +number_size_ch.collectFile(name: "loh_number_size.tsv", keepHeader: true, skip: 1, storeDir: "${params.dir_output}") diff --git a/src/pipelines/cancGeneHits/cancLOH.nf/nextflow.config b/src/pipelines/cancGeneHits/cancLOH.nf/nextflow.config new file mode 100755 index 0000000..fe0f26a --- /dev/null +++ b/src/pipelines/cancGeneHits/cancLOH.nf/nextflow.config @@ -0,0 +1,68 @@ +// pipeline information +manifest { + author = 'Andy Everall' + description = 'Retrieve somatic mutations in cancer genes' + mainScript = 'main.nf' + name = 'cancSomaticHits.nf' + nextflowVersion = '21.04.3' + version = '0.0.1' +} + +// environmental variables +// env { +// } + +// default global parameters +params { + // options: generic + help = false + + // options: LSF + projectName = 're_gecip_cancer_pan' + + // CADD executor + cadd_cmd = '/re_gecip/shared_allGeCIPs/bkinnersley/CADD/CADD-scripts-master/CADD.sh' + + // options: singularity + bind = '/nas/weka.gel.zone/resources:/resources,/nas/weka.gel.zone/resources/vscaler/ohpc:/opt/ohpc,/home:/home,/re_gecip:/re_gecip,/re_scratch/acornish/hail:/tmp,/gel_data_resources:/gel_data_resources' + + // options: test run + max_signatures = 1000 + run_name = "testrun" + run_meta = "" //"Age,PC1-3,purity covariates. PGS+GWAS downloaded traits. Zero Inflated Negative Binomial model with Poisson regression backup. Removed hypermutated. Primary tumour except in Skin where Meta in skin also kept." +} + +// default process parameters +process { + executor = 'lsf' + clusterOptions = "-P ${params.projectName}" // should not specify -R "span[hosts=1]", as this is automatically included when cpus > 1 + // errorStrategy = 'retry' + // maxRetries = 3 + beforeScript = { task.attempt <= 1 ? 'sleep 1' : 'sleep 30' } // negate delay in file availability + withLabel: 'regular|standard' { + executor = 'local' + memory = { task.attempt <= 1 ? '16 GB' : '32 GB' } + queue = 'short' + } + withLabel: short_job { + time = { task.attempt <= 2 ? '4h' : '24h' } + cpus = { task.attempt <= 1 ? '2' : '4' } + memory = { task.attempt <= 1 ? '16 GB' : '32 GB' } + queue = { task.attempt <= 2 ? 'short' : 'medium' } + } +} + +// default job submission parameters +executor { + submitRateLimit = '5 sec' + $lsf { queueSize = 50 } + $local { queueSize = 50 } +} + +timeline { + enabled=true +} + +report { + enabled=true +} diff --git a/src/pipelines/cancGeneHits/cancLOH.nf/scripts/loh.py b/src/pipelines/cancGeneHits/cancLOH.nf/scripts/loh.py new file mode 100755 index 0000000..5fdadc0 --- /dev/null +++ b/src/pipelines/cancGeneHits/cancLOH.nf/scripts/loh.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python + +## Implement the ALFRED analysis in GEL using the Battenberg calls to identify LOH +''' + Input: + sample_list + gene_list - four columns: chrom, start, end, gene_id + battenberg_list - two columns: tumour_sample_platekey, filename_batt + Output: + File where each row is gene, column for participant_id, with 1=LoH in this gene, 0=no LoH +''' +import pandas as pd, numpy as np +import sys + +#python 1_loh_calc.py $battenberg_id $updated_germline $updated_somatic ${params.ref_gene} +# participant_id, battenberg_dir, cadd_region_file, gene_db, gene_list = sys.argv[1:] +sample_list_file, battenberg_list_file, gene_list_file = sys.argv[1:] + +# Read in samples +samples = pd.read_csv(sample_list_file, sep="\t", names=['tumour_sample_platekey']) +battenberg_list = pd.read_csv(battenberg_list_file, sep="\t") +battenberg_list = battenberg_list[battenberg_list.filename_batt.astype(str)!='nan'] +samples = pd.merge(samples, battenberg_list, how='inner', on='tumour_sample_platekey') +print(f"Samples: {len(samples)}") + +# Read in genes +genes = pd.read_csv(gene_list_file, sep="\t", usecols=['chrom', 'start', 'end', 'gene_id']) +genes = genes[genes.start.astype(str)!='nan'] +genes = genes.astype({'start':int,'end':int}) +genes.chrom = genes.chrom.str[3:] +genes.reset_index(drop=True) + +# Results table +# loh_results=np.zeros((len(genes), len(samples)), dtype=np.int16)#, columns=genes.gene_id, index=samples.tumour_sample_platekey) # pd.DataFrame(index=genes.gid) +loh_fraction_arr=np.zeros((len(genes), len(samples)), dtype=np.float64) +n_cnv=np.zeros((len(genes), len(samples)), dtype=np.float64) +n_loh = np.zeros(len(samples), dtype=int) +n_loh50 = np.zeros(len(samples), dtype=int) +n_loh100 = np.zeros(len(samples), dtype=int) +size_loh = np.zeros(len(samples), dtype=int) + +# Columns to load from battenberg +battenberg_cols = ['chr','startpos','endpos']\ + +[f'nMaj{i}_A' for i in range(1,3)]\ + +[f'nMin{i}_A' for i in range(1,3)]\ + +[f'frac{i}_A' for i in range(1,3)] + +for idx, row in samples.iterrows(): + + # Import battenberg data + battenberg_df = pd.read_csv(row['filename_batt'], sep="\t", usecols=battenberg_cols).fillna(0.) + + # Get intersections between battenberg mutations and genes + batt_chrchr, gene_chrchr = np.meshgrid(battenberg_df.chr, genes.chrom) + batt_startstart, gene_startstart = np.meshgrid(battenberg_df.startpos, genes.start) + batt_endend, gene_endend = np.meshgrid(battenberg_df.endpos, genes.end) + batt_gene_overlap = (batt_chrchr==gene_chrchr)&(batt_startstartgene_startstart) + + # Number of mutations overlapping gene + n_cnv[:,idx]=np.sum(batt_gene_overlap.astype(int), axis=1) + + # Fraction of cells with LOH + loh_fraction = np.nansum( np.array([battenberg_df[f"frac{i}_A"]*((battenberg_df[f"nMaj{i}_A"]==0)|(battenberg_df[f"nMin{i}_A"]==0)).astype(float) \ + for i in range(1,3)]), axis=0) + + loh_fraction_arr[:,idx]=np.nanmax( batt_gene_overlap.astype(float) * loh_fraction[None,:], axis=1 ) + + # Number and total size of LoH events + n_loh[idx] = len(battenberg_df) + n_loh50[idx] = np.sum(loh_fraction>0.5) + n_loh100[idx] = np.sum(loh_fraction==1) + size_loh[idx] = np.sum(battenberg_df['endpos']-battenberg_df['startpos']) + +# loh_results = pd.DataFrame(loh_results.T, columns=genes.gene_id, index=samples.tumour_sample_platekey) +# loh_results.to_csv(f"loh_matrix.tsv", sep="\t", header=True, index=True) + +loh_number_size = pd.DataFrame({"n_loh":n_loh, "n_loh50":n_loh50, "n_loh100":n_loh100, "size_loh":size_loh}, + index=samples.tumour_sample_platekey) +loh_number_size.to_csv(f"loh_number_size.tsv", sep="\t", header=True, index=True) + +loh_fraction_arr = pd.DataFrame(loh_fraction_arr.T, columns=genes.gene_id, index=samples.tumour_sample_platekey) +loh_fraction_arr.to_csv(f"loh_fraction.tsv", sep="\t", header=True, index=True) + +n_cnv = pd.DataFrame(n_cnv.T, columns=genes.gene_id, index=samples.tumour_sample_platekey) +n_cnv.to_csv(f"n_overlapping_cnv.tsv", sep="\t", header=True, index=True) diff --git a/src/pipelines/cancGeneHits/cancOncoKB.nf/main.nf b/src/pipelines/cancGeneHits/cancOncoKB.nf/main.nf new file mode 100755 index 0000000..dd9a5c0 --- /dev/null +++ b/src/pipelines/cancGeneHits/cancOncoKB.nf/main.nf @@ -0,0 +1,63 @@ +#!/usr/bin/env nextflow + +// help message +def helpMessage() { + log.info""" + Mandatory arguments: + --filename_oncokb_list List of sample OncoKB files to extract + --filename_genes Gene loci to extract - tsv file with chromosome, start, end, ID + --dir_output Directory to save results + + Optional arguments: + --projectName LSF project name that jobs are submitted with (e.g. re_gecip_cancer_colorectal for CRC GeCIP). Default: re_gecip_cancer_pan + --max_signatures Max number of signatures to run in the method - use a small number when testing. Default: 1000 + --n_samples Number of samples to process (put in a small number if doing a trial run) + --chunk_size Number of samples per nextflow process + """.stripIndent() +} +if (params.help){ + helpMessage() + exit 0 +} + + +// parameters +file_samples = file(params.filename_oncokb_list) + +process split { + tag "Split files" + label 'standard'//'short_job' + + input: + file(samples) from file_samples + + output: + path 'sample_chunks_*' into file_chunks + + """ + HEADER=\$(head -1 ${samples}) + + tail -n +2 ${samples} | split -l ${params.chunk_size} - sample_chunks_ + for i in sample_chunks_*; do + sed -i -e "1i\$HEADER" "\$i" + done + """ +} + +process mutations { + tag "Extract mutations" + label 'standard' + + input: + file(oncokb_list) from file_chunks.flatten() + + output: + file("OncoKB_somatic_hits.tsv") into somatic_hits_ch + + module = 'lang/Anaconda3/2021.11' + + """ + python ${baseDir}/scripts/collectOncoKB.py ./ ${oncokb_list} ${params.filename_genes} + """ +} +somatic_hits_ch.collectFile(name: "OncoKB_somatic_cancer-gene_hits.tsv", keepHeader: true, skip: 1, storeDir: "${params.dir_output}") diff --git a/src/pipelines/cancGeneHits/cancOncoKB.nf/nextflow.config b/src/pipelines/cancGeneHits/cancOncoKB.nf/nextflow.config new file mode 100755 index 0000000..04af00d --- /dev/null +++ b/src/pipelines/cancGeneHits/cancOncoKB.nf/nextflow.config @@ -0,0 +1,60 @@ +// pipeline information +manifest { + author = 'Andy Everall' + description = 'Run treatment-induced mutations pipeline' + mainScript = 'main.nf' + name = 'cancOncoKB.nf' + nextflowVersion = '21.04.3' + version = '0.0.1' +} + +// environmental variables +// env { +// } + +// default global parameters +params { + // options: generic + help = false + + // options: LSF + projectName = 're_gecip_cancer_pan' + + // options: test run + run_name = "testrun" +} + +// default process parameters +process { + executor = 'lsf' + clusterOptions = "-P ${params.projectName}" // should not specify -R "span[hosts=1]", as this is automatically included when cpus > 1 + // errorStrategy = 'retry' + // maxRetries = 3 + beforeScript = { task.attempt <= 1 ? 'sleep 1' : 'sleep 30' } // negate delay in file availability + withLabel: 'regular|standard' { + executor = 'local' + memory = { task.attempt <= 1 ? '16 GB' : '32 GB' } + queue = 'short' + } + withLabel: short_job { + time = { task.attempt <= 2 ? '4h' : '24h' } + cpus = { task.attempt <= 1 ? '2' : '4' } + memory = { task.attempt <= 1 ? '32 GB' : '64 GB' } + queue = { task.attempt <= 2 ? 'short' : 'medium' } + } +} + +// default job submission parameters +executor { + submitRateLimit = '5 sec' + $lsf { queueSize = 50 } + $local { queueSize = 50 } +} + +timeline { + enabled=true +} + +report { + enabled=true +} diff --git a/src/pipelines/cancGeneHits/cancOncoKB.nf/scripts/PRIMARY_ALL_OncoKB.sh b/src/pipelines/cancGeneHits/cancOncoKB.nf/scripts/PRIMARY_ALL_OncoKB.sh new file mode 100755 index 0000000..9871a9c --- /dev/null +++ b/src/pipelines/cancGeneHits/cancOncoKB.nf/scripts/PRIMARY_ALL_OncoKB.sh @@ -0,0 +1,55 @@ + +# Run OncoKB +# Runs OncoKB for every single individual in the cohort. We later perform additional processing +# + + +username=aeverall +project_dir=/re_scratch/re_gecip/cancer_pan/${username}/projects/cancgene_sigs +output_dir=/re_gecip/cancer_pan/${username}/data/ + +module purge +export PATH=/usr/share/lsf/10.1/linux3.10-glibc2.17-x86_64/etc:/usr/share/lsf/10.1/linux3.10-glibc2.17-x86_64/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/home/rculliford/.local/bin:/home/rculliford/bin + +cd ${project_dir} +# /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/Updated_OncoKB_Annotation/VEP/nextflow_work_dirs + +module load singularity/3.2.1 + +/re_gecip/shared_allGeCIPs/bkinnersley/nextflow/nextflow run /re_gecip/cancer_renal_cell/rculliford/driver_analysis/Scripts/Initiation_Scripts/run_vep_CADD_UTRannotate_pancan_UsingRAWCADD.nf \ +--input_samples /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/Updated_OncoKB_Annotation/SampleLists/RCC_input_VEP.tsv \ +--output_dir /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/VEP_Annotated_VCF/ \ +-c /re_gecip/shared_allGeCIPs/bkinnersley/intogen/rerun_VEP/run_vep.conf -resume -with-report -with-trace + + +#module load lang/R/4.1.0-foss-2019b + +mkdir -p /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/VEP_Annotated_VCF/ +mkdir -p /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/Updated_OncoKB_Annotation/VEP/nextflow_work_dirs + +module purge +export PATH=/usr/share/lsf/10.1/linux3.10-glibc2.17-x86_64/etc:/usr/share/lsf/10.1/linux3.10-glibc2.17-x86_64/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/home/rculliford/.local/bin:/home/rculliford/bin + +cd /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/Updated_OncoKB_Annotation/VEP/nextflow_work_dirs + +module load singularity/3.2.1 + +/re_gecip/shared_allGeCIPs/bkinnersley/nextflow/nextflow run /re_gecip/cancer_renal_cell/rculliford/driver_analysis/Scripts/Initiation_Scripts/run_vep_CADD_UTRannotate_pancan_UsingRAWCADD.nf \ +--input_samples /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/Updated_OncoKB_Annotation/SampleLists/RCC_input_VEP.tsv \ +--output_dir /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/VEP_Annotated_VCF/ \ +-c /re_gecip/shared_allGeCIPs/bkinnersley/intogen/rerun_VEP/run_vep.conf -resume -with-report -with-trace + +## --input_samples /re_gecip/cancer_colorectal/analysisResults/13.driverAnalysis/OncoKB_annotation/CRC_v8_ALL_mut_input_for_nextflow.tsv + +mkdir -p /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/Updated_OncoKB_Annotation/output +mkdir -p /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/Updated_OncoKB_Annotation/nextflow_work_dirs + +cd /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/Updated_OncoKB_Annotation/nextflow_work_dirs + +module load singularity/3.2.1 + +/re_gecip/shared_allGeCIPs/bkinnersley/nextflow/nextflow run \ +/re_gecip/shared_allGeCIPs/bkinnersley/intogen/OncoKB/run_OncoKB_mutations.nf -resume -with-report -with-trace \ +--input_samples /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/Updated_OncoKB_Annotation/SampleLists/RCC_input_oncokb.tsv \ +--output_dir /re_gecip/cancer_renal_cell/analysisResults/6.driverAnalysis/Updated_OncoKB_Annotation/output \ +-c /re_gecip/shared_allGeCIPs/bkinnersley/intogen/OncoKB/run_OncoKB.conf diff --git a/src/pipelines/cancGeneHits/cancOncoKB.nf/scripts/collectOncoKB.py b/src/pipelines/cancGeneHits/cancOncoKB.nf/scripts/collectOncoKB.py new file mode 100755 index 0000000..e067c4d --- /dev/null +++ b/src/pipelines/cancGeneHits/cancOncoKB.nf/scripts/collectOncoKB.py @@ -0,0 +1,50 @@ +# +# Collect OncoKB results from pan cancer analysis +# + +import sys +import numpy as np, pandas as pd, re + + +if __name__=='__main__': + + print(sys.argv[1:]) + dir_output, oncokb_file, gene_list_file = sys.argv[1:] + + # Sample list + sample_list = pd.read_csv(oncokb_file, sep="\t") + print(sample_list.head()) + + # Gene list + genes = np.array(pd.read_csv(gene_list_file, sep="\t", usecols=['gene_id']).gene_id) + + # Iterate through samples and get gene mutations + mutation_df = pd.DataFrame(np.zeros((len(genes), len(sample_list)), dtype=int), columns=sample_list.tumour_sample_platekey, index=genes) + unique_entries = np.array([]) + unique_oncogenic = np.array([]) + oncogenic = np.vectorize(lambda x:bool(re.match('.*oncogenic.*', x, re.IGNORECASE))) + for i, row in sample_list.iterrows(): + + tumour_sample_platekey = row['tumour_sample_platekey'] + annotations = pd.read_csv(row['file'], sep="\t", + usecols=['hugoSymbol','oncogenic','knownEffect'])\ + .rename({'hugoSymbol':'gene'}, axis=1) + if len(annotations)==0: + continue + unique_entries = np.unique(np.hstack((unique_entries, annotations.oncogenic))) + # Filter to only rows which are oncogenic + annotations = annotations[oncogenic(annotations.oncogenic)] + unique_oncogenic = np.unique(np.hstack((unique_oncogenic, annotations.oncogenic))) + + # mutation_df[tumour_sample_platekey] = np.zeros(len(genes), dtype=int) + unique_genes, unique_counts = np.unique(annotations, return_counts=True) + match_genes, unique_indices, _ = np.intersect1d(unique_genes, genes, return_indices=True) + mutation_df[tumour_sample_platekey].loc[match_genes] = unique_counts[unique_indices] + + print(f"{int(100*i/len(sample_list))}%", end="\r") + print("") + + print("Unique entries: ", unique_entries) + print("Oncogenic entries: ", unique_oncogenic) + + mutation_df.T.to_csv(f"{dir_output}/OncoKB_somatic_hits.tsv", header=True, index=True, sep="\t") diff --git a/src/pipelines/cancGeneHits/clinvarCadd.nf/main.nf b/src/pipelines/cancGeneHits/clinvarCadd.nf/main.nf new file mode 100755 index 0000000..06311f3 --- /dev/null +++ b/src/pipelines/cancGeneHits/clinvarCadd.nf/main.nf @@ -0,0 +1,117 @@ +// help message +def helpMessage() { + log.info""" + Mandatory arguments: + --filename_genes Gene loci to extract - tsv file with chromosome, start, end, ID + --filename_clinvar_vcf vcf file of clinvar mutations + --dir_output Directory to save results + + Optional arguments: + --projectName LSF project name that jobs are submitted with (e.g. re_gecip_cancer_colorectal for CRC GeCIP). Default: re_gecip_cancer_pan + --n_files Number of vcf files to process (put in a small number if doing a trial run) + """.stripIndent() +} +if (params.help){ + helpMessage() + exit 0 +} + +// Input channels +chrom_ch = Channel.from(1..22).concat(Channel.of('X')) +file_genes = file(params.filename_genes) + +process clinvar { + tag "Filter ClinVar" + label 'standard' + + input: + val(chromosome) from chrom_ch + + output: + set env(chr_label), file("clinvar_filtered_*.vcf") into clinvar_filtered_ch + + module 'bio/BCFtools' + + """ + chr_label='chr${chromosome}' + echo -e '${chromosome} chr${chromosome}\n' > rename_chrs.txt + + # Select gene regions on chromosome + grep -P '^chr${chromosome}\t' ${params.filename_genes} | sed 's/^chr//g' > chr${chromosome}_gene_regions.txt + # Filter by pathogenic or likely pathogenic or at least one study says pathogenic/likely pathogenic and none say benign + bcftools view -R chr${chromosome}_gene_regions.txt ${params.filename_clinvar_vcf} | \ + bcftools annotate --rename-chrs rename_chrs.txt > clinvar_filtered_chr${chromosome}.vcf + """ + // -i 'CLNSIG="Pathogenic/i"|CLNSIG="Likely_pathogenic"|(CLNSIGCONF~"Pathogenic/i"&CLNSIGCONF!~"Benign/i")' +} + +process cadd { + tag "Run CADD" + label 'standard' + + input: + set chromosome, vcf_file from clinvar_filtered_ch + + output: + set chromosome, vcf_file, file("*.cadd.txt.gz") into caddout_ch + + module 'bio/BCFtools:tools/snakemake/5.7.1-foss-2019a-Python-3.7.2:lang/Anaconda3/2021.11' + + """ + CADD_OUT="${params.dir_output}/cadd_output/${chromosome}.cadd.txt.gz" + + # For renameing chromosomes (CADD only uses numbers without chr) + chrom=${chromosome} + echo -e "${chromosome} \${chrom#chr}" > rename_chrs.txt + + echo "bcftools annotate to change chromosome label format from chrX to X" + bcftools annotate --rename-chrs rename_chrs.txt ${vcf_file} > clinvar_${chromosome}_reformat.vcf + + # TODO set core number equal to number of cpus + echo "Run CADD" + bash ${params.cadd_cmd} -c 30 -g GRCh38 -o clinvar_${chromosome}.cadd.txt.gz clinvar_${chromosome}_reformat.vcf + """ +} + +// Group genotypes by chromosome +// caddout_ch.collect().into { cadd_grouped_ch; view_ch } + +process germline_hits { + tag "Extract mutations" + label 'standard' + + input: + tuple val(chromosome), file(clinvar_vcf), file(clinvar_cadd) from caddout_ch + + output: + file("clinvar.tsv") into clinvar_tsv_ch + file("CADD_ClinVar.tsv") into cadd_gene_ch + + module 'bio/BCFtools:singularity/3.2.1:lang/Anaconda3/2021.11' + + """ + echo "${chromosome}" + # Activate python virtual environment + # TODO: use conda environment from .env + + echo "Get ClinVar data table from vcf" + echo -e 'CHROM\tPOS\tID\tREF\tALT\tAF_ESP\tAF_EXAC\tAF_TGP\tALLELEID\tCLNDN\tCLNDNINCL\tCLNDISDB\t\ +CLNDISDBINCL\tCLNHGVS\tCLNREVSTAT\tCLNSIG\tCLNSIGCONF\tCLNSIGINCL\t\ +CLNVC\tCLNVCSO\tCLNVI\tDBVARID\tGENEINFO\tMC\tORIGIN\tRS\tSSR' > clinvar.tsv + bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%INFO/AF_ESP\t%INFO/AF_EXAC\t%INFO/AF_TGP\t%INFO/ALLELEID\t%INFO/CLNDN\t%INFO/CLNDNINCL\t%INFO/CLNDISDB\t%INFO/\ +CLNDISDBINCL\t%INFO/CLNHGVS\t%INFO/CLNREVSTAT\t%INFO/CLNSIG\t%INFO/CLNSIGCONF\t%INFO/CLNSIGINCL\t%INFO/\ +CLNVC\t%INFO/CLNVCSO\t%INFO/CLNVI\t%INFO/DBVARID\t%INFO/GENEINFO\t%INFO/MC\t%INFO/ORIGIN\t%INFO/RS\t%INFO/SSR\n' ${clinvar_vcf} >> clinvar.tsv + + # Import to Hail format + # export PYSPARK_SUBMIT_ARGS='--driver-memory ${task.cpus}0G pyspark-shell --conf spark.port.maxRetries=50' + # singularity exec -B ${params.bind} ${baseDir}/singularity/hail.img /usr/bin/ + python3 ${baseDir}/scripts/clinvar_hail.py \ + ${chromosome} \ + ${clinvar_cadd} \ + ${params.filename_genes} \ + 1 \ + 'GRCh38' + """ +} +cadd_gene_ch.collectFile(name: "CADD_ClinVar_all.tsv", keepHeader: true, skip: 1, storeDir: "${params.dir_output}") +clinvar_tsv_ch.collectFile(name: "clinvar_all.tsv", keepHeader: true, skip: 1, storeDir: "${params.dir_output}") diff --git a/src/pipelines/cancGeneHits/clinvarCadd.nf/nextflow.config b/src/pipelines/cancGeneHits/clinvarCadd.nf/nextflow.config new file mode 100755 index 0000000..228536c --- /dev/null +++ b/src/pipelines/cancGeneHits/clinvarCadd.nf/nextflow.config @@ -0,0 +1,65 @@ +// pipeline information +manifest { + author = 'Andy Everall' + description = 'Run' + mainScript = 'main.nf' + name = 'cancGermlineHits.nf' + nextflowVersion = '21.04.3' + version = '0.0.1' +} + +// environmental variables +// env { +// } + +// default global parameters +params { + // options: generic + help = false + + // options: LSF + projectName = 're_gecip_cancer_pan' + + // CADD executor + cadd_cmd = '/re_gecip/shared_allGeCIPs/bkinnersley/CADD/CADD-scripts-master/CADD.sh' + cadd_min = 20 + + // options: singularity + bind = '/nas/weka.gel.zone/resources:/resources,/nas/weka.gel.zone/resources/vscaler/ohpc:/opt/ohpc,/home:/home,/re_gecip:/re_gecip,/re_scratch/acornish/hail:/tmp,/gel_data_resources:/gel_data_resources' + + // options: test run + max_signatures = 1000 +} + +// default process parameters +process { + executor = 'lsf' + clusterOptions = "-P ${params.projectName}" // should not specify -R "span[hosts=1]", as this is automatically included when cpus > 1 + beforeScript = { task.attempt <= 1 ? 'sleep 1' : 'sleep 30' } // negate delay in file availability + withLabel: 'regular|standard' { + executor = 'local' + memory = { task.attempt <= 1 ? '16 GB' : '32 GB' } + queue = 'short' + } + withLabel: short_job { + time = { task.attempt <= 2 ? '4h' : '24h' } + cpus = { task.attempt <= 1 ? '2' : '4' } + memory = { task.attempt <= 1 ? '32 GB' : '64 GB' } + queue = { task.attempt <= 2 ? 'short' : 'medium' } + } +} + +// default job submission parameters +executor { + submitRateLimit = '5 sec' + $lsf { queueSize = 50 } + $local { queueSize = 50 } +} + +timeline { + enabled=true +} + +report { + enabled=true +} diff --git a/src/pipelines/cancGeneHits/clinvarCadd.nf/scripts/clinvar_hail.py b/src/pipelines/cancGeneHits/clinvarCadd.nf/scripts/clinvar_hail.py new file mode 100755 index 0000000..60da4f0 --- /dev/null +++ b/src/pipelines/cancGeneHits/clinvarCadd.nf/scripts/clinvar_hail.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +## Implement the ALFRED analysis in GEL using the Battenberg calls to identify LOH +import pandas as pd, numpy as np +import sys + +# Input arguments +chromosome, cadd_file, gene_file, n_cpu, reference = sys.argv[1:] +print(sys.argv[1:]) + +# Load list of CADD files +cadd_df = pd.read_csv(cadd_file,sep="\t",compression='gzip', + comment='#', names=['chrom','pos','ref','alt','raw','phred']) +cadd_df['chrom'] = "chr"+cadd_df.chrom.astype(str) + +# Read in genes - chrom,start,end,gene_id +genes = pd.read_csv(gene_file, sep="\t", names=['chrom','start','end','gene_id']).fillna(0) +genes['start'] = genes['start'].astype(int) +genes['end'] = genes['end'].astype(int) +genes = genes[genes.chrom==chromosome] + +# Get gene labels for CADD tsv file +cadd_df_gene = pd.DataFrame(index=cadd_df.index) +for idx,row in genes.iterrows(): + print(row.gene_id) + cadd_df_gene[row.gene_id] = np.where((cadd_df['chrom']==row.chrom)&(cadd_df['pos']>row.start)&(cadd_df['pos'] tests_subset.tsv + grep -P "${signature}\\t${group}" ${params.filename_tests} >> tests_subset.tsv + + Rscript --vanilla ${projectDir}/scripts/zinb_regress.r \ + ./ \ + ${projectDir}/scripts \ + tests_subset.tsv \ + ${params.filename_samples} \ + ${params.filename_signatures} \ + ${params.filename_targets} \ + ${params.resampling_method} \ + ${params.power_analysis} \ + true + mv glm_results.csv "${group}_${signature}_glm_results.csv" + """ +} +// glm_summary_ch = glm_results_ch.collectFile(name: "signature_target_assoc_zinb.csv", keepHeader: true, skip: 1, storeDir: "${params.output_dir}") + +process negbin { + tag "Run fits using glm in R" + label 'short_job'//'standard'// + + input: + tuple val(signature), val(group), val(targets), val(types) from test_nb_ch + + output: + file("*_nb_results.csv") into nb_results_ch + + """ + echo "${group}, ${signature}" + + # Get rows of test file for the given group signature combination + echo -e "signature\\tgroup\\ttarget\ttype" > tests_subset.tsv + grep -P "${signature}\\t${group}" ${params.filename_tests} >> tests_subset.tsv + + Rscript --vanilla ${projectDir}/scripts/zinb_regress.r \ + ./ \ + ${projectDir}/scripts \ + tests_subset.tsv \ + ${params.filename_samples} \ + ${params.filename_signatures} \ + ${params.filename_targets} \ + ${params.resampling_method} \ + ${params.power_analysis} \ + false + mv glm_results.csv "${group}_${signature}_nb_results.csv" + """ +} +// glm_summary_ch = glm_results_ch.collectFile(name: "signature_target_assoc_zinb.csv", keepHeader: true, skip: 1, storeDir: "${params.output_dir}") + + +// // Get covariates file channel +// Channel.from(file(params.filename_tests_binary)).set{ tests_binary_ch } +// tests_binary_ch.splitCsv(by: 1, skip: 1, sep: "\t", limit: 100000 ) //header: ['group', 'signature', 'target'], +// .groupTuple(by: [0,1]).set{ test_binary_ch } + +process logistic { + tag "Run fits using logistic regression in R" + label 'short_job'//'standard'// + + input: + tuple val(signature), val(group), val(targets), val(types) from test_binary_ch + + output: + file("*_logistic_results.csv") into binary_results_ch + + """ + echo "${group}, ${signature}" + + + # Get rows of test file for the given group signature combination + echo -e "signature\\tgroup\\ttarget\ttype" > tests_subset.tsv + grep -P "${signature}\\t${group}" ${params.filename_tests} >> tests_subset.tsv + + Rscript --vanilla ${projectDir}/scripts/logistic_regress.r \ + ./ \ + ${projectDir}/scripts \ + tests_subset.tsv \ + ${params.filename_samples} \ + ${params.filename_signatures} \ + ${params.filename_targets} \ + ZERO50 \ + NULL + mv logistic_results.csv "${group}_${signature}_logistic_results.csv" + """ +} + + + +// Get covariates file channel +test_hits_ch.splitCsv(by: 1, skip: 1, sep: "\t", limit: params.max_tests ) //header: ['group', 'signature', 'target'], + .groupTuple(by: [2,3]) + .set{ test_hit_ch } + +process hithit { + tag "Regressing hits against one another in R" + label 'short_job'//'standard'// + + input: + tuple val(signatures), val(group), val(target), val(type) from test_hit_ch + + output: + file("*_target_results.csv") into target_results_ch + + """ + echo "${group}, ${target}, ${type}" + + # Get rows of test file for the given group signature combination + echo -e "group\\ttarget" > tests_subset.tsv + cat ${params.filename_tests} | tail -n +2 | cut -d \$'\\t' -f2,3 | sort | uniq >> tests_subset.tsv + Rscript --vanilla ${projectDir}/scripts/target_regress.r \ + ${projectDir}/scripts \ + tests_subset.tsv \ + ${params.filename_samples} \ + ${params.filename_targets} \ + ${target} ${type} + mv target_results.csv "${target}_target_results.csv" + """ +} + +// Concatenate all results together into single files +glm_allresults_ch = glm_results_ch.collect() +nb_allresults_ch = nb_results_ch.collect() +binary_allresults_ch = binary_results_ch.collect() +// lognmax_allresults_ch = lognmax_results_ch.collect() +target_allresults_ch = target_results_ch.collect() +process concatenate { + tag "Run fits using glm in R" + label 'short_job'//'standard'// + + publishDir "${params.output_dir}", mode: "copy" + + input: + file(glm_results) from glm_allresults_ch + file(nb_results) from nb_allresults_ch + file(logistic_results) from binary_allresults_ch + // file(lognmax_results) from lognmax_allresults_ch + file(target_results) from target_allresults_ch + + output: + tuple file("signature_target_assoc_zinb.csv"), file("signature_target_assoc_logistic.csv"), file("signature_target_assoc_nb.csv"), file("target_target_assoc.csv") into summary_ch + // file("signature_target_assoc_lognmax.csv"), + + """ + Rscript --vanilla ${projectDir}/scripts/concatenate.r "." "*_glm_results.csv" "signature_target_assoc_zinb.csv" + Rscript --vanilla ${projectDir}/scripts/concatenate.r "." "*_nb_results.csv" "signature_target_assoc_nb.csv" + Rscript --vanilla ${projectDir}/scripts/concatenate.r "." "*_logistic_results.csv" "signature_target_assoc_logistic.csv" + # Rscript --vanilla ${projectDir}/scripts/concatenate.r "." "*_lognmax_results.csv" "signature_target_assoc_lognmax.csv" + Rscript --vanilla ${projectDir}/scripts/concatenate.r "." "*_target_results.csv" "target_target_assoc.csv" + """ +} + +process analysis { + tag "Run fits using glm in R" + label 'short_job'//'standard'// + + publishDir "${params.output_dir}/figures", pattern: "*.png", mode: "copy" + + errorStrategy "terminate" + + input: + tuple file(zinb_file), file(logistic_file), file(nb_file), file(target_file) from summary_ch + // file(lognmax_file), + + output: + file("*.png") into figures_ch + + """ + module load lang/Anaconda3/2021.11 + + mkdir -p "${params.output_dir}/figures" + python3 ${projectDir}/scripts/analysis.py ${zinb_file} ${logistic_file} + # \${lognmax_file} + """ +} diff --git a/src/pipelines/signatures/signatureAssociations.nf/nextflow.config b/src/pipelines/signatures/signatureAssociations.nf/nextflow.config new file mode 100755 index 0000000..d4e4327 --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/nextflow.config @@ -0,0 +1,77 @@ +/* + * ------------------------------------------------- + * signatureAssociations.nf Nextflow config file + * written by Andy Everall + * ------------------------------------------------- + */ + +// pipeline information +manifest { + author = 'Andy Everall' + description = 'Run Signature Associations pipeline' + mainScript = 'main.nf' + name = 'prsSignatures.nf' + nextflowVersion = '21.04.3' + version = '0.0.1' +} + +// workDir='/re_scratch/re_gecip/cancer_pan/aeverall/prs_signatures/prsSignatures/work' +// tempDir='/re_scratch/re_gecip/cancer_pan/aeverall/prs_signatures/prsSignatures/temp' + +// environmental variables +// env { +// } + +// default global parameters +params { + // options: generic + help = false + executor = 'lsf' + + // options: LSF + projectName = 're_gecip_cancer_pan' + + // options: test run + max_signatures = 1000 + + // options: How to perform resampling - dCRT or bootstrap + resampling_method = 'dCRT' + power_analysis = true +} + +// default process parameters +process { + executor = params.executor + clusterOptions = "-P ${params.projectName}" // should not specify -R "span[hosts=1]", as this is automatically included when cpus > 1 + // errorStrategy = 'retry' // maxRetries = 3 + module = 'lang/R/4.1.0-foss-2019b' + beforeScript = { task.attempt <= 1 ? 'sleep 1' : 'sleep 30' } // negate delay in file availability + withLabel: 'regular|standard' { + executor = 'local' + memory = { task.attempt <= 1 ? '16 GB' : '32 GB' } + queue = 'short' + } + withLabel: short_job { + errorStrategy = { task.attempt <= 2 ? 'retry' : 'ignore' } + maxRetries = 4 + time = { task.attempt <= 1 ? '4h' : '24h' } + // cpus = { task.attempt <= 1 ? '2' : '4' } + memory = { task.attempt <= 1 ? '16 GB' : '32 GB' } + queue = { task.attempt <= 1 ? 'short' : 'medium' } + } +} + +// default job submission parameters +executor { + submitRateLimit = '5 sec' + $lsf { queueSize = 200 } + $local { queueSize = 50 } +} + +timeline { + enabled=true +} + +report { + enabled=true +} diff --git a/src/pipelines/signatures/signatureAssociations.nf/scripts/analysis.py b/src/pipelines/signatures/signatureAssociations.nf/scripts/analysis.py new file mode 100755 index 0000000..0f72e25 --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/scripts/analysis.py @@ -0,0 +1,190 @@ +import sys +import scipy, scipy.stats, re +import pandas as pd, numpy as np + +import matplotlib as mpl +mpl.rcParams['mathtext.fontset'] = 'stix' +mpl.rcParams['font.family'] = 'STIXGeneral' +import matplotlib.pyplot as plt + +plt.rc('axes', labelsize=16) +plt.rc('xtick',labelsize=16) +plt.rc('ytick',labelsize=16) +plt.rc('legend',fontsize=16) + + + + +if __name__=='__main__': + + assoc_zinb, assoc_log0 = sys.argv[1:] # , assoc_log + + run_name="twohit_dCRT_binary" + results_zinb = pd.read_csv(assoc_zinb) + # results_log = pd.read_csv(assoc_log) + results_log0 = pd.read_csv(assoc_log0) + + for i, results_table in enumerate([results_zinb]):#, results_log]): + + + # QQ plot + plt.figure(figsize=(8,8)) + + subset = (results_table.resample_pvalue>0)&(results_table.model_alt.isin(['negbin', 'glm.binom', 'binomial']))&(np.abs(results_table.target_means_alt)<1000) + n_test = np.sum(subset) + k = np.arange(1,n_test+1) + + print(results_table.head()) + print("N_test", n_test) + print(np.unique(results_table.resample_pvalue)) + + pv_expected = np.linspace(0,1-1/n_test,n_test)+1/(2*n_test) + pv_expected = np.arange(1/n_test,1+1e-10,1/n_test) + percentiles = scipy.stats.beta.ppf(np.array([0.05,0.5,0.95])[:,None], k[None,:],n_test+1-k[None,:]) + + plt.scatter(-np.log10(percentiles[1][::-1]), + np.sort(-np.log10(results_table[subset]['wilks_pvalue'])), + s=5, label="Wilks", vmax=15, marker='D', + c=np.sort(-np.log10(results_table[subset]['wilks_pvalue'])), + cmap='viridis_r') + plt.scatter(-np.log10(percentiles[1][::-1]), + np.sort(-np.log10(results_table[subset]['resample_pvalue'])), + s=20, label="Resampled", marker='+', + c=-np.log10(np.array(results_table[subset]['wilks_pvalue']))[ + np.argsort(-np.log10(results_table[subset]['resample_pvalue'])) + ], + cmap='viridis_r', vmax=15) + + plt.plot(-np.log10(percentiles[1][::-1]),-np.log10(percentiles[1][::-1])) + plt.fill_between(-np.log10(percentiles[1][::-1]), + -np.log10(percentiles[0][::-1]), + -np.log10(percentiles[2][::-1]), alpha=0.6) + + #plt.ylim(0, np.max(-np.log10(results_table[subset]['resample_pvalue']))+1) + plt.ylim(0, min( np.max(-np.log10(results_table[subset]['resample_pvalue']))*2, + np.max(-np.log10(results_table[subset]['wilks_pvalue'])))*1.1) + plt.gca().set_xlim(left=0) + + plt.legend(loc="upper left") + plt.xlabel(r"$-\log_{10}(\mathrm{pvalue}_\mathrm{expected})$") + plt.ylabel(r"$-\log_{10}(\mathrm{pvalue}_\mathrm{observed})$") + + plt.savefig(f"sig_germline_qqplot_{['zinb','log', 'log0'][i]}.png", dpi=200, bbox_inches='tight', facecolor='w', transparent=False) + + + # Z-score - Q plot + plt.figure(figsize=(8,5)) + + subset = (results_table.resample_pvalue>0)&\ + (results_table.model_alt.isin(['negbin', 'glm.binom', 'binomial']))&\ + (np.abs(results_table.target_means_alt)<1000) + zscore = results_table.target_means_alt/np.sqrt(results_table.target_covs_alt) + plt.scatter(zscore[subset], + -np.log10(results_table.resample_pvalue[subset]), + c=-np.log10(results_table.wilks_pvalue[subset]), + s=4, cmap='viridis_r', vmax=15) + + plt.xlim(-np.max(np.abs(zscore[subset])), np.max(np.abs(zscore[subset]))) + plt.gca().set_ylim(bottom=0) + + cbar = plt.colorbar() + cbar.set_label(r"$-\log_{10}(\mathrm{pvalue}_\mathrm{Wilks})$") + + plt.xlabel(r"$z-\mathrm{score}$") + plt.ylabel(r"$-\log_{10}(\mathrm{pvalue}_\mathrm{resampled})$") + + plt.savefig(f"sig_germline_zqplot_{['zinb','log', 'log0'][i]}.png", dpi=200, bbox_inches='tight', facecolor='w', transparent=False) + + + # Combine logistic and ZINB results + keys = ['target', 'signature', 'group', 'wilks_pvalue', 'model_alt', 'target_means_alt', 'target_covs_alt'] + results_combined = pd.merge(results_zinb[keys+['resample_pvalue',]], + results_log0[keys], + on=['target', 'signature', 'group'], + how='inner', suffixes=("_zinb", "_log")) + + plt.figure(figsize=(8,5)) + + subset = (results_combined.resample_pvalue>0)&\ + (results_combined.wilks_pvalue_log>0)&\ + (results_combined.model_alt_zinb=='negbin')&\ + (np.abs(results_combined.target_means_alt_log)<1000) + + plt.scatter(-np.log10(results_combined[subset].wilks_pvalue_log), + -np.log10(results_combined[subset].resample_pvalue), + c=-np.log10(results_combined[subset].wilks_pvalue_zinb), + s=4, cmap='viridis_r', vmax=15) + plt.gca().set_xlim(left=0) + plt.gca().set_ylim(bottom=0) + + cbar = plt.colorbar() + cbar.set_label(r"$-\log_{10}(\mathrm{pvalue}_\mathrm{ZINB Wilks})$") + + plt.xlabel(r"$-\log_{10}(\mathrm{pvalue}_\mathrm{LOG Wilks})$") + plt.ylabel(r"$-\log_{10}(\mathrm{pvalue}_\mathrm{ZINB resampled})$") + + plt.savefig(f"sig_germline_ppplot_zinb-log.png", dpi=200, bbox_inches='tight', facecolor='w', transparent=False) + + + # Negative controls + vmatch = np.vectorize(lambda x:bool(re.match('.*mock.*', x, re.IGNORECASE))) + + default_colours = plt.rcParams['axes.prop_cycle'].by_key()['color'] + markers = ['.', '+', '^', 'x', '>', '<'] + + for j, results_table in enumerate([results_zinb, results_log0]): + + if j==0: pvalue_key = 'resample_pvalue' + else: pvalue_key = 'wilks_pvalue' + + plt.figure(figsize=(8,8)) + + targets = np.unique(results_table.target) + mock_targets = targets[vmatch(targets)] + signatures = np.unique(results_table.signature) + mock_signatures = signatures[vmatch(signatures)] + mock_set = np.hstack((mock_targets, mock_signatures)) + max_n_test = 0 + + for i,mock in enumerate(mock_set): + + subset = (results_table[pvalue_key]>0)&\ + (results_table.model_alt.isin(['negbin', 'glm.binom', 'binomial']))&\ + ((results_table['target']==mock)|(results_table['signature']==mock)) + + n_test = np.sum(subset) + max_n_test = max(n_test, max_n_test) + k = np.arange(1,n_test+1) + + pv_expected = np.linspace(0,1-1/n_test,n_test)+1/(2*n_test) + pv_expected = np.arange(1/n_test,1+1e-10,1/n_test) + percentiles = scipy.stats.beta.ppf(np.array([0.05,0.5,0.95])[:,None], k[None,:],n_test+1-k[None,:]) + + plt.scatter(-np.log10(percentiles[1][::-1]), + np.sort(-np.log10(results_table[subset]['wilks_pvalue'])), + s=10, label=f"{mock} Wilks", + c=default_colours[3], marker=markers[i%len(markers)]) + + if j==0: + plt.scatter(-np.log10(percentiles[1][::-1]), + np.sort(-np.log10(results_table[subset]['resample_pvalue'])), + s=10, label=f"{mock} dCRT", + c=default_colours[0], marker=markers[i%len(markers)]) + + k = np.arange(1,max_n_test+1) + percentiles = scipy.stats.beta.ppf(np.array([0.05,0.5,0.95])[:,None], + k[None,:], + max_n_test+1-k[None,:]) + plt.plot(-np.log10(percentiles[1][::-1]),-np.log10(percentiles[1][::-1]), c='k') + plt.fill_between(-np.log10(percentiles[1][::-1]), + -np.log10(percentiles[0][::-1]), + -np.log10(percentiles[2][::-1]), alpha=0.3, color='k') + + plt.gca().set_xlim(left=0) + plt.gca().set_ylim(bottom=0) + + plt.legend(loc="upper left") + plt.xlabel(r"$-\log_{10}(\mathrm{pvalue}_\mathrm{expected})$") + plt.ylabel(r"$-\log_{10}(\mathrm{pvalue}_\mathrm{observed})$") + + plt.savefig(f"sig_germline_mock-qqplot_{['zinb', 'log0'][j]}.png", dpi=200, bbox_inches='tight', facecolor='w', transparent=False) diff --git a/src/pipelines/signatures/signatureAssociations.nf/scripts/concatenate.r b/src/pipelines/signatures/signatureAssociations.nf/scripts/concatenate.r new file mode 100755 index 0000000..768b132 --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/scripts/concatenate.r @@ -0,0 +1,21 @@ +suppressMessages(library(tidyverse)) + + +args = commandArgs(trailingOnly=TRUE) +path = args[1] +pattern = args[2] +output_file = args[3] + + +input_files <- list.files(".", pattern) + +for (file in input_files) { + + results_df <- read_delim(file, delim=",") + + full_results_df <- tryCatch( bind_rows(full_results_df, results_df), + error=function (e) {results_df} ) + +} + +write_delim(full_results_df, output_file, delim=",") diff --git a/src/pipelines/signatures/signatureAssociations.nf/scripts/hyper_regress.r b/src/pipelines/signatures/signatureAssociations.nf/scripts/hyper_regress.r new file mode 100755 index 0000000..4cb4edb --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/scripts/hyper_regress.r @@ -0,0 +1,161 @@ +suppressMessages(library(tidyverse)) +suppressMessages(library(ggplot2, viridis, glmnet)) +library(rhdf5) + +# require(pscl) +require(MASS) +require(boot) +library(pscl) + + + +logistic_fit <- function(data.df, covariates) { + + # Run glm.nb - Initialise theta=1 if fit with auto calculating theta fails + results <- glm(k ~ . , data=data.df, family='binomial') + + count_means <- c(results$coefficients[c("(Intercept)", covariates)]) + count_covs <- c(diag(vcov(results))[c("(Intercept)", covariates)]) + + loglike <- length(results$coefficients)-results$aic/2 + # pred = results$coefficients[1] + results$coefficients[2:length(results$coefficients)] %*% t(data.matrix(variables.df[,2:dim(variables.df)[2]])) + # pred <- 1/(1+exp(-pred)) + # loglike <- sum(variables.df$k * log(pred) + (1-variables.df$k) * log(1-pred)) + + fit_results = list(count_means=count_means, count_covs=count_covs, + converged=results$converged, loglike=loglike, model='glm.nb', + dof=1) + + return(fit_results) + +} + + +model_fit <- function(variables.df, covariates, test_variables) { + + + # Renormalise covariates + print("Renormalising") + means = c() + stds = c() + for ( key in c(covariates, test_variables) ) { + means = c(means, mean(variables.df[, key])) + stds = c(stds, sd(variables.df[, key])) + variables.df[key] = ( variables.df[, key]-mean(variables.df[, key]) )/sd(variables.df[, key]) + } + # print(variables.df %>%top_n(10)) + # Select covariates with non-zero variance + covariates = covariates[stds[1:length(covariates)-1]>0] + + # print(select_(data.frame(variables.df), .dots = c('k', covariates, 'prs'))) + # print("Null fit") + fit_results_null <- logistic_fit(select_(data.frame(variables.df), .dots = c('k', covariates)), covariates) + + for (test_variable in test_variables) { + + # print(paste("Alternative fit", test_variable, sep=" ")) + fit_results_alt <- logistic_fit(select_(data.frame(variables.df), .dots = c('k', covariates, test_variable)), c(covariates, test_variable)) + + # print("Collecting results") + results.df <- tibble(trait_fit=fit_results_null["converged"][1], null_fit=fit_results_alt["converged"][1], + loglike_null=fit_results_null["loglike"][[1]], loglike_alt=fit_results_alt["loglike"][[1]], + model_null=fit_results_null["model"][[1]], model_alt=fit_results_alt["model"][[1]], + dof=fit_results_null["dof"][[1]], + sample_size=nrow(variables.df), nz_size=sum(variables.df$k>0), + trait=test_variable) + + # Count paramters + # print("Processing results") + names <- c("c", covariates, "prs", "theta") + for (i in 1:length(names)){ + results.df[paste(names[i], "means", "alt", sep="_")] <- fit_results_alt$count_means[i] + results.df[paste(names[i], "covs", "alt", sep="_")] <- fit_results_alt$count_covs[i] + } + names <- c("c", covariates, "theta") + for (i in 1:length(names)){ + results.df[paste(names[i], "means", "null", sep="_")] <- fit_results_null$count_means[i] + results.df[paste(names[i], "covs", "null", sep="_")] <- fit_results_null$count_covs[i] + } + + # Renormalisation values + # print("Processing renormalisation values") + for (i in 1:length(covariates)) { + results.df[paste("Xmean", covariates[i], sep="_")] <- means[i] + results.df[paste("Xsd", covariates[i], sep="_")] <- stds[i] + } + + stacked_results.df <- tryCatch( bind_rows(stacked_results.df, results.df), + error=function (e) {results.df} ) + } + + return(stacked_results.df) +} + + +args = commandArgs(trailingOnly=TRUE) +trait_file = args[1] +covariate_file = args[2] +category_col = args[3] + +# Load in data +covariates.df <- data.frame(read_csv(covariate_file)) +categories <- unique(covariates.df$category) +covariates <- colnames(covariates.df)[-c(1:3)] +print(c("Covar", nrow(covariates.df))) + +# Trait file should be csv with "germline_platekey" and a column for each trait +trait.df <- data.frame(read_csv(trait_file)) +traits <- colnames(trait.df)[-c(1)] + +data.df <- inner_join(covariates.df, trait.df, by=c("tumour_sample_platekey"="tumour_sample_platekey")) %>%mutate_if(is.numeric, ~replace_na(., NA) %>%replace(., is.infinite(.), NA)) +data.df <- mutate(data.df, is_female=as.double(is_female), category=category+1) +print(c("Data", nrow(data.df))) + +print(categories) +print(covariates) +print(traits) + + +data.df$k = as.integer(data.df$hypermutation>0) +k_list = list(data.df$hypermutation>0, + data.df$hypermutation>1) +dependent = c("hyper1", "hyper2") + +for ( i_k in c( 1:length(k_list) ) ) { + print(dependent[i_k]) + data.df$k <- as.integer(k_list[[i_k]]) + + results.df <- model_fit(data.frame(data.df), covariates, traits) + results.df["category"] <- "_all_" + results.df["dependent_variable"] <- dependent[i_k] + full_results.df <- tryCatch(bind_rows(full_results.df, results.df), + error=function (e) { results.df } ) + + for (i_category in 1:length(categories)) { + + category_data.df <- data.frame(data.df) %>% filter(category==i_category) + + print(categories[i_category]) + print(sum(category_data.df$k>0)) + + if ( sum(category_data.df$k>0)<5 ) { + next + } else { + results.df <- model_fit(data.frame(category_data.df), covariates, traits) + } + results.df["category"] <- categories[i_category] + results.df["dependent_variable"] <- dependent[i_k] + + full_results.df <- bind_rows(full_results.df, results.df) + } +} + + +print(full_results.df) +# if ( !exists(full_results.df) ) { quit("no", 120) } + +# Statistical tests +full_results.df["wilks"] <- pchisq( -2 * (full_results.df$loglike_null - full_results.df$loglike_alt), df=full_results.df$dof, lower.tail=FALSE ) + +# Save results +write_delim(full_results.df %>%relocate(where(is.character)), paste("hyper_results.csv", sep="_"), delim=",") diff --git a/src/pipelines/signatures/signatureAssociations.nf/scripts/logistic_functions.r b/src/pipelines/signatures/signatureAssociations.nf/scripts/logistic_functions.r new file mode 100755 index 0000000..b0b483f --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/scripts/logistic_functions.r @@ -0,0 +1,225 @@ +suppressMessages(library(tidyverse)) +suppressMessages(library(ggplot2, viridis, glmnet)) + +# require(pscl) +require(MASS) +require(boot) +library(pscl) + + + +logistic_fit <- function(data.df, covariates) { + + # Run glm.nb - Initialise theta=1 if fit with auto calculating theta fails + formula <- as.formula( paste(c("k ~ 0", covariates), collapse=" + ") ) + # print(c("Formula: ", formula)) + suppressWarnings(results <- glm(formula, data=data.df, family='binomial')) + + count_means <- c(results$coefficients[covariates]) + count_covs <- c(diag(vcov(results))[covariates]) + + loglike <- length(results$coefficients)-results$aic/2 + # pred = results$coefficients[1] + results$coefficients[2:length(results$coefficients)] %*% t(data.matrix(variables.df[,2:dim(variables.df)[2]])) + # pred <- 1/(1+exp(-pred)) + # loglike <- sum(variables.df$k * log(pred) + (1-variables.df$k) * log(1-pred)) + + fit_results = list(count_means=count_means, count_covs=count_covs, + converged=results$converged, loglike=loglike, model='glm.binom', + dof=1) + + return(fit_results) + +} + +resample_logistic_fit <- function(data.df, covariates, null_prob, nulliter=100) { + + # Alternative fit + fit_results_alt <- logistic_fit(data.df, covariates) + z_star <- fit_results_alt$count_means[length(covariates)]/sqrt(fit_results_alt$count_covs[length(covariates)]) + print(data.df$k) + print(sum(data.df$k)) + + # Resample n times to get null fits + z_null <- c() + for (i in c(1:nulliter)) { + data.df$k <- rbinom(length(null_prob), 1, null_prob) + + # print(paste("Alternative fit", test_variable, sep=" ")) + fit_resample_null <- logistic_fit(data.df, covariates) + z_null <- c(z_null, fit_resample_null$count_means[length(covariates)]/sqrt(fit_resample_null$count_covs[length(covariates)]) ) + + } + + # Get p-value target parameter + pv_score <- nullSamplePV(z_null, z_star) + + return(append(fit_results_alt, list(z_star=z_star, z_null=z_null, pv_score=pv_score))) + +} + + +zero_model_fit <- function(variables.df, covariates, test_variables) { + + # Convert signature to binary 1/0 + variables.df$k <- variables.df$k>0 + + # Renormalise covariates + print("Renormalising") + means = c() + stds = c() + for ( key in c(covariates) ) { + means = c(means, mean(variables.df[, key])) + stds = c(stds, sd(variables.df[, key])) + variables.df[key] = ( variables.df[, key]-mean(variables.df[, key]) )/sd(variables.df[, key]) + } + # print(variables.df %>%top_n(10)) + # Select covariates with non-zero variance + variables.df$intercept <- 1.0 + covariates = c("intercept", covariates[stds[1:length(covariates)-1]>0]) + + # print(select_(data.frame(variables.df), .dots = c('k', covariates, 'target'))) + # print("Null fit") + fit_results_null <- logistic_fit(variables.df, covariates) + + for (test_variable in test_variables) { + + # print(paste("Alternative fit", test_variable, sep=" ")) + fit_results_alt <- logistic_fit(variables.df, c(covariates, test_variable)) + + # print("Collecting results") + results.df <- tibble(target=test_variable, + alt_fit=fit_results_null["converged"][[1]], null_fit=fit_results_alt["converged"][[1]], + loglike_null=fit_results_null["loglike"][[1]], loglike_alt=fit_results_alt["loglike"][[1]], + model_null=fit_results_null["model"][[1]], model_alt=fit_results_alt["model"][[1]], + dof=fit_results_null["dof"][[1]], + sample_size=nrow(variables.df), nz_size=sum(variables.df$k>0)) + + # Count paramters + # print("Processing results") + names <- c(covariates, "target", "theta") + for (i in 1:length(names)){ + results.df[paste(names[i], "means", "alt", sep="_")] <- fit_results_alt$count_means[i] + results.df[paste(names[i], "covs", "alt", sep="_")] <- fit_results_alt$count_covs[i] + } + names <- c(covariates, "theta") + for (i in 1:length(names)){ + results.df[paste(names[i], "means", "null", sep="_")] <- fit_results_null$count_means[i] + results.df[paste(names[i], "covs", "null", sep="_")] <- fit_results_null$count_covs[i] + } + + # Renormalisation values + # print("Processing renormalisation values") + for (i in 1:length(covariates)) { + results.df[paste("Xmean", covariates[i], sep="_")] <- means[i] + results.df[paste("Xsd", covariates[i], sep="_")] <- stds[i] + } + + stacked_results.df <- tryCatch( bind_rows(stacked_results.df, results.df), + error=function (e) {results.df} ) + + } + + return(stacked_results.df) + +} + + +nmax_model_fit <- function(variables.df, covariates, test_variables, n_resample=100) { + + # Renormalise covariates + print("Renormalising") + means = c() + stds = c() + for ( key in c(covariates) ) { + means = c(means, mean(variables.df[, key])) + stds = c(stds, sd(variables.df[, key])) + variables.df[key] = ( variables.df[, key]-mean(variables.df[, key]) )/sd(variables.df[, key]) + } + # print(variables.df %>%top_n(10)) + # Select covariates with non-zero variance + variables.df$intercept <- 1.0 + covariates = c("intercept", covariates[stds[1:length(covariates)-1]>0]) + + for (test_variable in test_variables) { + + # Generate new variable from max n samples in target + n_hit = min( sum(variables.df[, test_variable]), sum(variables.df$k>0) ) + hit = order(- (variables.df$k + runif(length(variables.df$k))*0.1) )[1:n_hit] + new_k = rep(0,length(variables.df$k)) + new_k[hit] <- 1 + new_variables.df <- variables.df %>% mutate(k = new_k) + + # print(select_(data.frame(variables.df), .dots = c('k', covariates, 'target'))) + # print("Null fit") + fit_results_null <- logistic_fit(new_variables.df, covariates) + nullprob <- 1/( 1+exp( -(as.matrix(new_variables.df[covariates]) %*% as.vector(fit_results_null$count_means))[,1] ) ) + + # print(paste("Alternative fit", test_variable, sep=" ")) + fit_results_alt <- resample_logistic_fit(new_variables.df, c(covariates, test_variable), nullprob) + fit_results_alt <- resampleGLM(new_variables.df %>% dplyr::select(-count), new_variables.df$count, test_variable, + covariates=covariates, nulliter=500, + family=family, target_family=target_families[target_types[i]][[1]], + zeroinfl=F, notarget_result=fit_notarget, resampling=resampling_method) + + results.df <- data.frame(target=test_variable, + resample_pvalue=fit_results_alt$pv_score, + null_fit=fit_results_null$converged, alt_fit=fit_results_alt$converged, + loglike_null=fit_results_null$loglike, loglike_alt=fit_results_alt$loglike, + model_null=fit_results_null$model, model_alt=fit_results_alt$model, + dof=fit_results_null$dof, + sample_size=nrow(new_variables.df), nz_size=sum(new_variables.df$k>0), nz_target=sum(new_variables.df[, test_variable])) + + # Count paramters + # print("Processing results") + names <- c(covariates, "target", "theta") + for (i in 1:length(names)){ + results.df[paste(names[i], "means", "alt", sep="_")] <- fit_results_alt$count_means[i] + results.df[paste(names[i], "covs", "alt", sep="_")] <- fit_results_alt$count_covs[i] + } + names <- c(covariates, "theta") + for (i in 1:length(names)){ + results.df[paste(names[i], "means", "null", sep="_")] <- fit_results_null$count_means[i] + results.df[paste(names[i], "covs", "null", sep="_")] <- fit_results_null$count_covs[i] + } + + # Renormalisation values + # print("Processing renormalisation values") + for (i in 1:length(covariates)) { + results.df[paste("Xmean", covariates[i], sep="_")] <- means[i] + results.df[paste("Xsd", covariates[i], sep="_")] <- stds[i] + } + + stacked_results.df <- tryCatch( bind_rows(stacked_results.df, results.df), + error=function (e) {results.df} ) + + } + + return(stacked_results.df) + +} + + +#' Estimate p-value from null samples and fitted coefficient. +#' +#' Estimate p-value from null samples and fitted coefficient. +#' @param null_samples=vector. Vector of z-scores of null fits. +#' @param fitted_value=float. Value of z-score for true run. +#' @param model = str. Null model to be used. ('chisquare' (D), 'skewt'). Only chisquare currently implemented +#' @return float. P-value of fitted value against null samples under null model. +#' @examples +#' null_samples <- rnorm(100) +#' fitted_value <- 5 +#' pvalue <- nullSamplePV(null_samples, fitted_value, model='chisquare') +nullSamplePV <- function(null_samples, fitted_value, model='chisquare') { + + if (model=='chisquare') { + normal_var = sd(null_samples)^2 + normal_mean = mean(null_samples) + pv = pchisq((fitted_value-normal_mean)^2/normal_var, df=1, lower.tail=F) + } else { + stop('Unknown model for null sample passed to nullSamplePV') + } + + return(pv) + +} \ No newline at end of file diff --git a/src/pipelines/signatures/signatureAssociations.nf/scripts/logistic_regress.r b/src/pipelines/signatures/signatureAssociations.nf/scripts/logistic_regress.r new file mode 100755 index 0000000..25881ff --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/scripts/logistic_regress.r @@ -0,0 +1,321 @@ +suppressMessages(library(tidyverse)) +suppressMessages(library(ggplot2, viridis, glmnet)) + +# require(pscl) +require(MASS) +require(boot) +library(pscl) + +model_fit <- function(variables.df, covariates, targets, family = "binomial", power.analysis = T) { + # Convert signature to binary 1/0 + variables.df$count <- variables.df$count > 0 + + # Renormalise covariates + print("Renormalising") + means <- c() + stds <- c() + for (key in c(covariates)) { + means <- c(means, mean(variables.df[, key])) + stds <- c(stds, sd(variables.df[, key])) + variables.df[key] <- (variables.df[, key] - mean(variables.df[, key])) / sd(variables.df[, key]) + } + # print(variables.df %>%top_n(10)) + # Select covariates with non-zero variance + variables.df$intercept <- 1.0 + print(c("Covariates: ", covariates)) + print(c("SD of covariates: ", stds)) + covariates <- list(count = c("intercept", covariates[stds[1:length(covariates)] > 0])) + print(covariates) + + # print(select_(data.frame(variables.df), .dots = c('k', covariates, 'target'))) + # print("Null fit") + # fit_null <- logistic_fit(variables.df, covariates) + + fit_notarget <- glmFit(variables.df, covariates, family = family, zeroinfl = F) + + for (target in targets) { + print(target) + + print("Removing nan target values - to make null fit fair (glmFit does this automatically when doing alt)") + # print(head(variables.df[,0:5])) + # print(dim(variables.df)[1]) + variables_nona.df <- variables.df %>% filter(!is.na(get(target))) + # print(head(variables_nona.df[,0:5])) + # print(dim(variables_nona.df)[1]) + + if (dim(variables_nona.df)[1] != dim(variables.df)[1]) { + fit_notarget_nona <- glmFit(variables_nona.df, covariates, family = family, zeroinfl = F) + } else { + fit_notarget_nona <- fit_notarget + } + + # print(paste("Alternative fit", target, sep=" ")) + fit_alt <- glmFit(variables_nona.df, list(count = c(covariates$count, target)), family = family, zeroinfl = F) + + # print("Collecting results") + results.df <- data.frame( + target = target, + loglike_null = fit_notarget_nona$loglike, loglike_alt = fit_alt$loglike, + model_null = fit_notarget_nona$model, model_alt = fit_alt$model, + sample_size = nrow(variables_nona.df), nz_size = sum(variables_nona.df$count > 0), nz_target = sum(variables_nona.df[, target]) + ) + + # Count paramters + # print("Processing results") + names <- c(covariates$count, "target") + for (j in 1:length(names)) { + results.df[paste(names[j], "means", "alt", sep = "_")] <- fit_alt$betas[j] + results.df[paste(names[j], "covs", "alt", sep = "_")] <- diag(fit_alt$covar)[j] + } + names <- c(covariates$count) + for (j in 1:length(names)) { + results.df[paste(names[j], "means", "null", sep = "_")] <- fit_notarget_nona$betas[j] + results.df[paste(names[j], "covs", "null", sep = "_")] <- diag(fit_notarget_nona$covar)[j] + } + + # Renormalisation values + # print("Processing renormalisation values") + for (j in 1:length(covariates$count)) { + results.df[paste("Xmean", covariates$count[j], sep = "_")] <- means[j] + results.df[paste("Xsd", covariates$count[j], sep = "_")] <- stds[j] + } + + ### Power Analysis + power.percentiles <- c(20, 50, 80) + if (power.analysis) { + # Set parameters to known ones with target effect size of log(2) + # Resample from Zero-Inflated Negative Binomial N times from new parameters + # Apply resampleGLM method to simulated signature + # Fit beta distribution to set of p-values + effect.size <- log(2) + power.ntest <- 10 + + # New count probability + mu <- expit(as.matrix(variables.df[c(covariates$count, target)]) %*% c(fit_alt$betas[1:length(covariates$count)], effect.size)) + + pvalue_list <- c() + for (j in c(1:power.ntest)) { + counts <- rbinom(dim(mu)[1], size = 1, prob = mu) + fit_notarget_power <- glmFit(variables.df %>% mutate(count = counts), covariates, family = family, zeroinfl = F) + fit_alt_power <- glmFit(variables.df %>% mutate(count = counts), list(count = c(covariates$count, target)), family = family, zeroinfl = F) + + pvalue_power <- pchisq(-2 * (fit_notarget_power$loglike - fit_alt_power$loglike), df = 1, lower.tail = FALSE) + pvalue_list <- c(pvalue_list, pvalue_power) + print(c("Logistic fit", fit_notarget_power$loglike, fit_alt_power$loglike)) + print(c("Pass: ", pvalue_power)) + } + + for (percentile in power.percentiles) { + results.df[paste0("power", percentile)] <- quantile(na.omit(pvalue_list), percentile / 100)[[1]] + } + results.df[paste0("powerMean")] <- mean(na.omit(pvalue_list)) + results.df[paste0("powerVar")] <- sd(na.omit(pvalue_list))^2 + results.df[paste0("powerN")] <- length(na.omit(pvalue_list)) + } else { + for (percentile in power.percentiles) { + results.df[paste0("power", percentile)] <- -99 + } + results.df[paste0("powerMean")] <- -99 + results.df[paste0("powerVar")] <- -99 + results.df[paste0("powerN")] <- -99 + } + + stacked_results.df <- tryCatch(bind_rows(stacked_results.df, results.df), + error = function(e) { + results.df + } + ) + } + + return(stacked_results.df) +} + + +nmax_model_fit <- function(variables.df, covariates, targets, target_types, resampling_method) { + # Set up model and backup for when model fails + family <- "binomial" + + # Renormalise covariates + print("Renormalising") + means <- c() + stds <- c() + for (key in c(covariates)) { + means <- c(means, mean(variables.df[, key])) + stds <- c(stds, sd(variables.df[, key])) + variables.df[key] <- (variables.df[, key] - mean(variables.df[, key])) / sd(variables.df[, key]) + } + # print(variables.df %>%top_n(10)) + # Select covariates with non-zero variance + variables.df$intercept <- 1.0 + # covariates = c("intercept", covariates[stds[1:length(covariates)-1]>0]) + covariates <- list( + count = c("intercept", covariates[stds[1:length(covariates) - 1] > 0]), + target = c("intercept", covariates[stds[1:length(covariates) - 1] > 0]) + ) + + for (i in c(1:length(targets))) { + # Generate new variable from max n samples in target + n_hit <- min(sum(variables.df[, targets[i]]), sum(variables.df$count > 0)) + hit <- order(-(variables.df$count + runif(length(variables.df$count)) * 0.1))[1:n_hit] + new_count <- rep(0, length(variables.df$count)) + new_count[hit] <- 1 + new_variables.df <- variables.df %>% mutate(count = new_count) + + # print(select_(data.frame(variables.df), .dots = c('k', covariates, 'target'))) + # print("Null fit") + # fit_null <- logistic_fit(new_variables.df, covariates) + # nullprob <- 1/( 1+exp( -(as.matrix(new_variables.df[covariates]) %*% as.vector(fit_null$count_means))[,1] ) ) + fit_notarget <- glmFit(new_variables.df, covariates, family = family, zeroinfl = F) + + # print(paste("Alternative fit", target, sep=" ")) + fit_alt <- resampleGLM(new_variables.df %>% dplyr::select(-count), new_variables.df$count, targets[i], + covariates = covariates, nulliter = 500, + family = family, + target_family = target_types[i], + zeroinfl = F, notarget_result = fit_notarget, resampling = resampling_method + ) + + # print("Collecting results") + results.df <- data.frame( + target = targets[i], resample_pvalue = fit_alt$pv_score, + loglike_null = fit_notarget$loglike, loglike_alt = fit_alt$loglike, + model_null = fit_notarget$model, model_alt = fit_alt$model, + sample_size = nrow(new_variables.df), nz_size = sum(new_variables.df$count > 0), nz_target = sum(new_variables.df[, targets[i]]) + ) + + + # Count paramters + # print("Processing results") + names <- c(covariates$count, "target") + for (j in 1:length(names)) { + results.df[paste(names[j], "means", "alt", sep = "_")] <- fit_alt$betas[j] + results.df[paste(names[j], "covs", "alt", sep = "_")] <- diag(fit_alt$covar)[j] + } + names <- c(covariates$count) + for (j in 1:length(names)) { + results.df[paste(names[j], "means", "null", sep = "_")] <- fit_notarget$betas[j] + results.df[paste(names[j], "covs", "null", sep = "_")] <- diag(fit_notarget$covar)[j] + } + + # Renormalisation values + # print("Processing renormalisation values") + for (j in 1:length(covariates$count)) { + results.df[paste("Xmean", covariates$count[j], sep = "_")] <- means[j] + results.df[paste("Xsd", covariates$count[j], sep = "_")] <- stds[j] + } + + stacked_results.df <- tryCatch(bind_rows(stacked_results.df, results.df), + error = function(e) { + results.df + } + ) + } + + return(stacked_results.df) +} + + +if (sys.nframe() == 0) { + args <- commandArgs(trailingOnly = TRUE) + output_dir <- args[1] + base_dir <- args[2] + test_file <- args[3] + covariate_file <- args[4] + signature_file <- args[5] + target_file <- args[6] + logistic_method <- args[7] + resample_method <- args[8] + + # source(paste(base_dir, "logistic_functions.r", sep="/")) + source(paste(base_dir, "regression_functions.r", sep = "/")) + + # covariates = c("log_age", "logit_purity", "is_female", "pc1", "pc2", "pc3") + + # Set of tests to run + test_df <- read_delim(test_file, delim = "\t") + print(test_df[1:10, ]) + signatures <- unique(test_df$signature) + + # Load in samples + sample_df <- read_delim(covariate_file, delim = "\t") + covariates <- colnames(sample_df)[!colnames(sample_df) %in% c("sample_id", "group")] + print(covariates) + # covariates <- c("log_age", "is_female", "pc1", "pc2", "pc3") + + # Load in signatures + print(c("sample_id", signatures)) + signature_df <- read_delim(signature_file, delim = "\t") %>% dplyr::select(c("sample_id", signatures)) + signature_df[is.na(signature_df)] <- 0 + + # Load in target variables + # Trait file should be csv with "germline_platekey" and a column for each trait + target_df <- read_delim(target_file, delim = "\t") + + # Join tables together + sample_df <- inner_join(sample_df, target_df, by = c("sample_id"), suffix = c("", "_tgt")) %>% mutate_if(is.numeric, ~ replace_na(., NA) %>% replace(., is.infinite(.), NA)) + + + # Iterate through all signatures + for (sig in signatures) { + # Iterate through tumour groups for the given signature + groups <- unique(test_df[test_df$signature == sig, ]$group) + for (grp in groups) { + # Get all variable targets for the given group and signature + targets <- test_df[test_df$signature == sig & test_df$group == grp, ]$target + target_types <- test_df[test_df$signature == sig & test_df$group == grp, ]$type + + print(sig) + print(grp) + print(targets) + subsample_df <- sample_df %>% filter(group == grp) + + # Filter out instances where only one target value + nonsingular <- c() + for (target in targets) { + nonsingular <- c(nonsingular, nrow(unique(subsample_df[, target])) > 1) + } + targets <- targets[nonsingular] + print(targets) + if (length(targets) == 0) { + stop("No non-unique targets") + } + + subsample_df <- inner_join(subsample_df, + signature_df %>% dplyr::select(sample_id, sig) %>% rename(count = sig), + by = c("sample_id") + ) %>% dplyr::select(count, targets, covariates) + + if (logistic_method == "ZERO") { + results_df <- model_fit(data.frame(subsample_df), covariates, gsub("-", ".", targets)) + } else if (logistic_method == "ZERO50") { + # Take top 50% of activities if more than 50% non-zero + n_hit <- min(floor(nrow(subsample_df) * 0.5), sum(subsample_df$count > 0)) + hit <- order(-(subsample_df$count + runif(length(subsample_df$count)) * 0.1))[1:n_hit] + new_count <- rep(0, length(subsample_df$count)) + new_count[hit] <- 1 + subsample_df <- subsample_df %>% mutate(count = new_count) + results_df <- model_fit(data.frame(subsample_df), covariates, gsub("-", ".", targets)) + } else if (logistic_method == "NMAX") { + results_df <- nmax_model_fit(data.frame(subsample_df), covariates, gsub("-", ".", targets), target_types, resample_method) + } else { + stop(c("logistic_method ", logistic_method, " not known")) + } + results_df["signature"] <- sig + results_df["group"] <- grp + + full_results_df <- tryCatch(bind_rows(full_results_df, results_df), + error = function(e) { + results_df + } + ) + } + } + + # Statistical tests + full_results_df["wilks_pvalue"] <- pchisq(-2 * (full_results_df$loglike_null - full_results_df$loglike_alt), df = 1, lower.tail = FALSE) + + # print(head(full_results_df)) + + # Save results + write_delim(full_results_df %>% dplyr::select(group, signature, target, wilks_pvalue, everything()), "logistic_results.csv", delim = ",") +} diff --git a/src/pipelines/signatures/signatureAssociations.nf/scripts/regression_fn2.r b/src/pipelines/signatures/signatureAssociations.nf/scripts/regression_fn2.r new file mode 100755 index 0000000..039d9d6 --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/scripts/regression_fn2.r @@ -0,0 +1,432 @@ +library(tidyverse, MASS) +library(stats) +library(pscl) + +#' Fit negative binomial with MASS::glm.nb +#' +#' Fit negative binomial with MASS::glm.nb +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = vector of strings. Columns in data.df to be used as covariates. +#' @param start = vector (of length covariates) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, residuals, beta covariances, NB size parameter (theta) and offsets +#' @examples +#' data.df <- tibble(intercept = rep(c(1), 10), x1 = c(1:10), count = c(21:30)) +#' covariates <- c("intercept", "x1") +#' results <- negBinGLM(data.df, covariates) +negBinGLM <- function(data.df, covariates, start = NULL, maxit = 100, offset = NULL, sample_weights = NULL) { + # If starting parameters provided + if (!is.null(start)) { + start.theta <- start$theta + start.count <- start$count + } else { + start.theta <- 1 + start.count <- NULL + } + + # If there is an offset column, add to formula + if (!is.null(offset)) { + data.df <- data.df %>% mutate(offset = offset) + formula <- "count ~ 0 + offset(offset) + " + } else { + formula <- "count ~ 0 + " + } + + formula <- as.formula(paste0(formula, paste(covariates$count, collapse = " + "))) + + fit_nb <- MASS::glm.nb(formula, + data = data.df, + start = start.count, init.theta = start.theta, + maxit = maxit, weights = sample_weights + ) + + return(list( + betas = fit_nb$coefficients, + residual = data.df$count - as.numeric(fit_nb$fitted.values), + covar = vcov(fit_nb), + theta = fit_nb$theta, + offset = fit_nb$fitted.values, + loglike = fit_nb$twologlik / 2, + model = "negbin" + )) +} + +#' Fit GLM with stats::glm +#' +#' Fit GLM with stats::glm +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = vector of strings. Columns in data.df to be used as covariates. +#' @param family='negbin' = str. GLM family to be used. +#' @param start = vector (of length covariates) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, residuals, beta covariances, NB size parameter (theta) and offsets +#' @examples +#' data.df <- tibble(intercept = rep(c(1), 10), x1 = c(1:10), count = c(21:30)) +#' covariates <- c("intercept", "x1") +#' results <- negBinGLM(data.df, covariates) +genLinMod <- function(data.df, covariates, family = "X", start = NULL, maxit = 100, offset = NULL, sample_weights = NULL) { + # If there is an offset column, add to formula + if (!is.null(offset)) { + data.df <- data.df %>% mutate(offset = offset) + formula <- "count ~ 0 + offset(offset)" + } else { + formula <- "count ~ 0" + } + + formula <- as.formula(paste(c(formula, covariates$count), collapse = " + ")) + + fit_glm <- stats::glm(formula, + data = data.df, family = family, + start = start$count, + maxit = maxit, weights = sample_weights + ) + + residual <- data.df$count - as.numeric(fit_glm$fitted.values) + # Estimate standard error from Gaussian residuals + # Only meaningful in Gaussian models? + sigma <- sqrt(sum(residual^2) / (length(residual) - length(covariates$count))) + + return(list( + betas = fit_glm$coefficients, + residual = residual, + sigma = sigma, + covar = vcov(fit_glm), + offset = fit_glm$fitted.values, + loglike = logLik(fit_glm), + model = family + )) +} + +#' Fit zero inflated GLM with pscl::zeroinfl +#' +#' Fit zero inflated GLM with pscl::zeroinfl +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = list of vectors with count and zero entries. Columns in data.df to be used as covariates. +#' @param family='negbin' = str. GLM family to be used. +#' @param start = list or NULL. Starting values for parameters with count, zero and theta entries. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL or list of count:vector, zero:vector. +#' Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, residuals, beta covariances, NB size parameter (theta) and offsets +#' @examples +#' data.df <- tibble(intercept = rep(c(1), 10), x1 = c(1:10), count = c(21:30)) +#' covariates <- c("intercept", "x1") +#' results <- negBinGLM(data.df, covariates, head(covariates, 1)) +zeroInflGLM <- function(data.df, covariates, family = "X", + start = NULL, maxit = 100, offset = NULL, sample_weights = NULL) { + print(c("Family", family)) + print(covariates) + + if (length(covariates$zero) == 0) { + if (family == "negbin") { + return(negBinGLM(data.df, covariates, + start = start, + maxit = maxit, offset = offset$count, sample_weights = sample_weights + )) + } else { + return(genLinMod(data.df, covariates, + start = start, family = family, + maxit = maxit, offset = offset$count, sample_weights = sample_weights + )) + } + } + + # If there is an offset column, add to formula + if (!is.null(offset)) { + data.df <- data.df %>% mutate(offset_count = offset$count, offset_zero = offset$zero) + formula_count <- "count ~ 0 + offset(offset_count)" + formula_zero <- "0 + offset(offset_zero)" + } else { + formula_count <- "count ~ 0" + formula_zero <- "0" + } + + formula <- as.formula(paste0( + paste(c(formula_count, covariates$count), collapse = " + "), " | ", + paste(c(formula_zero, covariates$zero), collapse = " + ") + )) + + # print(start) + # print(c('formula', formula)) + # print(sum(sample_weights)) + # print(head(data.df %>% dplyr::select(count, covariates$count))) + # for (cov in c("count", covariates$count)) { + # print(c(cov, sum(data.df[, cov]))) + # } + # print(sample_weights) + # print(maxit) + # print(family) + + fit_nb <- pscl::zeroinfl(formula, + data = data.df, + dist = family, link = "logit", start = start, + maxit = maxit, reltol = 1e-12, weights = sample_weights + ) + + return(list( + betas = fit_nb$coefficients$count, + betas_zero = fit_nb$coefficients$zero, + residual = data.df$count - as.numeric(fit_nb$fitted.values), + covar = vcov(fit_nb), + theta = fit_nb$theta, + offset = fit_nb$fitted.values, + loglike = fit_nb$loglik, + model = family + )) +} + +#' Fit GLM with stats::glm +#' +#' Fit GLM with stats::glm +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = vector of strings. Columns in data.df to be used as covariates. +#' @param family='poisson' = str. GLM family to be used. +#' @param zeroinfl=Bool. Use zero inflated GLM. +#' @param start = vector (of length covariates) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, offsets and beta covariances. +#' @examples +#' data.df <- tibble(intercept = rep(c(1), 10), x1 = c(1:10), count = c(21:30)) +#' covariates <- c("intercept", "x1") +#' results <- glmFit(data.df, covariates, family = "poisson") +glmFit <- function(data.df, covariates, family = "X", zeroinfl = F, + start = NULL, maxit = 100, offset = NULL, sample_weights = NULL) { + if (family == "negbin" & !zeroinfl) { + results <- negBinGLM(data.df, covariates, start = start, offset = offset, sample_weights = sample_weights) + } else if (!zeroinfl) { + results <- genLinMod(data.df, covariates, family = family, start = start, offset = offset, sample_weights = sample_weights) + } else { + results <- zeroInflGLM(data.df, covariates, family = family, start = start, offset = offset, sample_weights = sample_weights) + } + + return(results) +} + +#' Fit GLM +#' +#' Fit GLM to counts and covariates +#' @param X = tibble. DataFrame of covariates. +#' @param y = vector of int >= 0. Target variable (count data) +#' @param covariates = list of vector of strings with entries count, zero (if zeroinfl model). +#' Columns in data.df to be used as covariates. +#' @param family='X' = str. GLM family to be used. +#' @param zeroinfl=Bool. Use zero inflated GLM. +#' @param start = list of vector (of length covariates) (count=, zero=) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, offsets, beta covariances and other parameters. +#' @examples +#' X <- tibble(intercept = rep(c(1), 10), x1 = c(1:10)) +#' y <- c(21:30) +#' results <- glmFit(X, y) +runGlmFit <- function(X, y, covariates = NULL, family = "X", zeroinfl = F, start = NULL, + maxit = 100, offset = NULL, sample_weights = NULL) { + if (is.null(coariates)) { + covariates <- list(count = colnames(X), zero = colnames(X)) + } + data.df <- X %>% mutate(count = y) + + results <- glmFit(data.df, covariates, + family = family, zeroinfl = zeroinfl, + start = start, maxit = maxit, offset = offset, sample_weights = sample_weights + ) + + return(results) +} + +#' Fit GLM with variable resampling +#' +#' Fit GLM to counts and covariates with resampling of the target variable +#' Resampling performed using resample with replacement +#' @param X = tibble. DataFrame of covariates. +#' @param y = vector of int >= 0. Target variable (count data) +#' @param target = str. Column name of variable which is being analysed. +#' @param covariates=NULL +#' @param nulliter=100 = int. Number of resamples to perform for null distribution. +#' @param family='negbin' = str. GLM family to be used. +#' @param zeroinfl=Bool. Use zero inflated GLM. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param use_offsets = Bool. Whether to fix non-target parameters when resampling --> faster but less accurate? +#' @param notarget_results=NULL +#' @return list. Output from glm. Includes betas, offsets, beta covariances and other parameters. +#' @examples +#' X <- tibble(intercept = rep(c(1), 10), x1 = c(1:10)) +#' y <- c(21:30) +#' results <- resample_glmFit(X, y, "x1", nulliter = 100, family = "negbin") +resampleGLM <- function(X, y, target, covariates = NULL, nulliter = 100, + family = "X", target_family = "X", + zeroinfl = F, maxit = 100, use_offsets = F, sample_weights = NULL, + notarget_result = NULL, + fittarget_result = NULL) { + # Covariates excluding the target parameter + if (is.null(covariates)) { + covariates <- list(count = c(colnames(X)[colnames(X) != target])) + if (zeroinfl) { + covariates$zero <- covariates$count + } + } + data.df <- X %>% mutate(count = y) + + print(head(data.df[c(covariates$count, target)])) + + # Fit resampling model + if ((is.null(fittarget_result)) & (!is.null(covariates$target))) { + print(target_family) + fittarget_result <- glmFit(X %>% rename(count = target), list(count = covariates$target), + family = target_family, zeroinfl = F, maxit = 100 + ) + } + print(c("Fit to target: ", fittarget_result$betas)) + + # Fit model without target variable + if (is.null(notarget_result)) { + notarget_result <- glmFit(data.df, covariates, family = family, zeroinfl = zeroinfl, maxit = maxit) + } + print(c("No target: ", notarget_result$betas)) + + # Get fixed starting parameters from model fit without target + if (zeroinfl) { + start <- list(count = c(notarget_result$betas, rnorm(1)), zero = notarget_result$betas_zero) + offset <- list( + count = as.numeric(as.vector(notarget_result$betas) %*% t(data.matrix(data.df[covariates$count]))), + zero = as.numeric(as.vector(notarget_result$betas_zero) %*% t(data.matrix(data.df[covariates$zero]))) + ) + sample_weights <- 1 - expit(offset$zero) + } else { + start <- list(count = c(notarget_result$betas, rnorm(1))) + offset <- list(count = as.numeric(as.vector(notarget_result$betas) %*% t(data.matrix(data.df[covariates$count])))) # log(notarget_result$offset) + sample_weights <- 1 + } + if (family == "negbin") { + start$theta <- notarget_result$theta + } + + # Fit model with target variable and hot start + covariates$count <- c(covariates$count, target) + print(start) + alt_result <- glmFit(data.df, covariates, family = family, zeroinfl = zeroinfl, maxit = maxit, start = start) + # return(alt_result) + print(c("Alt result: ", alt_result$betas)) + z_star <- scoreTestZ(data.df[target], data.df$count, exp(offset$count), + notarget_result$theta, + sample_weights = sample_weights, family = family + ) + + # Generate resample-with-replacement of target parameter + target_resampled <- paste0(target, "_resampled") + covariates$count <- c(covariates$count[covariates$count != target], target_resampled) + # Run resample and fit lots of times + z_null <- c() + # pb <- txtProgressBar(min = 0, max = nulliter, style = 3) + for (i in c(1:nulliter)) { + # Resampling PERMUTATION or distilled-CRT + if (is.null(covariates$target)) { + data.df[target_resampled] <- sample(X[, target], size = length(y), replace = FALSE) + } else { + data.df[target_resampled] <- drawFromGLM(X, covariates$target, fittarget_result, family = target_family) + } + + # Get null z value from score test method + z_null <- c(z_null, scoreTestZ(data.df[target_resampled], data.df$count, exp(offset$count), + notarget_result$theta, + sample_weights = sample_weights, family = family + )) + } + + # Get p-value target parameter + pv_score <- nullSamplePV(z_null, z_star) + + return(append( + alt_result, + list( + z_star = z_star, z_null = z_null, pv_score = pv_score, + null_loglike = notarget_result$loglike + ) + )) +} + +#' Draw sample from GLM +#' +#' Draw sample from GLM with given parameters +#' @param X=data.frame. Dataframe of covariates for model. +#' @param covariates=vector of str. Column headers of covariates used for model. +#' @param parameters=list(betas=vector, sigma?). Parameters of GLM fit. +#' @param family = str. GLM distribution family used. Currently binomial and gaussian implemented. +#' @return vector of same length as X. Samples from the model. +drawFromGLM <- function(X, covariates, parameters, family = "X") { + K <- as.matrix(X[covariates]) %*% as.vector(parameters$betas) + if (family == "binomial") { + mu <- expit(K) + sample <- rbinom(dim(X)[1], 1, mu) + } else if (family == "gaussian") { + sample <- rnorm(dim(X)[1], K, parameters$sigma) + } else { + print(paste0("No family ", family, " for scoreTestZ")) + } + + return(sample) +} + +#' Estimate p-value from null samples and fitted coefficient. +#' +#' Estimate p-value from null samples and fitted coefficient. +#' @param null_samples=vector. Vector of z-scores of null fits. +#' @param fitted_value=float. Value of z-score for true run. +#' @param model = str. Null model to be used. ('chisquare' (D), 'skewt'). Only chisquare currently implemented +#' @return float. P-value of fitted value against null samples under null model. +#' @examples +#' null_samples <- rnorm(100) +#' fitted_value <- 5 +#' pvalue <- nullSamplePV(null_samples, fitted_value, model = "chisquare") +nullSamplePV <- function(null_samples, fitted_value, model = "chisquare") { + if (model == "chisquare") { + var <- sd(null_samples)^2 + pv <- pchisq((fitted_value - mean(null_samples))^2 / var, df = 1, lower.tail = F) + } else { + stop("Unknown model for null sample passed to nullSamplePV") + } + + return(pv) +} + +#' Calculate expit (aka sigmoid) of x +#' +#' Calculate expit (aka sigmoid) of x +#' @param x = numeric type. Value or array of values. +#' @return p. expit(x). +#' @examples +#' x <- c(-10, -0.1, 0., 0.1, 10) +#' p <- expit(x) +expit <- function(x) { + p <- 1 / (1 + exp(-x)) + return(p) +} + +#' Estimate z-score used in score test +#' +#' Estimate z-score used in score test for negative binomial or poisson distributions +#' @param count: y predicted value from negative binomial draw +#' @param offset: predicted values from NB fit to covariates +#' @param theta: theta parameter from NB fit +#' @param sample_weights: weights for samples in likelihood function +#' @param family="X": +#' @return z score estimate +#' @examples +#' x <- c(-10, -0.1, 0., 0.1, 10) +#' p <- expit(x) +scoreTestZ <- function(X, count, offset, theta = 1, sample_weights = 1, family = "X") { + if (family == "negbin") { + dlogL <- sum(sample_weights * X * (count - offset * (count + theta) / (offset + theta))) + fisherI <- sum(sample_weights * X^2 * offset / (offset + theta)) + } else if (family == "poisson") { + dlogL <- sum(sample_weights * X * (count - offset)) + fisherI <- sum(sample_weights * X^2 * offset) + } else { + print(paste0("No family ", family, " for scoreTestZ")) + } + + return(dlogL / sqrt(fisherI)) +} diff --git a/src/pipelines/signatures/signatureAssociations.nf/scripts/regression_functions.r b/src/pipelines/signatures/signatureAssociations.nf/scripts/regression_functions.r new file mode 100755 index 0000000..9da3422 --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/scripts/regression_functions.r @@ -0,0 +1,579 @@ +library(tidyverse, MASS) +#, stats) + +#' Fit negative binomial with MASS::glm.nb +#' +#' Fit negative binomial with MASS::glm.nb +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = vector of strings. Columns in data.df to be used as covariates. +#' @param start = vector (of length covariates) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, residuals, beta covariances, NB size parameter (theta) and offsets +#' @examples +#' data.df <- tibble(intercept=rep(c(1), 10), x1=c(1:10), count=c(21:30)) +#' covariates <- c('intercept','x1') +#' results <- negBinGLM(data.df, covariates) +negBinGLM <- function(data.df, covariates, start=NULL, maxit=100, offset=NULL, sample_weights=NULL) { + + # If starting parameters provided + if (! is.null(start)) { + start.theta <- start$theta + start.count <- start$count + } else { + start.theta <- 1 + start.count <- NULL + } + + # If there is an offset column, add to formula + if (! is.null(offset)) { + data.df <- data.df %>% mutate(offset=offset) + formula <- "count ~ 0 + offset(offset) + " + } else { + formula <- "count ~ 0 + " + } + + formula <- as.formula( paste0( formula, paste(covariates$count, collapse=" + ") ) ) + + fit_nb <- MASS::glm.nb( formula, data=data.df, + start=start.count, init.theta=start.theta, + maxit=maxit, weights=sample_weights ) + + return(list(betas=fit_nb$coefficients, + residual=data.df$count-as.numeric(fit_nb$fitted.values), + covar=vcov(fit_nb), + theta=fit_nb$theta, + offset=fit_nb$fitted.values, + loglike=fit_nb$twologlik/2, + model='negbin')) + +} + +#' Fit GLM with stats::glm +#' +#' Fit GLM with stats::glm +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = vector of strings. Columns in data.df to be used as covariates. +#' @param family='negbin' = str. GLM family to be used. +#' @param start = vector (of length covariates) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, residuals, beta covariances, NB size parameter (theta) and offsets +#' @examples +#' data.df <- tibble(intercept=rep(c(1), 10), x1=c(1:10), count=c(21:30)) +#' covariates <- c('intercept','x1') +#' results <- negBinGLM(data.df, covariates) +genLinMod <- function(data.df, covariates, family='X', start=NULL, maxit=100, offset=NULL, sample_weights=NULL) { + + # For binomial regression with n>1 - need to give a success and fail count column + if (str_detect(family, regex("[a-z]+[0-9]+"))) { + glm_family <- str_match(family, regex("([a-z]+)([0-9]+)"))[[2]] + n <- strtoi(str_match(family, regex("([a-z]+)([0-9]+)"))[[3]]) + data.df$countFail <- n-data.df$count + formula <- "cbind(count,countFail)" + } else { + glm_family <- family + formula <- "count" + } + + # If there is an offset column, add to formula + if (! is.null(offset)) { + data.df <- data.df %>% mutate(offset=offset) + formula <- paste(formula, "0 + offset(offset)", sep=" ~ ") + } else { + formula <- paste(formula, "0", sep=" ~ ") + } + + formula <- as.formula( paste(c(formula, covariates$count), collapse=" + ") ) + + fit_glm <- stats::glm( formula, data=data.df, family=glm_family, + start=start$count, + maxit=maxit, weights=sample_weights ) + + residual <- data.df$count-as.numeric(fit_glm$fitted.values) + # Estimate standard error from Gaussian residuals + # Only meaningful in Gaussian models? + sigma <- sqrt(sum(residual^2)/(length(residual)-length(covariates$count))) + + return(list(betas=fit_glm$coefficients, + residual=residual, + sigma=sigma, + covar=vcov(fit_glm), + offset=fit_glm$fitted.values, + loglike=logLik(fit_glm), + model=family)) + +} + +#' Fit zero inflated GLM with pscl::zeroinfl +#' +#' Fit zero inflated GLM with pscl::zeroinfl +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = list of vectors with count and zero entries. Columns in data.df to be used as covariates. +#' @param family='negbin' = str. GLM family to be used. +#' @param start = list or NULL. Starting values for parameters with count, zero and theta entries. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL or list of count:vector, zero:vector. +#' Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, residuals, beta covariances, NB size parameter (theta) and offsets +#' @examples +#' data.df <- tibble(intercept=rep(c(1), 10), x1=c(1:10), count=c(21:30)) +#' covariates <- c('intercept','x1') +#' results <- negBinGLM(data.df, covariates, head(covariates,1)) +zeroInflGLM <- function(data.df, covariates, family='X', + start=NULL, maxit=100, offset=NULL, sample_weights=NULL) { + + if (length(covariates$zero)==0) { + if (family=='negbin') { + return( negBinGLM(data.df, covariates, start=start, + maxit=maxit, offset=offset$count, sample_weights=sample_weights) ) + } else { + return( genLinMod(data.df, covariates, start=start, family=family, + maxit=maxit, offset=offset$count, sample_weights=sample_weights) ) + } + } + + # If there is an offset column, add to formula + if (! is.null(offset)) { + data.df <- data.df %>% mutate(offset_count=offset$count, offset_zero=offset$zero) + formula_count <- "count ~ 0 + offset(offset_count)" + formula_zero <- "0 + offset(offset_zero)" + } else { + formula_count <- "count ~ 0" + formula_zero <- "0" + } + + formula <- as.formula( paste0( paste(c(formula_count, covariates$count), collapse=" + "), " | ", + paste(c(formula_zero, covariates$zero), collapse=" + ") ) ) + + fit_nb <- pscl::zeroinfl( formula, data=data.df, + dist=family, link="logit", start=start, + maxit=maxit, reltol=1e-12, weights=sample_weights) + + return(list(betas=fit_nb$coefficients$count, + betas_zero=fit_nb$coefficients$zero, + residual=data.df$count-as.numeric(fit_nb$fitted.values), + covar=vcov(fit_nb), + theta=fit_nb$theta, + offset=fit_nb$fitted.values, + loglike=fit_nb$loglik, + model=family)) + +} + +#' Fit GLM with stats::glm +#' +#' Fit GLM with stats::glm +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = vector of strings. Columns in data.df to be used as covariates. +#' @param family='poisson' = str. GLM family to be used. +#' @param zeroinfl=Bool. Use zero inflated GLM. +#' @param start = vector (of length covariates) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, offsets and beta covariances. +#' @examples +#' data.df <- tibble(intercept=rep(c(1), 10), x1=c(1:10), count=c(21:30)) +#' covariates <- c('intercept','x1') +#' results <- glmFit(data.df, covariates, family='poisson') +glmFit <- function(data.df, covariates, family='X', zeroinfl=F, + start=NULL, maxit=100, offset=NULL, sample_weights=NULL) { + + if (family=='negbin' & ! zeroinfl) { + results <- negBinGLM(data.df, covariates, start=start, offset=offset, sample_weights=sample_weights) + } else if (! zeroinfl ) { + results <- genLinMod(data.df, covariates, family=family, start=start, offset=offset, sample_weights=sample_weights) + } else { + results <- zeroInflGLM(data.df, covariates, family=family, start=start, offset=offset, sample_weights=sample_weights) + } + + return(results) + +} + +#' Fit GLM +#' +#' Fit GLM to counts and covariates +#' @param X = tibble. DataFrame of covariates. +#' @param y = vector of int >= 0. Target variable (count data) +#' @param covariates = list of vector of strings with entries count, zero (if zeroinfl model). +#' Columns in data.df to be used as covariates. +#' @param family='X' = str. GLM family to be used. +#' @param zeroinfl=Bool. Use zero inflated GLM. +#' @param start = list of vector (of length covariates) (count=, zero=) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, offsets, beta covariances and other parameters. +#' @examples +#' X <- tibble(intercept=rep(c(1), 10), x1=c(1:10)) +#' y=c(21:30) +#' results <- glmFit(X, y) +runGlmFit <- function(X, y, covariates=NULL, family="X", zeroinfl=F, start=NULL, + maxit=100, offset=NULL, sample_weights=NULL) { + + if (is.null(coariates)) { + covariates <- list(count=colnames(X), zero=colnames(X)) + } + data.df <- X %>% mutate(count=y) + + results <- glmFit(data.df, covariates, family=family, zeroinfl=zeroinfl, + start=start, maxit=maxit, offset=offset, sample_weights=sample_weights) + + return(results) + +} + +targetFit <- function(X, covariates, target, + family=NULL, zeroinfl=F, maxit=100) { + + # Target may be split into multiple components + fittarget <- list() + # Split family into consituent parts if they exist + families = str_split(family, ",")[[1]] + if ( length(families)==1 ) { + # Only one target family + fittarget <- glmFit(X %>% rename(count=target), covariates, + family=family, zeroinfl=F, maxit=100) + } else { + # Multiple target families + for (i in c(2:length(families))) { + fittarget[[i-1]] <- glmFit(X %>% rename(count=paste(target, i-2, sep="_")), covariates, + family=families[i], zeroinfl=F, maxit=100) + print(c(i, fittarget[[i-1]]$betas)) + } + } + + return(fittarget) + +} + +#' Fit GLM with variable resampling +#' +#' Fit GLM to counts and covariates with resampling of the target variable +#' Resampling performed using resample with replacement +#' @param X = tibble. DataFrame of covariates. +#' @param y = vector of int >= 0. Target variable (count data) +#' @param target = str. Column name of variable which is being analysed. +#' @param covariates=NULL +#' @param nulliter=100 = int. Number of resamples to perform for null distribution. +#' @param family='negbin' = str. GLM family to be used. +#' @param zeroinfl=Bool. Use zero inflated GLM. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param use_offsets = Bool. Whether to fix non-target parameters when resampling --> faster but less accurate? +#' @param notarget_results=NULL +#' @param resampling='dCRT', str. 'dCRT' (distilled CRT), 'bootstrap' or 'permutation' +#' --> CRT defaults to permuation if no target covatiates. +#' @return list. Output from glm. Includes betas, offsets, beta covariances and other parameters. +#' @examples +#' X <- tibble(intercept=rep(c(1), 10), x1=c(1:10)) +#' y=c(21:30) +#' results <- resample_glmFit(X, y, 'x1', nulliter=100, family='negbin') +resampleGLM <- function(X, y, target, covariates=NULL, nulliter=100, + family='X', target_family='X', + zeroinfl=F, maxit=100, use_offsets=F, sample_weights=NULL, + notarget_result=NULL, fittarget_result=NULL, alt_result=NULL, + resampling='dCRT') { + + # Covariates excluding the target parameter + if (is.null(covariates) ) { + covariates <- list( count=c(colnames(X)[colnames(X)!=target]) ) + if (zeroinfl) { covariates$zero <- covariates$count } + } + data.df <- X %>% mutate(count=y) + + # Fit resampling model + if ( (is.null(fittarget_result)) & (! is.null(covariates$target)) ) { + fittarget_result <- targetFit(X, list(count=covariates$target), target, + family=target_family, zeroinfl=F, maxit=100) + } + # print(c("Fit to target: ", fittarget_result$betas)) + + # Fit model without target variable + if (is.null(notarget_result) ) { + notarget_result <- glmFit(data.df, covariates, family=family, zeroinfl=zeroinfl, maxit=maxit) + } + # print(c("No target: ", notarget_result$betas)) + + # Get fixed starting parameters from model fit without target + if (zeroinfl) { + start = list(count=c(notarget_result$betas, rnorm(1,0,0)), zero=notarget_result$betas_zero) + offset = list(count=as.numeric( as.vector(notarget_result$betas) %*% t(data.matrix(data.df[covariates$count]))), + zero=as.numeric( as.vector(notarget_result$betas_zero) %*% t(data.matrix(data.df[covariates$zero])) )) + sample_weights <- 1-expit(offset$zero) + } else { + start = list(count=c(notarget_result$betas, rnorm(1,0,0))) + offset = list(count=as.numeric( as.vector(notarget_result$betas) %*% t(data.matrix(data.df[covariates$count]))))#log(notarget_result$offset) + sample_weights <- rep(1,length(offset$count)) + } + if (family=='negbin') { start$theta = notarget_result$theta } + + # Fit model with target variable and hot start + covariates$count <- c(covariates$count, target) + if ( is.null(alt_result) ) { + alt_result <- glmFit(data.df, covariates, family=family, zeroinfl=zeroinfl, maxit=maxit, start=start) + } + + # # Stratified resample where targets not well distributed from model + strat_weights <- 1 + + # print(c("Alt result: ", alt_result$betas)) + z_star <- scoreTestZ(data.df[target], data.df$count, offset$count, + notarget_result$theta, sample_weights=sample_weights*strat_weights, family=family) + # print(table(data.df[target])) + counts <- data.frame(index=0) + + # Generate resample-with-replacement of target parameter + target_resampled <- paste0(target, "_resampled") + covariates$count <- c(covariates$count[covariates$count!=target], target_resampled) + # Run resample and fit lots of times + z_null = c() + # print(head(data.df)) + for (i in c(1:nulliter)) { + + if (resampling=="bootstrap") { + # Bootstrap + resample_idx <- sample(c(1:nrow(data.df)), size=nrow(data.df), replace=T) + # Get null z value from score test method + z_null <- c(z_null, scoreTestZ(data.df[resample_idx,][target], + data.df[resample_idx,]$count, + offset$count[resample_idx], + notarget_result$theta, + sample_weights=sample_weights[resample_idx], family=family)) + next + + } else if ( resampling=='dCRT' & is.null(covariates$target)) { + # Permutation + data.df[target_resampled] <- sample(data.df[,target], size = length(y), replace = FALSE) + } else if ( resampling=='dCRT' ) { + # Distilled CRT + data.df[target_resampled] <- drawTarget(data.df, covariates$target, fittarget_result, family=target_family) + # print(table(data.df[target_resampled])) + # counts <- rbind(counts, pivot_wider(data.frame(table(data.df[target_resampled])), names_from="Var1", values_from="Freq")) + if (target_family!="gaussian") { + # print(counts) + # print(head(data.frame(table(data.df[target_resampled])))) + counts <- merge(counts, data.frame(table(data.df[target_resampled])), all=T, by.x="index", by.y="Var1") + colnames(counts)[i+1] <- i + # print(c("Resampled", table(data.df[target_resampled]))) + } + } else { + stop(paste0("resampling method, ", resampling,", unknown")) + } + + # Get null z value from score test method + z_null <- c(z_null, scoreTestZ(data.df[target_resampled], data.df$count, offset$count, + notarget_result$theta, sample_weights=sample_weights, family=family)) + } + + if ( resampling=='dCRT' & !is.null(covariates$target) & !(target_family=='gaussian') ) { + counts[is.na(counts)] <- 0 + means <- rowMeans(counts[,2:ncol(counts)]) + stds <- sqrt(rowSums((counts[,2:ncol(counts)]-means)^2)/(ncol(counts)-1)) + print(c("idx", counts$index)) + print(c("mu", round(means,2))) + print(c("sd", round(stds,2))) + } + + # Get p-value target parameter + if (resampling=="bootstrap") {z_null <- z_null-mean(z_null)} + pv_score <- nullSamplePV(na.omit(z_null), z_star) + + return(append(alt_result, + list(z_star=z_star, z_null=z_null, pv_score=pv_score, + null_loglike=notarget_result$loglike))) + +} + + +#' +drawTarget <- function(X, covariates, parameter_sets, family="X") { + + if (length(str_split(family, ",")[[1]])==1) { + sample <- drawFromGLM(X, covariates, parameter_sets, family=family) + } else { + sample <- integer(nrow(X)) + for (i in c(2:length(str_split(family, ",")[[1]]))) { + sample <- cbind(sample, drawFromGLM(X, covariates, parameter_sets[[i-1]], family=parameter_sets[[i-1]]$model)) + # print(c(i, table(sample[,i]))) + } + if (str_detect(str_split(family, ",")[[1]][1], regex("[a-z]+[0-9]+"))) { + n <- strtoi(str_match(str_split(family, ",")[[1]][1], regex("[a-z]+([0-9]+)"))[[2]]) + sample <- rowSums(sample) + sample[sample>n] <- n + } else { + print(paste0("No family ", family, " for multiple resampled GLM")) + } + } + + return(sample) + +} + +#' Draw sample from GLM +#' +#' Draw sample from GLM with given parameters +#' @param X=data.frame. Dataframe of covariates for model. +#' @param covariates=vector of str. Column headers of covariates used for model. +#' @param parameters=list(betas=vector, sigma?). Parameters of GLM fit. +#' @param family = str. GLM distribution family used. Currently binomial and gaussian implemented. +#' @return vector of same length as X. Samples from the model. +drawFromGLM <- function(X, covariates, parameters, family="X") { + + K <- as.matrix(X[covariates]) %*% as.vector(parameters$betas) + if (family=='binomial') { + mu <- expit(K) + sample <- rbinom(dim(X)[1], 1, mu) + } else if (str_detect(family, regex("binomial[0-9]+"))) { + n <- strtoi(str_match(family, regex("binomial([0-9]+)"))[[2]]) + mu <- expit(K) + sample <- rbinom(dim(X)[1], n, mu) + } else if (str_detect(family, regex("quasibinomial[0-9]+"))) { + n <- strtoi(str_match(family, regex("quasibinomial([0-9]+)"))[[2]]) + beta_draw = q_fit$betas + matrix(rnorm(N*length(parameters$betas)), nrow=N) %*% chol(parameters$cov) + K <- rowSums(as.matrix(X) * as.matrix(beta_draw)) + mu <- expit(K) + sample <- rbinom(dim(X)[1], n, mu) + } else if (family=='poisson') { + mu <- exp(K) + sample <- rpois(dim(X)[1], mu) + } else if (family=='gaussian') { + sample <- rnorm(dim(X)[1], K, parameters$sigma) + } else { + print(paste0("No family ", family, " for GLM draw")) + } + + return(sample) + +} + +#' Estimate p-value from null samples and fitted coefficient. +#' +#' Estimate p-value from null samples and fitted coefficient. +#' @param null_samples=vector. Vector of z-scores of null fits. +#' @param fitted_value=float. Value of z-score for true run. +#' @param model = str. Null model to be used. ('chisquare' (D), 'skewt'). Only chisquare currently implemented +#' @return float. P-value of fitted value against null samples under null model. +#' @examples +#' null_samples <- rnorm(100) +#' fitted_value <- 5 +#' pvalue <- nullSamplePV(null_samples, fitted_value, model='chisquare') +nullSamplePV <- function(null_samples, fitted_value, model='chisquare') { + + if (model=='chisquare') { + normal_var = sd(null_samples)^2 + normal_mean = mean(null_samples) + pv = pchisq((fitted_value-normal_mean)^2/normal_var, df=1, lower.tail=F) + } else { + stop('Unknown model for null sample passed to nullSamplePV') + } + + return(pv) + +} + +#' Calculate expit (aka sigmoid) of x +#' +#' Calculate expit (aka sigmoid) of x +#' @param x = numeric type. Value or array of values. +#' @return p. expit(x). +#' @examples +#' x=c(-10,-0.1,0.,0.1,10) +#' p <- expit(x) +expit <- function(x) { + p <- 1/(1+exp(-x)) + return(p) +} + +#' Estimate z-score used in score test +#' +#' Estimate z-score used in score test for negative binomial or poisson distributions +#' @param count: y predicted value from negative binomial draw +#' @param offset: predicted values from NB fit to covariates +#' @param theta: theta parameter from NB fit +#' @param sample_weights: weights for samples in likelihood function +#' @param family="X": +#' @return z score estimate +#' @examples +#' x=c(-10,-0.1,0.,0.1,10) +#' p <- expit(x) +scoreTestZ <- function(X, count, offset, theta=1, sample_weights=1, family="X") { + + if (family=="negbin") { + mu <- exp(offset) + dlogL <- sum( sample_weights * X * ( count - mu*(count+theta)/(mu+theta) ) ) + fisherI <- sum( sample_weights * X^2 * mu/(mu+theta) ) + } else if (family=="poisson") { + mu <- exp(offset) + dlogL <- sum( sample_weights * X * ( count - mu ) ) + fisherI <- sum( sample_weights * X^2 * mu ) + } else if (str_detect(family, regex("binomial[0-9]*"))) { + if (str_detect(family, regex("binomial[0-9]+"))) { + # Binomial with n>1 + n <- strtoi(str_match(family, regex("binomial([0-9]+)"))[[2]]) + } else { n <- 1 } + p <- expit(offset) + dlogL <- sum( sample_weights * X * ( count - n*p ) ) + fisherI <- sum( sample_weights * X^2 * p*(1-p) * n ) + } else { + print(paste0("No family - ", family, " - for scoreTestZ")) + } + + return(dlogL/sqrt(fisherI)) + +} + + +#' Stratified resampling from binomial distribution +#' +#' Perform stratified resampling from binomial distribution +#' @param k = vector_int(N): value of k for all samples +#' @param p = vector_flt(N): binomial model probability for each sample +#' @param n = int: number of binomial trials +#' @param breaks = seq(0,1,0.1): stratification bins used for p +#' @return subsample = vector_int(N): resampled indices +binomialStratifiedResample <- function(k, p, n, breaks=seq(0,1,0.1), output="samples") { + + # Possible values of k + k_unique <- seq(0,n,1) + + # Get k probabilities + p_matrix <- t(replicate(n+1,p)) + p_matrix <- (p_matrix^k_unique * (1-p_matrix)^(n-k_unique)) * choose(n,k_unique) + + # Get number of samples with k=k and p in bin + breaks <- seq(0,1,0.1) + break_idx <- findInterval(p, breaks) + # break_idx <- ecdf(unique(breaks))(break_idx+0.1) + # Get ID of k in bins (equivalent to k+1) + k_idx <- findInterval(k, k_unique-0.5) + k_break_matrix <- table(break_idx, k_idx) + + # Get expected number of samples with k=k and p in bin + expected_matrix <- rowsum(t(p_matrix), break_idx) + + # Get weights for given bin and k + weights <- expected_matrix[,unique(k_idx)]/k_break_matrix + weights[which(!is.finite(weights))]=0 + sample_probs <- weights/max(weights) + # print(c("k_break_matrix", k_break_matrix)) + # print(c("expected_matrix", expected_matrix)) + # print(c("sample_probs", sample_probs)) + # Get unique positions of break_idx + break_idx <- match(break_idx, sort(unique(break_idx))) + sample_probs <- sample_probs[cbind(break_idx, k_idx)] + + # Get resamples - with or without replacement? + # print(sample_probs) + if (output=="samples") { + # subsample <- c(1:length(sample_probs))[rbinom(length(sample_probs),1,sample_probs)==1] + subsample <- sample(c(1:length(sample_probs)), prob=sample_probs, replace = T) + # print(c("k", table(k))) + # print(c("new_k", table(k[subsample]))) + return(sort(subsample)) + } else if (output=="weights") { + weights <- sample_probs/mean(sample_probs) + # print(weights[1:10]) + return(weights) + } + +} diff --git a/src/pipelines/signatures/signatureAssociations.nf/scripts/signatures.py b/src/pipelines/signatures/signatureAssociations.nf/scripts/signatures.py new file mode 100755 index 0000000..7834284 --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/scripts/signatures.py @@ -0,0 +1,55 @@ +# Import standard modules +print("Importing packages") +import sys, os, numpy as np, pandas as pd +from multiprocessing import Pool +import re + +verbose=True +def print_verbose(message): + if verbose: print(message) + +new_path = '/'.join(os.path.abspath(__file__).split('/')[:-2] + ['python']) +print_verbose(new_path) +sys.path.append(new_path) + + +if __name__=='__main__': + + # runname to go in filename + activities_file = sys.argv[1] + + # Import signature activities + sig_df = pd.read_csv(activities_file, sep="\t") + + sig_df['Samples'] = sig_df['Samples'].map(lambda x: "tumo"+x.split("_")[1]+"_"+x.split("_")[2]+"_norm"+x.split("_")[3]+"_"+x.split("_")[4]) + sig_df.set_index('Samples', inplace=True) + + for key in sig_df.keys(): + sig_df[key] = sig_df[key].astype(float) + sig_df[key][np.isnan(sig_df[key])] = 0 + + sig_df[key] = np.round(sig_df[key]).astype(int) + + sig_df['TSMC'] = np.sum(np.array(sig_df), axis=1) + + try: + # Add new signatures + sig_df['SBSapo'] = sig_df.SBS2 + sig_df.SBS13 + + # Add together signatures which have been sub-split + signatures = sig_df.keys() + split_signatures = np.unique([ + re.search("(SBS[0-9]+?)[a-z]+", sig).group(1) for sig in signatures if re.search("SBS([0-9]+)[a-z]", sig) is not None + ]) + for split_sig in split_signatures: + sig_df[split_sig] = np.zeros(len(sig_df)) + for sig in signatures: + if re.search("SBS([0-9]+)[a-z]", sig) is not None: + split_sig = re.search("(SBS[0-9]+?)[a-z]+", sig).group(1) + sig_df[split_sig] += sig_df[sig] + except AttributeError: + sig_df['TSMC'] = np.sum(np.array(sig_df), axis=1) + print(sig_df.head()) + + + sig_df.to_csv("activities_extended.tsv", index_label=False, header=True, sep="\t") diff --git a/src/pipelines/signatures/signatureAssociations.nf/scripts/target_regress.r b/src/pipelines/signatures/signatureAssociations.nf/scripts/target_regress.r new file mode 100755 index 0000000..6d38620 --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/scripts/target_regress.r @@ -0,0 +1,84 @@ +suppressMessages(library(tidyverse)) +suppressMessages(library(ggplot2, viridis, glmnet)) + +# require(pscl) +require(MASS) +require(boot) +library(pscl) + + +args = commandArgs(trailingOnly=TRUE) +base_dir = args[1] +test_file = args[2] +covariate_file = args[3] +target_file = args[4] +target_variable = args[5] +target_type = args[6] + +# source(paste(base_dir, "logistic_functions.r", sep="/")) +source(paste(base_dir, "logistic_regress.r", sep="/")) +source(paste(base_dir, "regression_functions.r", sep="/")) + +# Set of tests to run +test_df <- read_delim(test_file, delim="\t") +print(test_df[1:10,]) + +# Load in samples +sample_df <- read_delim(covariate_file, delim="\t") +covariates <- colnames(sample_df)[! colnames(sample_df) %in% c("sample_id", "group")] +# covariates <- c("log_age", "is_female", "pc1", "pc2", "pc3") +print(c("Covariates: ", covariates)) + + +# Load in target variables +# Trait file should be csv with "germline_platekey" and a column for each trait +target_df <- read_delim(target_file, delim="\t") + +# Join tables together +sample_df <- inner_join(sample_df, target_df, by=c("sample_id"), suffix=c("", "_tgt")) %>%mutate_if(is.numeric, ~replace_na(., NA) %>%replace(., is.infinite(.), NA)) + + +# Iterate through tumour groups for the given signature +groups <- unique(test_df$group) +for (grp in groups) { + + # Get all variable targets for the given group and signature + targets <- unique(test_df[test_df$group==grp,]$target) + + print(c("Group: ", grp)) + print(c("Target variable: ", target_variable)) + subsample_df <- sample_df %>% filter(group==grp) + + # Filter out instances where only one target value + nonsingular <- c() + for (target in targets) { nonsingular<-c(nonsingular, nrow(unique(subsample_df[,target]))>1) } + targets <- targets[nonsingular] + print(c("Targets: ", targets)) + if (length(targets)==0) { stop("No non-unique targets") } + + print("Subsample info: ") + print(length(subsample_df)) + subsample_df <- subsample_df %>% mutate(count=.data[[target_variable]]) %>% dplyr::select(count, targets, covariates) %>% filter(!is.na(count)) + print(dim(subsample_df)) + print(head(subsample_df)) + + if (dim(subsample_df)[1]>2){ + results_df <- model_fit(data.frame(subsample_df), covariates, gsub("-", ".", targets), family=target_type, power.analysis=F) + results_df["group"] <- grp + results_df["dependent"] <- target_variable + + full_results_df <- tryCatch( bind_rows(full_results_df, results_df), + error=function (e) {results_df} ) + } + +} + +if (nrow(full_results_df)==0) { stop("Failed all runs!!!") } + +# Statistical tests +full_results_df["wilks_pvalue"] <- pchisq( -2 * (full_results_df$loglike_null - full_results_df$loglike_alt), df=1, lower.tail=FALSE ) + +print(head(full_results_df)) + +# Save results +write_delim(full_results_df %>% dplyr::select(group, dependent, target, wilks_pvalue, everything()), "target_results.csv", delim=",") diff --git a/src/pipelines/signatures/signatureAssociations.nf/scripts/zinb_regress.r b/src/pipelines/signatures/signatureAssociations.nf/scripts/zinb_regress.r new file mode 100755 index 0000000..a1faec7 --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/scripts/zinb_regress.r @@ -0,0 +1,346 @@ +suppressMessages(library(tidyverse, rhdf5)) + + +model_fit <- function(sample.df, covariate_list, targets, target_types, resampling_method, power.analysis = true, zeroinfl = true) { + # Set up model and backup for when model fails + family <- "negbin" + backup <- "poisson" + zeroinfl_model <- (sum(sample.df$count == 0) > 0) & (zeroinfl) + # target_families <- list(linear="gaussian", logistic="binomial") + + # Renormalise covariate_list + # print("Renormalising") + means <- c() + stds <- c() + for (key in c(covariate_list, targets[target_types == "gaussian"])) { + means <- c(means, mean(sample.df[, key])) + stds <- c(stds, sd(sample.df[, key])) + sample.df[key] <- (sample.df[, key] - mean(sample.df[, key])) / sd(sample.df[, key]) + } + + # Select covariate_list with non-zero variance + sample.df$intercept <- 1.0 + covariates <- list( + count = c("intercept", covariate_list[stds[1:length(covariate_list)] > 0]), + zero = c("intercept", covariate_list[stds[1:length(covariate_list)] > 0]), + target = c("intercept", covariate_list[stds[1:length(covariate_list)] > 0]) + ) + + # Run GLM regression without target variable + print("No target") + print(covariates) + print(covariate_list) + print(stds[1:length(covariate_list) - 1]) + print(stds) + fit_notarget <- tryCatch(glmFit(sample.df, covariates, family = family, zeroinfl = zeroinfl_model), + error = function(e) { + glmFit(sample.df, covariates, family = backup, zeroinfl = zeroinfl_model) + } + ) + family <- fit_notarget$model + print("Done") + + # For each target, run GLM regression with target variable and null resampling + for (i in c(1:length(targets))) { + print(c(i, length(targets), " target: ", targets[i])) + + print("Removing nan target values - to make null fit fair (glmFit does this automatically when doing alt)") + print(head(sample.df)) + print(dim(sample.df)[1]) + sample_nona.df <- sample.df %>% filter(!is.na(get(targets[i]))) + print(head(sample_nona.df)) + print(dim(sample_nona.df)[1]) + if (dim(sample_nona.df)[1] != dim(sample.df)[1]) { + print("Fit nona") + fit_notarget_nona <- tryCatch(glmFit(sample_nona.df, covariates, family = family, zeroinfl = zeroinfl_model), + error = function(e) { + list( + loglike = -99, model = "NA", theta = -99, + betas = rep(-99, length(covariates$count) + 1), betas_zero = rep(-99, length(covariates$zero)), + covar = matrix(-99, ncol = length(covariates$count) + 1, nrow = length(covariates$count) + 1) + ) + } + ) + family_nona <- fit_notarget$model + } else { + fit_notarget_nona <- fit_notarget + family_nona <- family + } + + print(c("Resampling: ", resampling_method)) + fit_alt <- tryCatch( + resampleGLM(sample_nona.df %>% dplyr::select(-count), sample_nona.df$count, targets[i], + covariates = covariates, nulliter = 500, + family = family_nona, target_family = target_types[i], + zeroinfl = zeroinfl_model, notarget_result = fit_notarget_nona, resampling = resampling_method + ), + error = function(e) { + list( + pv_score = -99, loglike = -99, model = "NA", + betas = rep(-99, length(covariates$count) + 1), betas_zero = rep(-99, length(covariates$zero)), + covar = matrix(-99, ncol = length(covariates$count) + 1, nrow = length(covariates$count) + 1), + z_star = -99, z_null = c(-99, -99) + ) + } + ) + + print(c("Z ", fit_alt$z_star, mean(na.omit(fit_alt$z_null)), sd(na.omit(fit_alt$z_null)))) + + # print("Collecting results") + # print(c('ll', fit_notarget_nona['loglike'])) + # print(c('ll', fit_notarget_nona$loglike)) + wilks <- pchisq(-2 * (fit_notarget_nona$loglike - fit_alt$loglike), df = 1, lower.tail = FALSE) + print(list( + target = targets[i], resample_pvalue = fit_alt$pv_score, + theta_null = fit_notarget_nona$theta, theta_alt = fit_alt$theta, + loglike_null = fit_notarget_nona$loglike, loglike_alt = fit_alt$loglike, wilks = wilks, + model_null = fit_notarget_nona$model, model_alt = fit_alt$model, + sample_size = nrow(sample_nona.df), nz_size = sum(sample_nona.df$count > 0), + alt_params = fit_alt$betas + )) + results.df <- data.frame( + target = targets[i], resample_pvalue = fit_alt$pv_score, + loglike_null = fit_notarget_nona$loglike, loglike_alt = fit_alt$loglike, + model_null = fit_notarget_nona$model, model_alt = fit_alt$model, + sample_size = nrow(sample_nona.df), nz_size = sum(sample_nona.df$count > 0), + target_z_means_alt = fit_alt$z_star, target_z_means_null = mean(na.omit(fit_alt$z_null)), target_z_covs_null = sd(na.omit(fit_alt$z_null))^2 + ) + results.df$theta_null <- fit_notarget_nona$theta + results.df$theta_alt <- fit_alt$theta + + + # Count paramters + names <- c(covariates$count, "target") + for (j in c(1:length(names))) { + results.df[paste(names[j], "means", "alt", sep = "_")] <- fit_alt$betas[j] + results.df[paste(names[j], "covs", "alt", sep = "_")] <- diag(fit_alt$covar)[j] + } + for (j in c(1:length(covariates$count))) { + results.df[paste(covariates$count[j], "means", "null", sep = "_")] <- fit_notarget_nona$betas[j] + results.df[paste(covariates$count[j], "covs", "null", sep = "_")] <- diag(fit_notarget_nona$covar)[j] + } + for (j in c(1:length(covariate_list))) { + results.df[paste("Xmean", covariate_list[j], sep = "_")] <- means[j] + results.df[paste("Xsd", covariate_list[j], sep = "_")] <- stds[j] + } + for (j in c(1:length(covariates$zero))) { + results.df[paste(covariates$zero[j], "zero", "means", "alt", sep = "_")] <- fit_alt$betas_zero[j] + # results.df[paste(covariates$zero[j], "zero", "covs", "alt", sep="_")] <- diag(fit_alt$covar)[j] + results.df[paste(covariates$zero[j], "zero", "means", "null", sep = "_")] <- fit_notarget_nona$betas_zero[j] + # results.df[paste(covariates$zero[j], "zero", "covs", "null", sep="_")] <- diag(fit_notarget_nona$covar)[j] + } + + + ### Power Analysis + power.percentiles <- c(20, 50, 80) + if ((power.analysis) & !(is.na(fit_alt$pv_score)) & (fit_alt$pv_score != -99)) { + # Set parameters to known ones with target effect size of log(2) + # Resample from Zero-Inflated Negative Binomial N times from new parameters + # Apply resampleGLM method to simulated signature + # Fit beta distribution to set of p-values + effect.size <- log(2) + power.ntest <- 10 + + fit_target <- targetFit(sample.df %>% dplyr::select(-count), list(count = covariates$target), targets[i], + family = target_types[i], zeroinfl = F, maxit = 100 + ) + + + mu <- exp(as.matrix(sample.df[c(covariates$count, targets[i])]) %*% c(fit_alt$betas[1:length(covariates$count)], effect.size)) + print(c("Input betas ", c(fit_alt$betas[1:length(covariates$count)], effect.size))) + if (zeroinfl_model) { + # mu zero is the expected probability of sample having activity forced to zero + muzero <- 1 - expit(as.matrix(sample.df[covariates$zero]) %*% fit_alt$betas_zero) + } + pvalue_list <- c() + rpv_list <- c() + for (j in c(1:power.ntest)) { + # Reorder true counts by drawn counts + # counts <- sort(sample.df$count)[order(order( drawFromModel(mu, muzero, theta=fit_alt$theta, zeroinfl=zeroinfl_model, family=family) ))] + # counts <- drawFromModel(mu, muzero, theta=fit_alt$theta, zeroinfl=zeroinfl_model, family=family) + counts <- drawFromModel(mu, muzero, zeroinfl = zeroinfl_model, family = "poisson") + fit_power <- tryCatch( + resampleGLM(sample.df %>% dplyr::select(-count), counts, targets[i], + covariates = covariates, nulliter = 500, + family = family_nona, target_family = target_types[i], + zeroinfl = zeroinfl_model, resampling = resampling_method, + fittarget_result = fit_target + ), + error = function(e) { + "failed" + } + ) + # notarget_result=fit_notarget_nona, fittarget_result=fit_target, + if (fit_power == "failed") { + next + } + # print(fit_power$z_star) + # print(fit_power$z_null) + print(mean(fit_power$z_null)) + print(sd(fit_power$z_null)) + wilks_pv <- pchisq(-2 * (fit_power$null_loglike - fit_power$loglike), df = 1, lower.tail = FALSE) + pvalue_list <- c(pvalue_list, wilks_pv) + rpv_list <- c(rpv_list, fit_power$pv_score) + print(c("GLMft: ", wilks_pv)) + print(c("Pass: ", fit_power$pv_score)) + } + + for (percentile in power.percentiles) { + results.df[paste0("power", percentile)] <- quantile(na.omit(pvalue_list), percentile / 100)[[1]] + } + for (percentile in power.percentiles) { + results.df[paste0("resample_power", percentile)] <- quantile(na.omit(rpv_list), percentile / 100)[[1]] + } + results.df[paste0("powerMean")] <- mean(na.omit(pvalue_list)) + results.df[paste0("powerVar")] <- sd(na.omit(pvalue_list))^2 + results.df[paste0("powerN")] <- length(na.omit(pvalue_list)) + } else { + for (percentile in power.percentiles) { + results.df[paste0("power", percentile)] <- -99 + } + for (percentile in power.percentiles) { + results.df[paste0("resample_power", percentile)] <- -99 + } + results.df[paste0("powerMean")] <- -99 + results.df[paste0("powerVar")] <- -99 + results.df[paste0("powerN")] <- -99 + } + + ### Return results + stacked_results.df <- tryCatch(bind_rows(stacked_results.df, results.df), + error = function(e) { + results.df + } + ) + } + + return(stacked_results.df) +} + +drawFromModel <- function(mu, muzero, theta = NULL, zeroinfl = F, family = "X") { + # Draw from count process + if (family == "negbin") { + counts <- rnbinom(dim(mu)[1], mu = mu, size = theta) + } else if (family == "poisson") { + counts <- rpois(dim(mu)[1], lambda = mu) + } + + # Draw zero inflation from binomial distribution + if (zeroinfl) { + nonzero <- rbinom(dim(mu)[1], size = 1, prob = muzero) + } else { + nonzero <- 1 + } + + return(nonzero * counts) +} + +if (sys.nframe() == 0) { + args <- commandArgs(trailingOnly = TRUE) + output_dir <- args[1] + base_dir <- args[2] + test_file <- args[3] + covariate_file <- args[4] + signature_file <- args[5] + target_file <- args[6] + resampling_method <- args[7] # Must be dCRT or bootstrap + power.analysis <- args[8] # true or not + zeroinfl <- args[9] == "true" + print("Zeroinfl? ") + print(zeroinfl) + + print("Power analysis?") + power.analysis <- power.analysis == "true" + print(power.analysis) + + source(paste(base_dir, "regression_functions.r", sep = "/")) + + # covariates = c("log_age", "logit_purity", "is_female", "pc1", "pc2", "pc3") + + # Set of tests to run + test_df <- read_delim(test_file, delim = "\t") + print(test_df[1:10, ]) + signatures <- unique(test_df$signature) + print(test_df) + print(signatures) + + # Load in samples + sample_df <- read_delim(covariate_file, delim = "\t") + covariates <- colnames(sample_df)[!colnames(sample_df) %in% c("sample_id", "group")] + # covariates <- c("log_age", "is_female", "pc1", "pc2", "pc3") + + # Load in signatures + print(c("sample_id", signatures)) + signature_df <- read_delim(signature_file, delim = "\t") %>% dplyr::select(c("sample_id", signatures)) + signature_df[is.na(signature_df)] <- 0 + + # Load in target variables + # Trait file should be csv with "germline_platekey" and a column for each trait + target_df <- read_delim(target_file, delim = "\t") + + # Join tables together + # sample_df <- inner_join(sample_df, signature_df, by=c("sample_id"), suffix=c("", "_sig")) + sample_df <- inner_join(sample_df, target_df, by = c("sample_id"), suffix = c("", "_tgt")) %>% mutate_if(is.numeric, ~ replace_na(., NA) %>% replace(., is.infinite(.), NA)) + + + # Iterate through all signatures + for (sig in signatures) { + # Iterate through tumour groups for the given signature + groups <- unique(test_df[test_df$signature == sig, ]$group) + for (grp in groups) { + # Get all variable targets for the given group and signature + targets <- test_df[test_df$signature == sig & test_df$group == grp, ]$target + target_types <- test_df[test_df$signature == sig & test_df$group == grp, ]$type + + print(sig) + print(grp) + print(targets) + subsample_df <- sample_df %>% filter(group == grp) + + # Filter out instances where only one target value + nonsingular <- c() + for (target in targets) { + nonsingular <- c(nonsingular, nrow(unique(subsample_df[, target])) > 1) + } + targets <- targets[nonsingular] + target_types <- target_types[nonsingular] + print(targets) + if (length(targets) == 0) { + stop("No non-unique targets") + } + + # Get all cols with are either target or target_i for multicomponent target resampling + target_cols <- c() + for (target in targets) { + target_cols <- c( + target_cols, + colnames(subsample_df)[str_detect(colnames(subsample_df), regex(paste0(target, "[_0-9]*")))] + ) + } + + subsample_df <- inner_join(subsample_df, + signature_df %>% dplyr::select(sample_id, sig) %>% rename(count = sig), + by = c("sample_id") + ) %>% dplyr::select(count, target_cols, covariates) + # subsample_df <- sample_df %>% filter(group==grp) %>% dplyr::select(sig, targets, covariates) %>% rename(count=sig) + + results_df <- model_fit(data.frame(subsample_df), covariates, gsub("-", ".", targets), target_types, resampling_method, power.analysis = power.analysis, zeroinfl = zeroinfl) + results_df["signature"] <- sig + results_df["group"] <- grp + + full_results_df <- tryCatch(bind_rows(full_results_df, results_df), + error = function(e) { + results_df + } + ) + } + } + + # Statistical tests + full_results_df["wilks_pvalue"] <- pchisq(-2 * (full_results_df$loglike_null - full_results_df$loglike_alt), df = 1, lower.tail = FALSE) + + # print(full_results_df) + + # Save results + write_delim(full_results_df %>% dplyr::select(group, signature, target, wilks_pvalue, everything()), paste0(output_dir, "glm_results.csv"), delim = ",") +} diff --git a/src/pipelines/signatures/signatureAssociations.nf/scripts/zinb_regress_test.r b/src/pipelines/signatures/signatureAssociations.nf/scripts/zinb_regress_test.r new file mode 100755 index 0000000..c1e449b --- /dev/null +++ b/src/pipelines/signatures/signatureAssociations.nf/scripts/zinb_regress_test.r @@ -0,0 +1,665 @@ +suppressMessages(library(tidyverse, rhdf5)) + +library(tidyverse, MASS) +# , stats) + +#' Fit negative binomial with MASS::glm.nb +#' +#' Fit negative binomial with MASS::glm.nb +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = vector of strings. Columns in data.df to be used as covariates. +#' @param start = vector (of length covariates) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, residuals, beta covariances, NB size parameter (theta) and offsets +#' @examples +#' data.df <- tibble(intercept = rep(c(1), 10), x1 = c(1:10), count = c(21:30)) +#' covariates <- c("intercept", "x1") +#' results <- negBinGLM(data.df, covariates) +negBinGLM <- function(data.df, covariates, start = NULL, maxit = 100, offset = NULL, sample_weights = NULL) { + # If starting parameters provided + if (!is.null(start)) { + start.theta <- start$theta + start.count <- start$count + } else { + start.theta <- 1 + start.count <- NULL + } + + # If there is an offset column, add to formula + if (!is.null(offset)) { + data.df <- data.df %>% + mutate(offset = offset) %>% + dplyr::select(count, covariates$count, offset) + formula <- "count ~ 0 + offset(offset) + " + } else { + formula <- "count ~ 0 + " + } + + formula <- as.formula(paste0(formula, paste(covariates$count, collapse = " + "))) + # print("NB") + # print(formula) + # print(start.count) + # print(start.theta) + # print(maxit) + # print(sample_weights[50:60]) + # print(data.df[50:60,]) + + fit_nb <- MASS::glm.nb(formula, + data = data.df, + start = start.count, init.theta = start.theta, + maxit = maxit, weights = sample_weights + ) + # print(fit_nb) + + return(list( + betas = fit_nb$coefficients, + residual = data.df$count - as.numeric(fit_nb$fitted.values), + covar = vcov(fit_nb), + theta = fit_nb$theta, + offset = fit_nb$fitted.values, + loglike = fit_nb$twologlik / 2, + model = "negbin" + )) +} + +#' Fit GLM with stats::glm +#' +#' Fit GLM with stats::glm +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = vector of strings. Columns in data.df to be used as covariates. +#' @param family='negbin' = str. GLM family to be used. +#' @param start = vector (of length covariates) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, residuals, beta covariances, NB size parameter (theta) and offsets +#' @examples +#' data.df <- tibble(intercept = rep(c(1), 10), x1 = c(1:10), count = c(21:30)) +#' covariates <- c("intercept", "x1") +#' results <- negBinGLM(data.df, covariates) +genLinMod <- function(data.df, covariates, family = "X", start = NULL, maxit = 100, offset = NULL, sample_weights = NULL) { + # If there is an offset column, add to formula + if (!is.null(offset)) { + data.df <- data.df %>% mutate(offset = offset) + formula <- "count ~ 0 + offset(offset)" + } else { + formula <- "count ~ 0" + } + + formula <- as.formula(paste(c(formula, covariates$count), collapse = " + ")) + + fit_glm <- stats::glm(formula, + data = data.df, family = family, + start = start$count, + maxit = maxit, weights = sample_weights + ) + + residual <- data.df$count - as.numeric(fit_glm$fitted.values) + # Estimate standard error from Gaussian residuals + # Only meaningful in Gaussian models? + sigma <- sqrt(sum(residual^2) / (length(residual) - length(covariates$count))) + + return(list( + betas = fit_glm$coefficients, + residual = residual, + sigma = sigma, + covar = vcov(fit_glm), + offset = fit_glm$fitted.values, + loglike = logLik(fit_glm), + model = family + )) +} + +#' Fit zero inflated GLM with pscl::zeroinfl +#' +#' Fit zero inflated GLM with pscl::zeroinfl +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = list of vectors with count and zero entries. Columns in data.df to be used as covariates. +#' @param family='negbin' = str. GLM family to be used. +#' @param start = list or NULL. Starting values for parameters with count, zero and theta entries. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL or list of count:vector, zero:vector. +#' Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, residuals, beta covariances, NB size parameter (theta) and offsets +#' @examples +#' data.df <- tibble(intercept = rep(c(1), 10), x1 = c(1:10), count = c(21:30)) +#' covariates <- c("intercept", "x1") +#' results <- negBinGLM(data.df, covariates, head(covariates, 1)) +zeroInflGLM <- function(data.df, covariates, family = "X", + start = NULL, maxit = 100, offset = NULL, sample_weights = NULL) { + if (length(covariates$zero) == 0) { + if (family == "negbin") { + return(negBinGLM(data.df, covariates, + start = start, + maxit = maxit, offset = offset$count, sample_weights = sample_weights + )) + } else { + return(genLinMod(data.df, covariates, + start = start, family = family, + maxit = maxit, offset = offset$count, sample_weights = sample_weights + )) + } + } + + # If there is an offset column, add to formula + if (!is.null(offset)) { + data.df <- data.df %>% mutate(offset_count = offset$count, offset_zero = offset$zero) + formula_count <- "count ~ 0 + offset(offset_count)" + formula_zero <- "0 + offset(offset_zero)" + } else { + formula_count <- "count ~ 0" + formula_zero <- "0" + } + + formula <- as.formula(paste0( + paste(c(formula_count, covariates$count), collapse = " + "), " | ", + paste(c(formula_zero, covariates$zero), collapse = " + ") + )) + print(formula) + + fit_nb <- pscl::zeroinfl(formula, + data = data.df, + dist = family, link = "logit", start = start, + maxit = maxit, reltol = 1e-12, weights = sample_weights + ) + + return(list( + betas = fit_nb$coefficients$count, + betas_zero = fit_nb$coefficients$zero, + residual = data.df$count - as.numeric(fit_nb$fitted.values), + covar = vcov(fit_nb), + theta = fit_nb$theta, + offset = fit_nb$fitted.values, + loglike = fit_nb$loglik, + model = family + )) +} + +#' Fit GLM with stats::glm +#' +#' Fit GLM with stats::glm +#' @param data.df = tibble. DataFrame including count column and column for each covarites. +#' @param covariates = vector of strings. Columns in data.df to be used as covariates. +#' @param family='poisson' = str. GLM family to be used. +#' @param zeroinfl=Bool. Use zero inflated GLM. +#' @param start = vector (of length covariates) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, offsets and beta covariances. +#' @examples +#' data.df <- tibble(intercept = rep(c(1), 10), x1 = c(1:10), count = c(21:30)) +#' covariates <- c("intercept", "x1") +#' results <- glmFit(data.df, covariates, family = "poisson") +glmFit <- function(data.df, covariates, family = "X", zeroinfl = F, + start = NULL, maxit = 100, offset = NULL, sample_weights = NULL) { + if (family == "negbin" & !zeroinfl) { + results <- negBinGLM(data.df, covariates, start = start, offset = offset, sample_weights = sample_weights) + } else if (!zeroinfl) { + results <- genLinMod(data.df, covariates, family = family, start = start, offset = offset, sample_weights = sample_weights) + } else { + results <- zeroInflGLM(data.df, covariates, family = family, start = start, offset = offset, sample_weights = sample_weights) + } + + return(results) +} + +#' Fit GLM +#' +#' Fit GLM to counts and covariates +#' @param X = tibble. DataFrame of covariates. +#' @param y = vector of int >= 0. Target variable (count data) +#' @param covariates = list of vector of strings with entries count, zero (if zeroinfl model). +#' Columns in data.df to be used as covariates. +#' @param family='X' = str. GLM family to be used. +#' @param zeroinfl=Bool. Use zero inflated GLM. +#' @param start = list of vector (of length covariates) (count=, zero=) or NULL. Starting values for covariate betas. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param offset = NULL,vector of int. Sample offsets to be used (for fixing parameter values - offset=exp(X beta)). +#' @return list. Output from glm. Includes betas, offsets, beta covariances and other parameters. +#' @examples +#' X <- tibble(intercept = rep(c(1), 10), x1 = c(1:10)) +#' y <- c(21:30) +#' results <- glmFit(X, y) +runGlmFit <- function(X, y, covariates = NULL, family = "X", zeroinfl = F, start = NULL, + maxit = 100, offset = NULL, sample_weights = NULL) { + if (is.null(coariates)) { + covariates <- list(count = colnames(X), zero = colnames(X)) + } + data.df <- X %>% mutate(count = y) + + results <- glmFit(data.df, covariates, + family = family, zeroinfl = zeroinfl, + start = start, maxit = maxit, offset = offset, sample_weights = sample_weights + ) + + return(results) +} + +#' Fit GLM with variable resampling +#' +#' Fit GLM to counts and covariates with resampling of the target variable +#' Resampling performed using resample with replacement +#' @param X = tibble. DataFrame of covariates. +#' @param y = vector of int >= 0. Target variable (count data) +#' @param target = str. Column name of variable which is being analysed. +#' @param covariates=NULL +#' @param nulliter=100 = int. Number of resamples to perform for null distribution. +#' @param family='negbin' = str. GLM family to be used. +#' @param zeroinfl=Bool. Use zero inflated GLM. +#' @param maxit = int. Maximum number of iterations when fitting glm. +#' @param use_offsets = Bool. Whether to fix non-target parameters when resampling --> faster but less accurate? +#' @param notarget_results=NULL +#' @param resampling='dCRT', str. 'dCRT' (distilled CRT), 'bootstrap' or 'permutation' +#' --> CRT defaults to permuation if no target covatiates. +#' @return list. Output from glm. Includes betas, offsets, beta covariances and other parameters. +#' @examples +#' X <- tibble(intercept = rep(c(1), 10), x1 = c(1:10)) +#' y <- c(21:30) +#' results <- resample_glmFit(X, y, "x1", nulliter = 100, family = "negbin") +resampleGLM <- function(X, y, target, covariates = NULL, nulliter = 100, + family = "X", target_family = "X", + zeroinfl = F, maxit = 100, use_offsets = F, sample_weights = NULL, + notarget_result = NULL, + fittarget_result = NULL, + resampling = "dCRT") { + # Covariates excluding the target parameter + if (is.null(covariates)) { + covariates <- list(count = c(colnames(X)[colnames(X) != target])) + if (zeroinfl) { + covariates$zero <- covariates$count + } + } + data.df <- X %>% mutate(count = y) + + # Fit resampling model + if ((is.null(fittarget_result)) & (!is.null(covariates$target))) { + fittarget_result <- glmFit(X %>% rename(count = target), list(count = covariates$target), + family = target_family, zeroinfl = F, maxit = 100 + ) + } + # print(c("Fit to target: ", fittarget_result$betas)) + + # Fit model without target variable + if (is.null(notarget_result)) { + notarget_result <- glmFit(data.df, covariates, family = family, zeroinfl = zeroinfl, maxit = maxit) + } + # print(c("No target: ", notarget_result$betas)) + + # Get fixed starting parameters from model fit without target + if (zeroinfl) { + start <- list(count = c(notarget_result$betas, rnorm(1, 0, 0)), zero = notarget_result$betas_zero) + offset <- list( + count = as.numeric(as.vector(notarget_result$betas) %*% t(data.matrix(data.df[covariates$count]))), + zero = as.numeric(as.vector(notarget_result$betas_zero) %*% t(data.matrix(data.df[covariates$zero]))) + ) + sample_weights <- 1 - expit(offset$zero) + } else { + start <- list(count = c(notarget_result$betas, rnorm(1, 0, 0))) + offset <- list(count = as.numeric(as.vector(notarget_result$betas) %*% t(data.matrix(data.df[covariates$count])))) # log(notarget_result$offset) + sample_weights <- rep(1, length(offset$count)) + } + if (family == "negbin") { + start$theta <- notarget_result$theta + } + + # Fit model with target variable and hot start + covariates$count <- c(covariates$count, target) + alt_result <- glmFit(data.df, covariates, family = family, zeroinfl = zeroinfl, maxit = maxit, start = start) + # print(c("Alt result: ", alt_result$betas)) + print("Star") + z_star <- scoreTestZ(data.df[target], data.df$count, exp(offset$count), + notarget_result$theta, + sample_weights = sample_weights, family = family + ) + + # Generate resample-with-replacement of target parameter + target_resampled <- paste0(target, "_resampled") + covariates$count <- c(covariates$count[covariates$count != target], target_resampled) + # Run resample and fit lots of times + z_null <- c() + # print(head(data.df)) + nulliter <- 2 + print("NT") + print(c(notarget_result$loglike, notarget_result$betas)) + for (i in c(1:nulliter)) { + if (resampling == "bootstrap") { + # Bootstrap + resample_idx <- sample(c(1:nrow(data.df)), size = nrow(data.df), replace = T) + # Get null z value from score test method + z_null <- c(z_null, scoreTestZ(data.df[resample_idx, ][target], + data.df[resample_idx, ]$count, + exp(offset$count)[resample_idx], + notarget_result$theta, + sample_weights = sample_weights[resample_idx], family = family + )) + next + } else if (resampling == "dCRT" & is.null(covariates$target)) { + # Permutation + data.df[target_resampled] <- sample(X[, target], size = length(y), replace = FALSE) + } else if (resampling == "dCRT") { + # Distilled CRT + data.df[target_resampled] <- drawFromGLM(X, covariates$target, fittarget_result, family = target_family) + } else { + stop(paste0("resampling method, ", resampling, ", unknown")) + } + + covariates$count <- c(head(covariates$count, -1), target_resampled) + # print(notarget_result$offset[1:50]) + # print(offset$count[1:50]) + null_result <- glmFit(data.df, list(count = c(target_resampled)), + family = family, zeroinfl = zeroinfl, + maxit = maxit, + start = list(count = NULL, theta = start$theta), + offset = offset, sample_weights = sample_weights + ) + print("Null") + print(c(null_result$loglike, null_result$betas)) + tgt_result <- glmFit(data.df, list(count = c(target)), + family = family, zeroinfl = zeroinfl, + maxit = maxit, + start = list(count = NULL, theta = start$theta), + offset = offset, sample_weights = sample_weights + ) + print("TGT") + print(c(tgt_result$loglike, tgt_result$betas)) + + # Get null z value from score test method + print("Null") + z_null <- c(z_null, scoreTestZ(data.df[target_resampled], data.df$count, exp(offset$count), + notarget_result$theta, + sample_weights = sample_weights, family = family + )) + } + stop("Interrupted ") + + # Get p-value target parameter + if (resampling == "bootstrap") { + z_null <- z_null - mean(z_null) + } + pv_score <- nullSamplePV(z_null, z_star) + + return(append( + alt_result, + list( + z_star = z_star, z_null = z_null, pv_score = pv_score, + null_loglike = notarget_result$loglike + ) + )) +} + +#' Draw sample from GLM +#' +#' Draw sample from GLM with given parameters +#' @param X=data.frame. Dataframe of covariates for model. +#' @param covariates=vector of str. Column headers of covariates used for model. +#' @param parameters=list(betas=vector, sigma?). Parameters of GLM fit. +#' @param family = str. GLM distribution family used. Currently binomial and gaussian implemented. +#' @return vector of same length as X. Samples from the model. +drawFromGLM <- function(X, covariates, parameters, family = "X") { + K <- as.matrix(X[covariates]) %*% as.vector(parameters$betas) + if (family == "binomial") { + mu <- expit(K) + sample <- rbinom(dim(X)[1], 1, mu) + } else if (family == "gaussian") { + sample <- rnorm(dim(X)[1], K, parameters$sigma) + } else { + print(paste0("No family ", family, " for scoreTestZ")) + } + + return(sample) +} + +#' Estimate p-value from null samples and fitted coefficient. +#' +#' Estimate p-value from null samples and fitted coefficient. +#' @param null_samples=vector. Vector of z-scores of null fits. +#' @param fitted_value=float. Value of z-score for true run. +#' @param model = str. Null model to be used. ('chisquare' (D), 'skewt'). Only chisquare currently implemented +#' @return float. P-value of fitted value against null samples under null model. +#' @examples +#' null_samples <- rnorm(100) +#' fitted_value <- 5 +#' pvalue <- nullSamplePV(null_samples, fitted_value, model = "chisquare") +nullSamplePV <- function(null_samples, fitted_value, model = "chisquare") { + if (model == "chisquare") { + normal_var <- sd(null_samples)^2 + normal_mean <- mean(null_samples) + pv <- pchisq((fitted_value - normal_mean)^2 / normal_var, df = 1, lower.tail = F) + } else { + stop("Unknown model for null sample passed to nullSamplePV") + } + + return(pv) +} + +#' Calculate expit (aka sigmoid) of x +#' +#' Calculate expit (aka sigmoid) of x +#' @param x = numeric type. Value or array of values. +#' @return p. expit(x). +#' @examples +#' x <- c(-10, -0.1, 0., 0.1, 10) +#' p <- expit(x) +expit <- function(x) { + p <- 1 / (1 + exp(-x)) + return(p) +} + +#' Estimate z-score used in score test +#' +#' Estimate z-score used in score test for negative binomial or poisson distributions +#' @param count: y predicted value from negative binomial draw +#' @param offset: predicted values from NB fit to covariates +#' @param theta: theta parameter from NB fit +#' @param sample_weights: weights for samples in likelihood function +#' @param family="X": +#' @return z score estimate +#' @examples +#' x <- c(-10, -0.1, 0., 0.1, 10) +#' p <- expit(x) +scoreTestZ <- function(X, count, offset, theta = 1, sample_weights = 1, family = "X") { + if (family == "negbin") { + dlogL <- sum(sample_weights * X * (count - offset * (count + theta) / (offset + theta))) + fisherI <- sum(sample_weights * X^2 * offset / (offset + theta)) + print(dlogL) + print(fisherI) + print(dlogL / sqrt(fisherI)) + } else if (family == "poisson") { + dlogL <- sum(sample_weights * X * (count - offset)) + fisherI <- sum(sample_weights * X^2 * offset) + } else { + print(paste0("No family ", family, " for scoreTestZ")) + } + + return(dlogL / sqrt(fisherI)) +} + + +model_fit <- function(sample.df, covariates, targets, target_types, resampling_method) { + # Set up model and backup for when model fails + family <- "negbin" + backup <- "poisson" + zeroinfl_model <- sum(sample.df$count == 0) > 0 + target_families <- list(linear = "gaussian", logistic = "binomial") + + # Renormalise covariates + # print("Renormalising") + means <- c() + stds <- c() + for (key in c(covariates, targets[target_types == "linear"])) { + means <- c(means, mean(sample.df[, key])) + stds <- c(stds, sd(sample.df[, key])) + sample.df[key] <- (sample.df[, key] - mean(sample.df[, key])) / sd(sample.df[, key]) + } + + # Select covariates with non-zero variance + sample.df$intercept <- 1.0 + covariates <- list( + count = c("intercept", covariates[stds[1:length(covariates) - 1] > 0]), + zero = c("intercept", covariates[stds[1:length(covariates) - 1] > 0]), + target = c("intercept", covariates[stds[1:length(covariates) - 1] > 0]) + ) + + # Run GLM regression without target variable + print("No target") + # fit_notarget <- glmFit(sample.df, covariates, family=family, zeroinfl=zeroinfl_model, maxit=maxit) + fit_notarget <- tryCatch(glmFit(sample.df, covariates, family = family, zeroinfl = zeroinfl_model, maxit = maxit), + error = function(e) { + glmFit(sample.df, covariates, family = backup, zeroinfl = zeroinfl_model, maxit = maxit) + } + ) + family <- fit_notarget$model + print(fit_notarget) + + # For each target, run GLM regression with target variable and null resampling + for (i in c(1:length(targets))) { + print(c("target", targets[i])) + + print(c("Resampling: ", resampling_method)) + fit_alt <- resampleGLM(sample.df %>% dplyr::select(-count), sample.df$count, targets[i], + covariates = covariates, nulliter = 10, + family = family, target_family = target_families[target_types[i]][[1]], + zeroinfl = zeroinfl_model, notarget_result = fit_notarget, resampling = resampling_method + ) + print( + list( + target = targets[i], resample_pvalue = fit_alt$pv_score, + loglike_null = fit_notarget$loglike, loglike_alt = fit_alt$loglike, + model_null = fit_notarget$model, model_alt = fit_alt$model, + sample_size = nrow(sample.df), nz_size = sum(sample.df$count > 0) + ), + z_star = fit_alt$z_star, z_null = fit_alt$z_null + ) + results.df <- data.frame( + target = targets[i], resample_pvalue = fit_alt$pv_score, + loglike_null = fit_notarget$loglike, loglike_alt = fit_alt$loglike, + model_null = fit_notarget$model, model_alt = fit_alt$model, + sample_size = nrow(sample.df), nz_size = sum(sample.df$count > 0) + ) + results.df$theta_null <- fit_notarget$theta + results.df$theta_alt <- fit_alt$theta + + # Count paramters + names <- c(covariates$count, "target") + for (i in c(1:length(names))) { + results.df[paste(names[i], "means", "alt", sep = "_")] <- fit_alt$betas[i] + results.df[paste(names[i], "covs", "alt", sep = "_")] <- diag(fit_alt$covar)[i] + } + for (i in c(1:length(covariates$count))) { + results.df[paste(covariates$count[i], "means", "null", sep = "_")] <- fit_notarget$betas[i] + results.df[paste(covariates$count[i], "covs", "null", sep = "_")] <- diag(fit_notarget$covar)[i] + + results.df[paste("Xmean", covariates$count[i], sep = "_")] <- means[i] + results.df[paste("Xsd", covariates$count[i], sep = "_")] <- stds[i] + } + for (i in c(1:length(covariates$zero))) { + results.df[paste(covariates$zero[i], "zero", "means", "alt", sep = "_")] <- fit_alt$betas_zero[i] + # results.df[paste(covariates$zero[i], "zero", "covs", "alt", sep="_")] <- diag(fit_alt$covar)[i] + + results.df[paste(covariates$zero[i], "zero", "means", "null", sep = "_")] <- fit_notarget$betas_zero[i] + # results.df[paste(covariates$zero[i], "zero", "covs", "null", sep="_")] <- diag(fit_notarget$covar)[i] + } + + stacked_results.df <- tryCatch(bind_rows(stacked_results.df, results.df), + error = function(e) { + results.df + } + ) + } + + return(stacked_results.df) +} + + +args <- commandArgs(trailingOnly = TRUE) +output_dir <- args[1] +base_dir <- args[2] +test_file <- args[3] +covariate_file <- args[4] +signature_file <- args[5] +target_file <- args[6] +resampling_method <- args[7] # Must be dCRT or bootstrap + +# source(paste(base_dir, "regression_functions.r", sep="/")) + +# covariates = c("log_age", "logit_purity", "is_female", "pc1", "pc2", "pc3") + +# Set of tests to run +test_df <- read_delim(test_file, delim = "\t") +print(test_df[1:10, ]) +signatures <- unique(test_df$signature) + +# Load in samples +sample_df <- read_delim(covariate_file, delim = "\t") +covariates <- colnames(sample_df)[!colnames(sample_df) %in% c("sample_id", "group")] +# covariates <- c("log_age", "is_female", "pc1", "pc2", "pc3") + +# Load in signatures +print(c("sample_id", signatures)) +signature_df <- read_delim(signature_file, delim = "\t") %>% dplyr::select(c("sample_id", signatures)) +signature_df[is.na(signature_df)] <- 0 + +# Load in target variables +# Trait file should be csv with "germline_platekey" and a column for each trait +target_df <- read_delim(target_file, delim = "\t") + +# Join tables together +# sample_df <- inner_join(sample_df, signature_df, by=c("sample_id"), suffix=c("", "_sig")) +sample_df <- inner_join(sample_df, target_df, by = c("sample_id"), suffix = c("", "_tgt")) %>% mutate_if(is.numeric, ~ replace_na(., NA) %>% replace(., is.infinite(.), NA)) + + +# Iterate through all signatures +for (sig in signatures) { + # Iterate through tumour groups for the given signature + groups <- unique(test_df[test_df$signature == sig, ]$group) + for (grp in groups) { + # Get all variable targets for the given group and signature + targets <- test_df[test_df$signature == sig & test_df$group == grp, ]$target + target_types <- test_df[test_df$signature == sig & test_df$group == grp, ]$type + + print(sig) + print(grp) + if (grp %in% c("Breast", "Ovary", "Uterus")) { + subsample_df <- sample_df %>% filter(group == grp, is_female == 1) + } else if (grp %in% list("Testis", "Prostate")) { + subsample_df <- sample_df %>% filter(group == grp, is_female == 0) + } else { + subsample_df <- sample_df %>% filter(group == grp) + } + + # Filter out instances where only one target value + nonsingular <- c() + for (target in targets) { + nonsingular <- c(nonsingular, nrow(unique(subsample_df[, target])) > 1) + } + targets <- targets[nonsingular] + target_types <- target_types[nonsingular] + if (length(targets) == 0) { + stop("No non-unique targets") + } + + subsample_df <- inner_join(subsample_df, + signature_df %>% dplyr::select(sample_id, sig) %>% rename(count = sig), + by = c("sample_id") + ) %>% dplyr::select(count, targets, covariates) + # subsample_df <- sample_df %>% filter(group==grp) %>% dplyr::select(sig, targets, covariates) %>% rename(count=sig) + + results_df <- model_fit(data.frame(subsample_df), covariates, gsub("-", ".", targets), target_types, resampling_method) + results_df["signature"] <- sig + results_df["group"] <- grp + + full_results_df <- tryCatch(bind_rows(full_results_df, results_df), + error = function(e) { + results_df + } + ) + } +} + +# Statistical tests +full_results_df["wilks_pvalue"] <- pchisq(-2 * (full_results_df$loglike_null - full_results_df$loglike_alt), df = 1, lower.tail = FALSE) + +# print(full_results_df) + +# Save results +write_delim(full_results_df, paste0(output_dir, "glm_results.csv"), delim = ",")