From 5cbb9dcd4c154f313bdc02fa7e4f08d6ca2f34f1 Mon Sep 17 00:00:00 2001 From: Vadym Date: Wed, 25 Oct 2023 09:22:54 +0200 Subject: [PATCH] refactor: Remove archived/outdated workflows and scripts (#1296) --- BALSAMIC/assets/scout_config_template.yaml | 43 --- BALSAMIC/assets/scripts/CoveragePlot.R | 78 ---- BALSAMIC/assets/scripts/CoverageRep.R | 158 -------- BALSAMIC/assets/scripts/MutationalSig.R | 97 ----- BALSAMIC/assets/scripts/VariantReport.R | 350 ------------------ BALSAMIC/assets/vagrant/Vagrantfile | 85 ----- BALSAMIC/assets/vagrant/setup.sh | 40 -- BALSAMIC/assets/vcfanno/__init__.py | 0 BALSAMIC/assets/vcfanno/vcfanno.toml | 11 - .../VariantCalling_paired_sentieon_wes | 40 -- .../archive/VariantCalling_paired_umi | 49 --- .../archive/VariantCalling_single_umi | 39 -- CHANGELOG.rst | 1 + 13 files changed, 1 insertion(+), 990 deletions(-) delete mode 100644 BALSAMIC/assets/scout_config_template.yaml delete mode 100755 BALSAMIC/assets/scripts/CoveragePlot.R delete mode 100755 BALSAMIC/assets/scripts/CoverageRep.R delete mode 100755 BALSAMIC/assets/scripts/MutationalSig.R delete mode 100755 BALSAMIC/assets/scripts/VariantReport.R delete mode 100644 BALSAMIC/assets/vagrant/Vagrantfile delete mode 100644 BALSAMIC/assets/vagrant/setup.sh delete mode 100644 BALSAMIC/assets/vcfanno/__init__.py delete mode 100644 BALSAMIC/assets/vcfanno/vcfanno.toml delete mode 100644 BALSAMIC/workflows/archive/VariantCalling_paired_sentieon_wes delete mode 100644 BALSAMIC/workflows/archive/VariantCalling_paired_umi delete mode 100644 BALSAMIC/workflows/archive/VariantCalling_single_umi diff --git a/BALSAMIC/assets/scout_config_template.yaml b/BALSAMIC/assets/scout_config_template.yaml deleted file mode 100644 index f8f143e95..000000000 --- a/BALSAMIC/assets/scout_config_template.yaml +++ /dev/null @@ -1,43 +0,0 @@ ---- - -owner: cust000 - -family: 'lovingtiger' -family_name: 'lovingtiger' -samples: - - analysis_type: panel - sample_id: tumor - tumor_type: unknown - capture_kit: capture_kit_name - sample_name: tumor - phenotype: affected - sex: unknown - expected_coverage: unknown - tmb: unknown - msi: unknown - tumor_purity: unknown - bam_path: path_tumor_merged.bam - - - analysis_type: panel - sample_id: normal - capture_kit: capture_kit_name - sample_name: normal - phenotype: unaffected - sex: unknown - expected_coverage: unknown - bam_path: path_tumor_merged.bam - -vcf_cancer: path_to_final_vcf -vcf_cancer_sv: path_to_final_vcf - -multiqc: path_multiqc_report - -default_gene_panels: [panel1] -gene_panels: [panel1] - -# meta data -rank_model_version: '1.1' -rank_score_threshold: -100 -analysis_date: 'N/A' -human_genome_build: 37 -track: cancer diff --git a/BALSAMIC/assets/scripts/CoveragePlot.R b/BALSAMIC/assets/scripts/CoveragePlot.R deleted file mode 100755 index 94e97ddd0..000000000 --- a/BALSAMIC/assets/scripts/CoveragePlot.R +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env Rscript -# Copyright 2018, Hassan Foroughi Asl -# -# This file is free software: you may copy, redistribute and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 2 of the License, or (at your -# option) any later version. -# -# This file is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("data.table")) - -option_list <- list( - make_option(c("-i", "--infile"), type="character", - help="Input coverage analysis of Sambamba for exons.", metavar="character"), - make_option(c("-o", "--outfile"), type="character", - help="In case of PDF, output file name [default infile.Coverage.pdf].", metavar="character"), - make_option(c("--avgcov"), type="integer", - help="Average coverage of sample. If it's not provided, an average will calculated from input file"), - make_option(c("--covline"), type="integer", - help="Plot coverage and normalized coverage plot for bed regions in the input file [default %default]", metavar="character", default=100), - make_option(c("--title"), type="character", help="plot title.", metavar="character", default= "Sample"), - make_option(c("-f", "--fontsize"), type="integer", default=12, - help="Fontsize as an input to pointsize in pdf() for heigh and width [default %default]"), - make_option(c("-r", "--resolution"), type="integer", default=7, - help="Print image resolution in inches, as an input to pdf() for heigh and width [default %default]"), - make_option(c("-v", "--verbose"), action="store_true", default=FALSE, - help="Print some extra output [default %default]") - ) - -' - %prog [options] - Coverage plot for sambamba depth output file. -' -> usage -opt_parser <- OptionParser(usage = usage, option_list=option_list); -arg <- parse_args(opt_parser) -file <- arg$infile -outfile <- arg$outfile - -if (is.null(file)){ - print_help(opt_parser) - stop("An input is required.", call.=FALSE) -} - -if ( arg$verbose ) { - options( warn = 0 ) -} else { - options( warn = -1 ) -} - -if ( arg$verbose ) { - write("Read coverage file.", stderr()) -} - -sample.coverage = fread(file, showProgress=F) -covSample = sample.coverage - -if (is.null(arg$avgcov) ) { - arg$avgcov = mean(covSample$meanCoverage) -} - -pdf(arg$outfile, width = arg$resolution, height = arg$resolution, pointsize = arg$fontsize) - -par(mfrow = c(1,2), pty = "s") - -hist(covSample$meanCoverage, breaks = 100, xlab = "Coverage", main = arg$title) -abline(v=arg$covline,col="red") - -hist(covSample$meanCoverage/arg$avgcov, breaks = 100, xlab = "Normalized Coverage", main = arg$title) -abline(v=arg$covline/arg$avgcov,col="red") - -garbage <- dev.off() diff --git a/BALSAMIC/assets/scripts/CoverageRep.R b/BALSAMIC/assets/scripts/CoverageRep.R deleted file mode 100755 index 5ee951968..000000000 --- a/BALSAMIC/assets/scripts/CoverageRep.R +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/env Rscript -# Copyright 2018, Hassan Foroughi Asl -# -# This file is free software: you may copy, redistribute and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 2 of the License, or (at your -# option) any later version. -# -# This file is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("data.table")) -suppressPackageStartupMessages(library("stargazer")) -#suppressPackageStartupMessages(library("bit64")) - -option_list <- list( - make_option(c("-i", "--infile"), type="character", - help="Input coverage analysis of Sambamba for exons.", metavar="character"), - make_option(c("--genename"), type="character", - help="List of gene symbols comma separeted.", metavar="character"), - make_option(c("--ensemble"), type="character", - help="List of comma separeted gene ensemble ids, not both.", metavar="character"), - make_option(c("-t", "--type"), type="character", default="text", - help="Output table type format for exon coverage report [default %default].", metavar="character"), - make_option(c("--name"), type="character", help="A name for the output table [default %default].", - default="Coverage report"), - make_option(c("-o", "--outfile"), type="character", - help="In case of PDF, output file name [default infile.Coverage.pdf].", metavar="character"), - make_option(c("-v", "--verbose"), action="store_true", default=FALSE, - help="Print some extra output [default %default]") - ) - -' - %prog [options] - - A script to report the exon coverage summary output from Sambamba with for canonical transcripts (longest -transcript) of each gene. In coverage, the following is taken into account: 1. transcript must be protein coding. 2. -Transcript should not have more than one exon with zero coverage. 3. A canonical transcript is a transcript that is -longest and meets criteria 1 and 2. - -' -> usage -opt_parser <- OptionParser(usage = usage, option_list=option_list); -arg <- parse_args(opt_parser) -file <- arg$infile -outfile <- arg$outfile - -if (is.null(file)){ - print_help(opt_parser) - stop("An input is required.", call.=FALSE) -} - -if (! is.null(arg$genename) & ! is.null(arg$ensemble) ){ - stop("Provide gene or ensemble id, not both", call.=FALSE) -} - -if ( is.null(arg$genename) & is.null(arg$ensemble) ){ - stop("You provide a list of genes or ensemble IDs.", call.=FALSE) -} - -if ( arg$verbose ) { - options( warn = 0 ) -} else { - options( warn = -1 ) -} - -if ( arg$verbose ) { - write("Read coverage file.", stderr()) -} - -sample.coverage = fread(file, showProgress=F) - -fragLength = 100 -if (! is.null(arg$genename)) { - genelist = unlist(strsplit(arg$genename, ",")) - dt.gene = sample.coverage[F10 %in% genelist,] -} - -if (! is.null(arg$ensemble)) { - genelist = unlist(strsplit(arg$ensemble, ",")) - dt.gene = sample.coverage[F11 %in% genelist,] -} - -dt.gene = dt.gene[, - .(exonCount = .N, - readPerExon = sum(readCount)/.N, - meanExonCoverage = mean(readCount*fragLength/(abs(chromStart-chromEnd))), - medianExonCoverage = median(readCount*fragLength/(abs(chromStart-chromEnd))), - readPerbpPerExon = sum(readCount*fragLength)/(F7/.N), - txID = F3, - geneID = F11, - txLength = F7, - geneName = F10, - txType = F6, - txStatus = F9, - totalRead = sum(readCount), - zeroExonCov = sum(readCount==0), - zeroExonCovMid = !(any(which(!readCount)==length(readCount)) - || any(which(!readCount)==1)), - zeroExonCovLastFirst = any(which(!readCount)==length(readCount)) - || any(which(!readCount)==1) - ), - by=.(F3, F6, F7, F9, F10, F11) - ] - -dt.gene = dt.gene[txType=="protein_coding"] - -dt.gene = dt.gene[, - .("Gene" = geneName, - txID, - "tx_exonCount" = paste0(txID, "_", exonCount), - "tx length" = txLength, - txLength, - maxLength = max(txLength), - "tx type" = txType, - "tx status" = txStatus, - exonCount, - "read per exon" = readPerExon, - readPerbpPerExon, - "Median exon cov" = medianExonCoverage, - meanExonCoverage, - totalRead, - zeroExonCov, - zeroExonCovLastFirst, - zeroExonCovMid, - "Exon zero cov" = paste0(zeroExonCov, - " (", - as.integer(zeroExonCovLastFirst), - " / ", - as.integer(zeroExonCovMid), - ")"), - maxTxReadCount = max(totalRead) - ),keyby=geneID] - -dt.gene = dt.gene[maxLength==txLength, - !c("maxLength", - "zeroExonCovLastFirst", - "zeroExonCovMid", - "txLength", - "geneID", - "txID", - "exonCount", - "totalRead", - "readPerbpPerExon", - "meanExonCoverage", - "zeroExonCov", - "Exon zero cov", - "maxTxReadCount") - ] - -stargazer(dt.gene, summary = FALSE, type = arg$type, title = arg$name, - table.placement = "H", - digit.separator = "", rownames = F, style = "io", float = T, - header = F, out.header = F) diff --git a/BALSAMIC/assets/scripts/MutationalSig.R b/BALSAMIC/assets/scripts/MutationalSig.R deleted file mode 100755 index 3f40da923..000000000 --- a/BALSAMIC/assets/scripts/MutationalSig.R +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env Rscript -# Copyright 2018, Hassan Foroughi Asl -# -# This file is free software: you may copy, redistribute and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 2 of the License, or (at your -# option) any later version. -# -# This file is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("deconstructSigs")) - -option_list <- list( - make_option(c("-i", "--infile"), type="character", - help="Input mutation file.", metavar="character"), - make_option(c("-o", "--outfile"), type="character", - help="output file name.", metavar="character"), - make_option(c("-m", "--model"), type="character", default="nature2013", - help="Choose between two models: nature2013 or cosmic. Refer to deconstructSigs documentatation. [default %default]"), - make_option(c("-r", "--resolution"), type="integer", default=7, - help="Print image resolution in inches, as an input to pdf() for heigh and width [default %default]"), - make_option(c("-f", "--fontsize"), type="integer", default=12, - help="Fontsize as an input to pointsize in pdf() for heigh and width [default %default]"), - make_option(c("-s", "--sampleid"), type="integer", default=1, - help="Numeric id for sample in input file. [default %default]"), - make_option(c("-v", "--verbose"), action="store_true", default=FALSE, - help="Print extra output [default %default]") - ) - -' - %prog [options] - - A wrapper for deconstructSig to plot mutational signatures. - -' -> usage -opt_parser <- OptionParser(usage = usage, option_list=option_list); -arg <- parse_args(opt_parser) -file <- arg$infile -outfile <- arg$outfile - -if (is.null(file) || is.null(outfile) ){ - print_help(opt_parser) - stop("An input and output files are required.", call.=FALSE) -} - -if ( arg$verbose ) { - options( warn = 0 ) -} else { - options( warn = -1 ) -} - -if ( arg$verbose ) { - write("Loading input mutation list into data frame.", stderr()) -} - -sample.mut.ref = read.table(file , header = T) - -if ( arg$verbose ) { - write("Converting data frame into mutation", stderr()) -} - -sigs.input <- mut.to.sigs.input(mut.ref = sample.mut.ref, - sample.id = "sample", - chr = "chrom", - pos = "pos", - ref = "ref", - alt = "alt") - -if ( arg$verbose ) { - write("Matching signatures with reference.", stderr()) -} - -if ( arg$model == "nature2013" ) { - sigmodel = signatures.nature2013 -} else if ( arg$model == "cosmic" ) { - sigmodel = signatures.cosmic -} else { - stop("Unknown model paramters. See help") -} - -sample_sig = whichSignatures(tumor.ref = sigs.input, - signatures.ref = sigmodel, - sample.id = arg$sampleid, - contexts.needed = TRUE, - tri.counts.method = 'default') - -pdf(arg$outfile, width = arg$resolution, height = arg$resolution, pointsize = arg$fontsize) -plotSignatures(sample_sig) -garbage <- dev.off() - - diff --git a/BALSAMIC/assets/scripts/VariantReport.R b/BALSAMIC/assets/scripts/VariantReport.R deleted file mode 100755 index 71b2b6e32..000000000 --- a/BALSAMIC/assets/scripts/VariantReport.R +++ /dev/null @@ -1,350 +0,0 @@ -#!/usr/bin/env Rscript -# Copyright 2018, Hassan Foroughi Asl -# -# This file is free software: you may copy, redistribute and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 2 of the License, or (at your -# option) any later version. -# -# This file is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("data.table")) -suppressPackageStartupMessages(library("stargazer")) - -option_list <- list( - make_option(c("-i", "--infile"), type="character", help="Input coverage analysis of Sambamba for exons.", metavar="character"), - make_option(c("--mode"), type="character", help="Run mode. Select one from the list: MVL,TMB,VarClass,VarCaller,VarCallerClass [default %default]", default="MVL"), - make_option(c("--dp"), type="character", help="Total read depth filter [default %default].", default="100"), - make_option(c("-F","--afmax"), type="character", help="Maximum tumor AF filter [default %default].", default="0.05"), - make_option(c("-f","--afmin"), type="character", help="Minimum tumor AF filter [default %default].", default="0.01"), - make_option(c("-a","--tumorad"), type="character", help="Allelic depth for alternative allele in tumor [default %default].", default="10"), - make_option(c("-m","--inMVL"), type="character", help="Flag to filter variant in MVL or not [default %default].", default="FALSE"), - make_option(c("--vartype"), type="character", help="Variant type filter. The value must exist in the TYPE column [default %default]", default="SNP"), - make_option(c("--varcaller"), type="character", help="Variant caller name filter. Choose from: STRELKA, MUTECT2, VARDICT, or ANY. Use multiple variant caller names sepraterd by comma and no space in between. [default %default].", default="VARDICT"), - make_option(c("--ann"), type="character", help="Annotation string to exact match and filter. [default %default].", default="missense_variant"), - make_option(c("--name"), type="character", help="A name for the output table [default %default].", default="Variant filter table"), - make_option(c("--inExon"), type="logical", help="A flag to select variants that are only found in exons [default %default]", default=TRUE), - make_option(c("--inGene"), type="logical", help="A flag to select variants that have a gene symbol annotation [default %default]", default=TRUE), - make_option(c("--genomeSize"), type="integer", help="Genome or panel or exome size to calculate TMB"), - make_option(c("--exclusiveSets"), type="logical", help="A flag to only perform setdiff between different sets of MVL [default %default]", default=FALSE), - make_option(c("--exportGene"), type="logical", help="A flag to not output the table, instead comma separated list of genes [default %default]", default=FALSE), - make_option(c("-t", "--type"), type="character", default="text", help="Output table fortmat type. Choose from: text, latex, html. And output file name is required for html and latex [default %default].", metavar="character"), - make_option(c("-o", "--outfile"), type="character", help="In case of PDF, output file name.", metavar="character"), - make_option(c("-v", "--verbose"), action="store_true", default=FALSE, help="Print some extra output [default %default]") - ) - -' - %prog [options] - - A script to report variants based on series of inputs. Data.table is sometimes faster than Pandas is aggregating -results, thus it was developed in R instead of Python. -' -> usage -opt_parser <- OptionParser(usage = usage, option_list=option_list); -arg <- parse_args(opt_parser) -file <- arg$infile - -if (is.null(file)){ - print_help(opt_parser) - stop("An input is required.", call.=FALSE) -} - -if (! arg$verbose ) { - options( warn = 0 ) -} else { - options( warn = -1 ) -} - -ConcatVarCall <- function(x) { - x = gsub("VARDICT","V",x) - x = gsub("STRELKA","S",x) - x = gsub("MUTECT2","M",x) - return(x) -} - -trimStr <- function(x) { - if (nchar(x) > 10) { - x = paste0(substring(x, 1, 3), "...", substring(x, nchar(x)-3, nchar(x))) - } - return(x) -} - -sample.coverage = fread(arg$infile, showProgress=F) -sample.coverage[,ID:=paste0(CHROM,"_",POS,"_",REF,"_",ALT)] - -dt_excl = data.table() - -if (arg$mode == "MVL") { - var_param = c("afmax","afmin","inMVL","dp","tumorad","name","varcaller","ann","vartype","outfile") - - set_cnt = c() - for (v in var_param) { - arg[[v]] = unlist(strsplit(arg[[v]], split=';', fixed=T)) - set_cnt = c(length(unlist(strsplit(arg[[v]],split=';', fixed=T))), set_cnt) - } - - if (length(unique(set_cnt)) > 1) { - stop("Number of sets is different among inputs.", call.=FALSE) - } - - int_vars = c("afmax","afmin","dp","tumorad") - for (v in int_vars) { - arg[[v]] = as.numeric(arg[[v]]) - } - - #bool_vars = c("inMVL", "inExon", "inGene") - bool_vars = c("inMVL") - for (v in bool_vars) { - arg[[v]] = as.logical(arg[[v]]) - } - for (i in 1:unique(set_cnt)) { - mvl = arg$inMVL[i] - if (mvl) { - mvl = "1" - } else { - mvl = "." - } - dp = arg$dp[i] - tumor_ad_alt = arg$tumorad[i] - af_max = arg$afmax[i] - af_min = arg$afmin[i] - var_type = unlist(strsplit(arg$vartype[i], ",")) - var_caller = unlist(strsplit(arg$varcaller[i], ",")) - table_name = arg$name[i] - table_num = arg$num[i] - outfile = arg$outfile[i] - - var_caller = toupper(var_caller) - if (any(var_caller %in% "ANY")) { - var_caller = c("MUTECT2", "VARDICT", "STRELKA") - } - - dt = sample.coverage[CALLER %in% var_caller - & MSK_MVL == mvl - & (TUMOR_AD_REF + TUMOR_AD_ALT) >= dp - & TUMOR_AD_ALT / (TUMOR_AD_REF + TUMOR_AD_ALT) <= af_max - & TUMOR_AD_ALT / (TUMOR_AD_REF + TUMOR_AD_ALT) >= af_min - & TUMOR_AD_ALT >= tumor_ad_alt - & TYPE %in% var_type] - - if (! is.null(arg$ann)) { - var_ann = unlist(strsplit(arg$ann, ",")) - dt = dt[Consequence %in% var_ann] - } - - if (arg$inExon) { - dt = dt[EXON != "."] - } - - if (arg$inGene) { - dt = dt[SYMBOL != "."] - } - - if (nrow(dt)!=0) { - - dt = dt[, - .("Chr:Pos" = paste0(CHROM,":",POS), - "Ref/Alt" = paste0(unlist(lapply(FUN = trimStr, REF)),"/",unlist(lapply(FUN = trimStr, ALT))), - "Caller" = ConcatVarCall(paste(unique(c(CALLER)), collapse = "/")), - "CallerCount" = length(unique(c(CALLER))), - "DP (Ref/Alt)" = paste0(floor(mean(TUMOR_AD_REF + TUMOR_AD_ALT)), - "(", - paste0(floor(mean(TUMOR_AD_REF)),"/", floor(mean(TUMOR_AD_ALT))), - ")"), - "AF" = mean(TUMOR_AD_ALT/(TUMOR_AD_REF + TUMOR_AD_ALT)), - "Consequence" = paste(unique(c(Consequence)), collapse = ", "), - "Protein"=paste(unlist(strsplit(HGVSp,":"))[2], collapse=", "), - "Gene" = SYMBOL - ) - ,by=.(ID)] - - dt = unique(dt) - dt = dt[,c("Chr:Pos", "Ref/Alt", "Caller", "DP (Ref/Alt)", "AF", "Gene", "Consequence", "Protein")] - - if (nrow(dt_excl)==0) { - dt_excl = dt[0,] - } - } - - - if (arg$exclusiveSets & unique(set_cnt) > 1 & nrow(dt)>0) { - dt = fsetdiff(dt, dt_excl, all = FALSE) - dt_excl = funion(dt, dt_excl) - } - - if (nrow(dt)==0) { - write(paste0("No variant were for found for table ", table_name), "") - write(" ", file = outfile) - } else { - if (arg$exportGene) { - write(paste0("list of genes for table ", table_name, " : ", - paste(unlist(unique(dt[, c("Gene")])), collapse=",")), "") - write(paste(unlist(unique(dt[, c("Gene")])), collapse=","), file = outfile) - } else { - if (is.null(arg$outfile) || arg$type == "text") { - stargazer(dt, summary = FALSE, type = arg$type, title = table_name, - table.placement = "H", digit.separator = "", rownames = F, style = "io", float = T, - notes = c(paste0("1. A summary of results based on \"", - table_name, "\" specification."), - paste0("2. Variant callers included: ", - tolower(paste(var_caller, collapse = ", ")))), - header = F, out.header = F) - } else { - stargazer(dt, summary = FALSE, title = table_name, - table.placement = "H", digit.separator = "", rownames = F, style = "io", float = T, - notes = c(paste0("1. A summary of results based on \"", - table_name, "\" specification."), - paste0("2. Variant callers included: ", - tolower(paste(var_caller, collapse = ", ")))), - header = F, out.header = F, out = outfile) - fwrite(x = dt, file = paste0(outfile, ".csv")) - } - } - } - } -} else if ( arg$mode == "TMB" ) { - if (is.null(arg$genomeSize)) { - stop("Genome/panel size is required.", call.=FALSE) - } - genomeLength = arg$genomeSize / 1e6 - - var_caller = as.list(unique(sample.coverage[,"CALLER"]))$CALLER - annotation = c("stop_gained", "stop_lost", "start_lost", - "missense_variant", "nonsynonymous_variant", - "splice_acceptor_variant", "splice_donor_variant", - "splice_donor_5th_base_variant", "splice_site_variant", - "splicing_variant", "frameshift_variant") - - dt1 = unique(sample.coverage[CALLER %in% var_caller - & Consequence %in% annotation, .(CHROM, POS), - by=.(ID, CALLER)])[,.(.N,"TMB"=.N/genomeLength),by=.(CALLER)] - - dt2 = unique(sample.coverage[CALLER %in% var_caller - & Consequence %in% annotation, .(CALLER), - by=.(ID)][,.(ID)])[,.(.N)] - dt2[,"CALLER":="ALL"] - setcolorder(dt2, neworder = c("CALLER", "N")) - dt = dt2[,.(CALLER, N, "TMB"=N/genomeLength)] - - str_annot = paste(gsub("_", "-", annotation), collapse = ", ") - dt.TMB = rbind(dt1, dt) - stargazer(unique(dt.TMB), summary = FALSE, type = arg$type, - title = "Tumor mutation burden (TMB)", - digit.separator = "", rownames = F, style = "io", - header = F, out.header = F, table.placement = "H", float = T, - notes = c(paste0("1. Variant callers included: ", - tolower(paste(as.list(unique(sample.coverage[,"CALLER"]))$CALLER, - collapse = ", "))), - paste0("2. Variant types: ", - tolower(paste(as.list(unique(sample.coverage[,"VARIANT_CLASS"]))$VARIANT_CLASS, - collapse=", "))), - paste0("3. Only all coding variants (all subchilds of nonsynonymous variants annotation)"))) - - if (!is.null(arg$outfile)){ - for (v in var_caller) { - fwrite(x = unique(sample.coverage[CALLER==v & SYMBOL!=".", - .("Chr:Pos" = paste0(CHROM,":",POS), - "Ref/Alt" = paste0(unlist(lapply(FUN = trimStr, REF)),"/",unlist(lapply(FUN = trimStr, ALT))), - "Caller" = ConcatVarCall(paste(unique(c(CALLER)), collapse = "/")), - "CallerCount" = length(unique(c(CALLER))), - "DP (Ref/Alt)" = paste0(floor(mean(TUMOR_AD_REF + TUMOR_AD_ALT)), - "(", - paste0(floor(mean(TUMOR_AD_REF)),"/", floor(mean(TUMOR_AD_ALT))), - ")"), - "AF" = mean(TUMOR_AD_ALT/(TUMOR_AD_REF + TUMOR_AD_ALT)), - "Consequence" = paste(unique(c(Consequence)), collapse = ", "), - "Protein" = paste(unlist(unique(HGVSp)), collapse=", "), - "Gene" = paste(unlist(unique(SYMBOL)), collapse=", ") - ) - ,by=.(ID)]), file = paste0(arg$outfile, "_", v, ".csv")) - } - } -} else if ( arg$mode == "VarClass" ) { - dt = unique(sample.coverage[,.(ID,CALLER,VARIANT_CLASS)]) - dt.typevars = dt[,.("CALLERCOUNT"=length(unique(c(CALLER))),.N), - by=.("CALLERS"=CALLER,VARIANT_CLASS)][order(CALLERS,-VARIANT_CLASS)] - - dt = unique(sample.coverage[,.(ID,CALLER,VARIANT_CLASS)]) - dt = dt[,.("CALLERS"="ALL","CALLERCOUNT"=length(unique(c(CALLER))),.N), - by=(VARIANT_CLASS)] - - setcolorder(dt, neworder = c("CALLERS", "VARIANT_CLASS", "CALLERCOUNT","N")) - dt.typecensus = rbind(dt, dt.typevars) - - dt = unique(sample.coverage[,.(ID,CALLER)]) - dt.allcallers = dt[,.("VARIANT_CLASS"="All_types","CALLERCOUNT"=1,.N), by=.("CALLERS"=CALLER) ][order(CALLERS)] - - dt.callercensus = rbind(dt.allcallers,dt.typecensus)[order(CALLERS,VARIANT_CLASS,CALLERCOUNT)] - stargazer(unique(dt.callercensus), summary = FALSE, type = arg$type, - title = "Variant class summary", - digit.separator = "", rownames = F, style = "io", - header = F, out.header = F, table.placement = "H", float = T, - notes = c(paste0("1. A summary of variant classes devided by variant class and variant caller"), - paste0("2. Variant callers included: ", - tolower(paste(as.list(unique(sample.coverage[,"CALLER"]))$CALLER, - collapse = ", "))), - paste0("3. Variant types: ", - tolower(paste(as.list(unique(sample.coverage[,"VARIANT_CLASS"]))$VARIANT_CLASS, - collapse=", "))))) -} else if ( arg$mode == "VarCaller" ) { - - var_caller = as.list(unique(sample.coverage[,"CALLER"]))$CALLER - var_caller_combn = do.call("c", lapply(seq_along(var_caller), - function(i) {combn(var_caller, i, simplify = F)})) - - dt = unique(sample.coverage[,.(ID,CALLER,VARIANT_CLASS)]) - dt = dt[,.("CALLERS"=paste(unique(c(CALLER)),collapse = "-"), - "CALLERCOUNT"=length(unique(c(CALLER)))), - by=.(ID)][,.(.N), - by=.(CALLERS,CALLERCOUNT) - ] - - dt.venn = dt[order(CALLERCOUNT,CALLERS)] - stargazer(unique(dt.venn), summary = FALSE, type = arg$type, - title = "Variant caller summary", - digit.separator = "", rownames = F, style = "io", - header = F, out.header = F, table.placement = "H", float = T, - notes = c(paste0("1. A summary of exclusive variant types ", - "devided by variant callers that identified them"), - paste0("2. Variant callers included: ", - tolower(paste(as.list(unique(sample.coverage[,"CALLER"]))$CALLER, - collapse = ", "))), - paste0("3. Variant types: ", - tolower(paste(as.list(unique(sample.coverage[,"VARIANT_CLASS"]))$VARIANT_CLASS, - collapse=", "))))) -} else if ( arg$mode == "VarCallerClass" ){ - - var_caller = as.list(unique(sample.coverage[,"CALLER"]))$CALLER - var_caller_combn = do.call("c", lapply(seq_along(var_caller), - function(i) {combn(var_caller, i, simplify = F)})) - - dt = unique(sample.coverage[,.(ID,CALLER,VARIANT_CLASS)]) - dt = dt[,.("CALLERS"=paste(unique(c(CALLER)),collapse = "-"), - "CALLERCOUNT"=length(unique(c(CALLER))), - VARIANT_CLASS), - by=.(ID)][,.(.N), - by=.(CALLERS,VARIANT_CLASS,CALLERCOUNT) - ] - - dt.venn = dt[order(CALLERCOUNT,CALLERS,VARIANT_CLASS)] - stargazer(unique(dt.venn), summary = FALSE, type = arg$type, - title = "Variant caller summary by class", - digit.separator = "", rownames = F, style = "io", - header = F, out.header = F, table.placement = "H", float = T, - notes = c(paste0("1. A summary of exclusive variant types ", - "devided by variant callers that identified them"), - paste0("2. Variant callers included: ", - tolower(paste(as.list(unique(sample.coverage[,"CALLER"]))$CALLER, - collapse = ", "))), - paste0("3. Variant types: ", - tolower(paste(as.list(unique(sample.coverage[,"VARIANT_CLASS"]))$VARIANT_CLASS, - collapse=", "))))) -} else { - stop("Run mode not recognized", call.=FALSE) -} diff --git a/BALSAMIC/assets/vagrant/Vagrantfile b/BALSAMIC/assets/vagrant/Vagrantfile deleted file mode 100644 index 622f0a791..000000000 --- a/BALSAMIC/assets/vagrant/Vagrantfile +++ /dev/null @@ -1,85 +0,0 @@ -# -*- mode: ruby -*- -# vi: set ft=ruby : - -#################################### -# CentOS 7 Build -# -# Downloads and installs the latest -# CentOS 6 build posted to S3 from -# Jenkins in a CentOS 7 VM. -# -# IP: 10.0.0.68 -# Admin dash username: admin -# Admin dash password: password -# -# NOTE that this image uses a custom box. It's based off of the -# minimal CentOS 7 box on vagrantbox.es, but we bumped to guest -# additions to match 4.3.20. On my Ubuntu 14.04 machine, I needed -# 4.3.20 Virtual box on the host machine and this custom image -# on the guest machine to get shared folders to work. -# -# mostly copied from official shiny_server git repo -# -#################################### - - -# Vagrantfile API/syntax version. Don't touch unless you know what you're doing! -VAGRANTFILE_API_VERSION = "2" - -Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| - # All Vagrant configuration is done here. The most common configuration - # options are documented and commented below. For a complete reference, - # please see the online documentation at vagrantup.com. - - # Every Vagrant virtual environment requires a box to build off of. - config.vm.box = "centos7-20" - - # The url from where the 'config.vm.box' box will be fetched if it - # doesn't already exist on the user's system. - config.vm.box_url = "https://s3-us-west-2.amazonaws.com/rstudio-vagrant-boxes/boxes/centos7.box" - - config.vm.host_name = "sso-centos7-latest" - - config.vm.provision "shell", path: "setup.sh" - - # Create a forwarded port mapping which allows access to a specific port - # within the machine from a port on the host machine. In the example below, - # accessing "localhost:8080" will access port 80 on the guest machine. - # config.vm.network :forwarded_port, guest: 80, host: 8080 - - # Create a private network, which allows host-only access to the machine - # using a specific IP. - #config.vm.network :private_network, ip: "10.0.0.62" - config.vm.network :forwarded_port, guest: 4001, host: 3838 - - # Create a public network, which generally matched to bridged network. - # Bridged networks make the machine appear as another physical device on - # your network. - # config.vm.network :public_network - - # If true, then any SSH connections made will enable agent forwarding. - # Default value: false - # config.ssh.forward_agent = true - - # Share an additional folder to the guest VM. The first argument is - # the path on the host to the actual folder. The second argument is - # the path on the guest to mount the folder. And the optional third - # argument is a set of non-required options. - # config.vm.synced_folder ".", "/vagrant_data" - - # Provider-specific configuration so you can fine-tune various - # backing providers for Vagrant. These expose provider-specific options. - # Example for VirtualBox: - # - config.vm.provider :virtualbox do |vb| - # Don't boot with headless mode - # vb.gui = true - - # Use VBoxManage to customize the VM. For example to change memory: - vb.customize ["modifyvm", :id, "--memory", "1024"] - end - # - # View the documentation for the provider you're using for more - # information on available options. - -end diff --git a/BALSAMIC/assets/vagrant/setup.sh b/BALSAMIC/assets/vagrant/setup.sh deleted file mode 100644 index 9eebdd721..000000000 --- a/BALSAMIC/assets/vagrant/setup.sh +++ /dev/null @@ -1,40 +0,0 @@ -#setup file mostly copied from official shiny_server git repo -yum clean all -y --enablerepo=* -yum install -y epel-release -yum update -y --disablerepo=epel - -# Enable EPEL -rpm -Uvh https://dl.fedoraproject.org/pub/epel/7/x86_64/Packages/e/epel-release-7-11.noarch.rpm - -# On this minimal install, we need wget -yum install -y coreutils -yum install -y yum-utils -yum install -y wget -yum install -y which -yum install -y bzip2 -yum install -y git -yum install -y gcc -yum install -y fontconfig -yum install -y libcurl libcurl-devel -yum install -y openssl-devel - -# Install R -yum install R -y - -wget https://s3.amazonaws.com/rstudio-shiny-server-os-build/centos-6.3/x86_64/VERSION -O "version.txt" -VERSION=`cat version.txt` - -# Install the latest SS build -wget "https://s3.amazonaws.com/rstudio-shiny-server-os-build/centos-6.3/x86_64/shiny-server-$VERSION-rh6-x86_64.rpm" -O ss-latest.rpm -yum install --nogpgcheck ss-latest.rpm -y - -sudo su - \ - -c "R -e \"install.packages(c('shiny', 'httpuv', 'rmarkdown', 'devtools', 'RJDBC'), repos='http://cran.rstudio.com/')\"" - -sudo R -e 'devtools::install_github("tidyverse/ggplot2")' - -systemctl disable firewalld -systemctl stop firewalld -sed -i 's/enforcing/disabled/g' /etc/selinux/config -systemctl enable shiny-server -systemctl start shiny-server diff --git a/BALSAMIC/assets/vcfanno/__init__.py b/BALSAMIC/assets/vcfanno/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/BALSAMIC/assets/vcfanno/vcfanno.toml b/BALSAMIC/assets/vcfanno/vcfanno.toml deleted file mode 100644 index c824c659d..000000000 --- a/BALSAMIC/assets/vcfanno/vcfanno.toml +++ /dev/null @@ -1,11 +0,0 @@ -[[annotation]] -file="gnomad.genomes.r2.1.1.sites.vcf.bgz" -fields = ["AF", "AF_popmax"] -ops=["self", "self"] -names=["GNOMADAF", "GNOMADAF_popmax"] - -[[annotation]] -file="clinvar.vcf.gz" -fields=["CLNACC", "CLNREVSTAT", "CLNSIG", "ORIGIN", "CLNVC", "CLNVCSO"] -ops=["self", "self","self","self","self","self"] -names=["CLNACC", "CLNREVSTAT", "CLNSIG", "ORIGIN", "CLNVC", "CLNVCSO"] diff --git a/BALSAMIC/workflows/archive/VariantCalling_paired_sentieon_wes b/BALSAMIC/workflows/archive/VariantCalling_paired_sentieon_wes deleted file mode 100644 index 90d6d3fd9..000000000 --- a/BALSAMIC/workflows/archive/VariantCalling_paired_sentieon_wes +++ /dev/null @@ -1,40 +0,0 @@ -#!python -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -import os -from BALSAMIC.utils.rule import get_vcf -from BALSAMIC.utils.rule import get_result_dir - -shell.prefix("set -eo pipefail; ") - -rule_dir = config["rule_directory"] -analysis_fastq_dir = get_result_dir(config) + "/fastq/" -bam_dir = get_result_dir(config) + "/bam/" -cnv_dir = get_result_dir(config) + "/cnv/" -cutadapt_dir = get_result_dir(config) + "/cutadapt/" -fastqc_dir = get_result_dir(config) + "/fastqc/" -result_dir = get_result_dir(config) + "/" -vcf_dir = get_result_dir(config) + "/vcf/" -vep_dir = get_result_dir(config) + "/vep/" - -SENTIEON_LICENSE = "10.10.10.1:8990" -SENTIEON_INSTALL_DIR="/home/proj/development/cancer/sentieon/sentieon-genomics-201808.03" - -include: - rule_dir + "snakemake_rules/sentieon/sentieon.rule" - -rule all: - input: - expand(bam_dir + "{sample}.bam", sample=config["samples"]), - expand(bam_dir + "{sample}.dedup.bam", sample=config["samples"]), - expand(bam_dir + "{sample}.dedup.realign.bam", sample=config["samples"]), - expand(bam_dir + "{sample}.dedup.realign.recal_data.table", sample=config["samples"]), - expand(bam_dir + "{sample}.dedup.realign.recal.csv", sample=config["samples"]), - expand(bam_dir + "{sample}.dedup.realign.recal.pdf", sample=config["samples"]), - bam_dir + config["analysis"]["case_id"] + ".corealign.bam", - expand(vcf_dir + config["analysis"]["case_id"] + ".{algo}.vcf.gz", algo = ["tnsnv", "tnhaplotyper"]) - output: - os.path.join(get_result_dir(config), "analysis_finish") - shell: - "date +'%Y-%M-%d T%T %:z' > {output}" diff --git a/BALSAMIC/workflows/archive/VariantCalling_paired_umi b/BALSAMIC/workflows/archive/VariantCalling_paired_umi deleted file mode 100644 index befdd955e..000000000 --- a/BALSAMIC/workflows/archive/VariantCalling_paired_umi +++ /dev/null @@ -1,49 +0,0 @@ -#!python -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -import os -from BALSAMIC.utils.rule import get_result_dir, get_vcf - -shell.prefix("set -eo pipefail; ") - -rule_dir = config["rule_directory"] -bam_dir = get_result_dir(config) + "/bam/" -cnv_dir = get_result_dir(config) + "/cnv/" -cutadapt_dir = get_result_dir(config) + "/cutadapt/" -fastqc_dir = get_result_dir(config) + "/fastqc/" -result_dir = get_result_dir(config) + "/" -vcf_dir = get_result_dir(config) + "/vcf/" -vep_dir = get_result_dir(config) + "/vep/" - -include: - -include: - rule_dir + "snakemake_rules/align/bwa_mem_umi.rule" -include: - rule_dir + "snakemake_rules/variant_calling/split_bed.rule" -include: - rule_dir + "snakemake_rules/quality_control/picard.rule" -include: - rule_dir + "snakemake_rules/umi/fgbio.rule" -include: - rule_dir + "snakemake_rules/variant_calling/mergetype_paired_umi.rule" -include: - rule_dir + "snakemake_rules/variant_calling/vardict.rule" -include: - rule_dir + "snakemake_rules/variant_calling/strelka.rule" -include: - rule_dir + "snakemake_rules/variant_calling/manta.rule" - -var_type = ["SNV", "SV"] -var_class = ["somatic", "germline"] - -rule all: - input: - expand(vcf_dir + "{vcf}.vcf.gz", vcf=get_vcf(config, ["vardict", "strelka"], [config["analysis"]["case_id"]])), - expand(bam_dir + "{sample}.unalg.umi.mrkadp.bwa.map.umi.cnsunalg.bwa.map.fltr.clip.bam", sample=config["samples"]), - expand(bam_dir + "{sample}.unalg.umi.mrkadp.bwa.map.umi.metrics", sample=config["samples"]) - output: - os.path.join(get_result_dir(config), "analysis_finish") - shell: - "date +'%Y-%M-%d T%T %:z' > {output}" diff --git a/BALSAMIC/workflows/archive/VariantCalling_single_umi b/BALSAMIC/workflows/archive/VariantCalling_single_umi deleted file mode 100644 index 1d6563b8b..000000000 --- a/BALSAMIC/workflows/archive/VariantCalling_single_umi +++ /dev/null @@ -1,39 +0,0 @@ -#!python -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -import os -from BALSAMIC.utils.rule import get_result_dir, get_vcf - -shell.prefix("set -eo pipefail; ") - -rule_dir = config["rule_directory"] -bam_dir = get_result_dir(config) + "/bam/" -cnv_dir = get_result_dir(config) + "/cnv/" -cutadapt_dir = get_result_dir(config) + "/cutadapt/" -fastqc_dir = get_result_dir(config) + "/fastqc/" -result_dir = get_result_dir(config) + "/" -vcf_dir = get_result_dir(config) + "/vcf/" -vep_dir = get_result_dir(config) + "/vep/" - -include: - rule_dir + "snakemake_rules/umi/fgbio_v2.rule" -include: - rule_dir + "snakemake_rules/variant_calling/split_bed.rule" -include: - rule_dir + "snakemake_rules/variant_calling/mergetype_single_umi.rule" -include: - rule_dir + "snakemake_rules/umi/vardict_single_umi.rule" - -var_type = ["SNV"] -var_class = ["somatic", "germline"] - -rule all: - input: - expand(vcf_dir + "{vcf}.vcf.gz", vcf=get_vcf(config, ["vardict"], [config["analysis"]["case_id"]])), - expand(bam_dir + "{sample}.unalg.umi.mrkadp.bwa.map.umi.cnsunalg.bwa.map.fltr.clip.bam", sample=config["samples"]), - expand(bam_dir + "{sample}.unalg.umi.mrkadp.bwa.map.umi.bam.duplex_qc.pdf", sample=config["samples"]) - output: - os.path.join(get_result_dir(config), "analysis_finish") - shell: - "date +'%Y-%M-%d T%T %:z' > {output}" diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1f175c524..95abb5de0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -74,6 +74,7 @@ Removed: * Balsamic container https://github.com/Clinical-Genomics/BALSAMIC/pull/1230 * Plugin CLI https://github.com/Clinical-Genomics/BALSAMIC/pull/1245 * Realignment step for TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1272 +* Archived/outdated workflows and scripts https://github.com/Clinical-Genomics/BALSAMIC/pull/1296 [12.0.2] --------