diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 1550915ff..981991c01 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -6,8 +6,6 @@ tag_name = v{new_version} [bumpversion:file:CITATION.cff] -[bumpversion:file:BALSAMIC/containers/balsamic/Dockerfile] - [bumpversion:file:BALSAMIC/__version__.py] [bumpversion:file:BALSAMIC/__init__.py] @@ -21,5 +19,8 @@ tag_name = v{new_version} [bumpversion:file:docs/install.rst] [bumpversion:file:docs/balsamic_methods.rst] + +[bumpversion:file:setup.py] + search = {current_version} replace = {new_version} diff --git a/.github/workflows/black_linter.yml b/.github/workflows/black_linter.yml index 045d267c5..e0d9b14b4 100644 --- a/.github/workflows/black_linter.yml +++ b/.github/workflows/black_linter.yml @@ -7,7 +7,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.7' + python-version: '3.11' - uses: psf/black@stable with: options: "--check --verbose" diff --git a/.github/workflows/docker_build_publish_develop.yml b/.github/workflows/docker_build_publish_develop.yml index b8e92b2a6..1cac42a6f 100644 --- a/.github/workflows/docker_build_publish_develop.yml +++ b/.github/workflows/docker_build_publish_develop.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: true matrix: - container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py3, balsamic, delly, vcf2cytosure, cnvpytor, somalier, ascatNgs] + container-name: [align_qc, annotate, ascatNgs, cadd, cnvkit, cnvpytor, coverage_qc, delly, gatk, htslib, purecn, somalier, varcall_py3, varcall_py27, vcf2cytosure] steps: - name: Git checkout id: git_checkout @@ -39,6 +39,11 @@ jobs: tags: clinicalgenomics/balsamic:develop-${{ matrix.container-name }} build-args: CONTAINER_NAME=${{ matrix.container-name }} provenance: false + - name: Prune containers + id: docker_prune_containers + shell: bash + run: | + docker system prune -a -f - name: Test container id: docker_test_container shell: bash diff --git a/.github/workflows/docker_build_publish_release.yml b/.github/workflows/docker_build_publish_release.yml index 5d88e24ae..1d8b8e87a 100644 --- a/.github/workflows/docker_build_publish_release.yml +++ b/.github/workflows/docker_build_publish_release.yml @@ -11,7 +11,7 @@ jobs: strategy: fail-fast: true matrix: - container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py3, balsamic, delly, vcf2cytosure, cnvpytor, somalier, ascatNgs] + container-name: [align_qc, annotate, ascatNgs, cadd, cnvkit, cnvpytor, coverage_qc, delly, gatk, htslib, purecn, somalier, varcall_py3, varcall_py27, vcf2cytosure] steps: - name: Git checkout id: git_checkout @@ -39,6 +39,11 @@ jobs: tags: clinicalgenomics/balsamic:${{ steps.get_branch_name.outputs.branch }}-${{ matrix.container-name }} build-args: CONTAINER_NAME=${{ matrix.container-name }} provenance: false + - name: Prune containers + id: docker_prune_containers + shell: bash + run: | + docker system prune -a -f - name: Test container id: docker_test_container shell: bash diff --git a/.github/workflows/pytest_and_coveralls.yml b/.github/workflows/pytest_and_coveralls.yml index 7f74329ec..f6ab9cd3d 100644 --- a/.github/workflows/pytest_and_coveralls.yml +++ b/.github/workflows/pytest_and_coveralls.yml @@ -27,6 +27,8 @@ jobs: with: activate-environment: balsamic environment-file: BALSAMIC/conda/balsamic.yaml + - name: Install the HTML to PDF renderer + run: sudo apt-get update && sudo apt-get install -y wkhtmltopdf # Install BALSAMIC - name: Install BALSAMIC id: install_balsamic diff --git a/.github/workflows/urls_check.yml b/.github/workflows/urls_check.yml index b0fdac2b8..fe0569519 100644 --- a/.github/workflows/urls_check.yml +++ b/.github/workflows/urls_check.yml @@ -15,5 +15,5 @@ jobs: - name: Link Checker uses: lycheeverse/lychee-action@v1.8.0 with: - args: --verbose './BALSAMIC/constants/reference.py' './docs/*.rst' + args: --verbose './BALSAMIC/constants/cache.py' './docs/*.rst' fail: true diff --git a/.lycheeignore b/.lycheeignore index f1070cd69..8982f9042 100644 --- a/.lycheeignore +++ b/.lycheeignore @@ -1,4 +1,6 @@ https:\/\/gatk\.broadinstitute\.org\/hc\/.* https:\/\/cancer\.sanger\.ac\.uk\/cosmic\/file_download\/.* https:\/\/doi.org\/10.1073\/pnas.* +https:\/\/doi.org\/10.1093\/bioinformatics\/.* +https:\/\/drive.google.com\/.* https://doi.org/10.1002/cpbi.17 diff --git a/.pytest.ini b/.pytest.ini index e15cdcbb6..fe10a3d3d 100644 --- a/.pytest.ini +++ b/.pytest.ini @@ -1,4 +1,4 @@ [pytest] -testpaths = tests testpaths = tests -addopts = -v -x addopts = -v -x +testpaths = tests +addopts = -v -x norecursedirs = '.*' 'dist' 'CVS' '_darcs' '{arch}' '*.egg' 'venv' diff --git a/.readthedocs.yml b/.readthedocs.yml index 1172aa53b..c0c927437 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,9 +5,9 @@ version: 2 build: - os: "ubuntu-20.04" + os: "ubuntu-22.04" tools: - python: "3.7" + python: "3.11" # Build documentation in the docs/ directory with Sphinx sphinx: diff --git a/BALSAMIC/assets/__init__.py b/BALSAMIC/assets/__init__.py index 39979809e..e69de29bb 100644 --- a/BALSAMIC/assets/__init__.py +++ b/BALSAMIC/assets/__init__.py @@ -1,9 +0,0 @@ -import pkg_resources - -##### FILES ##### - -cg_logo = "assets/cg.png" - -##### Paths ##### - -cg_logo_path = pkg_resources.resource_filename("BALSAMIC", cg_logo) diff --git a/BALSAMIC/assets/vcfanno/__init__.py b/BALSAMIC/assets/images/__init__.py similarity index 100% rename from BALSAMIC/assets/vcfanno/__init__.py rename to BALSAMIC/assets/images/__init__.py diff --git a/BALSAMIC/assets/balsamic_logo.png b/BALSAMIC/assets/images/balsamic_logo.png similarity index 100% rename from BALSAMIC/assets/balsamic_logo.png rename to BALSAMIC/assets/images/balsamic_logo.png diff --git a/BALSAMIC/assets/cg.png b/BALSAMIC/assets/images/cg.png similarity index 100% rename from BALSAMIC/assets/cg.png rename to BALSAMIC/assets/images/cg.png diff --git a/BALSAMIC/assets/scout_config_template.yaml b/BALSAMIC/assets/scout_config_template.yaml deleted file mode 100644 index f8f143e95..000000000 --- a/BALSAMIC/assets/scout_config_template.yaml +++ /dev/null @@ -1,43 +0,0 @@ ---- - -owner: cust000 - -family: 'lovingtiger' -family_name: 'lovingtiger' -samples: - - analysis_type: panel - sample_id: tumor - tumor_type: unknown - capture_kit: capture_kit_name - sample_name: tumor - phenotype: affected - sex: unknown - expected_coverage: unknown - tmb: unknown - msi: unknown - tumor_purity: unknown - bam_path: path_tumor_merged.bam - - - analysis_type: panel - sample_id: normal - capture_kit: capture_kit_name - sample_name: normal - phenotype: unaffected - sex: unknown - expected_coverage: unknown - bam_path: path_tumor_merged.bam - -vcf_cancer: path_to_final_vcf -vcf_cancer_sv: path_to_final_vcf - -multiqc: path_multiqc_report - -default_gene_panels: [panel1] -gene_panels: [panel1] - -# meta data -rank_model_version: '1.1' -rank_score_threshold: -100 -analysis_date: 'N/A' -human_genome_build: 37 -track: cancer diff --git a/BALSAMIC/assets/scripts/CoveragePlot.R b/BALSAMIC/assets/scripts/CoveragePlot.R deleted file mode 100755 index 94e97ddd0..000000000 --- a/BALSAMIC/assets/scripts/CoveragePlot.R +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env Rscript -# Copyright 2018, Hassan Foroughi Asl -# -# This file is free software: you may copy, redistribute and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 2 of the License, or (at your -# option) any later version. -# -# This file is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("data.table")) - -option_list <- list( - make_option(c("-i", "--infile"), type="character", - help="Input coverage analysis of Sambamba for exons.", metavar="character"), - make_option(c("-o", "--outfile"), type="character", - help="In case of PDF, output file name [default infile.Coverage.pdf].", metavar="character"), - make_option(c("--avgcov"), type="integer", - help="Average coverage of sample. If it's not provided, an average will calculated from input file"), - make_option(c("--covline"), type="integer", - help="Plot coverage and normalized coverage plot for bed regions in the input file [default %default]", metavar="character", default=100), - make_option(c("--title"), type="character", help="plot title.", metavar="character", default= "Sample"), - make_option(c("-f", "--fontsize"), type="integer", default=12, - help="Fontsize as an input to pointsize in pdf() for heigh and width [default %default]"), - make_option(c("-r", "--resolution"), type="integer", default=7, - help="Print image resolution in inches, as an input to pdf() for heigh and width [default %default]"), - make_option(c("-v", "--verbose"), action="store_true", default=FALSE, - help="Print some extra output [default %default]") - ) - -' - %prog [options] - Coverage plot for sambamba depth output file. -' -> usage -opt_parser <- OptionParser(usage = usage, option_list=option_list); -arg <- parse_args(opt_parser) -file <- arg$infile -outfile <- arg$outfile - -if (is.null(file)){ - print_help(opt_parser) - stop("An input is required.", call.=FALSE) -} - -if ( arg$verbose ) { - options( warn = 0 ) -} else { - options( warn = -1 ) -} - -if ( arg$verbose ) { - write("Read coverage file.", stderr()) -} - -sample.coverage = fread(file, showProgress=F) -covSample = sample.coverage - -if (is.null(arg$avgcov) ) { - arg$avgcov = mean(covSample$meanCoverage) -} - -pdf(arg$outfile, width = arg$resolution, height = arg$resolution, pointsize = arg$fontsize) - -par(mfrow = c(1,2), pty = "s") - -hist(covSample$meanCoverage, breaks = 100, xlab = "Coverage", main = arg$title) -abline(v=arg$covline,col="red") - -hist(covSample$meanCoverage/arg$avgcov, breaks = 100, xlab = "Normalized Coverage", main = arg$title) -abline(v=arg$covline/arg$avgcov,col="red") - -garbage <- dev.off() diff --git a/BALSAMIC/assets/scripts/CoverageRep.R b/BALSAMIC/assets/scripts/CoverageRep.R deleted file mode 100755 index 5ee951968..000000000 --- a/BALSAMIC/assets/scripts/CoverageRep.R +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/env Rscript -# Copyright 2018, Hassan Foroughi Asl -# -# This file is free software: you may copy, redistribute and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 2 of the License, or (at your -# option) any later version. -# -# This file is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("data.table")) -suppressPackageStartupMessages(library("stargazer")) -#suppressPackageStartupMessages(library("bit64")) - -option_list <- list( - make_option(c("-i", "--infile"), type="character", - help="Input coverage analysis of Sambamba for exons.", metavar="character"), - make_option(c("--genename"), type="character", - help="List of gene symbols comma separeted.", metavar="character"), - make_option(c("--ensemble"), type="character", - help="List of comma separeted gene ensemble ids, not both.", metavar="character"), - make_option(c("-t", "--type"), type="character", default="text", - help="Output table type format for exon coverage report [default %default].", metavar="character"), - make_option(c("--name"), type="character", help="A name for the output table [default %default].", - default="Coverage report"), - make_option(c("-o", "--outfile"), type="character", - help="In case of PDF, output file name [default infile.Coverage.pdf].", metavar="character"), - make_option(c("-v", "--verbose"), action="store_true", default=FALSE, - help="Print some extra output [default %default]") - ) - -' - %prog [options] - - A script to report the exon coverage summary output from Sambamba with for canonical transcripts (longest -transcript) of each gene. In coverage, the following is taken into account: 1. transcript must be protein coding. 2. -Transcript should not have more than one exon with zero coverage. 3. A canonical transcript is a transcript that is -longest and meets criteria 1 and 2. - -' -> usage -opt_parser <- OptionParser(usage = usage, option_list=option_list); -arg <- parse_args(opt_parser) -file <- arg$infile -outfile <- arg$outfile - -if (is.null(file)){ - print_help(opt_parser) - stop("An input is required.", call.=FALSE) -} - -if (! is.null(arg$genename) & ! is.null(arg$ensemble) ){ - stop("Provide gene or ensemble id, not both", call.=FALSE) -} - -if ( is.null(arg$genename) & is.null(arg$ensemble) ){ - stop("You provide a list of genes or ensemble IDs.", call.=FALSE) -} - -if ( arg$verbose ) { - options( warn = 0 ) -} else { - options( warn = -1 ) -} - -if ( arg$verbose ) { - write("Read coverage file.", stderr()) -} - -sample.coverage = fread(file, showProgress=F) - -fragLength = 100 -if (! is.null(arg$genename)) { - genelist = unlist(strsplit(arg$genename, ",")) - dt.gene = sample.coverage[F10 %in% genelist,] -} - -if (! is.null(arg$ensemble)) { - genelist = unlist(strsplit(arg$ensemble, ",")) - dt.gene = sample.coverage[F11 %in% genelist,] -} - -dt.gene = dt.gene[, - .(exonCount = .N, - readPerExon = sum(readCount)/.N, - meanExonCoverage = mean(readCount*fragLength/(abs(chromStart-chromEnd))), - medianExonCoverage = median(readCount*fragLength/(abs(chromStart-chromEnd))), - readPerbpPerExon = sum(readCount*fragLength)/(F7/.N), - txID = F3, - geneID = F11, - txLength = F7, - geneName = F10, - txType = F6, - txStatus = F9, - totalRead = sum(readCount), - zeroExonCov = sum(readCount==0), - zeroExonCovMid = !(any(which(!readCount)==length(readCount)) - || any(which(!readCount)==1)), - zeroExonCovLastFirst = any(which(!readCount)==length(readCount)) - || any(which(!readCount)==1) - ), - by=.(F3, F6, F7, F9, F10, F11) - ] - -dt.gene = dt.gene[txType=="protein_coding"] - -dt.gene = dt.gene[, - .("Gene" = geneName, - txID, - "tx_exonCount" = paste0(txID, "_", exonCount), - "tx length" = txLength, - txLength, - maxLength = max(txLength), - "tx type" = txType, - "tx status" = txStatus, - exonCount, - "read per exon" = readPerExon, - readPerbpPerExon, - "Median exon cov" = medianExonCoverage, - meanExonCoverage, - totalRead, - zeroExonCov, - zeroExonCovLastFirst, - zeroExonCovMid, - "Exon zero cov" = paste0(zeroExonCov, - " (", - as.integer(zeroExonCovLastFirst), - " / ", - as.integer(zeroExonCovMid), - ")"), - maxTxReadCount = max(totalRead) - ),keyby=geneID] - -dt.gene = dt.gene[maxLength==txLength, - !c("maxLength", - "zeroExonCovLastFirst", - "zeroExonCovMid", - "txLength", - "geneID", - "txID", - "exonCount", - "totalRead", - "readPerbpPerExon", - "meanExonCoverage", - "zeroExonCov", - "Exon zero cov", - "maxTxReadCount") - ] - -stargazer(dt.gene, summary = FALSE, type = arg$type, title = arg$name, - table.placement = "H", - digit.separator = "", rownames = F, style = "io", float = T, - header = F, out.header = F) diff --git a/BALSAMIC/assets/scripts/MutationalSig.R b/BALSAMIC/assets/scripts/MutationalSig.R deleted file mode 100755 index 3f40da923..000000000 --- a/BALSAMIC/assets/scripts/MutationalSig.R +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env Rscript -# Copyright 2018, Hassan Foroughi Asl -# -# This file is free software: you may copy, redistribute and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 2 of the License, or (at your -# option) any later version. -# -# This file is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("deconstructSigs")) - -option_list <- list( - make_option(c("-i", "--infile"), type="character", - help="Input mutation file.", metavar="character"), - make_option(c("-o", "--outfile"), type="character", - help="output file name.", metavar="character"), - make_option(c("-m", "--model"), type="character", default="nature2013", - help="Choose between two models: nature2013 or cosmic. Refer to deconstructSigs documentatation. [default %default]"), - make_option(c("-r", "--resolution"), type="integer", default=7, - help="Print image resolution in inches, as an input to pdf() for heigh and width [default %default]"), - make_option(c("-f", "--fontsize"), type="integer", default=12, - help="Fontsize as an input to pointsize in pdf() for heigh and width [default %default]"), - make_option(c("-s", "--sampleid"), type="integer", default=1, - help="Numeric id for sample in input file. [default %default]"), - make_option(c("-v", "--verbose"), action="store_true", default=FALSE, - help="Print extra output [default %default]") - ) - -' - %prog [options] - - A wrapper for deconstructSig to plot mutational signatures. - -' -> usage -opt_parser <- OptionParser(usage = usage, option_list=option_list); -arg <- parse_args(opt_parser) -file <- arg$infile -outfile <- arg$outfile - -if (is.null(file) || is.null(outfile) ){ - print_help(opt_parser) - stop("An input and output files are required.", call.=FALSE) -} - -if ( arg$verbose ) { - options( warn = 0 ) -} else { - options( warn = -1 ) -} - -if ( arg$verbose ) { - write("Loading input mutation list into data frame.", stderr()) -} - -sample.mut.ref = read.table(file , header = T) - -if ( arg$verbose ) { - write("Converting data frame into mutation", stderr()) -} - -sigs.input <- mut.to.sigs.input(mut.ref = sample.mut.ref, - sample.id = "sample", - chr = "chrom", - pos = "pos", - ref = "ref", - alt = "alt") - -if ( arg$verbose ) { - write("Matching signatures with reference.", stderr()) -} - -if ( arg$model == "nature2013" ) { - sigmodel = signatures.nature2013 -} else if ( arg$model == "cosmic" ) { - sigmodel = signatures.cosmic -} else { - stop("Unknown model paramters. See help") -} - -sample_sig = whichSignatures(tumor.ref = sigs.input, - signatures.ref = sigmodel, - sample.id = arg$sampleid, - contexts.needed = TRUE, - tri.counts.method = 'default') - -pdf(arg$outfile, width = arg$resolution, height = arg$resolution, pointsize = arg$fontsize) -plotSignatures(sample_sig) -garbage <- dev.off() - - diff --git a/BALSAMIC/assets/scripts/VariantReport.R b/BALSAMIC/assets/scripts/VariantReport.R deleted file mode 100755 index 71b2b6e32..000000000 --- a/BALSAMIC/assets/scripts/VariantReport.R +++ /dev/null @@ -1,350 +0,0 @@ -#!/usr/bin/env Rscript -# Copyright 2018, Hassan Foroughi Asl -# -# This file is free software: you may copy, redistribute and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 2 of the License, or (at your -# option) any later version. -# -# This file is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -suppressPackageStartupMessages(library("optparse")) -suppressPackageStartupMessages(library("data.table")) -suppressPackageStartupMessages(library("stargazer")) - -option_list <- list( - make_option(c("-i", "--infile"), type="character", help="Input coverage analysis of Sambamba for exons.", metavar="character"), - make_option(c("--mode"), type="character", help="Run mode. Select one from the list: MVL,TMB,VarClass,VarCaller,VarCallerClass [default %default]", default="MVL"), - make_option(c("--dp"), type="character", help="Total read depth filter [default %default].", default="100"), - make_option(c("-F","--afmax"), type="character", help="Maximum tumor AF filter [default %default].", default="0.05"), - make_option(c("-f","--afmin"), type="character", help="Minimum tumor AF filter [default %default].", default="0.01"), - make_option(c("-a","--tumorad"), type="character", help="Allelic depth for alternative allele in tumor [default %default].", default="10"), - make_option(c("-m","--inMVL"), type="character", help="Flag to filter variant in MVL or not [default %default].", default="FALSE"), - make_option(c("--vartype"), type="character", help="Variant type filter. The value must exist in the TYPE column [default %default]", default="SNP"), - make_option(c("--varcaller"), type="character", help="Variant caller name filter. Choose from: STRELKA, MUTECT2, VARDICT, or ANY. Use multiple variant caller names sepraterd by comma and no space in between. [default %default].", default="VARDICT"), - make_option(c("--ann"), type="character", help="Annotation string to exact match and filter. [default %default].", default="missense_variant"), - make_option(c("--name"), type="character", help="A name for the output table [default %default].", default="Variant filter table"), - make_option(c("--inExon"), type="logical", help="A flag to select variants that are only found in exons [default %default]", default=TRUE), - make_option(c("--inGene"), type="logical", help="A flag to select variants that have a gene symbol annotation [default %default]", default=TRUE), - make_option(c("--genomeSize"), type="integer", help="Genome or panel or exome size to calculate TMB"), - make_option(c("--exclusiveSets"), type="logical", help="A flag to only perform setdiff between different sets of MVL [default %default]", default=FALSE), - make_option(c("--exportGene"), type="logical", help="A flag to not output the table, instead comma separated list of genes [default %default]", default=FALSE), - make_option(c("-t", "--type"), type="character", default="text", help="Output table fortmat type. Choose from: text, latex, html. And output file name is required for html and latex [default %default].", metavar="character"), - make_option(c("-o", "--outfile"), type="character", help="In case of PDF, output file name.", metavar="character"), - make_option(c("-v", "--verbose"), action="store_true", default=FALSE, help="Print some extra output [default %default]") - ) - -' - %prog [options] - - A script to report variants based on series of inputs. Data.table is sometimes faster than Pandas is aggregating -results, thus it was developed in R instead of Python. -' -> usage -opt_parser <- OptionParser(usage = usage, option_list=option_list); -arg <- parse_args(opt_parser) -file <- arg$infile - -if (is.null(file)){ - print_help(opt_parser) - stop("An input is required.", call.=FALSE) -} - -if (! arg$verbose ) { - options( warn = 0 ) -} else { - options( warn = -1 ) -} - -ConcatVarCall <- function(x) { - x = gsub("VARDICT","V",x) - x = gsub("STRELKA","S",x) - x = gsub("MUTECT2","M",x) - return(x) -} - -trimStr <- function(x) { - if (nchar(x) > 10) { - x = paste0(substring(x, 1, 3), "...", substring(x, nchar(x)-3, nchar(x))) - } - return(x) -} - -sample.coverage = fread(arg$infile, showProgress=F) -sample.coverage[,ID:=paste0(CHROM,"_",POS,"_",REF,"_",ALT)] - -dt_excl = data.table() - -if (arg$mode == "MVL") { - var_param = c("afmax","afmin","inMVL","dp","tumorad","name","varcaller","ann","vartype","outfile") - - set_cnt = c() - for (v in var_param) { - arg[[v]] = unlist(strsplit(arg[[v]], split=';', fixed=T)) - set_cnt = c(length(unlist(strsplit(arg[[v]],split=';', fixed=T))), set_cnt) - } - - if (length(unique(set_cnt)) > 1) { - stop("Number of sets is different among inputs.", call.=FALSE) - } - - int_vars = c("afmax","afmin","dp","tumorad") - for (v in int_vars) { - arg[[v]] = as.numeric(arg[[v]]) - } - - #bool_vars = c("inMVL", "inExon", "inGene") - bool_vars = c("inMVL") - for (v in bool_vars) { - arg[[v]] = as.logical(arg[[v]]) - } - for (i in 1:unique(set_cnt)) { - mvl = arg$inMVL[i] - if (mvl) { - mvl = "1" - } else { - mvl = "." - } - dp = arg$dp[i] - tumor_ad_alt = arg$tumorad[i] - af_max = arg$afmax[i] - af_min = arg$afmin[i] - var_type = unlist(strsplit(arg$vartype[i], ",")) - var_caller = unlist(strsplit(arg$varcaller[i], ",")) - table_name = arg$name[i] - table_num = arg$num[i] - outfile = arg$outfile[i] - - var_caller = toupper(var_caller) - if (any(var_caller %in% "ANY")) { - var_caller = c("MUTECT2", "VARDICT", "STRELKA") - } - - dt = sample.coverage[CALLER %in% var_caller - & MSK_MVL == mvl - & (TUMOR_AD_REF + TUMOR_AD_ALT) >= dp - & TUMOR_AD_ALT / (TUMOR_AD_REF + TUMOR_AD_ALT) <= af_max - & TUMOR_AD_ALT / (TUMOR_AD_REF + TUMOR_AD_ALT) >= af_min - & TUMOR_AD_ALT >= tumor_ad_alt - & TYPE %in% var_type] - - if (! is.null(arg$ann)) { - var_ann = unlist(strsplit(arg$ann, ",")) - dt = dt[Consequence %in% var_ann] - } - - if (arg$inExon) { - dt = dt[EXON != "."] - } - - if (arg$inGene) { - dt = dt[SYMBOL != "."] - } - - if (nrow(dt)!=0) { - - dt = dt[, - .("Chr:Pos" = paste0(CHROM,":",POS), - "Ref/Alt" = paste0(unlist(lapply(FUN = trimStr, REF)),"/",unlist(lapply(FUN = trimStr, ALT))), - "Caller" = ConcatVarCall(paste(unique(c(CALLER)), collapse = "/")), - "CallerCount" = length(unique(c(CALLER))), - "DP (Ref/Alt)" = paste0(floor(mean(TUMOR_AD_REF + TUMOR_AD_ALT)), - "(", - paste0(floor(mean(TUMOR_AD_REF)),"/", floor(mean(TUMOR_AD_ALT))), - ")"), - "AF" = mean(TUMOR_AD_ALT/(TUMOR_AD_REF + TUMOR_AD_ALT)), - "Consequence" = paste(unique(c(Consequence)), collapse = ", "), - "Protein"=paste(unlist(strsplit(HGVSp,":"))[2], collapse=", "), - "Gene" = SYMBOL - ) - ,by=.(ID)] - - dt = unique(dt) - dt = dt[,c("Chr:Pos", "Ref/Alt", "Caller", "DP (Ref/Alt)", "AF", "Gene", "Consequence", "Protein")] - - if (nrow(dt_excl)==0) { - dt_excl = dt[0,] - } - } - - - if (arg$exclusiveSets & unique(set_cnt) > 1 & nrow(dt)>0) { - dt = fsetdiff(dt, dt_excl, all = FALSE) - dt_excl = funion(dt, dt_excl) - } - - if (nrow(dt)==0) { - write(paste0("No variant were for found for table ", table_name), "") - write(" ", file = outfile) - } else { - if (arg$exportGene) { - write(paste0("list of genes for table ", table_name, " : ", - paste(unlist(unique(dt[, c("Gene")])), collapse=",")), "") - write(paste(unlist(unique(dt[, c("Gene")])), collapse=","), file = outfile) - } else { - if (is.null(arg$outfile) || arg$type == "text") { - stargazer(dt, summary = FALSE, type = arg$type, title = table_name, - table.placement = "H", digit.separator = "", rownames = F, style = "io", float = T, - notes = c(paste0("1. A summary of results based on \"", - table_name, "\" specification."), - paste0("2. Variant callers included: ", - tolower(paste(var_caller, collapse = ", ")))), - header = F, out.header = F) - } else { - stargazer(dt, summary = FALSE, title = table_name, - table.placement = "H", digit.separator = "", rownames = F, style = "io", float = T, - notes = c(paste0("1. A summary of results based on \"", - table_name, "\" specification."), - paste0("2. Variant callers included: ", - tolower(paste(var_caller, collapse = ", ")))), - header = F, out.header = F, out = outfile) - fwrite(x = dt, file = paste0(outfile, ".csv")) - } - } - } - } -} else if ( arg$mode == "TMB" ) { - if (is.null(arg$genomeSize)) { - stop("Genome/panel size is required.", call.=FALSE) - } - genomeLength = arg$genomeSize / 1e6 - - var_caller = as.list(unique(sample.coverage[,"CALLER"]))$CALLER - annotation = c("stop_gained", "stop_lost", "start_lost", - "missense_variant", "nonsynonymous_variant", - "splice_acceptor_variant", "splice_donor_variant", - "splice_donor_5th_base_variant", "splice_site_variant", - "splicing_variant", "frameshift_variant") - - dt1 = unique(sample.coverage[CALLER %in% var_caller - & Consequence %in% annotation, .(CHROM, POS), - by=.(ID, CALLER)])[,.(.N,"TMB"=.N/genomeLength),by=.(CALLER)] - - dt2 = unique(sample.coverage[CALLER %in% var_caller - & Consequence %in% annotation, .(CALLER), - by=.(ID)][,.(ID)])[,.(.N)] - dt2[,"CALLER":="ALL"] - setcolorder(dt2, neworder = c("CALLER", "N")) - dt = dt2[,.(CALLER, N, "TMB"=N/genomeLength)] - - str_annot = paste(gsub("_", "-", annotation), collapse = ", ") - dt.TMB = rbind(dt1, dt) - stargazer(unique(dt.TMB), summary = FALSE, type = arg$type, - title = "Tumor mutation burden (TMB)", - digit.separator = "", rownames = F, style = "io", - header = F, out.header = F, table.placement = "H", float = T, - notes = c(paste0("1. Variant callers included: ", - tolower(paste(as.list(unique(sample.coverage[,"CALLER"]))$CALLER, - collapse = ", "))), - paste0("2. Variant types: ", - tolower(paste(as.list(unique(sample.coverage[,"VARIANT_CLASS"]))$VARIANT_CLASS, - collapse=", "))), - paste0("3. Only all coding variants (all subchilds of nonsynonymous variants annotation)"))) - - if (!is.null(arg$outfile)){ - for (v in var_caller) { - fwrite(x = unique(sample.coverage[CALLER==v & SYMBOL!=".", - .("Chr:Pos" = paste0(CHROM,":",POS), - "Ref/Alt" = paste0(unlist(lapply(FUN = trimStr, REF)),"/",unlist(lapply(FUN = trimStr, ALT))), - "Caller" = ConcatVarCall(paste(unique(c(CALLER)), collapse = "/")), - "CallerCount" = length(unique(c(CALLER))), - "DP (Ref/Alt)" = paste0(floor(mean(TUMOR_AD_REF + TUMOR_AD_ALT)), - "(", - paste0(floor(mean(TUMOR_AD_REF)),"/", floor(mean(TUMOR_AD_ALT))), - ")"), - "AF" = mean(TUMOR_AD_ALT/(TUMOR_AD_REF + TUMOR_AD_ALT)), - "Consequence" = paste(unique(c(Consequence)), collapse = ", "), - "Protein" = paste(unlist(unique(HGVSp)), collapse=", "), - "Gene" = paste(unlist(unique(SYMBOL)), collapse=", ") - ) - ,by=.(ID)]), file = paste0(arg$outfile, "_", v, ".csv")) - } - } -} else if ( arg$mode == "VarClass" ) { - dt = unique(sample.coverage[,.(ID,CALLER,VARIANT_CLASS)]) - dt.typevars = dt[,.("CALLERCOUNT"=length(unique(c(CALLER))),.N), - by=.("CALLERS"=CALLER,VARIANT_CLASS)][order(CALLERS,-VARIANT_CLASS)] - - dt = unique(sample.coverage[,.(ID,CALLER,VARIANT_CLASS)]) - dt = dt[,.("CALLERS"="ALL","CALLERCOUNT"=length(unique(c(CALLER))),.N), - by=(VARIANT_CLASS)] - - setcolorder(dt, neworder = c("CALLERS", "VARIANT_CLASS", "CALLERCOUNT","N")) - dt.typecensus = rbind(dt, dt.typevars) - - dt = unique(sample.coverage[,.(ID,CALLER)]) - dt.allcallers = dt[,.("VARIANT_CLASS"="All_types","CALLERCOUNT"=1,.N), by=.("CALLERS"=CALLER) ][order(CALLERS)] - - dt.callercensus = rbind(dt.allcallers,dt.typecensus)[order(CALLERS,VARIANT_CLASS,CALLERCOUNT)] - stargazer(unique(dt.callercensus), summary = FALSE, type = arg$type, - title = "Variant class summary", - digit.separator = "", rownames = F, style = "io", - header = F, out.header = F, table.placement = "H", float = T, - notes = c(paste0("1. A summary of variant classes devided by variant class and variant caller"), - paste0("2. Variant callers included: ", - tolower(paste(as.list(unique(sample.coverage[,"CALLER"]))$CALLER, - collapse = ", "))), - paste0("3. Variant types: ", - tolower(paste(as.list(unique(sample.coverage[,"VARIANT_CLASS"]))$VARIANT_CLASS, - collapse=", "))))) -} else if ( arg$mode == "VarCaller" ) { - - var_caller = as.list(unique(sample.coverage[,"CALLER"]))$CALLER - var_caller_combn = do.call("c", lapply(seq_along(var_caller), - function(i) {combn(var_caller, i, simplify = F)})) - - dt = unique(sample.coverage[,.(ID,CALLER,VARIANT_CLASS)]) - dt = dt[,.("CALLERS"=paste(unique(c(CALLER)),collapse = "-"), - "CALLERCOUNT"=length(unique(c(CALLER)))), - by=.(ID)][,.(.N), - by=.(CALLERS,CALLERCOUNT) - ] - - dt.venn = dt[order(CALLERCOUNT,CALLERS)] - stargazer(unique(dt.venn), summary = FALSE, type = arg$type, - title = "Variant caller summary", - digit.separator = "", rownames = F, style = "io", - header = F, out.header = F, table.placement = "H", float = T, - notes = c(paste0("1. A summary of exclusive variant types ", - "devided by variant callers that identified them"), - paste0("2. Variant callers included: ", - tolower(paste(as.list(unique(sample.coverage[,"CALLER"]))$CALLER, - collapse = ", "))), - paste0("3. Variant types: ", - tolower(paste(as.list(unique(sample.coverage[,"VARIANT_CLASS"]))$VARIANT_CLASS, - collapse=", "))))) -} else if ( arg$mode == "VarCallerClass" ){ - - var_caller = as.list(unique(sample.coverage[,"CALLER"]))$CALLER - var_caller_combn = do.call("c", lapply(seq_along(var_caller), - function(i) {combn(var_caller, i, simplify = F)})) - - dt = unique(sample.coverage[,.(ID,CALLER,VARIANT_CLASS)]) - dt = dt[,.("CALLERS"=paste(unique(c(CALLER)),collapse = "-"), - "CALLERCOUNT"=length(unique(c(CALLER))), - VARIANT_CLASS), - by=.(ID)][,.(.N), - by=.(CALLERS,VARIANT_CLASS,CALLERCOUNT) - ] - - dt.venn = dt[order(CALLERCOUNT,CALLERS,VARIANT_CLASS)] - stargazer(unique(dt.venn), summary = FALSE, type = arg$type, - title = "Variant caller summary by class", - digit.separator = "", rownames = F, style = "io", - header = F, out.header = F, table.placement = "H", float = T, - notes = c(paste0("1. A summary of exclusive variant types ", - "devided by variant callers that identified them"), - paste0("2. Variant callers included: ", - tolower(paste(as.list(unique(sample.coverage[,"CALLER"]))$CALLER, - collapse = ", "))), - paste0("3. Variant types: ", - tolower(paste(as.list(unique(sample.coverage[,"VARIANT_CLASS"]))$VARIANT_CLASS, - collapse=", "))))) -} else { - stop("Run mode not recognized", call.=FALSE) -} diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py index d10d213cf..8c976f49f 100755 --- a/BALSAMIC/assets/scripts/collect_qc_metrics.py +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -1,20 +1,20 @@ #!/usr/bin/env python import os +import re from pathlib import Path from typing import List, Optional import click import yaml -import re -from BALSAMIC.constants.qc_metrics import METRICS +from BALSAMIC.constants.metrics import METRICS +from BALSAMIC.models.metrics import Metric from BALSAMIC.utils.io import read_json -from BALSAMIC.utils.models import MetricModel from BALSAMIC.utils.rule import ( + get_analysis_type, get_capture_kit, + get_sample_type_from_sample_name, get_sequencing_type, - get_sample_type_from_prefix, - get_analysis_type, ) @@ -109,7 +109,7 @@ def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str: def get_relatedness_metrics(multiqc_data: dict) -> list: - """Retrieves the relatedness metrics and returns them as a MetricModel list.""" + """Retrieves the relatedness metrics and returns them as a Metric list.""" source_tool = "Somalier" metric = "relatedness" step = "multiqc_somalier" @@ -124,14 +124,14 @@ def get_relatedness_metrics(multiqc_data: dict) -> list: metric_value = multiqc_data["report_saved_raw_data"][step][sample][metric] case_id = re.sub(r"_NORMAL.*|_TUMOR.*", "", sample) - output_metrics = MetricModel( + output_metrics = Metric( id=case_id, input=data_source, name=metric.upper(), step=step, value=metric_value, condition=METRICS["paired"]["RELATEDNESS"]["condition"], - ).dict() + ).model_dump() return [output_metrics] @@ -171,11 +171,10 @@ def get_metric_condition( """Returns a condition associated to a sample and sequencing type""" sequencing_type = get_sequencing_type(config) - try: - sample_type = get_sample_type_from_prefix(config, sample) - except KeyError: + sample_type = get_sample_type_from_sample_name(config, sample) + if not sample_type: # Deletes pair orientation information from the sample name (insertSize metrics) - sample_type = get_sample_type_from_prefix(config, sample.rsplit("_", 1)[0]) + sample_type = get_sample_type_from_sample_name(config, sample.split("_")[0]) req_metrics = requested_metrics[metric]["condition"] if sequencing_type == "wgs" and ( @@ -200,9 +199,13 @@ def extract(data, output_metrics, sample=None, source=None): # Ignore UMI and reverse reads metrics if "umi" not in k: if k in requested_metrics: + # example of possible sample-formats below from "report_saved_raw_data": + # tumor.ACCXXXXXX + # tumor.ACCXXXXXX_FR + # extracted below for id to: ACCXXXXXX output_metrics.append( - MetricModel( - id=sample.split("_")[1], + Metric( + id=sample.split(".")[1].split("_")[0], input=get_multiqc_data_source( multiqc_data, sample, source ), @@ -212,10 +215,10 @@ def extract(data, output_metrics, sample=None, source=None): condition=get_metric_condition( config, requested_metrics, - sample, + sample.split(".")[1].split("_")[0], k, ), - ).dict() + ).model_dump() ) extract(data[k], output_metrics, k, sample) @@ -241,7 +244,7 @@ def extract_number_variants(counts: list) -> dict: def get_variant_metrics(counts_path: list) -> list: - """Retrieves the variant metrics and returns them as a MetricModel list""" + """Retrieves the variant metrics and returns them as a Metric list""" output_metrics = list() @@ -252,14 +255,14 @@ def get_variant_metrics(counts_path: list) -> list: requested_metrics = METRICS["variants"] for metric in requested_metrics: output_metrics.append( - MetricModel( + Metric( id=os.path.basename(counts_path).split(".")[2], # case_id input=os.path.basename(counts_path), name=metric, step="collect_custom_qc_metrics", value=variant_metrics[metric], condition=requested_metrics[metric]["condition"], - ).dict() + ).model_dump() ) return output_metrics diff --git a/BALSAMIC/assets/scripts/csv_to_pdf.py b/BALSAMIC/assets/scripts/csv_to_pdf.py new file mode 100644 index 000000000..4b40bde07 --- /dev/null +++ b/BALSAMIC/assets/scripts/csv_to_pdf.py @@ -0,0 +1,41 @@ +"""Script for converting a CSV file to a PDF.""" +from pathlib import Path + +import click +from pandas import DataFrame, read_csv + +from BALSAMIC.utils.pdf_report import get_table_html, html_to_pdf + + +@click.command() +@click.argument( + "csv_path", nargs=1, required=True, type=click.Path(exists=True, resolve_path=True) +) +@click.argument("pdf_path", nargs=1, required=True, type=click.Path(resolve_path=True)) +@click.option( + "--delimiter", + type=click.STRING, + default=",", + show_default=True, + help="CSV file delimiter.", +) +@click.option( + "-h", + "--header", + is_flag=True, + help="Include this option if the CSV file has a header row.", +) +def csv_to_pdf(csv_path: str, pdf_path: str, delimiter: str, header: bool) -> None: + """Convert CSV file to a PDF.""" + df: DataFrame = read_csv(filepath_or_buffer=csv_path, delimiter=delimiter) + html_table: str = df.to_html( + index=False, na_rep="NA", justify="center", escape=False, header=header + ) + html_page: str = get_table_html( + html_table=html_table, table_name=Path(csv_path).stem + ) + html_to_pdf(html_string=html_page, pdf_path=pdf_path) + + +if __name__ == "__main__": + csv_to_pdf() diff --git a/BALSAMIC/assets/scripts/edit_vcf_info.py b/BALSAMIC/assets/scripts/edit_vcf_info.py index c1f1609dc..25eee242a 100644 --- a/BALSAMIC/assets/scripts/edit_vcf_info.py +++ b/BALSAMIC/assets/scripts/edit_vcf_info.py @@ -1,10 +1,6 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -""" Script to include variant caller information to the VCF info field and output to new VCF file """ +""" Script to include variant caller information to the VCF info field and output to new VCF file.""" import click -import gzip from cyvcf2 import VCF, Writer diff --git a/BALSAMIC/assets/scripts/generate_cnv_report.py b/BALSAMIC/assets/scripts/generate_cnv_report.py deleted file mode 100644 index 3f1541dca..000000000 --- a/BALSAMIC/assets/scripts/generate_cnv_report.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python -import os -from pathlib import Path -from typing import List, Tuple - -import click - -from fpdf import FPDF -from PIL import Image - - -@click.command(short_help="Merge statistics and plots into a single CNV report") -@click.argument("data_paths", nargs=-1, type=click.Path(exists=True), required=False) -@click.option("--output", type=click.Path(exists=False), required=True) -def generate_cnv_report(data_paths: List[Path], output: Path) -> None: - """Generate a CNV report given a set of statistic files and a list of plots.""" - pdf: PDF = get_pdf_instance() - statistics, plots = get_pdf_data(data_paths) - pdf: PDF = add_data_to_pdf(pdf=pdf, data_paths=statistics) if statistics else pdf - pdf: PDF = add_plots_to_pdf(pdf=pdf, plot_paths=plots) if plots else pdf - pdf.output(output) - - -class PDF(FPDF): - """PDF generation subclass.""" - - def footer(self): - """Overwrite the predetermined method to perform a specific footer processing.""" - self.set_y(-15) - self.set_font("helvetica", "I", 8) - self.cell(0, 10, f"Page {self.page_no()}/{{nb}}", 0, 0, "C") - - -def get_pdf_instance() -> PDF: - """Return a PDF instance.""" - pdf: PDF = PDF() - pdf.alias_nb_pages(alias="{nb}") - return pdf - - -def get_pdf_data(data_paths: List[Path]) -> Tuple[List[Path], List[Path]]: - """Return""" - statistics = [] - plots = [] - for path in data_paths: - if Path(path).suffix == ".txt": - statistics.append(path) - if Path(path).suffix == ".png": - plots.append(path) - return statistics, plots - - -def add_data_to_pdf(pdf: PDF, data_paths: List[Path]) -> PDF: - """Add statistics to a PDF instance.""" - for data_path in data_paths: - with open(data_path) as data: - data = data.readlines() - pdf.add_page() - pdf.set_font("helvetica", "B", 15) - # Title layout & styling - title: str = os.path.basename(data_path).replace(".txt", "") - pdf.cell(25) - pdf.cell(140, 10, title, 1, 0, "C") - pdf.cell(35, 25, ln=1) # Post title indentation - # Table layout & styling - pdf.set_font("Times", size=11) - line_height = pdf.font_size * 2.5 - col_width = pdf.epw / 4 # Even distribution of the content - for row in data: - pdf.cell(45) - for statistic in row.split(): - pdf.multi_cell( - col_width, - line_height, - statistic, - align="C", - border=1, - ln=3, - max_line_height=pdf.font_size, - ) - pdf.ln(line_height) - - return pdf - - -def add_plots_to_pdf(pdf: PDF, plot_paths: List[Path]) -> PDF: - """Add plots to a PDF instance.""" - pdf.set_font("helvetica", "B", 15) - portrait_size = {"sunrise": (500, 500), "circular": (700, 700)} - portrait_position = {"sunrise": (10, 55), "circular": (-29, 55)} - for path in plot_paths: - title: str = os.path.basename(path).replace(".png", "") - plot_name = title.split(".")[-1] - # Image & page layout parameters - if plot_name in portrait_size.keys(): - page_orientation = "portrait" - img_size = portrait_size[plot_name] - title_w_pos = 25 - title_wh = 140, 10 - img_xy = portrait_position[plot_name] - else: - page_orientation = "landscape" - img_size = (800, 800) if "scatter" != plot_name else (650, 650) - title_w_pos = 68.5 - title_wh = 140, 10 - img_xy = (5, 40) if "scatter" != plot_name else (25, 23) - - pdf.add_page(orientation=page_orientation) - - # Title position & styling - pdf.cell(title_w_pos) - pdf.cell(title_wh[0], title_wh[1], title, 1, 0, "C") - - # Image position & resizing - img = Image.open(path) - img.thumbnail(img_size, Image.ANTIALIAS) - pdf.image(img, img_xy[0], img_xy[1]) - - return pdf - - -if __name__ == "__main__": - generate_cnv_report() diff --git a/BALSAMIC/assets/scripts/image_to_pdf.py b/BALSAMIC/assets/scripts/image_to_pdf.py new file mode 100644 index 000000000..a6d03e965 --- /dev/null +++ b/BALSAMIC/assets/scripts/image_to_pdf.py @@ -0,0 +1,26 @@ +"""Script for converting images to PDF.""" +from pathlib import Path + +import click + +from BALSAMIC.utils.pdf_report import get_image_html, html_to_pdf + + +@click.command() +@click.argument( + "image_path", + nargs=1, + required=True, + type=click.Path(exists=True, resolve_path=True), +) +@click.argument("pdf_path", nargs=1, required=True, type=click.Path(resolve_path=True)) +def image_to_pdf(image_path: str, pdf_path: str) -> None: + """Convert image file to a PDF.""" + html_page: str = get_image_html( + image_path=Path(image_path), image_name=Path(image_path).stem + ) + html_to_pdf(html_string=html_page, pdf_path=pdf_path) + + +if __name__ == "__main__": + image_to_pdf() diff --git a/BALSAMIC/assets/scripts/merge_pdfs.py b/BALSAMIC/assets/scripts/merge_pdfs.py new file mode 100644 index 000000000..eae9d3f0e --- /dev/null +++ b/BALSAMIC/assets/scripts/merge_pdfs.py @@ -0,0 +1,28 @@ +"""Script to merge PDFs.""" +from typing import List + +import click +from pypdf import PdfWriter + + +@click.command() +@click.argument( + "input_pdfs", + nargs=-1, + required=True, + type=click.Path(exists=True, resolve_path=True), +) +@click.argument( + "output_pdf", nargs=1, required=True, type=click.Path(resolve_path=True) +) +def merge_pdfs(input_pdfs: List[str], output_pdf: str) -> None: + """Merge PDFs into a single file.""" + merger: PdfWriter = PdfWriter() + for pdf in input_pdfs: + merger.append(pdf) + merger.write(output_pdf) + merger.close() + + +if __name__ == "__main__": + merge_pdfs() diff --git a/BALSAMIC/assets/scripts/preprocess_gens.py b/BALSAMIC/assets/scripts/preprocess_gens.py new file mode 100755 index 000000000..0e146a3f1 --- /dev/null +++ b/BALSAMIC/assets/scripts/preprocess_gens.py @@ -0,0 +1,423 @@ +#!/usr/bin/env python +import click +import io +import logging +from pathlib import Path +from typing import Dict, List, Tuple, Union, Optional +from statistics import mean + +from BALSAMIC.constants.analysis import SequencingType +from BALSAMIC.constants.tools import GENS_PARAMS +from BALSAMIC.utils.io import read_vcf_file + +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + + +@click.group() +@click.option( + "-o", + "--output-file", + required=True, + type=click.Path(exists=False), + help="Name of output-file.", +) +@click.option( + "-s", + "--sequencing-type", + required=True, + type=click.Choice([SequencingType.WGS]), + help="Sequencing type used.", +) +@click.pass_context +def cli(ctx: click.Context, output_file: str, sequencing_type: SequencingType): + """GENS pre-processing tool.""" + ctx.ensure_object(dict) + ctx.obj["output_file"] = output_file + ctx.obj["sequencing_type"] = sequencing_type + + +@cli.command() +@click.pass_context +@click.option( + "-v", + "--vcf-file-path", + required=True, + type=click.Path(exists=True), + help="Input VCF from germline-caller with SNVs & InDels from DNAscope, called with --given gnomad_af_0.05.vcf ", +) +def calculate_bafs(ctx: click.Context, vcf_file_path: str): + """ + Processes vcf-file from DNAscope into a bed-file format for GENS, with different number of variants for each zoom-level. + + Args: + vcf_file: From DNAscope created with --given argument using gnomad_af_min_0.05.vcf. + + Outputs bed-file in file-name specified in output-file. + """ + + LOG.info("Calculating BAFs from VCF.") + LOG.info("Reading VCF file...") + vcf_lines: List = read_vcf_file(vcf_file_path) + LOG.info("Extracting and processing variant info...") + variants: Dict = get_valid_variants(vcf_lines) + LOG.info("Writing variant b-allele-frequencies to output...") + output_file: Path = Path(ctx.obj["output_file"]) + sequencing_type: SequencingType = ctx.obj["sequencing_type"] + write_b_allele_output( + variants=variants, output_file=output_file, sequencing_type=sequencing_type + ) + + +def get_valid_variants(vcf_lines: List[str]) -> Dict: + """ + Process VCF lines to extract valid variants. + + Args: + vcf_lines (List[str]): List of VCF lines to be processed. + + Returns: + dict: A dictionary containing valid variants with variant IDs as keys and variant info as values. + + This function takes a list of VCF lines, processes them, and extracts valid variants. + Variants are stored in a dictionary with variant IDs as keys and variant information as values. + If any invalid variants are encountered, appropriate warnings are printed. + + Example usage: + vcf_lines: List = [...] # List of VCF lines + valid_variants: Dict = get_valid_variants(vcf_lines) + """ + count_invalid_vars: int = 0 + illegal_chromosomes: dict = {} + variants: dict = {} + variant_id: int = 0 + for variant_line in vcf_lines: + if variant_line.startswith("#"): + continue + + variant: Dict[str, Union[str, float]] = extract_variant_info(variant_line) + if not variant: + count_invalid_vars += 1 + continue + + v_chrom = variant["chr"] + if v_chrom not in GENS_PARAMS["ALLOWED_CHR_LIST"]: + illegal_chromosomes[v_chrom] = illegal_chromosomes.get(v_chrom, 0) + 1 + continue + + variants[variant_id] = variant + variant_id += 1 + + if count_invalid_vars: + LOG.warning( + f"Warning: Can't calc AF for a number of variants: {count_invalid_vars}." + ) + if illegal_chromosomes: + LOG.warning( + f"Warning: A number of variants have illegal chromosomes and will be skipped: {illegal_chromosomes}." + ) + return variants + + +def extract_variant_info(variant: str) -> Optional[Dict[str, Union[str, float]]]: + """ + Extracts genetic variant information. + + Args: + variant (str): Tab-separated string representing a genetic variant. + + Returns: + dict or None: Dictionary with variant details ('chr', 'start', 'ref', 'alt', 'sample', 'af'). + Returns None for uninformative samples or division by zero. + + Raises: + ValueError: If variant string lacks expected fields. + + Example: + variant_string = "chr1\t1000\t.\tA\tT\t.\t.\t.\tGT:AD:DP:GQ:PL\t0/1:10,5:15:45:45,0,20" + extract_variant_info(variant_string) + {'chr': 'chr1', 'start': '1000', 'ref': 'A', 'alt': 'T', 'sample': '0/1:10,5:15:45:45,0,20', 'af': 0.333333} + """ + fields: List[str] = variant.split("\t") + variant_info = { + "chr": fields[0], + "start": fields[1], + "ref": fields[3], + "alt": fields[4], + "sample": fields[9], + } + + if variant_info["sample"].split(":")[0] == "./.": + return None + + try: + allele_depths: List[str] = variant_info["sample"].split(":")[1] + VD = int(allele_depths.split(",")[1]) + DP = int(variant_info["sample"].split(":")[2]) + variant_info["af"] = round(VD / DP, 6) + except (ValueError, ZeroDivisionError, IndexError): + return None + + return variant_info + + +def write_b_allele_output( + variants: Dict, output_file: Path, sequencing_type: SequencingType +) -> None: + """ + Writes B-allele frequency (BAF) output to a file for each level of GENS zoom specified in prefix of BAF_SKIP_N. + + Args: + variants (Dict): A dictionary containing variant information. Each variant should have the following keys: + - 'start' (int): The start position of the variant. + - 'chr' (str): The chromosome where the variant is located. + - 'af' (float): The allele frequency of the variant. + + output_file (Path): The file path where the BAF output will be written. + + sequencing_type (SequencingType): The sequencing type used in analysis. + + Returns: + None + + Writes BAF information for each variant in the provided `variants` dictionary to the specified `output_file`. + The output is formatted as follows: + _ + + Note: + This function uses a predefined constant BAF_SKIP_N, which determines how many variants to skip before writing. + """ + BAF_SKIP_N: Dict[str, int] = GENS_PARAMS["SEQUENCING_TYPE"][sequencing_type][ + "BAF_SKIP_N" + ] + + with open(output_file.as_posix(), "w") as baf_out: + for prefix, req_skip_count in BAF_SKIP_N.items(): + skip_count: int = req_skip_count + for variant_id in variants: + variant: Dict = variants[variant_id] + if skip_count == req_skip_count: + v_start: int = int(variant["start"]) + v_chrom: str = variant["chr"] + v_af: float = variant["af"] + baf_out.write( + f"{prefix}_{v_chrom}\t{v_start}\t{v_start + 1}\t{v_af}\n" + ) + skip_count = 0 + else: + skip_count += 1 + + +@cli.command() +@click.pass_context +@click.option( + "-c", + "--normalised-coverage-path", + required=True, + type=click.Path(exists=True), + help="Input normalised coverage from GATK DenoiseReadCounts.", +) +def create_coverage_regions(ctx: click.Context, normalised_coverage_path: str) -> None: + """ + Calculate coverage data. + + Args: + normalised_coverage_path: Path to normalised coverage file. + + Returns: + None + """ + LOG.info("Creating coverage regions for GENS.") + normalised_coverage_path = Path(normalised_coverage_path) + output_file: Path = Path(ctx.obj["output_file"]) + sequencing_type: SequencingType = ctx.obj["sequencing_type"] + COV_REGION_SIZES: Dict[str, int] = GENS_PARAMS["SEQUENCING_TYPE"][sequencing_type][ + "COV_REGION_SIZES" + ] + with open(output_file.as_posix(), "w") as cov_out: + for prefix, region_size_requested in COV_REGION_SIZES.items(): + LOG.info( + f"Creating regions for prefix: {prefix}, region_size_requested: {region_size_requested}." + ) + generate_cov_bed( + normalised_coverage_path=normalised_coverage_path, + region_size_requested=region_size_requested, + prefix=prefix, + cov_out=cov_out, + ) + + +def write_coverage_region( + prefix: str, + region_chrom: str, + region_start: int, + region_end: int, + reg_ratios: List[float], + cov_out: io.TextIOWrapper, +) -> None: + """ + Write coverage region information to an output file. + + Args: + prefix (str): Prefix for the output. + region_chrom (str): Chromosome for the region. + region_start (int): Start position of the region. + region_end (int): End position of the region. + reg_ratios (List[float]): List of log2 ratios. + cov_out (io.TextIOWrapper): Output file. + + Returns: + None + """ + mid_point = region_start + (region_end - region_start) // 2 + cov_out.write( + f"{prefix}_{region_chrom}\t{mid_point - 1}\t{mid_point}\t{mean(reg_ratios)}\n" + ) + + +def extract_coverage_line_values(coverage_line: str) -> Tuple[str, int, int, float]: + """ + Extract coverage region values from a coverage line. + + Args: + coverage_line (str): A line containing coverage and genomic position information. + + Returns: + Tuple[str, int, int, float]: Extracted values (chr, start, end, log2_ratio). + """ + # Extract coverage region values + chr_start_stop_ratio: List = coverage_line.strip().split("\t") + chrom: str = chr_start_stop_ratio[0] + start, end = int(chr_start_stop_ratio[1]), int(chr_start_stop_ratio[2]) + log2_ratio: float = float(chr_start_stop_ratio[3]) + return chrom, start, end, log2_ratio + + +def generate_cov_bed( + normalised_coverage_path: Path, + region_size_requested: int, + prefix: str, + cov_out: io.TextIOWrapper, +) -> None: + """ + Merge coverage data into coverage regions for GENS. + + Args: + normalised_coverage_path: Path to normalised coverage file. + region_size_requested: Size of the coverage region. + prefix: Prefix for the output. + cov_out: Output file. + + Returns: + None + """ + + normalised_coverage: List[str] = normalised_coverage_path.read_text().splitlines() + minimum_region_size: int = GENS_PARAMS["MINIMUM_REGION_SIZE"] + + region_chrom, chrom, region_start, region_end, end, log2_ratio = [None] * 6 + first_cov_line: bool = True + start_new_region: bool = False + for coverage_line in normalised_coverage: + if coverage_line.startswith("@") or coverage_line.startswith("CONTIG"): + continue + + if first_cov_line or start_new_region: + ( + region_chrom, + region_start, + region_end, + log2_ratio, + ) = extract_coverage_line_values(coverage_line) + reg_ratios: List = [log2_ratio] + first_cov_line: bool = False + start_new_region: bool = False + if region_size_requested == minimum_region_size: + write_coverage_region( + prefix=prefix, + region_chrom=region_chrom, + region_start=region_start, + region_end=region_end, + reg_ratios=reg_ratios, + cov_out=cov_out, + ) + start_new_region: bool = True + continue + else: + chrom, _, end, log2_ratio = extract_coverage_line_values(coverage_line) + + region_size: int = end - region_start + 1 + if region_size == region_size_requested: + # Region size matches requested region size + # Step 1: Write region from current line + # Step 2: Start new region from new line + reg_ratios.append(log2_ratio) + write_coverage_region( + prefix=prefix, + region_chrom=region_chrom, + region_start=region_start, + region_end=end, + reg_ratios=reg_ratios, + cov_out=cov_out, + ) + start_new_region: bool = True + continue + + if region_size > region_size_requested: + # Region size larger due to incomplete genome reference + # Step 1: Write region from previous line + # Step 2: Start new region from current line + # Conceptual example: window_size = 300: + # start1 -- 100bases -- end1 + (region_start = start1, region_end = end1) + # start2 -- 100bases -- end2 + (region_start = start1, region_end = end2) + # -- gap 500 bases -- + # start3 -- 100bases -- end3 + write_coverage_region( + prefix=prefix, + region_chrom=region_chrom, + region_start=region_start, + region_end=region_end, + reg_ratios=reg_ratios, + cov_out=cov_out, + ) + start_new_region: bool = True + + if chrom != region_chrom: + # New chromosome: + # Step 1: Write region from previous line + # Step 2: Start new region from current line + write_coverage_region( + prefix=prefix, + region_chrom=region_chrom, + region_start=region_start, + region_end=region_end, + reg_ratios=reg_ratios, + cov_out=cov_out, + ) + start_new_region: bool = True + + if start_new_region: + ( + region_chrom, + region_start, + region_end, + log2_ratio, + ) = extract_coverage_line_values(coverage_line) + reg_ratios: List = [log2_ratio] + else: + region_end: int = end + reg_ratios.append(log2_ratio) + + # Output last line: + write_coverage_region( + prefix=prefix, + region_chrom=region_chrom, + region_start=region_start, + region_end=region_end, + reg_ratios=reg_ratios, + cov_out=cov_out, + ) + + +if __name__ == "__main__": + cli() diff --git a/BALSAMIC/assets/vagrant/Vagrantfile b/BALSAMIC/assets/vagrant/Vagrantfile deleted file mode 100644 index 622f0a791..000000000 --- a/BALSAMIC/assets/vagrant/Vagrantfile +++ /dev/null @@ -1,85 +0,0 @@ -# -*- mode: ruby -*- -# vi: set ft=ruby : - -#################################### -# CentOS 7 Build -# -# Downloads and installs the latest -# CentOS 6 build posted to S3 from -# Jenkins in a CentOS 7 VM. -# -# IP: 10.0.0.68 -# Admin dash username: admin -# Admin dash password: password -# -# NOTE that this image uses a custom box. It's based off of the -# minimal CentOS 7 box on vagrantbox.es, but we bumped to guest -# additions to match 4.3.20. On my Ubuntu 14.04 machine, I needed -# 4.3.20 Virtual box on the host machine and this custom image -# on the guest machine to get shared folders to work. -# -# mostly copied from official shiny_server git repo -# -#################################### - - -# Vagrantfile API/syntax version. Don't touch unless you know what you're doing! -VAGRANTFILE_API_VERSION = "2" - -Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| - # All Vagrant configuration is done here. The most common configuration - # options are documented and commented below. For a complete reference, - # please see the online documentation at vagrantup.com. - - # Every Vagrant virtual environment requires a box to build off of. - config.vm.box = "centos7-20" - - # The url from where the 'config.vm.box' box will be fetched if it - # doesn't already exist on the user's system. - config.vm.box_url = "https://s3-us-west-2.amazonaws.com/rstudio-vagrant-boxes/boxes/centos7.box" - - config.vm.host_name = "sso-centos7-latest" - - config.vm.provision "shell", path: "setup.sh" - - # Create a forwarded port mapping which allows access to a specific port - # within the machine from a port on the host machine. In the example below, - # accessing "localhost:8080" will access port 80 on the guest machine. - # config.vm.network :forwarded_port, guest: 80, host: 8080 - - # Create a private network, which allows host-only access to the machine - # using a specific IP. - #config.vm.network :private_network, ip: "10.0.0.62" - config.vm.network :forwarded_port, guest: 4001, host: 3838 - - # Create a public network, which generally matched to bridged network. - # Bridged networks make the machine appear as another physical device on - # your network. - # config.vm.network :public_network - - # If true, then any SSH connections made will enable agent forwarding. - # Default value: false - # config.ssh.forward_agent = true - - # Share an additional folder to the guest VM. The first argument is - # the path on the host to the actual folder. The second argument is - # the path on the guest to mount the folder. And the optional third - # argument is a set of non-required options. - # config.vm.synced_folder ".", "/vagrant_data" - - # Provider-specific configuration so you can fine-tune various - # backing providers for Vagrant. These expose provider-specific options. - # Example for VirtualBox: - # - config.vm.provider :virtualbox do |vb| - # Don't boot with headless mode - # vb.gui = true - - # Use VBoxManage to customize the VM. For example to change memory: - vb.customize ["modifyvm", :id, "--memory", "1024"] - end - # - # View the documentation for the provider you're using for more - # information on available options. - -end diff --git a/BALSAMIC/assets/vagrant/setup.sh b/BALSAMIC/assets/vagrant/setup.sh deleted file mode 100644 index 9eebdd721..000000000 --- a/BALSAMIC/assets/vagrant/setup.sh +++ /dev/null @@ -1,40 +0,0 @@ -#setup file mostly copied from official shiny_server git repo -yum clean all -y --enablerepo=* -yum install -y epel-release -yum update -y --disablerepo=epel - -# Enable EPEL -rpm -Uvh https://dl.fedoraproject.org/pub/epel/7/x86_64/Packages/e/epel-release-7-11.noarch.rpm - -# On this minimal install, we need wget -yum install -y coreutils -yum install -y yum-utils -yum install -y wget -yum install -y which -yum install -y bzip2 -yum install -y git -yum install -y gcc -yum install -y fontconfig -yum install -y libcurl libcurl-devel -yum install -y openssl-devel - -# Install R -yum install R -y - -wget https://s3.amazonaws.com/rstudio-shiny-server-os-build/centos-6.3/x86_64/VERSION -O "version.txt" -VERSION=`cat version.txt` - -# Install the latest SS build -wget "https://s3.amazonaws.com/rstudio-shiny-server-os-build/centos-6.3/x86_64/shiny-server-$VERSION-rh6-x86_64.rpm" -O ss-latest.rpm -yum install --nogpgcheck ss-latest.rpm -y - -sudo su - \ - -c "R -e \"install.packages(c('shiny', 'httpuv', 'rmarkdown', 'devtools', 'RJDBC'), repos='http://cran.rstudio.com/')\"" - -sudo R -e 'devtools::install_github("tidyverse/ggplot2")' - -systemctl disable firewalld -systemctl stop firewalld -sed -i 's/enforcing/disabled/g' /etc/selinux/config -systemctl enable shiny-server -systemctl start shiny-server diff --git a/BALSAMIC/assets/vcfanno/vcfanno.toml b/BALSAMIC/assets/vcfanno/vcfanno.toml deleted file mode 100644 index c824c659d..000000000 --- a/BALSAMIC/assets/vcfanno/vcfanno.toml +++ /dev/null @@ -1,11 +0,0 @@ -[[annotation]] -file="gnomad.genomes.r2.1.1.sites.vcf.bgz" -fields = ["AF", "AF_popmax"] -ops=["self", "self"] -names=["GNOMADAF", "GNOMADAF_popmax"] - -[[annotation]] -file="clinvar.vcf.gz" -fields=["CLNACC", "CLNREVSTAT", "CLNSIG", "ORIGIN", "CLNVC", "CLNVCSO"] -ops=["self", "self","self","self","self","self"] -names=["CLNACC", "CLNREVSTAT", "CLNSIG", "ORIGIN", "CLNVC", "CLNVCSO"] diff --git a/BALSAMIC/commands/base.py b/BALSAMIC/commands/base.py index 8fd433fc9..0c9044509 100755 --- a/BALSAMIC/commands/base.py +++ b/BALSAMIC/commands/base.py @@ -1,45 +1,29 @@ -""" -Entry cli for balsamic -""" +"""Balsamic CLI.""" import logging + import click import coloredlogs -# Subcommands -from BALSAMIC.commands.run.base import run as run_command +from BALSAMIC import __version__ as balsamic_version +from BALSAMIC.commands.config.base import config as config_command from BALSAMIC.commands.init.base import initialize as init_command +from BALSAMIC.commands.options import OPTION_LOG_LEVEL from BALSAMIC.commands.report.base import report as report_command -from BALSAMIC.commands.config.base import config as config_command -from BALSAMIC.commands.plugins.base import plugins as plugins_command - -# CLI commands and decorators +from BALSAMIC.commands.run.base import run as run_command +from BALSAMIC.constants.constants import LogLevel from BALSAMIC.utils.cli import add_doc as doc -# Get version -from BALSAMIC import __version__ as balsamic_version - LOG = logging.getLogger(__name__) -LOG_LEVELS = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] @click.group() -@click.option( - "--loglevel", - default="INFO", - type=click.Choice(LOG_LEVELS), - help="Set the level of log output.", - show_default=True, -) +@OPTION_LOG_LEVEL @click.version_option(version=balsamic_version) @click.pass_context @doc( - """BALSAMIC {version}: Bioinformatic Analysis pipeLine for - SomAtic MutatIons in Cancer""".format( - version=balsamic_version - ) + f"Balsamic {balsamic_version}: Bioinformatic Analysis Pipeline for Somatic Mutations in Cancer" ) -def cli(context, loglevel): - "BALSAMIC" +def cli(context: click.Context, log_level: LogLevel): coloredlogs.DEFAULT_FIELD_STYLES = { "asctime": {"color": "green"}, "hostname": {"color": "magenta"}, @@ -48,18 +32,14 @@ def cli(context, loglevel): "name": {"color": "blue"}, } coloredlogs.install( - level=loglevel, + level=log_level, fmt="%(programname)s %(hostname)s %(asctime)s %(name)s pid:%(process)d [%(levelname)s] %(message)s", ) - LOG.info("Running BALSAMIC version %s", balsamic_version) - - context.obj = {} - context.obj["loglevel"] = loglevel - # LOG.info(f"BALSAMIC started with log level {loglevel}.") + LOG.info(f"Running BALSAMIC version {balsamic_version}") + context.obj = {"log_level": log_level} cli.add_command(run_command) cli.add_command(report_command) cli.add_command(config_command) -cli.add_command(plugins_command) cli.add_command(init_command) diff --git a/BALSAMIC/commands/config/base.py b/BALSAMIC/commands/config/base.py index 7744c765a..c357afcd3 100644 --- a/BALSAMIC/commands/config/base.py +++ b/BALSAMIC/commands/config/base.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +"""Balsamic configuration file generation base commands.""" import click from BALSAMIC.commands.config.case import case_config as case_command @@ -7,8 +7,8 @@ @click.group() @click.pass_context -def config(context): - "create config files required for running the pipeline." +def config(context: click.Context): + """Create config files required for running the pipeline.""" pass diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index 57a6786d1..03e934b6d 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -1,256 +1,165 @@ -import os -import json +"""Balsamic config case CLI.""" import logging +from datetime import datetime from pathlib import Path +from typing import Dict import click from BALSAMIC import __version__ as balsamic_version +from BALSAMIC.commands.options import ( + OPTION_ADAPTER_TRIM, + OPTION_ANALYSIS_DIR, + OPTION_ANALYSIS_WORKFLOW, + OPTION_BACKGROUND_VARIANTS, + OPTION_BALSAMIC_CACHE, + OPTION_CACHE_VERSION, + OPTION_CADD_ANNOTATIONS, + OPTION_CANCER_GERMLINE_SNV_OBSERVATIONS, + OPTION_CANCER_SOMATIC_SNV_OBSERVATIONS, + OPTION_CANCER_SOMATIC_SV_OBSERVATIONS, + OPTION_CASE_ID, + OPTION_CLINICAL_SNV_OBSERVATIONS, + OPTION_CLINICAL_SV_OBSERVATIONS, + OPTION_FASTQ_PATH, + OPTION_GENDER, + OPTION_GENOME_INTERVAL, + OPTION_GENOME_VERSION, + OPTION_GENS_COV_PON, + OPTION_GNOMAD_AF5, + OPTION_NORMAL_SAMPLE_NAME, + OPTION_PANEL_BED, + OPTION_PON_CNN, + OPTION_QUALITY_TRIM, + OPTION_SWEGEN_SNV, + OPTION_SWEGEN_SV, + OPTION_TUMOR_SAMPLE_NAME, + OPTION_UMI, + OPTION_UMI_TRIM_LENGTH, +) +from BALSAMIC.constants.analysis import BIOINFO_TOOL_ENV, AnalysisWorkflow, Gender +from BALSAMIC.constants.cache import GenomeVersion +from BALSAMIC.constants.constants import FileType +from BALSAMIC.constants.paths import CONTAINERS_DIR +from BALSAMIC.constants.workflow_params import VCF_DICT +from BALSAMIC.models.config import ConfigModel from BALSAMIC.utils.cli import ( - get_sample_dict, - get_panel_chrom, - get_bioinfo_tools_version, - create_fastq_symlink, generate_graph, + get_analysis_fastq_files_directory, + get_bioinfo_tools_version, + get_panel_chrom, + get_sample_list, ) -from BALSAMIC.constants.common import ( - CONTAINERS_CONDA_ENV_PATH, - BIOINFO_TOOL_ENV, - GENDER_OPTIONS, -) -from BALSAMIC.constants.workflow_params import VCF_DICT -from BALSAMIC.utils.models import BalsamicConfigModel +from BALSAMIC.utils.io import read_json, write_json +from BALSAMIC.utils.utils import get_absolute_paths_dict LOG = logging.getLogger(__name__) @click.command("case", short_help="Create a sample config file from input sample data") -@click.option( - "--case-id", - required=True, - help="Sample id that is used for reporting, \ - naming the analysis jobs, and analysis path", -) -@click.option( - "--gender", - required=False, - default="female", - show_default=True, - type=click.Choice(GENDER_OPTIONS), - help="Case associated gender", -) -@click.option( - "--umi/--no-umi", - default=True, - show_default=True, - is_flag=True, - help=( - "UMI processing steps for samples with UMI tags. For WGS cases, UMI is always disabled." - ), -) -@click.option( - "--umi-trim-length", - default=5, - show_default=True, - type=int, - help="Trim N bases from reads in fastq", -) -@click.option( - "--quality-trim/--no-quality-trim", - default=True, - show_default=True, - is_flag=True, - help="Trim low quality reads in fastq", -) -@click.option( - "--adapter-trim/--no-adapter-trim", - default=True, - show_default=True, - is_flag=True, - help="Trim adapters from reads in fastq", -) -@click.option( - "-p", - "--panel-bed", - type=click.Path(exists=True, resolve_path=True), - required=False, - help="Panel bed file for variant calling.", -) -@click.option( - "-b", - "--background-variants", - type=click.Path(exists=True, resolve_path=True), - required=False, - help="Background set of valid variants for UMI", -) -@click.option( - "--pon-cnn", - type=click.Path(exists=True, resolve_path=True), - required=False, - help="Panel of normal reference (.cnn) for cnvkit", -) -@click.option( - "--balsamic-cache", - type=click.Path(exists=True, resolve_path=True), - required=True, - help="Path to BALSAMIC cache", -) -@click.option( - "--container-version", - show_default=True, - default=balsamic_version, - type=click.Choice(["develop", "master", balsamic_version]), - help="Container for BALSAMIC version to download", -) -@click.option( - "--analysis-dir", - type=click.Path(exists=True, resolve_path=True), - required=True, - help="Root analysis path to store analysis logs and results. \ - The final path will be analysis-dir/sample-id", -) -@click.option( - "-t", - "--tumor", - type=click.Path(exists=True, resolve_path=True), - required=True, - multiple=True, - help="Fastq files for tumor sample.", -) -@click.option( - "-n", - "--normal", - type=click.Path(exists=True, resolve_path=True), - required=False, - multiple=True, - help="Fastq files for normal sample.", -) -@click.option("--tumor-sample-name", help="Tumor sample name") -@click.option("--normal-sample-name", help="Normal sample name") -@click.option( - "--clinical-snv-observations", - type=click.Path(exists=True, resolve_path=True), - required=False, - help="VCF path of clinical SNV observations (WGS analysis workflow)", -) -@click.option( - "--clinical-sv-observations", - type=click.Path(exists=True, resolve_path=True), - required=False, - help="VCF path of clinical SV observations (WGS analysis workflow)", -) -@click.option( - "--cancer-all-snv-observations", - type=click.Path(exists=True, resolve_path=True), - required=False, - help="VCF path of cancer SNV normal observations (WGS analysis workflow)", -) -@click.option( - "--cancer-somatic-snv-observations", - type=click.Path(exists=True, resolve_path=True), - required=False, - help="VCF path of cancer SNV tumor observations (WGS analysis workflow)", -) -@click.option( - "--cancer-somatic-sv-observations", - type=click.Path(exists=True, resolve_path=True), - required=False, - help="VCF path of cancer SV observations (WGS analysis workflow)", -) -@click.option( - "--swegen-snv", - type=click.Path(exists=True, resolve_path=True), - required=False, - help="VCF path of Swegen SNV frequency database (WGS analysis workflow)", -) -@click.option( - "--swegen-sv", - type=click.Path(exists=True, resolve_path=True), - required=False, - help="VCF path of Swegen SV frequency database (WGS analysis workflow)", -) -@click.option( - "-g", - "--genome-version", - default="hg19", - type=click.Choice(["hg19", "hg38", "canfam3"]), - help=( - "Genome version to prepare reference. Path to genome will be /genome_version" - ), -) -@click.option( - "-w", - "--analysis-workflow", - default="balsamic", - show_default=True, - type=click.Choice(["balsamic", "balsamic-umi", "balsamic-qc"]), - help=( - 'Analysis workflow to run. By default: "balsamic" only ' - "workflow will be running. If you want to run both " - "balsamic and UMI workflow together for panel data; " - 'choose "balsamic-umi" option ' - ), -) +@OPTION_ADAPTER_TRIM +@OPTION_ANALYSIS_DIR +@OPTION_ANALYSIS_WORKFLOW +@OPTION_BACKGROUND_VARIANTS +@OPTION_BALSAMIC_CACHE +@OPTION_CACHE_VERSION +@OPTION_CADD_ANNOTATIONS +@OPTION_CANCER_GERMLINE_SNV_OBSERVATIONS +@OPTION_CANCER_SOMATIC_SNV_OBSERVATIONS +@OPTION_CANCER_SOMATIC_SV_OBSERVATIONS +@OPTION_CASE_ID +@OPTION_CLINICAL_SNV_OBSERVATIONS +@OPTION_CLINICAL_SV_OBSERVATIONS +@OPTION_FASTQ_PATH +@OPTION_GENDER +@OPTION_GENOME_VERSION +@OPTION_GENOME_INTERVAL +@OPTION_GENS_COV_PON +@OPTION_GNOMAD_AF5 +@OPTION_NORMAL_SAMPLE_NAME +@OPTION_PANEL_BED +@OPTION_PON_CNN +@OPTION_QUALITY_TRIM +@OPTION_SWEGEN_SNV +@OPTION_SWEGEN_SV +@OPTION_TUMOR_SAMPLE_NAME +@OPTION_UMI +@OPTION_UMI_TRIM_LENGTH @click.pass_context def case_config( - context, - case_id, - gender, - umi, - umi_trim_length, - adapter_trim, - quality_trim, - panel_bed, - background_variants, - pon_cnn, - analysis_dir, - tumor, - normal, - tumor_sample_name, - normal_sample_name, - clinical_snv_observations, - clinical_sv_observations, - cancer_all_snv_observations, - cancer_somatic_snv_observations, - cancer_somatic_sv_observations, - swegen_snv, - swegen_sv, - genome_version, - balsamic_cache, - container_version, - analysis_workflow, + context: click.Context, + adapter_trim: bool, + analysis_dir: Path, + analysis_workflow: AnalysisWorkflow, + background_variants: Path, + balsamic_cache: Path, + cache_version: str, + cadd_annotations: Path, + cancer_germline_snv_observations: Path, + cancer_somatic_snv_observations: Path, + cancer_somatic_sv_observations: Path, + case_id: str, + clinical_snv_observations: Path, + clinical_sv_observations: Path, + fastq_path: Path, + gender: Gender, + genome_version: GenomeVersion, + genome_interval: Path, + gens_coverage_pon: Path, + gnomad_min_af5: Path, + normal_sample_name: str, + panel_bed: Path, + pon_cnn: Path, + quality_trim: bool, + swegen_snv: Path, + swegen_sv: Path, + tumor_sample_name: str, + umi: bool, + umi_trim_length: int, ): + references_path: Path = Path(balsamic_cache, cache_version, genome_version) + references: Dict[str, Path] = get_absolute_paths_dict( + base_path=references_path, + data=read_json(Path(references_path, f"reference.{FileType.JSON}").as_posix()), + ) + cadd_annotations_path = {"cadd_annotations": cadd_annotations} + if cadd_annotations: + references.update(cadd_annotations_path) - try: - samples = get_sample_dict( - tumor=tumor, - normal=normal, - tumor_sample_name=tumor_sample_name, - normal_sample_name=normal_sample_name, - ) - except AttributeError: - LOG.error(f"File name is invalid, use convention [SAMPLE_ID]_R_[1,2].fastq.gz") - raise click.Abort() + if any([genome_interval, gens_coverage_pon, gnomad_min_af5]): + if panel_bed: + raise click.BadParameter( + "GENS is currently not compatible with TGA analysis, only WGS." + ) + if not all([genome_interval, gens_coverage_pon, gnomad_min_af5]): + raise click.BadParameter( + "All three arguments (genome_interval gens_coverage_pon, gnomad_min_af5) are required for GENS." + ) - if container_version: - balsamic_version = container_version + gens_ref_files = { + "genome_interval": genome_interval, + "gens_coverage_pon": gens_coverage_pon, + "gnomad_min_af5": gnomad_min_af5, + } - reference_config = os.path.join( - balsamic_cache, balsamic_version, genome_version, "reference.json" - ) - with open(reference_config, "r") as f: - reference_dict = json.load(f)["reference"] + references.update( + { + gens_file: path + for gens_file, path in gens_ref_files.items() + if path is not None + } + ) variants_observations = { "clinical_snv_observations": clinical_snv_observations, "clinical_sv_observations": clinical_sv_observations, - "cancer_all_snv_observations": cancer_all_snv_observations, + "cancer_germline_snv_observations": cancer_germline_snv_observations, "cancer_somatic_snv_observations": cancer_somatic_snv_observations, "cancer_somatic_sv_observations": cancer_somatic_sv_observations, "swegen_snv_frequency": swegen_snv, "swegen_sv_frequency": swegen_sv, } - reference_dict.update( + references.update( { observations: path for observations, path in variants_observations.items() @@ -258,7 +167,20 @@ def case_config( } ) - config_collection_dict = BalsamicConfigModel( + analysis_fastq_dir: str = get_analysis_fastq_files_directory( + case_dir=Path(analysis_dir, case_id).as_posix(), fastq_path=fastq_path + ) + result_dir: Path = Path(analysis_dir, case_id, "analysis") + log_dir: Path = Path(analysis_dir, case_id, "logs") + script_dir: Path = Path(analysis_dir, case_id, "scripts") + benchmark_dir: Path = Path(analysis_dir, case_id, "benchmarks") + dag_path: Path = Path( + analysis_dir, case_id, f"{case_id}_BALSAMIC_{balsamic_version}_graph.pdf" + ) + for directory in [result_dir, log_dir, script_dir, benchmark_dir]: + directory.mkdir(exist_ok=True) + + config_collection_dict = ConfigModel( QC={ "quality_trim": quality_trim, "adapter_trim": adapter_trim, @@ -269,19 +191,32 @@ def case_config( "case_id": case_id, "gender": gender, "analysis_dir": analysis_dir, - "analysis_type": "paired" if normal else "single", + "fastq_path": analysis_fastq_dir, + "analysis_type": "paired" if normal_sample_name else "single", + "log": log_dir.as_posix(), + "script": script_dir.as_posix(), + "result": result_dir.as_posix(), + "benchmark": benchmark_dir.as_posix(), + "dag": dag_path.as_posix(), "sequencing_type": "targeted" if panel_bed else "wgs", "analysis_workflow": analysis_workflow, + "config_creation_date": datetime.now().strftime("%Y-%m-%d %H:%M"), + }, + reference=references, + singularity={ + "image": Path(balsamic_cache, cache_version, "containers").as_posix() }, - reference=reference_dict, - singularity=os.path.join(balsamic_cache, balsamic_version, "containers"), background_variants=background_variants, - samples=samples, + samples=get_sample_list( + tumor_sample_name=tumor_sample_name, + normal_sample_name=normal_sample_name, + fastq_path=analysis_fastq_dir, + ), vcf=VCF_DICT, bioinfo_tools=BIOINFO_TOOL_ENV, bioinfo_tools_version=get_bioinfo_tools_version( bioinfo_tools=BIOINFO_TOOL_ENV, - container_conda_env_path=CONTAINERS_CONDA_ENV_PATH, + container_conda_env_path=CONTAINERS_DIR, ), panel={ "capture_kit": panel_bed, @@ -290,32 +225,12 @@ def case_config( } if panel_bed else None, - ).dict(by_alias=True, exclude_none=True) - LOG.info("Config file generated successfully") - - Path.mkdir( - Path(config_collection_dict["analysis"]["fastq_path"]), - parents=True, - exist_ok=True, - ) - LOG.info("Directories created successfully") + ).model_dump(by_alias=True, exclude_none=True) + LOG.info("Balsamic config model instantiated successfully") - create_fastq_symlink( - casefiles=(tumor + normal), - symlink_dir=Path(config_collection_dict["analysis"]["fastq_path"]), - ) - LOG.info(f"Symlinks generated successfully") - - config_path = Path(analysis_dir) / case_id / (case_id + ".json") - with open(config_path, "w+") as fh: - fh.write(json.dumps(config_collection_dict, indent=4)) + config_path = Path(analysis_dir, case_id, case_id + ".json").as_posix() + write_json(json_obj=config_collection_dict, path=config_path) LOG.info(f"Config file saved successfully - {config_path}") - try: - generate_graph(config_collection_dict, config_path) - LOG.info(f"BALSAMIC Workflow has been configured successfully!") - except ValueError as e: - LOG.error( - f'BALSAMIC dag graph generation failed - {config_collection_dict["analysis"]["dag"]}', - ) - raise click.Abort() + generate_graph(config_collection_dict, config_path) + LOG.info(f"BALSAMIC Workflow has been configured successfully!") diff --git a/BALSAMIC/commands/config/pon.py b/BALSAMIC/commands/config/pon.py index 1d28b1a4c..6d8695f8f 100644 --- a/BALSAMIC/commands/config/pon.py +++ b/BALSAMIC/commands/config/pon.py @@ -1,124 +1,110 @@ -import os -import json +"""Balsamic panel of normals config case CLI.""" import logging +from datetime import datetime from pathlib import Path +from typing import Dict import click from BALSAMIC import __version__ as balsamic_version +from BALSAMIC.commands.options import ( + OPTION_ADAPTER_TRIM, + OPTION_ANALYSIS_DIR, + OPTION_BALSAMIC_CACHE, + OPTION_CACHE_VERSION, + OPTION_CASE_ID, + OPTION_FASTQ_PATH, + OPTION_GENOME_INTERVAL, + OPTION_GENOME_VERSION, + OPTION_PANEL_BED, + OPTION_PON_VERSION, + OPTION_PON_WORKFLOW, + OPTION_QUALITY_TRIM, + OPTION_UMI, + OPTION_UMI_TRIM_LENGTH, +) +from BALSAMIC.constants.analysis import BIOINFO_TOOL_ENV, PONWorkflow +from BALSAMIC.constants.cache import GenomeVersion +from BALSAMIC.constants.constants import FileType +from BALSAMIC.constants.paths import CONTAINERS_DIR +from BALSAMIC.models.config import ConfigModel from BALSAMIC.utils.cli import ( - create_fastq_symlink, generate_graph, + get_analysis_fastq_files_directory, get_bioinfo_tools_version, - create_pon_fastq_symlink, -) -from BALSAMIC.utils.models import PonBalsamicConfigModel - -from BALSAMIC.constants.common import ( - CONTAINERS_CONDA_ENV_PATH, - BIOINFO_TOOL_ENV, + get_pon_sample_list, ) +from BALSAMIC.utils.io import read_json, write_json +from BALSAMIC.utils.utils import get_absolute_paths_dict LOG = logging.getLogger(__name__) @click.command("pon", short_help="Create a sample config file for PON analysis") -@click.option("--case-id", required=True, help="Sample id used for reporting analysis") -@click.option( - "--umi/--no-umi", - default=True, - show_default=True, - is_flag=True, - help=( - "UMI processing steps for samples with UMI tags." - "For WGS cases,by default UMI is disabled." - ), -) -@click.option( - "--umi-trim-length", - default=5, - show_default=True, - type=int, - help="Trimming first N bases from reads in fastq file", -) -@click.option( - "--quality-trim/--no-quality-trim", - default=True, - show_default=True, - is_flag=True, - help="Trimming low quality reads in fastq file", -) -@click.option( - "--adapter-trim/--no-adapter-trim", - default=True, - show_default=True, - is_flag=True, - help="Preprocess fastq reads by trimming adapters", -) -@click.option( - "-p", - "--panel-bed", - type=click.Path(exists=True, resolve_path=True), - required=False, - help="Panel bed file for calculating target regions.", -) -@click.option( - "--balsamic-cache", - type=click.Path(exists=True, resolve_path=True), - required=True, - help="Path to BALSAMIC cache", -) -@click.option( - "--analysis-dir", - type=click.Path(exists=True, resolve_path=True), - required=True, - help="Root analysis path directory.", -) -@click.option( - "--fastq-path", - type=click.Path(exists=True, resolve_path=True), - required=True, - help="Path directing to list of PON fastq samples.", -) -@click.option( - "-g", - "--genome-version", - default="hg19", - type=click.Choice(["hg19"]), - help=( - "Genome version to prepare reference. Path to genome" - "will be /genome_version" - ), -) -@click.option( - "-v", - "--version", - default="v1", - type=str, - help="Version of the PON file to be generated", -) +@OPTION_ADAPTER_TRIM +@OPTION_ANALYSIS_DIR +@OPTION_BALSAMIC_CACHE +@OPTION_CACHE_VERSION +@OPTION_CASE_ID +@OPTION_FASTQ_PATH +@OPTION_GENOME_VERSION +@OPTION_GENOME_INTERVAL +@OPTION_PANEL_BED +@OPTION_PON_WORKFLOW +@OPTION_PON_VERSION +@OPTION_QUALITY_TRIM +@OPTION_UMI +@OPTION_UMI_TRIM_LENGTH @click.pass_context def pon_config( - context, - case_id, - analysis_dir, - fastq_path, - panel_bed, - quality_trim, - umi, - umi_trim_length, - adapter_trim, - genome_version, - balsamic_cache, - version, + context: click.Context, + adapter_trim: bool, + analysis_dir: Path, + balsamic_cache: Path, + cache_version: str, + case_id: str, + fastq_path: Path, + genome_version: GenomeVersion, + genome_interval: Path, + panel_bed: Path, + pon_workflow: PONWorkflow, + quality_trim: bool, + umi: bool, + umi_trim_length: bool, + version: str, ): - reference_config = os.path.join( - balsamic_cache, balsamic_version, genome_version, "reference.json" + references_path: Path = Path(balsamic_cache, cache_version, genome_version) + references: Dict[str, Path] = get_absolute_paths_dict( + base_path=references_path, + data=read_json(Path(references_path, f"reference.{FileType.JSON}").as_posix()), ) - with open(reference_config, "r") as f: - reference_dict = json.load(f)["reference"] - config_collection_dict = PonBalsamicConfigModel( + if pon_workflow in [PONWorkflow.GENS_MALE, PONWorkflow.GENS_FEMALE]: + if not genome_interval: + raise click.BadParameter( + "Argument: genome_interval is required for GENS PON creation." + ) + references["genome_interval"] = genome_interval + + if pon_workflow == PONWorkflow.CNVKIT and not panel_bed: + raise click.BadParameter( + "Argument: panel_bed is required for CNVkit PON creation." + ) + + fastq_path: str = get_analysis_fastq_files_directory( + case_dir=Path(analysis_dir, case_id).as_posix(), fastq_path=fastq_path + ) + result_dir: Path = Path(analysis_dir, case_id, "analysis") + log_dir: Path = Path(analysis_dir, case_id, "logs") + script_dir: Path = Path(analysis_dir, case_id, "scripts") + benchmark_dir: Path = Path(analysis_dir, case_id, "benchmarks") + dag_path: Path = Path( + analysis_dir, case_id, f"{case_id}_BALSAMIC_{balsamic_version}_graph.pdf" + ) + for directory in [result_dir, log_dir, script_dir, benchmark_dir]: + directory.mkdir(exist_ok=True) + + config_collection_dict = ConfigModel( QC={ "adapter_trim": adapter_trim, "quality_trim": quality_trim, @@ -128,45 +114,36 @@ def pon_config( analysis={ "case_id": case_id, "analysis_dir": analysis_dir, + "fastq_path": fastq_path, + "log": log_dir.as_posix(), + "script": script_dir.as_posix(), + "result": result_dir.as_posix(), + "benchmark": benchmark_dir.as_posix(), + "dag": dag_path.as_posix(), "analysis_type": "pon", + "pon_workflow": pon_workflow, "pon_version": version, "analysis_workflow": "balsamic", "sequencing_type": "targeted" if panel_bed else "wgs", + "config_creation_date": datetime.now().strftime("%Y-%m-%d %H:%M"), + }, + samples=get_pon_sample_list(fastq_path), + reference=references, + singularity={ + "image": Path(balsamic_cache, cache_version, "containers").as_posix() }, - reference=reference_dict, - singularity=os.path.join(balsamic_cache, balsamic_version, "containers"), bioinfo_tools=BIOINFO_TOOL_ENV, bioinfo_tools_version=get_bioinfo_tools_version( bioinfo_tools=BIOINFO_TOOL_ENV, - container_conda_env_path=CONTAINERS_CONDA_ENV_PATH, + container_conda_env_path=CONTAINERS_DIR, ), panel={"capture_kit": panel_bed} if panel_bed else None, - ).dict(by_alias=True, exclude_none=True) - LOG.info("PON config file generated successfully") - - Path.mkdir( - Path(config_collection_dict["analysis"]["fastq_path"]), - parents=True, - exist_ok=True, - ) - LOG.info("fastq directories created successfully") + ).model_dump(by_alias=True, exclude_none=True) + LOG.info("PON config model instantiated successfully") - create_pon_fastq_symlink( - pon_fastqs=fastq_path, - symlink_dir=Path(config_collection_dict["analysis"]["fastq_path"]), - ) - LOG.info(f"fastqs symlinks generated successfully") - - config_path = Path(analysis_dir) / case_id / (case_id + "_PON" + ".json") - with open(config_path, "w+") as fh: - fh.write(json.dumps(config_collection_dict, indent=4)) + config_path = Path(analysis_dir, case_id, case_id + "_PON.json").as_posix() + write_json(json_obj=config_collection_dict, path=config_path) LOG.info(f"PON config file saved successfully - {config_path}") - try: - generate_graph(config_collection_dict, config_path) - LOG.info(f"BALSAMIC PON workflow has been configured successfully!") - except ValueError: - LOG.error( - f'BALSAMIC PON dag graph generation failed - {config_collection_dict["analysis"]["dag"]}' - ) - raise click.Abort() + generate_graph(config_collection_dict, config_path) + LOG.info(f"BALSAMIC PON workflow has been configured successfully!") diff --git a/BALSAMIC/commands/init/base.py b/BALSAMIC/commands/init/base.py index 3f74e075c..af1286ec9 100644 --- a/BALSAMIC/commands/init/base.py +++ b/BALSAMIC/commands/init/base.py @@ -1,332 +1,175 @@ -import os -import sys -import re +"""Balsamic init command.""" +import json import logging import subprocess +import sys +from datetime import datetime from pathlib import Path +from typing import Union, List, Optional import click -import graphviz -import snakemake -from BALSAMIC.constants.common import ( - BIOINFO_TOOL_ENV, - BALSAMIC_DOCKER_PATH, - VALID_CONTAINER_CONDA_NAME, -) -from BALSAMIC.utils.cli import ( - CaptureStdout, - get_snakefile, - SnakeMake, - get_config, - get_schedulerpy, - # job_id_dump_to_yaml, -) -from BALSAMIC import __version__ as balsamic_version -from BALSAMIC.utils.io import write_json +from BALSAMIC.commands.options import ( + OPTION_RUN_MODE, + OPTION_CLUSTER_PROFILE, + OPTION_CLUSTER_QOS, + OPTION_CLUSTER_ACCOUNT, + OPTION_SNAKEMAKE_OPT, + OPTION_GENOME_VERSION, + OPTION_FORCE_ALL, + OPTION_RUN_ANALYSIS, + OPTION_CLUSTER_MAIL, + OPTION_CLUSTER_MAIL_TYPE, + OPTION_QUIET, + OPTION_CACHE_VERSION, + OPTION_SNAKEFILE, + OPTION_OUT_DIR, + OPTION_CLUSTER_CONFIG, + OPTION_COSMIC_KEY, +) +from BALSAMIC.constants.analysis import BIOINFO_TOOL_ENV, RunMode +from BALSAMIC.constants.cache import GenomeVersion, REFERENCE_FILES +from BALSAMIC.constants.cluster import ( + ClusterMailType, + QOS, + ClusterProfile, + ClusterConfigType, +) +from BALSAMIC.models.cache import CacheConfig, ReferencesHg, ReferencesCanFam +from BALSAMIC.models.snakemake import SnakemakeExecutable +from BALSAMIC.utils.analysis import get_cache_singularity_bind_paths +from BALSAMIC.utils.cache import get_containers +from BALSAMIC.utils.cli import get_snakefile, get_config_path +from BALSAMIC.utils.io import write_json, generate_workflow_graph LOG = logging.getLogger(__name__) @click.command( - "init", short_help="Download matching version for container and build reference" -) -@click.option( - "-o", - "--outdir", - "--out-dir", - required=True, - help=( - "Output directory for ref files." - "This path will be used as base path for files" - ), -) -@click.option( - "-v", - "--container-version", - show_default=True, - default=balsamic_version, - type=click.Choice(["develop", "master", balsamic_version]), - help="Container for BALSAMIC version to download", -) -@click.option( - "-f", - "--force", - show_default=True, - default=False, - is_flag=True, - help="Force re-downloading all containers", -) -@click.option("-c", "--cosmic-key", required=False, help="cosmic db authentication key") -@click.option( - "-s", - "--snakefile", - default=None, - type=click.Path(), - show_default=True, - help="snakefile for reference generation", -) -@click.option( - "-d", - "--dagfile", - default="generate_ref_worflow_graph", - show_default=True, - help="DAG file for overview", -) -@click.option( - "-g", - "--genome-version", - default="hg19", - type=click.Choice(["hg19", "hg38", "canfam3"]), - help=( - "Genome version to prepare reference. Path to genome" - "will be /genome_version" - ), -) -@click.option( - "-r", - "--run-analysis", - show_default=True, - default=False, - is_flag=True, - help=( - "By default balsamic run_analysis will run in dry run mode." - "Raise this flag to make the actual analysis" - ), -) -@click.option( - "--run-mode", - show_default=True, - default="cluster", - type=click.Choice(["local", "cluster"]), - help=( - "Run mode to use. By default SLURM will be used to generate the balsamic_cache" - "Alternatively, option for local computing" - ), -) -@click.option( - "--cluster-config", - show_default=True, - default=get_config("reference_cluster"), - type=click.Path(), - help="cluster config json file. (eg- SLURM, QSUB)", -) -@click.option( - "-p", - "--profile", - default="slurm", - type=click.Choice(["slurm", "qsub"]), - help="cluster profile to submit jobs", -) -@click.option( - "--account", - "--slurm-account", - "--qsub-account", - help="cluster account to run jobs, ie: slurm_account", -) -@click.option( - "--qos", - type=click.Choice(["low", "normal", "high", "express"]), - show_default=True, - default="low", - help="QOS for sbatch jobs. Passed to " + get_schedulerpy(), -) -@click.option( - "--mail-user", help="cluster mail user to send out email. e.g.: slurm_mail_user" -) -@click.option( - "--mail-type", - type=click.Choice( - [ - "NONE", - "BEGIN", - "END", - "FAIL", - "REQUEUE", - "ALL", - "TIME_LIMIT", - ] - ), - help=( - "cluster mail type to send out email. This will " - "be applied to all jobs and override snakemake settings." - ), -) -@click.option( - "-f", - "--force-all", - show_default=True, - default=False, - is_flag=True, - help="Force run all analysis. This is same as snakemake --forceall", -) -@click.option( - "--snakemake-opt", multiple=True, help="Pass these options directly to snakemake" -) -@click.option( - "-q", - "--quiet", - default=False, - is_flag=True, - help="Instruct snakemake to be quiet! No output will be printed", -) + "init", short_help="Download singularity containers and build the reference cache" +) +@OPTION_OUT_DIR +@OPTION_CACHE_VERSION +@OPTION_CLUSTER_ACCOUNT +@OPTION_CLUSTER_CONFIG +@OPTION_CLUSTER_MAIL +@OPTION_CLUSTER_MAIL_TYPE +@OPTION_CLUSTER_PROFILE +@OPTION_CLUSTER_QOS +@OPTION_COSMIC_KEY +@OPTION_FORCE_ALL +@OPTION_GENOME_VERSION +@OPTION_QUIET +@OPTION_RUN_ANALYSIS +@OPTION_RUN_MODE +@OPTION_SNAKEFILE +@OPTION_SNAKEMAKE_OPT @click.pass_context def initialize( - context, - outdir, - container_version, - force, - cosmic_key, - snakefile, - dagfile, - genome_version, - run_analysis, - run_mode, - cluster_config, - account, - qos, - profile, - mail_user, - mail_type, - force_all, - quiet, - snakemake_opt, -): - """ - Initialize various resources after first installation. - - Pull container(s) for BALSAMIC according to matching version - - Download and build a reference - """ - config_dict = dict() - config_dict["singularity"] = dict() - - LOG.info("BALSAMIC started with log level %s" % context.obj["loglevel"]) - - if run_mode == "cluster" and not run_analysis: + context: click.Context, + account: Optional[str], + cache_version: str, + cluster_config: Path, + cosmic_key: str, + force_all: bool, + genome_version: GenomeVersion, + mail_type: Optional[ClusterMailType], + mail_user: Optional[str], + out_dir: str, + profile: ClusterProfile, + qos: QOS, + quiet: bool, + run_analysis: bool, + run_mode: RunMode, + snakefile: Path, + snakemake_opt: List[str], +) -> None: + """Validate inputs and download reference caches and containers.""" + LOG.info(f"BALSAMIC started with log level {context.obj['log_level']}") + + if run_mode == RunMode.CLUSTER and not run_analysis: LOG.info("Changing run-mode to local on dry-run") - run_mode = "local" - - if run_mode == "cluster" and not account: - LOG.info( - "slurm-account, qsub-account, or account is required for slurm run mode" - ) - raise click.Abort() + run_mode: RunMode = RunMode.LOCAL - if genome_version in ["hg38", "hg19"] and not cosmic_key: - LOG.error("cosmic db authentication key required with hg38 and hg19") + if run_mode == RunMode.CLUSTER and not account: + LOG.error("A cluster account is required for cluster run mode") raise click.Abort() - # resolve outdir to absolute path - outdir = Path(outdir).resolve() - container_outdir = Path(outdir, balsamic_version, "containers") - Path(container_outdir).mkdir(parents=True, exist_ok=True) - config_dict["singularity"]["image_path"] = container_outdir.as_posix() - config_dict["singularity"]["containers"] = dict() - - pattern = re.compile(r"^(\d+\.)?(\d+\.)?(\*|\d+)$") - if pattern.findall(container_version): - docker_image_base_name = "release_v{}".format(container_version) - else: - docker_image_base_name = container_version - - for image_suffix in VALID_CONTAINER_CONDA_NAME: - container_stub_url = "{}:{}-{}".format( - BALSAMIC_DOCKER_PATH, docker_image_base_name, image_suffix + if genome_version in [GenomeVersion.HG19, GenomeVersion.HG38] and not cosmic_key: + LOG.error( + f"No COSMIC authentication key specified. It is required when using {genome_version} reference" ) - config_dict["singularity"]["containers"][image_suffix] = container_stub_url - - config_path = Path(__file__).parents[2] / "config" - config_path = config_path.absolute() - - rule_directory = Path(__file__).parents[2] - - config_dict["bioinfo_tools"] = BIOINFO_TOOL_ENV - config_dict["rule_directory"] = rule_directory.as_posix() + "/" - - reference_outdir = Path(outdir, balsamic_version, genome_version) - Path(reference_outdir).mkdir(parents=True, exist_ok=True) - config_json = Path(reference_outdir, "config.json").as_posix() - dagfile_path = Path(reference_outdir, dagfile).as_posix() - logpath = Path(reference_outdir, "logs") - Path(logpath).mkdir(parents=True, exist_ok=True) - scriptpath = Path(reference_outdir, "scripts") - Path(scriptpath).mkdir(parents=True, exist_ok=True) - - config_dict["output"] = reference_outdir.as_posix() - if cosmic_key: - config_dict["cosmic_key"] = cosmic_key + raise click.Abort() - config_dict["genome_version"] = genome_version - config_dict["analysis"] = {} - config_dict["analysis"]["case_id"] = ( - "reference" + "." + genome_version + ".v" + balsamic_version + out_dir: Path = Path(out_dir, cache_version).absolute() + references_dir: Path = Path(out_dir, genome_version) + genome_dir = Path(references_dir, "genome") + variants_dir = Path(references_dir, "variants") + vep_dir = Path(references_dir, "vep") + containers_dir: Path = Path(out_dir, "containers") + config_path: Path = Path(references_dir, "config.json") + log_dir: Path = Path(references_dir, "logs") + script_dir: Path = Path(references_dir, "scripts") + for dir_path in [references_dir, log_dir, script_dir]: + dir_path.mkdir(parents=True, exist_ok=True) + + references: Union[ReferencesHg, ReferencesCanFam] = REFERENCE_FILES[genome_version] + cache_config: CacheConfig = CacheConfig( + analysis={"case_id": f"reference.{genome_version}.{cache_version}"}, + references_dir=references_dir.as_posix(), + genome_dir=genome_dir.as_posix(), + variants_dir=variants_dir.as_posix(), + vep_dir=vep_dir.as_posix(), + containers_dir=containers_dir.as_posix(), + genome_version=genome_version, + cosmic_key=cosmic_key, + bioinfo_tools=BIOINFO_TOOL_ENV, + containers=get_containers(cache_version), + references=references, + references_date=datetime.now().strftime("%Y-%m-%d %H:%M"), ) + write_json( + json_obj=json.loads(cache_config.model_dump_json(exclude_none=True)), + path=config_path.as_posix(), + ) + LOG.info(f"Reference workflow configured successfully ({config_path.as_posix()})") - write_json(config_dict, config_json) - LOG.info("Reference generation workflow configured successfully - %s" % config_json) - - snakefile = ( - snakefile - if snakefile - else get_snakefile("generate_ref", "balsamic", genome_version) + snakefile: Path = ( + snakefile if snakefile else get_snakefile("generate_ref", "balsamic") ) - with CaptureStdout() as graph_dot: - snakemake.snakemake( - snakefile=snakefile, - dryrun=True, - configfiles=[config_json], - printrulegraph=True, - ) + generate_workflow_graph( + config_path=config_path, + directory_path=references_dir, + snakefile=snakefile, + title="reference", + ) - graph_title = "_".join(["BALSAMIC", balsamic_version, "Generate reference"]) - graph_dot = "".join(graph_dot).replace( - "snakemake_dag {", 'BALSAMIC { label="' + graph_title + '";labelloc="t";' + LOG.info("Starting reference generation workflow...") + snakemake_executable: SnakemakeExecutable = SnakemakeExecutable( + account=account, + case_id=cache_config.analysis.case_id, + cluster_config_path=cluster_config + if cluster_config + else get_config_path(ClusterConfigType.CACHE), + config_path=config_path, + force=force_all, + log_dir=log_dir, + mail_type=mail_type, + mail_user=mail_user, + profile=profile, + qos=qos, + quiet=quiet, + result_dir=references_dir, + run_analysis=run_analysis, + run_mode=run_mode, + script_dir=script_dir, + snakemake_options=snakemake_opt, + singularity_bind_paths=get_cache_singularity_bind_paths(cache_config), + snakefile=snakefile, + working_dir=references_dir, ) - graph_obj = graphviz.Source( - graph_dot, filename=dagfile_path, format="pdf", engine="dot" + subprocess.run( + f"{sys.executable} -m {snakemake_executable.get_command()}", + shell=True, ) - - try: - graph_pdf = graph_obj.render() - LOG.info("Reference workflow graph generated successfully - %s " % graph_pdf) - except Exception: - LOG.error("Reference workflow graph generation failed") - raise click.Abort() - - LOG.info("Reference generation workflow started") - - # Singularity bind path - bind_path = list() - bind_path.append(config_dict["output"]) - bind_path.append(config_dict["rule_directory"]) - - # Construct snakemake command to run workflow - balsamic_run = SnakeMake() - balsamic_run.working_dir = config_dict["output"] - balsamic_run.snakefile = snakefile - balsamic_run.configfile = config_json - balsamic_run.run_mode = run_mode - balsamic_run.forceall = force_all - balsamic_run.run_analysis = run_analysis - balsamic_run.cluster_config = cluster_config - balsamic_run.scheduler = get_schedulerpy() - balsamic_run.profile = profile - balsamic_run.account = account - balsamic_run.qos = qos - balsamic_run.log_path = logpath - balsamic_run.script_path = scriptpath - balsamic_run.result_path = reference_outdir - balsamic_run.case_name = config_dict["analysis"]["case_id"] - balsamic_run.quiet = quiet - if mail_type: - balsamic_run.mail_type = mail_type - balsamic_run.mail_user = mail_user - balsamic_run.sm_opt = snakemake_opt - - # Always use singularity - balsamic_run.use_singularity = True - balsamic_run.singularity_bind = bind_path - - cmd = sys.executable + " -m " + balsamic_run.build_cmd() - subprocess.run(cmd, shell=True) diff --git a/BALSAMIC/commands/options.py b/BALSAMIC/commands/options.py new file mode 100644 index 000000000..02194ae7b --- /dev/null +++ b/BALSAMIC/commands/options.py @@ -0,0 +1,428 @@ +"""Balsamic command options.""" +import click + +from BALSAMIC import __version__ as balsamic_version +from BALSAMIC.constants.analysis import ( + RunMode, + RUN_MODES, + ANALYSIS_WORKFLOWS, + AnalysisWorkflow, + Gender, + RULE_DELIVERY_MODES, + RuleDeliveryMode, + PON_WORKFLOWS, + PONWorkflow, +) +from BALSAMIC.constants.cache import GenomeVersion, CacheVersion, GENOME_VERSIONS +from BALSAMIC.constants.cluster import ( + ClusterProfile, + QOS, + CLUSTER_PROFILES, + QOS_OPTIONS, + CLUSTER_MAIL_TYPES, +) +from BALSAMIC.constants.constants import LogLevel, LOG_LEVELS +from BALSAMIC.constants.rules import DELIVERY_RULES +from BALSAMIC.constants.workflow_params import VCF_DICT +from BALSAMIC.utils.cli import validate_cache_version + + +OPTION_ADAPTER_TRIM = click.option( + "--adapter-trim/--no-adapter-trim", + default=True, + show_default=True, + is_flag=True, + help="Trim adapters from reads in FASTQ file", +) + +OPTION_ANALYSIS_DIR = click.option( + "--analysis-dir", + type=click.Path(exists=True, resolve_path=True), + required=True, + help="Path to store the analysis results", +) + +OPTION_ANALYSIS_WORKFLOW = click.option( + "-w", + "--analysis-workflow", + default=AnalysisWorkflow.BALSAMIC, + show_default=True, + type=click.Choice(ANALYSIS_WORKFLOWS), + help="Balsamic analysis workflow to be executed", +) + +OPTION_BACKGROUND_VARIANTS = click.option( + "-b", + "--background-variants", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="Background set of valid variants for UMI", +) + +OPTION_BALSAMIC_CACHE = click.option( + "--balsamic-cache", + type=click.Path(exists=True, resolve_path=True), + required=True, + help="Path to BALSAMIC cache", +) + +OPTION_BENCHMARK = click.option( + "--benchmark", + default=False, + is_flag=True, + help="Profile slurm jobs. Make sure you have slurm profiler enabled in your HPC.", +) + +OPTION_CACHE_VERSION = click.option( + "--cache-version", + show_default=True, + default=balsamic_version, + type=click.STRING, + callback=validate_cache_version, + help=f"Cache version to be used for init or analysis. Use '{CacheVersion.DEVELOP}' or 'X.X.X'.", +) + +OPTION_CADD_ANNOTATIONS = click.option( + "--cadd-annotations", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="Path of CADD annotations", +) + +OPTION_CANCER_GERMLINE_SNV_OBSERVATIONS = click.option( + "--cancer-germline-snv-observations", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="VCF path of cancer germline SNV normal observations (WGS analysis workflow)", +) + +OPTION_CANCER_SOMATIC_SNV_OBSERVATIONS = click.option( + "--cancer-somatic-snv-observations", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="VCF path of cancer SNV tumor observations (WGS analysis workflow)", +) + +OPTION_CANCER_SOMATIC_SV_OBSERVATIONS = click.option( + "--cancer-somatic-sv-observations", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="VCF path of cancer SV observations (WGS analysis workflow)", +) + +OPTION_CASE_ID = click.option( + "--case-id", + required=True, + help="Sample ID for reporting, naming the analysis jobs, and analysis path", +) + +OPTION_CLINICAL_SNV_OBSERVATIONS = click.option( + "--clinical-snv-observations", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="VCF path of clinical SNV observations (WGS analysis workflow)", +) + +OPTION_CLINICAL_SV_OBSERVATIONS = click.option( + "--clinical-sv-observations", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="VCF path of clinical SV observations (WGS analysis workflow)", +) + +OPTION_CLUSTER_ACCOUNT = click.option( + "--account", + type=click.STRING, + help="Cluster account to run jobs", +) + +OPTION_CLUSTER_CONFIG = click.option( + "--cluster-config", + type=click.Path(), + help="Cluster configuration JSON file path", +) + +OPTION_CLUSTER_MAIL = click.option( + "--mail-user", + type=click.STRING, + help="User email to receive notifications from the cluster", +) + +OPTION_CLUSTER_MAIL_TYPE = click.option( + "--mail-type", + type=click.Choice(CLUSTER_MAIL_TYPES), + help="The mail type triggering cluster emails", +) + +OPTION_CLUSTER_PROFILE = click.option( + "-p", + "--profile", + show_default=True, + default=ClusterProfile.SLURM, + type=click.Choice(CLUSTER_PROFILES), + help="Cluster profile to submit jobs", +) + +OPTION_CLUSTER_QOS = click.option( + "--qos", + show_default=True, + default=QOS.LOW, + type=click.Choice(QOS_OPTIONS), + help="QOS for cluster jobs", +) + +OPTION_COSMIC_KEY = click.option( + "-c", + "--cosmic-key", + required=False, + type=click.STRING, + help="Cosmic DB authentication key", +) + +OPTION_DELIVERY_MODE = click.option( + "-m", + "--delivery-mode", + type=click.Choice(RULE_DELIVERY_MODES), + default=RuleDeliveryMode.APPEND, + show_default=True, + help=f"Append rules to deliver to the current delivery option ({RuleDeliveryMode.APPEND}) or deliver only " + f"the ones specified ({RuleDeliveryMode.RESET})", +) + +OPTION_DISABLE_VARIANT_CALLER = click.option( + "--disable-variant-caller", + help=f"Run workflow with selected variant caller(s) disable. Use comma to remove multiple variant callers. Valid " + f"values are: {list(VCF_DICT.keys())}", +) + +OPTION_DRAGEN = click.option( + "--dragen", + is_flag=True, + default=False, + help="Enable dragen variant caller", +) + +OPTION_FASTQ_PATH = click.option( + "--fastq-path", + type=click.Path(exists=True, resolve_path=True), + required=True, + help="Path to directory containing unconcatenated FASTQ files", +) + +OPTION_FORCE_ALL = click.option( + "--force-all", + show_default=True, + default=False, + is_flag=True, + help="Force execution. This is equivalent to Snakemake --forceall.", +) + +OPTION_GENDER = click.option( + "--gender", + required=False, + type=click.Choice([Gender.FEMALE, Gender.MALE]), + default=Gender.FEMALE, + show_default=True, + help="Sample associated gender", +) + +OPTION_GENOME_VERSION = click.option( + "-g", + "--genome-version", + show_default=True, + default=GenomeVersion.HG19, + type=click.Choice(GENOME_VERSIONS), + help="Type and build version of the reference genome", +) + +OPTION_GENOME_INTERVAL = click.option( + "--genome-interval", + required=False, + type=click.Path(exists=True, resolve_path=True), + help="Genome 100 bp interval-file (created with gatk PreprocessIntervals), used for GENS pre-processing.", +) + +OPTION_GENS_COV_PON = click.option( + "--gens-coverage-pon", + required=False, + type=click.Path(exists=True, resolve_path=True), + help="GENS PON file, either male or female (created with gatk CreateReadCountPanelOfNormals), used for GENS pre-processing.", +) + +OPTION_GNOMAD_AF5 = click.option( + "--gnomad-min-af5", + required=False, + type=click.Path(exists=True, resolve_path=True), + help="Gnomad VCF filtered to keep >= 0.05 AF, used for GENS pre-processing.", +) + +OPTION_LOG_LEVEL = click.option( + "--log-level", + default=LogLevel.INFO, + type=click.Choice(LOG_LEVELS), + help="Logging level in terms of urgency", + show_default=True, +) + +OPTION_NORMAL_SAMPLE_NAME = click.option( + "--normal-sample-name", + required=False, + type=click.STRING, + help="Normal sample name", +) + +OPTION_OUT_DIR = click.option( + "-o", + "--out-dir", + required=True, + type=click.Path(exists=True), + help="Output directory for singularity containers and reference files", +) + +OPTION_PANEL_BED = click.option( + "-p", + "--panel-bed", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="Panel bed file of target regions", +) + +OPTION_PON_CNN = click.option( + "--pon-cnn", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="Panel of normal reference (.cnn) for CNVkit", +) + +OPTION_PON_WORKFLOW = click.option( + "--pon-workflow", + type=click.Choice(PON_WORKFLOWS), + default=PONWorkflow.CNVKIT, + required=True, + help="Specify which PON to create.", +) + +OPTION_PON_VERSION = click.option( + "-v", + "--version", + default="v1", + type=click.STRING, + help="Version of the PON file to be generated", +) + +OPTION_PRINT_FILES = click.option( + "-p", + "--print-files", + is_flag=True, + default=False, + show_default=True, + help="Print list of analysis files. Otherwise only final count will be printed.", +) + +OPTION_QUALITY_TRIM = click.option( + "--quality-trim/--no-quality-trim", + default=True, + show_default=True, + is_flag=True, + help="Trim low quality reads in FASTQ file", +) + +OPTION_QUIET = click.option( + "-q", + "--quiet", + default=False, + is_flag=True, + help="Instruct Snakemake to not output any progress or rule information", +) + +OPTION_RULES_TO_DELIVER = click.option( + "-r", + "--rules-to-deliver", + multiple=True, + type=click.Choice(DELIVERY_RULES), + help="Specify the rules to deliver. The delivery mode selected via the --delivery-mode option.", +) + +OPTION_RUN_ANALYSIS = click.option( + "-r", + "--run-analysis", + show_default=True, + default=False, + is_flag=True, + help="Flag to run the actual analysis", +) + +OPTION_RUN_MODE = click.option( + "--run-mode", + show_default=True, + default=RunMode.CLUSTER, + type=click.Choice(RUN_MODES), + help="Run mode to execute Balsamic workflows", +) + +OPTION_SAMPLE_CONFIG = click.option( + "-s", + "--sample-config", + required=True, + type=click.Path(), + help="Sample configuration file", +) + +OPTION_SHOW_ONLY_MISSING_FILES = click.option( + "-m", + "--show-only-missing", + is_flag=True, + default=False, + show_default=True, + help="Only show missing analysis files.", +) + +OPTION_SNAKEFILE = click.option( + "-S", + "--snakefile", + type=click.Path(), + help="Custom Snakefile for internal testing", +) + +OPTION_SNAKEMAKE_OPT = click.option( + "--snakemake-opt", + multiple=True, + help="Options to be passed to Snakemake", +) + +OPTION_SWEGEN_SNV = click.option( + "--swegen-snv", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="VCF path of Swegen SNV frequency database", +) + +OPTION_SWEGEN_SV = click.option( + "--swegen-sv", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="VCF path of Swegen SV frequency database", +) + +OPTION_TUMOR_SAMPLE_NAME = click.option( + "--tumor-sample-name", + required=True, + type=click.STRING, + help="Tumor sample name", +) + +OPTION_UMI = click.option( + "--umi/--no-umi", + default=True, + show_default=True, + is_flag=True, + help="UMI processing steps for samples with UMI tags. For WGS cases, UMI is always disabled.", +) + +OPTION_UMI_TRIM_LENGTH = click.option( + "--umi-trim-length", + default=5, + show_default=True, + type=click.INT, + help="Trim N bases from reads in FASTQ file", +) diff --git a/BALSAMIC/commands/plugins/base.py b/BALSAMIC/commands/plugins/base.py deleted file mode 100644 index 1d3e069c7..000000000 --- a/BALSAMIC/commands/plugins/base.py +++ /dev/null @@ -1,19 +0,0 @@ -import click - -from BALSAMIC.commands.plugins.scout import scout as scout_command -from BALSAMIC.commands.plugins.cov_plot import ( - target_cov_plot as target_cov_plot_command, -) -from BALSAMIC.commands.plugins.vcfutils import vcfutils as vcfutils_command - - -@click.group() -@click.pass_context -def plugins(context): - """Additional and helper utilities for third party applications""" - pass - - -plugins.add_command(scout_command) -plugins.add_command(target_cov_plot_command) -plugins.add_command(vcfutils_command) diff --git a/BALSAMIC/commands/plugins/cov_plot.py b/BALSAMIC/commands/plugins/cov_plot.py deleted file mode 100644 index b37544115..000000000 --- a/BALSAMIC/commands/plugins/cov_plot.py +++ /dev/null @@ -1,13 +0,0 @@ -import logging -import click - -LOG = logging.getLogger(__name__) - - -@click.command("target-cov-plot", short_help="Plots coverage for target regions.") -@click.pass_context -def target_cov_plot(context): - """ - cli for coverage plot sub-command. - Creates coverage plots in result_directory. - """ diff --git a/BALSAMIC/commands/plugins/scout.py b/BALSAMIC/commands/plugins/scout.py deleted file mode 100644 index fdb1bf567..000000000 --- a/BALSAMIC/commands/plugins/scout.py +++ /dev/null @@ -1,104 +0,0 @@ -import os -import logging -import json -import yaml -import click -import datetime - -from BALSAMIC.utils.rule import get_result_dir - -LOG = logging.getLogger(__name__) - - -@click.command("scout", short_help="Creates a scout config yaml file.") -@click.option( - "--sample-config", - required=True, - help="Sample config file. Output of balsamic config sample", -) -@click.option( - "--snv-vcf", default="vcfmerge", help="variant caller to load as vcf_cancer" -) -@click.option("--tumor", default="TUMOR", help="sample name for tumor sample") -@click.option("--normal", default="NORMAL", help="sample name for normal sample") -@click.option( - "--sv-vcf", default="manta", help="variant caller to load as vcf_cancer_sv" -) -@click.option("--customer-id", required=True, help="customer id for scout config") -@click.pass_context -def scout(context, sample_config, snv_vcf, sv_vcf, customer_id, tumor, normal): - """ - Create a scout config.yaml file - """ - LOG.info(f"BALSAMIC started with log level {context.obj['loglevel']}.") - LOG.info("Adding scout cancer template to delivery directory") - - with open(sample_config, "r") as fn: - sample_config = json.load(fn) - case_name = sample_config["analysis"]["case_id"] - capture_kit = os.path.basename(sample_config["panel"]["capture_kit"]) - - result_dir = get_result_dir(sample_config) - dst_directory = os.path.join(result_dir, "scout") - if not os.path.exists(dst_directory): - LOG.debug("Creating delivery_report directory") - os.makedirs(dst_directory) - - scout_config_src = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "../../assets/scout_config_template.yaml", - ) - with open(scout_config_src, "r") as fn: - scout_config = yaml.load(fn, Loader=yaml.SafeLoader) - - deliver_wildcards = { - "bam": {"tumor": "tumor.merged.bam", "normal": "normal.merged.bam"}, - "vep": { - "vcf_cancer_sv": f"SV.somatic.{case_name}.{sv_vcf}.vcf.gz", - "vcf_cancer": f"SNV.somatic.{case_name}.{snv_vcf}.vcf.gz", - }, - "qc": "multiqc_report.html", - } - - if sample_config["analysis"]["analysis_type"] == "single": - del deliver_wildcards["bam"]["normal"] - scout_config["samples"].pop(1) - - scout_config["owner"] = customer_id - scout_config["family"] = case_name - scout_config["family_name"] = case_name - - scout_config["analysis_date"] = str(datetime.datetime.now()) - - # scout_config['vcf_cancer_sv'] = os.path.join(result_dir, 'vep', deliver_wildcards['vep']['vcf_cancer_sv']) - scout_config.pop("vcf_cancer_sv") - scout_config["vcf_cancer"] = os.path.join( - result_dir, "vep", deliver_wildcards["vep"]["vcf_cancer"] - ) - - scout_config["multiqc"] = os.path.join(result_dir, "qc", deliver_wildcards["qc"]) - - # scout sample info for tumor - scout_config["samples"][0]["bam_path"] = os.path.join( - result_dir, "bam", deliver_wildcards["bam"]["tumor"] - ) - scout_config["samples"][0]["capture_kit"] = capture_kit - scout_config["samples"][0]["sample_id"] = tumor - scout_config["samples"][0]["sample_name"] = tumor - - if sample_config["analysis"]["analysis_type"] == "paired": - scout_config["samples"][1]["bam_path"] = os.path.join( - result_dir, "bam", deliver_wildcards["bam"]["normal"] - ) - scout_config["samples"][1]["capture_kit"] = capture_kit - scout_config["samples"][1]["sample_id"] = normal - scout_config["samples"][1]["sample_name"] = normal - - scout_config_dst = os.path.join( - dst_directory, sample_config["analysis"]["case_id"] + ".scout.yaml" - ) - - LOG.debug("Creating scout config %s", scout_config_dst) - with open(scout_config_dst, "w") as f: - yaml.dump(scout_config, f, default_flow_style=False) - LOG.info("Scout config template is successfully created: %s", scout_config_dst) diff --git a/BALSAMIC/commands/plugins/vcfutils.py b/BALSAMIC/commands/plugins/vcfutils.py deleted file mode 100644 index 391a2ea71..000000000 --- a/BALSAMIC/commands/plugins/vcfutils.py +++ /dev/null @@ -1,132 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 -""" Command-line utilities for processing VCF files """ - -import re -from datetime import date -from datetime import datetime -import click -from cyvcf2 import VCF - - -@click.group() -def vcfutils(): - """Commands to process VCF files""" - - -##Tab-delimited input file containing specific header format.Information is stored as dictionary. -def readinput(text_file): - """Input text file processing. Outputs a dictionary""" - input_info = {} - with open(text_file, "r") as input_file: - header = input_file.readline().strip().split("\t") - for lines in iter(input_file): - lines = lines.strip().split("\t") - ref_keys = ":".join( - [ - lines[header.index("Mutation_ID")], - lines[header.index("Gene_ID")], - lines[header.index("AA_Change")], - ] - ) - input_info[ref_keys] = ";".join( - [ - lines[header.index("Average_AF%")], - lines[header.index("Variant_type")], - lines[header.index("AA_HGVS")], - ] - ) - return input_info - - -def vcfheader(): - """Write standard header lines for VCF output""" - file_date = date.today().strftime("%Y%m%d") - vcf_header = "{}".format( - """##fileformat=VCFv4.2 -##fileDate=""" - + file_date - + """ -##source=NA -##reference=NA -##contig= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n""" - ) - return vcf_header - - -def collect_ref_info(variant): - """Collect necessary variant information from input file""" - allele_freq, variant_type, aa_hgvs = variant.split(";") - info = ";".join( - [ - "VARIANT_TYPE=" + str(variant_type), - "AA_HGVS=" + str(aa_hgvs), - "AF=" + str(round(float(allele_freq) / 100, 5)), - ] - ) - return info - - -def collect_vcf_info(variant): - """Clean reference vcf and collect info from required fields""" - info = str(variant).split("\t") - info = [re.sub(r"(.*)(_ENST\d+;)", r"\1;", i) for i in info] - info = [re.sub(r"(.*)(;CNT=\d+\n)", r"\1", i) for i in info] - return info - - -@vcfutils.command() -@click.option( - "-i", - "--input_file", - required=True, - type=click.Path(exists=True), - help="tab-seperated reference text file", -) -@click.option( - "-r", - "--reference_file", - required=True, - type=click.Path(exists=True), - help="cosmic database file", -) -@click.option( - "-o", - "--output_file", - required=True, - type=click.Path(exists=False), - help="Output file name", -) -def createvcf(input_file, reference_file, output_file): - """Filter input variants from reference VCF""" - ## Exact information which is collected as keys in read_input() is retrieved - ## Compare the matching keys in both files and extract additional information - - start_time = datetime.now() - allele_freq = readinput(input_file) - filtered_variants = [] - with open(output_file, "w") as vcf_output: - vcf_output.write(vcfheader()) - for variant in VCF(reference_file): - gene_symbol = re.sub(r"(.*)(_ENST.*)", r"\1", variant.INFO.get("GENE")) - vcf_id = ":".join([variant.ID, gene_symbol, variant.INFO.get("AA")]) - vcf_id_value = allele_freq.get(vcf_id) - if vcf_id_value: - variant_info = collect_vcf_info(str(variant)) - reference_info = collect_ref_info(vcf_id_value) - info = "\t".join(variant_info) + ";" + reference_info - if info not in filtered_variants: - filtered_variants.append(info) - vcf_output.write(info + "\n") - end_time = datetime.now() - click.echo("VCF file created. Total Runtime:" + "{}".format(end_time - start_time)) - return filtered_variants diff --git a/BALSAMIC/commands/report/base.py b/BALSAMIC/commands/report/base.py index 39c0ff04c..ecb76e79e 100644 --- a/BALSAMIC/commands/report/base.py +++ b/BALSAMIC/commands/report/base.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +"""Balsamic report CLI.""" import click from BALSAMIC.commands.report.deliver import deliver as deliver_command @@ -7,8 +7,8 @@ @click.group() @click.pass_context -def report(context): - """Various command to create report, check status, and prepare delivery files""" +def report(context: click.Context): + """Command to generate delivery files and check analysis status.""" pass diff --git a/BALSAMIC/commands/report/deliver.py b/BALSAMIC/commands/report/deliver.py index afff77e2d..82d54a37f 100644 --- a/BALSAMIC/commands/report/deliver.py +++ b/BALSAMIC/commands/report/deliver.py @@ -1,69 +1,49 @@ +"""Balsamic report delivery CLI.""" +import json +import logging import os +import subprocess import sys -import logging -import json -import yaml +from pathlib import Path +from typing import List + import click import snakemake -import subprocess -from pathlib import Path +import yaml +from BALSAMIC.commands.options import ( + OPTION_SAMPLE_CONFIG, + OPTION_DISABLE_VARIANT_CALLER, + OPTION_DELIVERY_MODE, + OPTION_RULES_TO_DELIVER, +) +from BALSAMIC.constants.analysis import RunMode, RuleDeliveryMode +from BALSAMIC.constants.rules import DELIVERY_RULES +from BALSAMIC.models.snakemake import SnakemakeExecutable +from BALSAMIC.utils.cli import convert_deliverables_tags from BALSAMIC.utils.cli import get_file_extension from BALSAMIC.utils.cli import get_snakefile -from BALSAMIC.utils.cli import SnakeMake -from BALSAMIC.utils.cli import convert_deliverables_tags from BALSAMIC.utils.io import write_json from BALSAMIC.utils.rule import get_result_dir -from BALSAMIC.constants.workflow_params import VCF_DICT -from BALSAMIC.constants.workflow_rules import DELIVERY_RULES LOG = logging.getLogger(__name__) -@click.command( - "deliver", - short_help="Creates a YAML file with output from variant caller and alignment.", -) -@click.option( - "--sample-config", - "-s", - required=True, - help="Sample config file. Output of balsamic config sample", -) -@click.option( - "-r", - "--rules-to-deliver", - multiple=True, - help=f"Specify a rule to deliver. Delivery mode selected via --delivery-mode option." - f"Current available rules to deliver are: {', '.join(DELIVERY_RULES)} ", -) -@click.option( - "-m", - "--delivery-mode", - type=click.Choice(["a", "r"]), - default="a", - show_default=True, - help="a: append rules-to-deliver to current delivery options. " - "r: reset current rules to delivery to only the ones specified", -) -@click.option( - "--disable-variant-caller", - help=f"Run workflow with selected variant caller(s) disable. Use comma to remove multiple variant callers. Valid " - f"values are: {list(VCF_DICT.keys())}", -) +@click.command("deliver", short_help="Creates a report file with output files") +@OPTION_DELIVERY_MODE +@OPTION_DISABLE_VARIANT_CALLER +@OPTION_RULES_TO_DELIVER +@OPTION_SAMPLE_CONFIG @click.pass_context def deliver( - context, - sample_config, - rules_to_deliver, - delivery_mode, - disable_variant_caller, + context: click.Context, + delivery_mode: RuleDeliveryMode, + disable_variant_caller: str, + rules_to_deliver: List[str], + sample_config: str, ): - """ - cli for deliver sub-command. - Writes .hk in result_directory. - """ - LOG.info(f"BALSAMIC started with log level {context.obj['loglevel']}.") + """Deliver command to write .hk with the output analysis files.""" + LOG.info(f"BALSAMIC started with log level {context.obj['log_level']}.") LOG.debug("Reading input sample config") with open(sample_config, "r") as fn: sample_config_dict = json.load(fn) @@ -74,7 +54,7 @@ def deliver( rules_to_deliver = default_rules_to_deliver rules_to_deliver = list(rules_to_deliver) - if delivery_mode == "a": + if delivery_mode == RuleDeliveryMode.APPEND: rules_to_deliver.extend(default_rules_to_deliver) case_name = sample_config_dict["analysis"]["case_id"] @@ -88,34 +68,31 @@ def deliver( analysis_type = sample_config_dict["analysis"]["analysis_type"] analysis_workflow = sample_config_dict["analysis"]["analysis_workflow"] - reference_genome = sample_config_dict["reference"]["reference_genome"] - snakefile = get_snakefile(analysis_type, analysis_workflow, reference_genome) + snakefile = get_snakefile(analysis_type, analysis_workflow) + + report_path = Path(yaml_write_directory, f"{case_name}_report.html") + LOG.info(f"Creating report file {report_path.as_posix()}") - report_file_name = os.path.join( - yaml_write_directory, sample_config_dict["analysis"]["case_id"] + "_report.html" + LOG.info(f"Delivering {analysis_workflow} workflow...") + working_dir = Path( + sample_config_dict["analysis"]["analysis_dir"], case_name, "BALSAMIC_run" + ) + snakemake_executable: SnakemakeExecutable = SnakemakeExecutable( + case_id=case_name, + config_path=sample_config, + disable_variant_caller=disable_variant_caller, + report_path=report_path, + run_analysis=True, + run_mode=RunMode.LOCAL, + snakefile=snakefile, + snakemake_options=["--quiet"], + working_dir=working_dir, ) - LOG.info("Creating report file {}".format(report_file_name)) - - # write report.html file - report = SnakeMake() - report.case_name = case_name - report.working_dir = os.path.join( - sample_config_dict["analysis"]["analysis_dir"], - sample_config_dict["analysis"]["case_id"], - "BALSAMIC_run", + subprocess.check_output( + f"{sys.executable} -m {snakemake_executable.get_command()}".split(), + shell=False, ) - report.report = report_file_name - report.configfile = sample_config - report.snakefile = snakefile - report.run_mode = "local" - report.use_singularity = False - report.run_analysis = True - report.sm_opt = ["--quiet"] - if disable_variant_caller: - report.disable_variant_caller = disable_variant_caller - cmd = sys.executable + " -m " + report.build_cmd() - subprocess.check_output(cmd.split(), shell=False) - LOG.info(f"Workflow report file {report_file_name}") + LOG.info(f"Workflow report file {report_path.as_posix()}") snakemake.snakemake( snakefile=snakefile, @@ -144,9 +121,9 @@ def deliver( # Add Housekeeper file to report delivery_json["files"].append( { - "path": report_file_name, + "path": report_path.as_posix(), "step": "balsamic_delivery", - "format": get_file_extension(report_file_name), + "format": get_file_extension(report_path.as_posix()), "tag": ["balsamic-report"], "id": case_name, } diff --git a/BALSAMIC/commands/report/status.py b/BALSAMIC/commands/report/status.py index b3aae8033..d74f5cd9b 100644 --- a/BALSAMIC/commands/report/status.py +++ b/BALSAMIC/commands/report/status.py @@ -1,52 +1,38 @@ -import os -import logging +"""Balsamic status report CLI.""" import json +import logging +import os + import click import snakemake - -from pathlib import Path from colorclass import Color -from BALSAMIC.utils.cli import get_snakefile +from BALSAMIC.commands.options import ( + OPTION_SAMPLE_CONFIG, + OPTION_PRINT_FILES, + OPTION_SHOW_ONLY_MISSING_FILES, +) from BALSAMIC.utils.cli import CaptureStdout from BALSAMIC.utils.cli import get_file_status_string +from BALSAMIC.utils.cli import get_snakefile from BALSAMIC.utils.rule import get_result_dir LOG = logging.getLogger(__name__) -@click.command( - "status", - short_help="Creates a YAML file with output from variant caller and alignment.", -) -@click.option( - "-s", - "--sample-config", - required=True, - help="Sample config file. Output of balsamic config sample", -) -@click.option( - "-m", - "--show-only-missing", - is_flag=True, - default=False, - show_default=True, - help="Only show missing files.", -) -@click.option( - "-p", - "--print-files", - is_flag=True, - default=False, - show_default=True, - help="Print list of files. Otherwise only final count will be printed.", -) +@click.command("status", short_help="Print the analysis file status.") +@OPTION_PRINT_FILES +@OPTION_SAMPLE_CONFIG +@OPTION_SHOW_ONLY_MISSING_FILES @click.pass_context -def status(context, sample_config, show_only_missing, print_files): - """ - cli for status sub-command. - """ - LOG.info(f"BALSAMIC started with log level {context.obj['loglevel']}.") +def status( + context: click.Context, + print_files: bool, + sample_config: str, + show_only_missing: bool, +): + """Analysis status CLI command.""" + LOG.info(f"BALSAMIC started with log level {context.obj['log_level']}.") LOG.debug("Reading input sample config") with open(sample_config, "r") as fn: sample_config_dict = json.load(fn) @@ -54,8 +40,7 @@ def status(context, sample_config, show_only_missing, print_files): result_dir = get_result_dir(sample_config_dict) analysis_type = sample_config_dict["analysis"]["analysis_type"] analysis_workflow = sample_config_dict["analysis"]["analysis_workflow"] - reference_genome = sample_config_dict["reference"]["reference_genome"] - snakefile = get_snakefile(analysis_type, analysis_workflow, reference_genome) + snakefile = get_snakefile(analysis_type, analysis_workflow) if os.path.isfile(os.path.join(result_dir, "analysis_finish")): snakemake.snakemake( @@ -81,7 +66,7 @@ def status(context, sample_config, show_only_missing, print_files): quiet=True, ) summary = [i.split("\t") for i in summary] - summary_dict = [dict(zip(summary[0], value)) for value in summary[1:]] + summary_dict = [dict(zip(summary[1], value)) for value in summary[2:]] existing_files = set() missing_files = set() diff --git a/BALSAMIC/commands/run/analysis.py b/BALSAMIC/commands/run/analysis.py index b5abb8828..1200e43b5 100644 --- a/BALSAMIC/commands/run/analysis.py +++ b/BALSAMIC/commands/run/analysis.py @@ -1,287 +1,174 @@ -import sys -import os +"""Balsamic run analysis CLI.""" +import json import logging +import os import subprocess -import json -import yaml -import click - +import sys from pathlib import Path +from typing import List -# CLI commands and decorators +import click + +from BALSAMIC.commands.options import ( + OPTION_BENCHMARK, + OPTION_CLUSTER_ACCOUNT, + OPTION_CLUSTER_CONFIG, + OPTION_CLUSTER_MAIL, + OPTION_CLUSTER_MAIL_TYPE, + OPTION_CLUSTER_PROFILE, + OPTION_CLUSTER_QOS, + OPTION_DISABLE_VARIANT_CALLER, + OPTION_DRAGEN, + OPTION_FORCE_ALL, + OPTION_QUIET, + OPTION_RUN_ANALYSIS, + OPTION_RUN_MODE, + OPTION_SAMPLE_CONFIG, + OPTION_SNAKEFILE, + OPTION_SNAKEMAKE_OPT, +) +from BALSAMIC.constants.analysis import RunMode +from BALSAMIC.constants.cluster import ( + QOS, + ClusterConfigType, + ClusterMailType, + ClusterProfile, +) +from BALSAMIC.models.config import ConfigModel +from BALSAMIC.models.snakemake import SnakemakeExecutable +from BALSAMIC.utils.analysis import get_singularity_bind_paths from BALSAMIC.utils.cli import ( createDir, - get_schedulerpy, + get_config_path, get_snakefile, - SnakeMake, - get_config, - get_fastq_bind_path, job_id_dump_to_yaml, ) -from BALSAMIC.constants.common import ANALYSIS_TYPES, BALSAMIC_SCRIPTS -from BALSAMIC.constants.workflow_params import VCF_DICT LOG = logging.getLogger(__name__) -@click.command( - "analysis", short_help="Run the analysis on a provided sample config-file" -) -@click.option( - "-S", - "--snake-file", - type=click.Path(), - show_default=True, - help=( - "Input for a custom snakefile. WARNING: " - "This is for internal testing, and should " - "not be used. Providing a snakefile supersedes" - "analysis_type option." - ), -) -@click.option( - "-s", - "--sample-config", - required=True, - type=click.Path(), - help="Sample json config file.", -) -@click.option( - "--run-mode", - show_default=True, - default="cluster", - type=click.Choice(["local", "cluster"]), - help=( - "Run mode to use. By default SLURM will be used to " - "run the analysis. But local runner also available " - "for local computing" - ), -) -@click.option( - "-c", - "--cluster-config", - show_default=True, - default=get_config("cluster"), - type=click.Path(), - help="cluster config json file. (eg- SLURM, QSUB)", -) -@click.option( - "--dragen", is_flag=True, default=False, help="Enable dragen variant caller" -) -@click.option( - "-p", - "--profile", - default="slurm", - type=click.Choice(["slurm", "qsub"]), - help="cluster profile to submit jobs", -) -@click.option( - "--benchmark", - default=False, - is_flag=True, - help="Profile slurm jobs using the value of this option. Make sure you have slurm profiler enabled in your HPC.", -) -@click.option( - "-r", - "--run-analysis", - show_default=True, - default=False, - is_flag=True, - help=( - "By default balsamic run_analysis will run in " - "dry run mode. Raise thise flag to make the " - "actual analysis" - ), -) -@click.option( - "--qos", - type=click.Choice(["low", "normal", "high", "express"]), - show_default=True, - default="low", - help="QOS for sbatch jobs. Passed to " + get_schedulerpy(), -) -@click.option( - "-f", - "--force-all", - show_default=True, - default=False, - is_flag=True, - help="Force run all analysis. This is same as snakemake --forceall", -) -@click.option( - "--snakemake-opt", multiple=True, help="Pass these options directly to snakemake" -) -@click.option( - "--account", - "--slurm-account", - "--qsub-account", - help="cluster account to run jobs, ie: slurm_account", -) -@click.option( - "--mail-user", help="cluster mail user to send out email. e.g.: slurm_mail_user" -) -@click.option( - "-q", - "--quiet", - default=False, - is_flag=True, - help="Instruct snakemake to be quiet! No output will be printed", -) -@click.option( - "--mail-type", - type=click.Choice( - [ - "NONE", - "BEGIN", - "END", - "FAIL", - "REQUEUE", - "ALL", - "TIME_LIMIT", - ] - ), - help=( - "cluster mail type to send out email. This will " - "be applied to all jobs and override snakemake settings." - ), -) -@click.option( - "--disable-variant-caller", - help=( - f"Run workflow with selected variant caller(s) disable." - f"Use comma to remove multiple variant callers. Valid " - f"values are: {list(VCF_DICT.keys())}" - ), -) +@click.command("analysis", short_help="Run the analysis on a sample config-file") +@OPTION_BENCHMARK +@OPTION_CLUSTER_ACCOUNT +@OPTION_CLUSTER_CONFIG +@OPTION_CLUSTER_MAIL +@OPTION_CLUSTER_MAIL_TYPE +@OPTION_CLUSTER_PROFILE +@OPTION_CLUSTER_QOS +@OPTION_DISABLE_VARIANT_CALLER +@OPTION_DRAGEN +@OPTION_FORCE_ALL +@OPTION_QUIET +@OPTION_RUN_ANALYSIS +@OPTION_RUN_MODE +@OPTION_SAMPLE_CONFIG +@OPTION_SNAKEFILE +@OPTION_SNAKEMAKE_OPT @click.pass_context def analysis( - context, - snake_file, - sample_config, - run_mode, - cluster_config, - run_analysis, - force_all, - snakemake_opt, - mail_type, - mail_user, - account, - qos, - profile, - disable_variant_caller, - quiet, - dragen, - benchmark, + context: click.Context, + snakefile: Path, + sample_config: Path, + run_mode: RunMode, + cluster_config: Path, + benchmark: bool, + dragen: bool, + profile: ClusterProfile, + run_analysis: bool, + qos: QOS, + force_all: bool, + snakemake_opt: List[str], + account: str, + mail_user: str, + mail_type: ClusterMailType, + quiet: bool, + disable_variant_caller: str, ): - """ - Runs BALSAMIC workflow on the provided sample's config file - """ - LOG.info(f"BALSAMIC started with log level {context.obj['loglevel']}.") + """Run BALSAMIC workflow on the provided sample's config file.""" + LOG.info(f"BALSAMIC started with log level {context.obj['log_level']}.") - if run_mode == "cluster" and not run_analysis: + if run_mode == RunMode.CLUSTER and not run_analysis: LOG.info("Changing run-mode to local on dry-run") - run_mode = "local" + run_mode: RunMode = RunMode.LOCAL - if run_mode == "cluster" and not account: + if run_mode == RunMode.CLUSTER and not account: LOG.info( "slurm-account, qsub-account, or account is required for slurm run mode" ) raise click.Abort() - sample_config_path = os.path.abspath(sample_config) - - with open(sample_config, "r") as sample_fh: + sample_config_path: Path = Path(sample_config).absolute() + with open(sample_config_path, "r") as sample_fh: sample_config = json.load(sample_fh) - logpath = sample_config["analysis"]["log"] - scriptpath = sample_config["analysis"]["script"] - resultpath = sample_config["analysis"]["result"] - benchmarkpath = sample_config["analysis"]["benchmark"] - case_name = sample_config["analysis"]["case_id"] + # Initialize balsamic model to run validation tests + config_model = ConfigModel.model_validate(sample_config) - if run_analysis: - # if not dry run, then create (new) log/script directory - for dirpath, dirnames, files in os.walk(logpath): - if files: - logpath = createDir(logpath, []) - scriptpath = createDir(scriptpath, []) - sample_config["analysis"]["benchmark"] = createDir(benchmarkpath, []) + case_name = config_model.analysis.case_id - # Create result directory - os.makedirs(resultpath, exist_ok=True) + # Create directories for results, logs, scripts and benchmark files + result_path: Path = Path(config_model.analysis.result) + log_path: Path = Path(config_model.analysis.log) + script_path: Path = Path(config_model.analysis.script) + benchmark_path: Path = Path(config_model.analysis.benchmark) - if not os.path.exists(logpath): - os.makedirs(logpath, exist_ok=True) - os.makedirs(scriptpath, exist_ok=True) - os.makedirs(benchmarkpath, exist_ok=True) + analysis_directories_list = [result_path, log_path, script_path, benchmark_path] - analysis_type = sample_config["analysis"]["analysis_type"] - analysis_workflow = sample_config["analysis"]["analysis_workflow"] - reference_genome = sample_config["reference"]["reference_genome"] + for analysis_sub_dir in analysis_directories_list: + analysis_sub_dir.mkdir(exist_ok=True) - # Singularity bind path - bind_path = list() - bind_path.append(str(Path(__file__).parents[2] / "assets")) - bind_path.append(os.path.commonpath(sample_config["reference"].values())) - if "panel" in sample_config: - bind_path.append(sample_config.get("panel").get("capture_kit")) - if "background_variants" in sample_config: - bind_path.append(sample_config.get("background_variants")) - if "pon_cnn" in sample_config: - bind_path.append(sample_config.get("panel").get("pon_cnn")) - bind_path.append(BALSAMIC_SCRIPTS) - bind_path.append(sample_config["analysis"]["analysis_dir"]) - bind_path.extend(get_fastq_bind_path(sample_config["analysis"]["fastq_path"])) + if run_analysis: + # if not dry run, and current existing log-dir is not empty, then create (new) log/script directory + existing_log_files = os.listdir(log_path.as_posix()) + if existing_log_files: + log_path = Path(createDir(log_path.as_posix(), [])) + script_path = Path(createDir(script_path.as_posix(), [])) + + for analysis_sub_dir in analysis_directories_list: + analysis_sub_dir.mkdir(exist_ok=True) + + analysis_type = config_model.analysis.analysis_type + analysis_workflow = config_model.analysis.analysis_workflow + analysis_dir: Path = Path(config_model.analysis.analysis_dir) + snakefile: Path = ( + snakefile if snakefile else get_snakefile(analysis_type, analysis_workflow) + ) - # Construct snakemake command to run workflow - balsamic_run = SnakeMake() - balsamic_run.case_name = case_name - balsamic_run.working_dir = ( - Path( - sample_config["analysis"]["analysis_dir"], case_name, "BALSAMIC_run" - ).as_posix() - + "/" + LOG.info(f"Starting {analysis_workflow} workflow...") + snakemake_executable: SnakemakeExecutable = SnakemakeExecutable( + account=account, + benchmark=benchmark, + case_id=case_name, + cluster_config_path=cluster_config + if cluster_config + else get_config_path(ClusterConfigType.ANALYSIS), + config_path=sample_config_path, + disable_variant_caller=disable_variant_caller, + dragen=dragen, + force=force_all, + log_dir=log_path.as_posix(), + mail_type=mail_type, + mail_user=mail_user, + profile=profile, + qos=qos, + quiet=quiet, + result_dir=result_path.as_posix(), + run_analysis=run_analysis, + run_mode=run_mode, + script_dir=script_path.as_posix(), + singularity_bind_paths=get_singularity_bind_paths(sample_config), + snakefile=snakefile, + snakemake_options=snakemake_opt, + working_dir=Path(analysis_dir, case_name, "BALSAMIC_run"), ) - balsamic_run.snakefile = ( - snake_file - if snake_file - else get_snakefile(analysis_type, analysis_workflow, reference_genome) + subprocess.run( + f"{sys.executable} -m {snakemake_executable.get_command()}", + shell=True, ) - balsamic_run.configfile = sample_config_path - balsamic_run.run_mode = run_mode - balsamic_run.cluster_config = cluster_config - balsamic_run.scheduler = get_schedulerpy() - balsamic_run.profile = profile - balsamic_run.log_path = logpath - balsamic_run.script_path = scriptpath - balsamic_run.result_path = resultpath - balsamic_run.qos = qos - balsamic_run.account = account - if mail_type: - balsamic_run.mail_type = mail_type - balsamic_run.mail_user = mail_user - balsamic_run.forceall = force_all - balsamic_run.run_analysis = run_analysis - balsamic_run.quiet = quiet - # Always use singularity - balsamic_run.use_singularity = True - balsamic_run.singularity_bind = bind_path - balsamic_run.sm_opt = snakemake_opt - if benchmark and profile == "slurm": - balsamic_run.slurm_profiler = "task" - - if disable_variant_caller: - balsamic_run.disable_variant_caller = disable_variant_caller - - if dragen: - balsamic_run.dragen = dragen - - cmd = sys.executable + " -m " + balsamic_run.build_cmd() - subprocess.run(cmd, shell=True) if run_analysis and run_mode == "cluster": - jobid_dump = os.path.join( - logpath, sample_config["analysis"]["case_id"] + ".sacct" + jobid_dump = Path(log_path, f"{case_name}.sacct") + jobid_yaml = Path(result_path, f"{profile}_jobids.yaml") + job_id_dump_to_yaml( + job_id_dump=jobid_dump, job_id_yaml=jobid_yaml, case_name=case_name ) - jobid_yaml = os.path.join(resultpath, profile + "_jobids.yaml") - job_id_dump_to_yaml(jobid_dump, jobid_yaml, case_name) diff --git a/BALSAMIC/commands/run/base.py b/BALSAMIC/commands/run/base.py index c1c999398..d19f07ef9 100644 --- a/BALSAMIC/commands/run/base.py +++ b/BALSAMIC/commands/run/base.py @@ -1,13 +1,14 @@ +"""Balsamic run CLI.""" import click -from BALSAMIC.commands.run.analysis import analysis as run_analysis_cmd +from BALSAMIC.commands.run.analysis import analysis as run_analysis_command @click.group() @click.pass_context -def run(context): - "Run BALSAMIC on a provided config file" +def run(context: click.Context): + """Run Balsamic analysis on a provided configuration file.""" pass -run.add_command(run_analysis_cmd) +run.add_command(run_analysis_command) diff --git a/BALSAMIC/conda/balsamic.yaml b/BALSAMIC/conda/balsamic.yaml index c8c593bda..2758a237f 100644 --- a/BALSAMIC/conda/balsamic.yaml +++ b/BALSAMIC/conda/balsamic.yaml @@ -1,9 +1,8 @@ channels: - conda-forge - - bioconda - defaults dependencies: - - conda-forge::python=3.7 + - conda-forge::python=3.11 - conda-forge::pip - conda-forge::pygraphviz diff --git a/BALSAMIC/config/analysis.json b/BALSAMIC/config/analysis.json deleted file mode 100644 index 0a9a16426..000000000 --- a/BALSAMIC/config/analysis.json +++ /dev/null @@ -1,74 +0,0 @@ -{ - "QC": { - "picard_rmdup": false, - "adapter": - "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT", - "min_seq_length": "25", - "quality_trim": false, - "adapter_trim": false, - "umi_trim": false, - "umi_trim_length": "5" - }, - "vcf": { - "manta": { - "mutation": "somatic", - "type": "SV" - }, - "cnvkit": { - "mutation": "somatic", - "type": "CNV" - }, - "vardict": { - "mutation": "somatic", - "type": "SNV" - }, - "mutect": { - "mutation": "somatic", - "type": "SNV" - }, - "tnscope": { - "mutation": "somatic", - "type": "SNV" - }, - "tnhaplotyper": { - "mutation": "somatic", - "type": "SNV" - }, - "dnascope": { - "mutation": "germline", - "type": "SNV" - }, - "manta_germline": { - "mutation": "germline", - "type": "SV" - }, - "vcfmerge":{ - "mutation": "somatic", - "type": "SNV" - }, - "dellysv":{ - "mutation": "somatic", - "type": "SV" - }, - "tiddit":{ - "mutation": "somatic", - "type": "SV" - }, - "dellycnv":{ - "mutation": "somatic", - "type": "CNV" - }, - "ascat": { - "mutation": "somatic", - "type": "SV" - }, - "cnvpytor": { - "mutation": "somatic", - "type": "CNV" - }, - "svdb": { - "mutation": "somatic", - "type": "SV" - } - } -} diff --git a/BALSAMIC/config/balsamic_env.yaml b/BALSAMIC/config/balsamic_env.yaml deleted file mode 100644 index 5bb818a69..000000000 --- a/BALSAMIC/config/balsamic_env.yaml +++ /dev/null @@ -1,38 +0,0 @@ -align_qc: - - bedtools - - bwa - - fastqc - - samtools - - picard - - multiqc - - fastp - - csvkit -annotate: - - ensembl-vep - - vcfanno -coverage_qc: - - sambamba - - mosdepth -varcall_py3: - - bcftools - - tabix - - gatk - - vardict - - libiconv - - svdb - - tiddit -varcall_py27: - - manta -varcall_cnvkit: - - cnvkit - - purecn -vcf_merge: - - vcfmerge -delly: - - delly -ascatNgs: - - ascat -vcf2cytosure: - - vcf2cytosure -cnvpytor: - - cnvpytor diff --git a/BALSAMIC/config/cluster_minimal.json b/BALSAMIC/config/cluster_minimal.json deleted file mode 100644 index 17c3407be..000000000 --- a/BALSAMIC/config/cluster_minimal.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "__default__": { - "name": "BALSAMIC.{rule}.{wildcards}", - "time": "03:00:00", - "n": 6, - "mail_type": "FAIL", - "partition": "core" - }, - "all": { - "mail_type": "END", - "time": "00:15:00", - "n": 1 - }, - "dragen_align_call_tumor_only": { - "time": "10:00:00", - "n": 24, - "partition": "dragen" - } -} diff --git a/BALSAMIC/config/reference_cluster.json b/BALSAMIC/config/reference_cluster.json deleted file mode 100644 index 2a1058600..000000000 --- a/BALSAMIC/config/reference_cluster.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "__default__": { - "name": "BALSAMIC.{rule}.{wildcards}", - "n": 8, - "time": "12:00:00", - "mail_type": "FAIL", - "partition": "core" - }, - "all": { - "n": 1, - "time": "00:15:00", - "mail_type": "END" - }, - "download_reference": { - "n": 24, - "time": "16:00:00", - "mail_type": "FAIL", - "partition": "core" - }, - "download_container": { - "n": 16, - "time": "4:00:00", - "mail_type": "FAIL", - "partition": "core" - } -} diff --git a/BALSAMIC/config/sample.json b/BALSAMIC/config/sample.json deleted file mode 100644 index 5b5a614b7..000000000 --- a/BALSAMIC/config/sample.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "analysis": { - "case_id": "CASE_BASE_NAME", - "analysis_type": "ANALYSIS_TYPE[paired, single]", - "analysis_dir": "BASE_DIR_RESULTS", - "fastq_path": "BASE_PATH_TO_FASTQ", - "script": "scripts/", - "log": "logs/", - "result": "analysis/" - }, - "samples": { - "BASE_NAME_FOR_FIRST_SAMPLE": { - "file_prefix": "{LANE}_{DATE}_{FLOW_CELL}_{IDN}_{BARCODE_SEQ}", - "type": "[normal, tumor]", - "readpair_suffix": ["1", "2"] - }, - "BASE_NAME_FOR_SECOND_SAMPLE": { - "file_prefix": "{LANE}_{DATE}_{FLOW_CELL}_{IDN}_{BARCODE SEQ}", - "type": "[normal, tumor]", - "readpair_suffix": ["1", "2"] - } - } -} diff --git a/BALSAMIC/constants/analysis.py b/BALSAMIC/constants/analysis.py new file mode 100644 index 000000000..11d2ffd41 --- /dev/null +++ b/BALSAMIC/constants/analysis.py @@ -0,0 +1,188 @@ +"""Balsamic analysis workflow constants.""" +from enum import StrEnum +from typing import Dict, List + +from BALSAMIC.constants.cache import DockerContainers + + +class RunMode(StrEnum): + """Balsamic workflow run mode.""" + + CLUSTER: str = "cluster" + LOCAL: str = "local" + + +RUN_MODES: List[RunMode] = [mode for mode in RunMode] + + +class Gender(StrEnum): + """Sex options.""" + + FEMALE: str = "female" + MALE: str = "male" + + +class AnalysisType(StrEnum): + """Supported analysis types.""" + + PAIRED: str = "paired" + PON: str = "pon" + SINGLE: str = "single" + + +class AnalysisWorkflow(StrEnum): + """Available Balsamic workflows.""" + + BALSAMIC: str = "balsamic" + BALSAMIC_QC: str = "balsamic-qc" + BALSAMIC_UMI: str = "balsamic-umi" + + +ANALYSIS_WORKFLOWS: List[AnalysisWorkflow] = [workflow for workflow in AnalysisWorkflow] + + +class SequencingType(StrEnum): + """Sequencing carried out.""" + + TARGETED: str = "targeted" + WGS: str = "wgs" + + +class SampleType(StrEnum): + """Balsamic sample type inputs.""" + + NORMAL: str = "normal" + TUMOR: str = "tumor" + + +class MutationOrigin(StrEnum): + """Variations present in a sample.""" + + GERMLINE: str = "germline" + SOMATIC: str = "somatic" + + +class MutationType(StrEnum): + """Types of variations present in a sample.""" + + CNV: str = "CNV" + SNV: str = "SNV" + SV: str = "SV" + + +class WorkflowSolution(StrEnum): + """Solution applied to a specific part of the analysis.""" + + BALSAMIC: str = "BALSAMIC" + DRAGEN: str = "DRAGEN" + SENTIEON: str = "Sentieon" + SENTIEON_UMI: str = "Sentieon_umi" + + +class PONWorkflow(StrEnum): + """Panel Of Normal creation workflow type.""" + + CNVKIT: str = "CNVkit" + GENS_MALE: str = "GENS_male" + GENS_FEMALE: str = "GENS_female" + + +PON_WORKFLOWS: List[PONWorkflow] = [workflow for workflow in PONWorkflow] + + +class RuleDeliveryMode(StrEnum): + """Rules to deliver mode.""" + + APPEND: str = "append" + RESET: str = "reset" + + +RULE_DELIVERY_MODES: List[RuleDeliveryMode] = [mode for mode in RuleDeliveryMode] + + +class BioinfoTools(StrEnum): + """List of bioinformatics tools in Balsamic.""" + + ASCAT: str = "ascatNgs" + BCFTOOLS: str = "bcftools" + BEDTOOLS: str = "bedtools" + BGZIP: str = "bgzip" + BWA: str = "bwa" + CNVKIT: str = "cnvkit" + CNVPYTOR: str = "cnvpytor" + COMPRESS: str = "compress" + CSVKIT: str = "csvkit" + DELLY: str = "delly" + VEP: str = "ensembl-vep" + FASTP: str = "fastp" + FASTQC: str = "fastqc" + GATK: str = "gatk" + GENMOD: str = "genmod" + MANTA: str = "manta" + MOSDEPTH: str = "mosdepth" + MULTIQC: str = "multiqc" + PICARD: str = "picard" + SAMBAMBA: str = "sambamba" + SAMTOOLS: str = "samtools" + SOMALIER: str = "somalier" + SVDB: str = "svdb" + TABIX: str = "tabix" + TIDDIT: str = "tiddit" + VARDICT: str = "vardict" + VCF2CYTOSURE: str = "vcf2cytosure" + VCFANNO: str = "vcfanno" + CADD: str = "cadd" + PURECN: str = "purecn" + + +class FastqName(StrEnum): + """Fastq name parameters.""" + + FWD: str = "fwd" + REV: str = "rev" + + +FASTQ_SUFFIXES: Dict[str, Dict] = { + "1": {"fwd": "_1.fastq.gz", "rev": "_2.fastq.gz"}, + "2": {"fwd": "_R1_001.fastq.gz", "rev": "_R2_001.fastq.gz"}, +} + + +class PonParams: + """Parameters related to the PON creation workflow.""" + + MIN_PON_SAMPLES: int = 6 + + +BIOINFO_TOOL_ENV: Dict[str, str] = { + BioinfoTools.BEDTOOLS: DockerContainers.ALIGN_QC, + BioinfoTools.BWA: DockerContainers.ALIGN_QC, + BioinfoTools.COMPRESS: DockerContainers.ALIGN_QC, + BioinfoTools.FASTQC: DockerContainers.ALIGN_QC, + BioinfoTools.SAMTOOLS: DockerContainers.ALIGN_QC, + BioinfoTools.PICARD: DockerContainers.ALIGN_QC, + BioinfoTools.MULTIQC: DockerContainers.ALIGN_QC, + BioinfoTools.FASTP: DockerContainers.ALIGN_QC, + BioinfoTools.CSVKIT: DockerContainers.ALIGN_QC, + BioinfoTools.VEP: DockerContainers.ANNOTATE, + BioinfoTools.GENMOD: DockerContainers.ANNOTATE, + BioinfoTools.VCFANNO: DockerContainers.ANNOTATE, + BioinfoTools.SAMBAMBA: DockerContainers.COVERAGE_QC, + BioinfoTools.MOSDEPTH: DockerContainers.COVERAGE_QC, + BioinfoTools.BCFTOOLS: DockerContainers.PYTHON_3, + BioinfoTools.TABIX: DockerContainers.PYTHON_3, + BioinfoTools.BGZIP: DockerContainers.PYTHON_3, + BioinfoTools.VARDICT: DockerContainers.PYTHON_3, + BioinfoTools.SVDB: DockerContainers.PYTHON_3, + BioinfoTools.TIDDIT: DockerContainers.PYTHON_3, + BioinfoTools.CNVPYTOR: DockerContainers.CNVPYTOR, + BioinfoTools.MANTA: DockerContainers.PYTHON_27, + BioinfoTools.CNVKIT: DockerContainers.CNVKIT, + BioinfoTools.DELLY: DockerContainers.DELLY, + BioinfoTools.ASCAT: DockerContainers.ASCAT, + BioinfoTools.VCF2CYTOSURE: DockerContainers.VCF2CYTOSURE, + BioinfoTools.SOMALIER: DockerContainers.SOMALIER, + BioinfoTools.CADD: DockerContainers.CADD, + BioinfoTools.PURECN: DockerContainers.PURECN, + BioinfoTools.GATK: DockerContainers.GATK, +} diff --git a/BALSAMIC/constants/cache.py b/BALSAMIC/constants/cache.py new file mode 100644 index 000000000..d09aaaabc --- /dev/null +++ b/BALSAMIC/constants/cache.py @@ -0,0 +1,446 @@ +"""Balsamic cache specific constants.""" +from enum import StrEnum +from typing import Dict, List + +from BALSAMIC.constants.constants import FileType + +DOCKER_URL: str = "docker://clinicalgenomics/balsamic" +VEP_PLUGINS: str = "all" + + +class GenomeVersion(StrEnum): + """Reference genome versions.""" + + HG19: str = "hg19" + HG38: str = "hg38" + CanFam3: str = "canfam3" + + +GENOME_VERSIONS: List[GenomeVersion] = [version for version in GenomeVersion] + + +class GRCHVersion(StrEnum): + """Genome Reference Consortium Human Reference versions.""" + + GRCH37: str = "GRCh37" + GRCH38: str = "GRCh38" + + +class Species(StrEnum): + """A class representing different species.""" + + HOMO_SAPIENS: str = "homo_sapiens_merged" + + +class CacheVersion(StrEnum): + """Balsamic cache versions.""" + + DEVELOP: str = "develop" + + +class DockerContainers(StrEnum): + """Docker containers names.""" + + ALIGN_QC: str = "align_qc" + ANNOTATE: str = "annotate" + ASCAT: str = "ascatNgs" + CADD: str = "cadd" + CNVKIT: str = "cnvkit" + CNVPYTOR: str = "cnvpytor" + COVERAGE_QC: str = "coverage_qc" + DELLY: str = "delly" + GATK: str = "gatk" + HTSLIB: str = "htslib" + PURECN: str = "purecn" + PYTHON_3: str = "varcall_py3" + PYTHON_27: str = "varcall_py27" + SOMALIER: str = "somalier" + VCF2CYTOSURE: str = "vcf2cytosure" + + +REFERENCE_FILES: Dict[GenomeVersion, Dict[str, dict]] = { + GenomeVersion.HG19: { + "reference_genome": { + "url": "gs://gatk-legacy-bundles/b37/human_g1k_v37.fasta.gz", + "file_type": FileType.FASTA, + "gzip": True, + "file_name": "human_g1k_v37.fasta", + "dir_name": "genome", + }, + "dbsnp": { + "url": "gs://gatk-legacy-bundles/b37/dbsnp_138.b37.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "dbsnp_grch37_b138.vcf", + "dir_name": "variants", + }, + "hc_vcf_1kg": { + "url": "gs://gatk-legacy-bundles/b37/1000G_phase1.snps.high_confidence.b37.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "1kg_phase1_snps_high_confidence_b37.vcf", + "dir_name": "variants", + }, + "mills_1kg": { + "url": "gs://gatk-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "mills_1kg_index.vcf", + "dir_name": "variants", + }, + "known_indel_1kg": { + "url": "gs://gatk-legacy-bundles/b37/1000G_phase1.indels.b37.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "1kg_known_indels_b37.vcf", + "dir_name": "variants", + }, + "vcf_1kg": { + "url": "gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "1k_genome_wgs_p1_v3_all_sites.vcf", + "dir_name": "variants", + }, + "gnomad_variant": { + "url": "gs://gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz", + "file_type": FileType.VCF, + "gzip": False, + "file_name": "gnomad.genomes.r2.1.1.sites.vcf.bgz", + "dir_name": "variants", + }, + "gnomad_variant_index": { + "url": "gs://gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz.tbi", + "file_type": FileType.VCF, + "gzip": False, + "file_name": "gnomad.genomes.r2.1.1.sites.vcf.bgz.tbi", + "dir_name": "variants", + }, + "cosmic": { + "url": "https://cancer.sanger.ac.uk/cosmic/file_download/GRCh37/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "cosmic_coding_muts_v97.vcf", + "dir_name": "variants", + }, + "wgs_calling_regions": { + "url": "gs://gatk-legacy-bundles/b37/wgs_calling_regions.v1.interval_list", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "wgs_calling_regions.v1", + "dir_name": "genome", + }, + "genome_chrom_size": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "hg19.chrom.sizes", + "dir_name": "genome", + }, + "refgene_txt": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz", + "file_type": FileType.TXT, + "gzip": True, + "file_name": "refGene.txt", + "dir_name": "genome", + }, + "refgene_sql": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "refGene.sql", + "dir_name": "genome", + }, + "rank_score": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/cancer/rank_model/cancer_rank_model_-v0.1-.ini", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "cancer_rank_model_-v0.1-.ini", + "dir_name": "genome", + }, + "access_regions": { + "url": "https://raw.githubusercontent.com/etal/cnvkit/master/data/access-5k-mappable.hg19.bed", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "access_5kb_hg19.txt", + "dir_name": "genome", + }, + "delly_exclusion": { + "url": "https://raw.githubusercontent.com/dellytools/delly/master/excludeTemplates/human.hg19.excl.tsv", + "file_type": FileType.TSV, + "gzip": False, + "file_name": "delly_exclusion.tsv", + "dir_name": "genome", + }, + "delly_mappability": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "delly_mappability.gz", + "dir_name": "genome", + }, + "delly_mappability_gindex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz.gzi", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "delly_mappability.gz.gzi", + "dir_name": "genome", + }, + "delly_mappability_findex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz.fai", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "delly_mappability.gz.fai", + "dir_name": "genome", + }, + "ascat_gc_correction": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/12a6c760fd542c02de2cda286b6245e46f4b6a97/cancer/references/GRCh37_SnpGcCorrections.tsv.gz", + "file_type": FileType.TSV, + "gzip": True, + "file_name": "GRCh37_SnpGcCorrections.tsv", + "dir_name": "genome", + }, + "ascat_chr_y_loci": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/12a6c760fd542c02de2cda286b6245e46f4b6a97/cancer/references/GRCh37_d5_Y.loci", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "GRCh37_Y.loci", + "dir_name": "genome", + }, + "clinvar": { + "url": "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "clinvar.vcf", + "dir_name": "variants", + }, + "somalier_sites": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/87f22d3f458569afbcb4d7f1588468d21d1751fb/cancer/references/GRCh37.somalier.sites.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "GRCh37.somalier.sites.vcf", + "dir_name": "variants", + }, + "cadd_snv": { + "url": "https://kircherlab.bihealth.org/download/CADD/v1.6/GRCh37/whole_genome_SNVs.tsv.gz", + "file_type": FileType.TSV, + "gzip": False, + "file_name": "hg19.cadd_snv.tsv.gz", + "dir_name": "variants", + }, + "simple_repeat": { + "url": "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/simpleRepeat.txt.gz", + "file_type": FileType.TSV, + "gzip": False, + "file_name": "hg19.simple_repeat.txt.gz", + "dir_name": "variants", + }, + }, + GenomeVersion.HG38: { + "reference_genome": { + "url": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta", + "file_type": FileType.FASTA, + "gzip": False, + "file_name": "Homo_sapiens_assembly38.fasta", + "dir_name": "genome", + }, + "dbsnp": { + "url": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", + "file_type": FileType.VCF, + "gzip": False, + "file_name": "Homo_sapiens_assembly38.dbsnp138.vcf", + "dir_name": "variants", + }, + "hc_vcf_1kg": { + "url": "gs://genomics-public-data/resources/broad/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "1000G_phase1.snps.high_confidence.hg38.vcf", + "dir_name": "variants", + }, + "mills_1kg": { + "url": "gs://genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "Mills_and_1000G_gold_standard.indels.hg38.vcf", + "dir_name": "variants", + }, + "known_indel_1kg": { + "url": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "Homo_sapiens_assembly38.known_indels.vcf", + "dir_name": "variants", + }, + "vcf_1kg": { + "url": "gs://genomics-public-data/resources/broad/hg38/v0/1000G.phase3.integrated.sites_only.no_MATCHED_REV.hg38.vcf", + "file_type": FileType.VCF, + "gzip": False, + "file_name": "1000G.phase3.integrated.sites_only.no_MATCHED_REV.hg38.vcf", + "dir_name": "variants", + }, + "gnomad_variant": { + "url": "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz", + "file_type": FileType.VCF, + "gzip": False, + "file_name": "gnomad.genomes.r2.1.1.sites.vcf.bgz", + "dir_name": "variants", + }, + "gnomad_variant_index": { + "url": "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi", + "file_type": FileType.VCF, + "gzip": False, + "file_name": "gnomad.genomes.r2.1.1.sites.vcf.bgz.tbi", + "dir_name": "variants", + }, + "cosmic": { + "url": "https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "cosmic_coding_muts_v97.vcf", + "dir_name": "variants", + }, + "wgs_calling_regions": { + "url": "gs://genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "wgs_calling_regions.v1", + "dir_name": "genome", + }, + "genome_chrom_size": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "hg38.chrom.sizes", + "dir_name": "genome", + }, + "refgene_txt": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz", + "file_type": FileType.TXT, + "gzip": True, + "file_name": "refGene.txt", + "dir_name": "genome", + }, + "refgene_sql": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/refGene.sql", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "refGene.sql", + "dir_name": "genome", + }, + "rank_score": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/cancer/rank_model/cancer_rank_model_-v0.1-.ini", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "cancer_rank_model_-v0.1-.ini", + "dir_name": "genome", + }, + "access_regions": { + "url": "https://raw.githubusercontent.com/etal/cnvkit/master/data/access-5k-mappable.hg19.bed", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "access_5kb_hg38.txt", + "dir_name": "genome", + }, + "delly_exclusion": { + "url": "https://raw.githubusercontent.com/dellytools/delly/master/excludeTemplates/human.hg38.excl.tsv", + "file_type": FileType.TSV, + "gzip": False, + "file_name": "delly_exclusion.tsv", + "dir_name": "genome", + }, + "delly_mappability": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/ea051b864d18945980f0ded6b16a5d192bd736a5/cancer/references/GRCh38.delly.blacklist.gz", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "delly_mappability.gz", + "dir_name": "genome", + }, + "delly_mappability_gindex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/ea051b864d18945980f0ded6b16a5d192bd736a5/cancer/references/GRCh38.delly.blacklist.gz.gzi", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "delly_mappability.gz.gzi", + "dir_name": "genome", + }, + "delly_mappability_findex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/ea051b864d18945980f0ded6b16a5d192bd736a5/cancer/references/GRCh38.delly.blacklist.gz.fai", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "delly_mappability.gz.fai", + "dir_name": "genome", + }, + "ascat_gc_correction": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/35465e2644f76f2d59427a9b379d34ecea71f259/cancer/references/hg38_SnpGcCorrections.tsv.gz", + "file_type": FileType.TSV, + "gzip": True, + "file_name": "hg38_SnpGcCorrections.tsv", + "dir_name": "genome", + }, + "ascat_chr_y_loci": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/ea051b864d18945980f0ded6b16a5d192bd736a5/cancer/references/hg38_Y.loci", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "hg38_Y.loci", + "dir_name": "genome", + }, + "clinvar": { + "url": "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "clinvar.vcf", + "dir_name": "variants", + }, + "somalier_sites": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/87f22d3f458569afbcb4d7f1588468d21d1751fb/cancer/references/hg38.somalier.sites.vcf.gz", + "file_type": FileType.VCF, + "gzip": True, + "file_name": "hg38.somalier.sites.vcf", + "dir_name": "variants", + }, + "cadd_snv": { + "url": "https://kircherlab.bihealth.org/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz", + "file_type": FileType.TSV, + "gzip": False, + "file_name": "hg38.cadd_snv.tsv.gz", + "dir_name": "variants", + }, + "simple_repeat": { + "url": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/simpleRepeat.txt.gz", + "file_type": FileType.TSV, + "gzip": False, + "file_name": "hg38.simple_repeat.txt.gz", + "dir_name": "variants", + }, + }, + GenomeVersion.CanFam3: { + "reference_genome": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/bigZips/canFam3.fa.gz", + "file_type": FileType.FASTA, + "gzip": True, + "file_name": "canFam3.fasta", + "dir_name": "genome", + }, + "refgene_txt": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/database/refGene.txt.gz", + "file_type": FileType.TXT, + "gzip": True, + "file_name": "canfam3_refGene.txt", + "dir_name": "genome", + }, + "refgene_sql": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/database/refGene.sql", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "canfam3_refGene.sql", + "dir_name": "genome", + }, + "genome_chrom_size": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/bigZips/canFam3.chrom.sizes", + "file_type": FileType.TXT, + "gzip": False, + "file_name": "canfam3.chrom.sizes", + "dir_name": "genome", + }, + }, +} diff --git a/BALSAMIC/constants/cluster.py b/BALSAMIC/constants/cluster.py new file mode 100644 index 000000000..c5245b04f --- /dev/null +++ b/BALSAMIC/constants/cluster.py @@ -0,0 +1,55 @@ +"""Balsamic cluster and submission specific constants.""" +from enum import StrEnum +from typing import List + +MAX_JOBS: int = 999 + + +class ClusterConfigType(StrEnum): + """Analysis workflow config type.""" + + ANALYSIS: str = "cluster_analysis" + CACHE: str = "cluster_cache" + + +class ClusterProfile(StrEnum): + """Profile to submit jobs to the cluster.""" + + SLURM: str = "slurm" + QSUB: str = "qsub" + + +CLUSTER_PROFILES: List[ClusterProfile] = [profile for profile in ClusterProfile] + + +class ClusterAccount(StrEnum): + """Cluster job submission account.""" + + DEVELOPMENT: str = "development" + + +class QOS(StrEnum): + """Cluster quality of service.""" + + LOW: str = "low" + NORMAL: str = "normal" + HIGH: str = "high" + EXPRESS: str = "express" + + +QOS_OPTIONS: List[QOS] = [qos for qos in QOS] + + +class ClusterMailType(StrEnum): + """Cluster job mail type notification.""" + + ALL: str = "ALL" + BEGIN: str = "BEGIN" + END: str = "END" + FAIL: str = "FAIL" + NONE: str = "NONE" + REQUEUE: str = "REQUEUE" + TIME_LIMIT: str = "TIME_LIMIT" + + +CLUSTER_MAIL_TYPES: List[ClusterMailType] = [mail_type for mail_type in ClusterMailType] diff --git a/BALSAMIC/config/cluster.json b/BALSAMIC/constants/cluster_analysis.json similarity index 75% rename from BALSAMIC/config/cluster.json rename to BALSAMIC/constants/cluster_analysis.json index aa4f220c1..df3b90de4 100644 --- a/BALSAMIC/config/cluster.json +++ b/BALSAMIC/constants/cluster_analysis.json @@ -11,6 +11,18 @@ "time": "00:15:00", "n": 1 }, + "gens_preprocessing": { + "time": "01:00:00", + "n": 4 + }, + "postprocess_bam": { + "time": "03:00:00", + "n": 12 + }, + "finalize_gens_outputfiles": { + "time": "01:00:00", + "n": 2 + }, "CollectAlignmentSummaryMetrics": { "time": "03:30:00", "n": 8 @@ -33,10 +45,30 @@ }, "bwa_mem": { "time": "08:00:00", - "n": 16 + "n": 6 }, - "cnvkit_paired": { - "time": "12:00:00", + "concatenate": { + "time": "00:30:00", + "n": 1 + }, + "bcftools_merge_germlineSNV_research": { + "time": "4:00:00", + "n": 18 + }, + "cnvkit_segment_CNV_research": { + "time": "6:00:00", + "n": 18 + }, + "purecn_call_CNV_research": { + "time": "6:00:00", + "n": 10 + }, + "cnvkit_call_CNV_research": { + "time": "6:00:00", + "n": 10 + }, + "bcftools_sort_cnvkitCNV_research": { + "time": "4:00:00", "n": 10 }, "dragen_align_call_tumor_only": { @@ -44,21 +76,29 @@ "n": 36, "partition": "dragen" }, - "cnvkit_single": { + "fastqc":{ "time": "12:00:00", - "n": 10 + "n": 4 }, - "fastqc":{ + "fastp_quality_trim":{ + "time": "24:00:00", + "n": 4 + }, + "fastp_remove_umi":{ "time": "12:00:00", + "n": 4 + }, + "gatk_collectreadcounts":{ + "time": "10:00:00", "n": 5 }, - "fastp":{ - "time": "12:00:00", - "n": 8 + "gatk_denoisereadcounts":{ + "time": "10:00:00", + "n": 10 }, - "fastp_umi":{ - "time": "24:00:00", - "n": 8 + "gatk_create_readcount_pon":{ + "time": "60:00:00", + "n": 86 }, "manta_germline": { "time": "05:00:00", @@ -72,18 +112,10 @@ "time": "10:00:00", "n": 12 }, - "mergeBam_normal": { - "time": "12:00:00", - "n": 36 - }, "mergeBam_normal_gatk": { "time": "04:30:00", "n": 8 }, - "mergeBam_tumor": { - "time": "30:00:00", - "n": 36 - }, "mergeBam_tumor_gatk": { "time": "04:30:00", "n": 8 @@ -101,13 +133,17 @@ "n": 8 }, "samtools_sort_index": { - "time": "01:30:00", + "time": "02:30:00", "n": 16 }, "sentieon_DNAscope": { "time": "24:00:00", "n": 36 }, + "sentieon_DNAscope_gnomad": { + "time": "24:00:00", + "n": 36 + }, "sentieon_TNhaplotyper": { "time": "24:00:00", "n": 36 @@ -126,7 +162,7 @@ }, "sentieon_align_sort": { "time": "24:00:00", - "n": 36 + "n": 12 }, "sentieon_base_calibration": { "time": "24:00:00", @@ -154,11 +190,11 @@ }, "vardict_tumor_normal": { "time": "12:00:00", - "n": 10 + "n": 18 }, "vardict_tumor_only": { "time": "10:00:00", - "n": 10 + "n": 9 }, "sentieon_bwa_umiextract": { "time": "8:00:00", @@ -180,18 +216,38 @@ "time": "4:00:00", "n": 12 }, - "vep_somatic_research_snv": { + "bcftools_get_somaticINDEL_research": { + "time": "1:00:00", + "n": 36 + }, + "cadd_annotate_somaticINDEL_research": { + "time": "18:00:00", + "n": 36 + }, + "vep_annotate_somaticSNV_research": { "time": "18:00:00", "n": 36 }, - "vep_somatic_clinical_snv": { + "vcfanno_annotate_somaticSNV_clinical": { "time": "18:00:00", "n" : 36 }, - "vep_somatic_sv": { + "vep_somatic_research_sv": { "time": "12:00:00", "n": 36 }, + "svdb_annotate_somatic_research_sv": { + "time": "12:00:00", + "n": 8 + }, + "svdb_annotate_clinical_obs_somatic_clinical_sv": { + "time": "10:00:00", + "n": 8 + }, + "svdb_annotate_somatic_obs_somatic_clinical_sv": { + "time": "10:00:00", + "n": 8 + }, "vep_germline": { "time": "06:00:00", "n": 10 @@ -224,7 +280,19 @@ "time": "12:00:00", "n": 36 }, - "cnv_report": { + "csv_to_pdf": { + "time": "00:15:00", + "n": 1 + }, + "txt_to_pdf": { + "time": "00:15:00", + "n": 1 + }, + "image_to_pdf": { + "time": "00:15:00", + "n": 1 + }, + "merge_cnv_pdf_reports": { "time": "00:15:00", "n": 1 }, @@ -232,6 +300,10 @@ "time": "00:15:00", "n": 1 }, + "create_final_vcf_namemap": { + "time": "00:15:00", + "n": 1 + }, "svdb_merge_tumor_normal": { "time": "01:00:00", "n": 8 @@ -331,5 +403,13 @@ "somalier_relate": { "time": "01:00:00", "n": 1 + }, + "bam_compress": { + "time": "04:00:00", + "n": 20 + }, + "samtools_qc": { + "time": "04:00:00", + "n": 16 } } diff --git a/BALSAMIC/constants/cluster_cache.json b/BALSAMIC/constants/cluster_cache.json new file mode 100644 index 000000000..a1d13e646 --- /dev/null +++ b/BALSAMIC/constants/cluster_cache.json @@ -0,0 +1,86 @@ +{ + "__default__": { + "n": 12, + "time": "02:00:00", + "mail_type": "FAIL", + "partition": "core" + }, + "all": { + "n": 1, + "time": "00:15:00", + "mail_type": "END", + "partition": "core" + }, + "download_containers": { + "n": 12, + "time": "00:15:00", + "mail_type": "FAIL", + "partition": "core" + }, + "download_references": { + "n": 24, + "time": "10:00:00", + "mail_type": "FAIL", + "partition": "core" + }, + "compress_vcfs": { + "n": 12, + "time": "00:30:00", + "mail_type": "FAIL", + "partition": "core" + }, + "index_vcfs": { + "n": 12, + "time": "00:30:00", + "mail_type": "FAIL", + "partition": "core" + }, + "index_cadd": { + "n": 12, + "time": "01:00:00", + "mail_type": "FAIL", + "partition": "core" + }, + "picard_dict_reference_genome": { + "n": 12, + "time": "00:30:00", + "mail_type": "FAIL", + "partition": "core" + }, + "convert_delly_exclusion_file": { + "n": 2, + "time": "00:15:00", + "mail_type": "FAIL", + "partition": "core" + }, + "fasta_index_reference_genome": { + "n": 24, + "time": "01:00:00", + "mail_type": "FAIL", + "partition": "core" + }, + "bwa_index_reference_genome": { + "n": 24, + "time": "01:00:00", + "mail_type": "FAIL", + "partition": "core" + }, + "preprocess_refseq": { + "n": 12, + "time": "00:30:00", + "mail_type": "FAIL", + "partition": "core" + }, + "preprocess_refseq_canfam": { + "n": 12, + "time": "00:30:00", + "mail_type": "FAIL", + "partition": "core" + }, + "download_vep": { + "n": 24, + "time": "01:00:00", + "mail_type": "FAIL", + "partition": "core" + } +} diff --git a/BALSAMIC/constants/common.py b/BALSAMIC/constants/common.py deleted file mode 100644 index b7f5182e0..000000000 --- a/BALSAMIC/constants/common.py +++ /dev/null @@ -1,93 +0,0 @@ -"""This file contains constants variables used by BALSAMIC""" -import operator -import sys -from pathlib import Path - -# DOCKER hub path -BALSAMIC_DOCKER_PATH = "docker://clinicalgenomics/balsamic" - -# BALSAMIC base dir -BALSAMIC_BASE_DIR = Path(sys.modules["BALSAMIC"].__file__).parent.resolve() - -# BALSAMIC scripts dir -BALSAMIC_SCRIPTS = Path(BALSAMIC_BASE_DIR, "assets/scripts").as_posix() - -# Path to containers directory containing YAML files for conda installation for each one -CONTAINERS_CONDA_ENV_PATH = Path(BALSAMIC_BASE_DIR / "containers").as_posix() - -# Path to rule files to be accessed by Snakemake -RULE_DIRECTORY = BALSAMIC_BASE_DIR.as_posix() - -# Sentieon specific -SENTIEON_DNASCOPE = Path( - BALSAMIC_BASE_DIR - / "assets/sentieon_models/SentieonDNAscopeModelBeta0.4a-201808.05.model" -).as_posix() -SENTIEON_TNSCOPE = Path( - BALSAMIC_BASE_DIR - / "assets/sentieon_models/SentieonTNscopeModel_GiAB_HighAF_LowFP-201711.05.model" -) - -# Analysis related constants -GENDER_OPTIONS = ["female", "male"] -ANALYSIS_TYPES = ["paired", "single", "pon"] -ANALYSIS_WORKFLOW = ["balsamic", "balsamic-qc", "balsamic-umi"] -SEQUENCING_TYPE = ["wgs", "targeted"] -MUTATION_CLASS = ["somatic", "germline"] -MUTATION_TYPE = ["SNV", "SV", "CNV"] -WORKFLOW_SOLUTION = ["BALSAMIC", "Sentieon", "DRAGEN", "Sentieon_umi"] - -# list of bioinfo tools for each conda env -VALID_CONTAINER_CONDA_NAME = { - "align_qc", - "annotate", - "coverage_qc", - "varcall_py3", - "varcall_py27", - "varcall_cnvkit", - "delly", - "ascatNgs", - "balsamic", - "vcf2cytosure", - "cnvpytor", - "somalier", -} - -BIOINFO_TOOL_ENV = { - "bedtools": "align_qc", - "bwa": "align_qc", - "fastqc": "align_qc", - "samtools": "align_qc", - "picard": "align_qc", - "multiqc": "align_qc", - "fastp": "align_qc", - "csvkit": "align_qc", - "ensembl-vep": "annotate", - "genmod": "annotate", - "vcfanno": "annotate", - "sambamba": "coverage_qc", - "mosdepth": "coverage_qc", - "bcftools": "varcall_py3", - "tabix": "varcall_py3", - "gatk": "varcall_py3", - "vardict": "varcall_py3", - "svdb": "varcall_py3", - "tiddit": "varcall_py3", - "cnvpytor": "cnvpytor", - "manta": "varcall_py27", - "cnvkit": "varcall_cnvkit", - "delly": "delly", - "ascatNgs": "ascatNgs", - "sentieon": "sentieon", - "vcf2cytosure": "vcf2cytosure", - "somalier": "somalier", -} - -VALID_OPS = { - "lt": operator.lt, - "le": operator.le, - "eq": operator.eq, - "ne": operator.ne, - "ge": operator.ge, - "gt": operator.gt, -} diff --git a/BALSAMIC/constants/constants.py b/BALSAMIC/constants/constants.py new file mode 100644 index 000000000..13686a202 --- /dev/null +++ b/BALSAMIC/constants/constants.py @@ -0,0 +1,52 @@ +"""General use constants.""" +from enum import StrEnum +from typing import List + +EXIT_SUCCESS: int = 0 +EXIT_FAIL: int = 1 + + +class LogLevel(StrEnum): + NOTSET = "NOTSET" + DEBUG = "DEBUG" + INFO = "INFO" + WARNING = "WARNING" + ERROR = "ERROR" + FATAL = "FATAL" + CRITICAL = "CRITICAL" + + +LOG_LEVELS: List[LogLevel] = [level for level in LogLevel] + + +class FileType(StrEnum): + """Balsamic analysis and reference file extensions.""" + + BED: str = "bed" + CSV: str = "csv" + DICT: str = "dict" + FAI: str = "fai" + FASTA: str = "fasta" + FASTQ: str = "fastq" + FLAT: str = "flat" + GFF: str = "gff" + GTF: str = "gtf" + GZ: str = "gz" + JSON: str = "json" + LOG: str = "log" + PDF: str = "pdf" + SIF: str = "sif" + TBI: str = "tbi" + TSV: str = "tsv" + TXT: str = "txt" + VCF: str = "vcf" + + +class BwaIndexFileType(StrEnum): + """BWA genome index file extensions.""" + + AMB: str = "amb" + ANN: str = "ann" + BWT: str = "bwt" + PAC: str = "pac" + SA: str = "sa" diff --git a/BALSAMIC/constants/qc_metrics.py b/BALSAMIC/constants/metrics.py similarity index 89% rename from BALSAMIC/constants/qc_metrics.py rename to BALSAMIC/constants/metrics.py index 8e3299348..0e4ae6a81 100644 --- a/BALSAMIC/constants/qc_metrics.py +++ b/BALSAMIC/constants/metrics.py @@ -1,4 +1,19 @@ -METRICS = { +"""QC metrics constants.""" +import operator +from typing import Dict, Callable + + +VALID_OPS: Dict[str, Callable] = { + "lt": operator.lt, + "le": operator.le, + "eq": operator.eq, + "ne": operator.ne, + "ge": operator.ge, + "gt": operator.gt, +} + + +METRICS: Dict[str, dict] = { "targeted": { "default": { "MEAN_INSERT_SIZE": {"condition": None}, @@ -58,13 +73,13 @@ "wgs": { "MEAN_INSERT_SIZE": {"condition": None}, "MEDIAN_COVERAGE": {"condition": None}, - "FastQC_mqc-generalstats-fastqc-percent_duplicates": {"condition": None}, + "PERCENT_DUPLICATION": {"condition": None}, "PCT_15X": {"condition": {"norm": "gt", "threshold": 0.95}}, # Normal sample "PCT_30X": {"condition": None}, "PCT_60X": {"condition": {"norm": "gt", "threshold": 0.80}}, # Tumor sample "PCT_100X": {"condition": None}, "FOLD_80_BASE_PENALTY": {"condition": None}, - "PCT_PF_READS_IMPROPER_PAIRS": {"condition": {"norm": "le", "threshold": 0.1}}, + "PCT_PF_READS_IMPROPER_PAIRS": {"condition": {"norm": "le", "threshold": 0.05}}, }, "variants": { "NUMBER_OF_SITES": {"condition": {"norm": "lt", "threshold": 50000}}, diff --git a/BALSAMIC/constants/paths.py b/BALSAMIC/constants/paths.py new file mode 100644 index 000000000..c9ad5d4f5 --- /dev/null +++ b/BALSAMIC/constants/paths.py @@ -0,0 +1,28 @@ +"""Balsamic path constants.""" +import sys +from pathlib import Path + +# Balsamic working directory constants +BALSAMIC_DIR: Path = Path(sys.modules["BALSAMIC"].__file__).parent.resolve() +CONSTANTS_DIR: Path = Path(BALSAMIC_DIR, "constants") +CONTAINERS_DIR: Path = Path(BALSAMIC_DIR, "containers") +ASSETS_DIR: Path = Path(BALSAMIC_DIR, "assets") +SCRIPT_DIR: Path = Path(ASSETS_DIR, "scripts") +REFSEQ_SCRIPT_PATH: Path = Path(SCRIPT_DIR, "refseq_sql.awk") +SCHEDULER_PATH: Path = Path(BALSAMIC_DIR, "utils", "scheduler.py") + +# Sentieon specific constants +SENTIEON_MODELS_DIR: Path = Path(BALSAMIC_DIR, "assets", "sentieon_models") +SENTIEON_DNASCOPE_DIR: Path = Path( + SENTIEON_MODELS_DIR, "SentieonDNAscopeModelBeta0.4a-201808.05.model" +) +SENTIEON_TNSCOPE_DIR: Path = Path( + SENTIEON_MODELS_DIR, "SentieonTNscopeModel_GiAB_HighAF_LowFP-201711.05.model" +) + +# Singularity container hardcoded paths +CADD_ANNOTATIONS_CONTAINER_DIR = Path("/opt/conda/share/CADD-scripts/data/annotations") + +# Pytest paths +TEST_DATA_DIR: Path = Path("tests/test_data") +FASTQ_TEST_INFO: Path = Path(TEST_DATA_DIR, "fastq_test_info.json") diff --git a/BALSAMIC/constants/reference.py b/BALSAMIC/constants/reference.py deleted file mode 100644 index aa0f30cd1..000000000 --- a/BALSAMIC/constants/reference.py +++ /dev/null @@ -1,413 +0,0 @@ -# reference related constants -VALID_REF_FORMAT = ["fasta", "vcf", "text", "gtf", "gff"] -VALID_GENOME_VER = ["hg19", "hg38", "canfam3"] - -# reference files -REFERENCE_FILES = { - "hg38": { - "reference_genome": { - "url": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta", - "file_type": "fasta", - "gzip": False, - "genome_version": "hg38", - "output_file": "Homo_sapiens_assembly38.fasta", - "output_path": "genome", - }, - "dbsnp": { - "url": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", - "file_type": "vcf", - "gzip": False, - "genome_version": "hg38", - "output_file": "Homo_sapiens_assembly38.dbsnp138.vcf", - "output_path": "variants", - }, - "hc_vcf_1kg": { - "url": "gs://genomics-public-data/resources/broad/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg38", - "output_file": "1000G_phase1.snps.high_confidence.hg38.vcf", - "output_path": "variants", - }, - "mills_1kg": { - "url": "gs://genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg38", - "output_file": "Mills_and_1000G_gold_standard.indels.hg38.vcf", - "output_path": "variants", - }, - "known_indel_1kg": { - "url": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg38", - "output_file": "Homo_sapiens_assembly38.known_indels.vcf", - "output_path": "variants", - }, - "vcf_1kg": { - "url": "gs://genomics-public-data/resources/broad/hg38/v0/1000G.phase3.integrated.sites_only.no_MATCHED_REV.hg38.vcf", - "file_type": "vcf", - "gzip": False, - "genome_version": "hg38", - "output_file": "1000G.phase3.integrated.sites_only.no_MATCHED_REV.hg38.vcf", - "output_path": "variants", - }, - "gnomad_variant": { - "url": "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz", - "file_type": "vcf", - "gzip": False, - "genome_version": "hg38", - "output_file": "gnomad.genomes.r2.1.1.sites.vcf.bgz", - "output_path": "variants", - }, - "gnomad_variant_index": { - "url": "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi", - "file_type": "vcf", - "gzip": False, - "genome_version": "hg38", - "output_file": "gnomad.genomes.r2.1.1.sites.vcf.bgz.tbi", - "output_path": "variants", - }, - "cosmicdb": { - "url": "https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg38", - "output_file": "cosmic_coding_muts_v97.vcf", - "output_path": "variants", - }, - "wgs_calling": { - "url": "gs://genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list", - "file_type": "text", - "gzip": False, - "genome_version": "hg38", - "output_file": "wgs_calling_regions.v1", - "output_path": "genome", - }, - "genome_chrom_size": { - "url": "https://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes", - "file_type": "text", - "gzip": False, - "genome_version": "hg38", - "output_file": "hg38.chrom.sizes", - "output_path": "genome", - }, - "refgene_txt": { - "url": "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz", - "file_type": "text", - "gzip": True, - "genome_version": "hg38", - "output_file": "refGene.txt", - "output_path": "genome", - }, - "refgene_sql": { - "url": "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/refGene.sql", - "file_type": "text", - "gzip": False, - "genome_version": "hg38", - "output_file": "refGene.sql", - "output_path": "genome", - }, - "rankscore": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/cancer/rank_model/cancer_rank_model_-v0.1-.ini", - "file_type": "text", - "gzip": False, - "genome_version": "hg38", - "output_file": "cancer_rank_model_-v0.1-.ini", - "output_path": "genome", - }, - "access_regions": { - "url": "https://raw.githubusercontent.com/etal/cnvkit/master/data/access-5k-mappable.hg19.bed", - "file_type": "text", - "gzip": False, - "genome_version": "hg38", - "output_file": "access_5kb_hg38.txt", - "output_path": "genome", - }, - "delly_exclusion": { - "url": "https://raw.githubusercontent.com/dellytools/delly/master/excludeTemplates/human.hg38.excl.tsv", - "file_type": "text", - "gzip": False, - "genome_version": "hg38", - "output_file": "delly_exclusion.tsv", - "output_path": "genome", - }, - "delly_mappability": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/ea051b864d18945980f0ded6b16a5d192bd736a5/cancer/references/GRCh38.delly.blacklist.gz", - "file_type": "text", - "gzip": False, - "genome_version": "hg38", - "output_file": "delly_mappability.gz", - "output_path": "genome", - }, - "delly_mappability_gindex": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/ea051b864d18945980f0ded6b16a5d192bd736a5/cancer/references/GRCh38.delly.blacklist.gz.gzi", - "file_type": "text", - "gzip": False, - "genome_version": "hg38", - "output_file": "delly_mappability.gz.gzi", - "output_path": "genome", - }, - "delly_mappability_findex": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/ea051b864d18945980f0ded6b16a5d192bd736a5/cancer/references/GRCh38.delly.blacklist.gz.fai", - "file_type": "text", - "gzip": False, - "genome_version": "hg38", - "output_file": "delly_mappability.gz.fai", - "output_path": "genome", - }, - "ascat_gccorrection": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/35465e2644f76f2d59427a9b379d34ecea71f259/cancer/references/hg38_SnpGcCorrections.tsv.gz", - "file_type": "text", - "gzip": True, - "genome_version": "hg38", - "output_file": "hg38_SnpGcCorrections.tsv", - "output_path": "genome", - }, - "ascat_chryloci": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/ea051b864d18945980f0ded6b16a5d192bd736a5/cancer/references/hg38_Y.loci", - "file_type": "text", - "gzip": False, - "genome_version": "hg38", - "output_file": "hg38_Y.loci", - "output_path": "genome", - }, - "clinvar": { - "url": "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg38", - "output_file": "clinvar.vcf", - "output_path": "variants", - }, - "somalier_sites": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/87f22d3f458569afbcb4d7f1588468d21d1751fb/cancer/references/hg38.somalier.sites.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg38", - "output_file": "hg38.somalier.sites.vcf", - "output_path": "variants", - }, - }, - "hg19": { - "reference_genome": { - "url": "gs://gatk-legacy-bundles/b37/human_g1k_v37.fasta.gz", - "file_type": "fasta", - "gzip": True, - "genome_version": "hg19", - "output_file": "human_g1k_v37.fasta", - "output_path": "genome", - }, - "dbsnp": { - "url": "gs://gatk-legacy-bundles/b37/dbsnp_138.b37.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg19", - "output_file": "dbsnp_grch37_b138.vcf", - "output_path": "variants", - }, - "hc_vcf_1kg": { - "url": "gs://gatk-legacy-bundles/b37/1000G_phase1.snps.high_confidence.b37.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg19", - "output_file": "1kg_phase1_snps_high_confidence_b37.vcf", - "output_path": "variants", - }, - "mills_1kg": { - "url": "gs://gatk-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg19", - "output_file": "mills_1kg_index.vcf", - "output_path": "variants", - }, - "known_indel_1kg": { - "url": "gs://gatk-legacy-bundles/b37/1000G_phase1.indels.b37.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg19", - "output_file": "1kg_known_indels_b37.vcf", - "output_path": "variants", - }, - "vcf_1kg": { - "url": "gs://genomics-public-data/ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg19", - "output_file": "1k_genome_wgs_p1_v3_all_sites.vcf", - "output_path": "variants", - }, - "gnomad_variant": { - "url": "gs://gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz", - "file_type": "vcf", - "gzip": False, - "genome_version": "hg19", - "output_file": "gnomad.genomes.r2.1.1.sites.vcf.bgz", - "output_path": "variants", - }, - "gnomad_variant_index": { - "url": "gs://gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.vcf.bgz.tbi", - "file_type": "vcf", - "gzip": False, - "genome_version": "hg19", - "output_file": "gnomad.genomes.r2.1.1.sites.vcf.bgz.tbi", - "output_path": "variants", - }, - "cosmicdb": { - "url": "https://cancer.sanger.ac.uk/cosmic/file_download/GRCh37/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg19", - "output_file": "cosmic_coding_muts_v97.vcf", - "output_path": "variants", - }, - "wgs_calling": { - "url": "gs://gatk-legacy-bundles/b37/wgs_calling_regions.v1.interval_list", - "file_type": "text", - "gzip": False, - "genome_version": "hg19", - "output_file": "wgs_calling_regions.v1", - "output_path": "genome", - }, - "genome_chrom_size": { - "url": "https://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes", - "file_type": "text", - "gzip": False, - "genome_version": "hg19", - "output_file": "hg19.chrom.sizes", - "output_path": "genome", - }, - "refgene_txt": { - "url": "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz", - "file_type": "text", - "gzip": True, - "genome_version": "hg19", - "output_file": "refGene.txt", - "output_path": "genome", - }, - "refgene_sql": { - "url": "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.sql", - "file_type": "text", - "gzip": False, - "genome_version": "hg19", - "output_file": "refGene.sql", - "output_path": "genome", - }, - "rankscore": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/cancer/rank_model/cancer_rank_model_-v0.1-.ini", - "file_type": "text", - "gzip": False, - "genome_version": "hg19", - "output_file": "cancer_rank_model_-v0.1-.ini", - "output_path": "genome", - }, - "access_regions": { - "url": "https://raw.githubusercontent.com/etal/cnvkit/master/data/access-5k-mappable.hg19.bed", - "file_type": "text", - "gzip": False, - "genome_version": "hg19", - "output_file": "access_5kb_hg19.txt", - "output_path": "genome", - }, - "delly_exclusion": { - "url": "https://raw.githubusercontent.com/dellytools/delly/master/excludeTemplates/human.hg19.excl.tsv", - "file_type": "text", - "gzip": False, - "genome_version": "hg19", - "output_file": "delly_exclusion.tsv", - "output_path": "genome", - }, - "delly_mappability": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz", - "file_type": "text", - "gzip": False, - "genome_version": "hg19", - "output_file": "delly_mappability.gz", - "output_path": "genome", - }, - "delly_mappability_gindex": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz.gzi", - "file_type": "text", - "gzip": False, - "genome_version": "hg19", - "output_file": "delly_mappability.gz.gzi", - "output_path": "genome", - }, - "delly_mappability_findex": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz.fai", - "file_type": "text", - "gzip": False, - "genome_version": "hg19", - "output_file": "delly_mappability.gz.fai", - "output_path": "genome", - }, - "ascat_gccorrection": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/12a6c760fd542c02de2cda286b6245e46f4b6a97/cancer/references/GRCh37_SnpGcCorrections.tsv.gz", - "file_type": "text", - "gzip": True, - "genome_version": "hg19", - "output_file": "GRCh37_SnpGcCorrections.tsv", - "output_path": "genome", - }, - "ascat_chryloci": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/12a6c760fd542c02de2cda286b6245e46f4b6a97/cancer/references/GRCh37_d5_Y.loci", - "file_type": "text", - "gzip": False, - "genome_version": "hg19", - "output_file": "GRCh37_Y.loci", - "output_path": "genome", - }, - "clinvar": { - "url": "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg19", - "output_file": "clinvar.vcf", - "output_path": "variants", - }, - "somalier_sites": { - "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/87f22d3f458569afbcb4d7f1588468d21d1751fb/cancer/references/GRCh37.somalier.sites.vcf.gz", - "file_type": "vcf", - "gzip": True, - "genome_version": "hg19", - "output_file": "GRCh37.somalier.sites.vcf", - "output_path": "variants", - }, - }, - "canfam3": { - "reference_genome": { - "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/bigZips/canFam3.fa.gz", - "file_type": "fasta", - "gzip": True, - "genome_version": "canfam3", - "output_file": "canFam3.fasta", - "output_path": "genome", - }, - "refgene_txt": { - "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/database/refGene.txt.gz", - "file_type": "text", - "gzip": True, - "genome_version": "canfam3", - "output_file": "canfam3_refGene.txt", - "output_path": "genome", - }, - "refgene_sql": { - "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/database/refGene.sql", - "file_type": "text", - "gzip": False, - "genome_version": "canfam3", - "output_file": "canfam3_refGene.sql", - "output_path": "genome", - }, - "genome_chrom_size": { - "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/bigZips/canFam3.chrom.sizes", - "file_type": "text", - "gzip": False, - "genome_version": "canfam3", - "output_file": "canfam3.chrom.sizes", - "output_path": "genome", - }, - }, -} diff --git a/BALSAMIC/constants/workflow_rules.py b/BALSAMIC/constants/rules.py similarity index 68% rename from BALSAMIC/constants/workflow_rules.py rename to BALSAMIC/constants/rules.py index 41ea81d21..9bb961e79 100644 --- a/BALSAMIC/constants/workflow_rules.py +++ b/BALSAMIC/constants/rules.py @@ -1,44 +1,84 @@ -# Define set of rules -SNAKEMAKE_RULES = { +"""Snakemake rules constants.""" +from typing import Dict, List + +from BALSAMIC.constants.cache import GenomeVersion +from BALSAMIC.constants.analysis import ( + AnalysisType, + AnalysisWorkflow, + SequencingType, + WorkflowSolution, +) + +common_cache_rules: List[str] = [ + "snakemake_rules/cache/singularity_containers.rule", + "snakemake_rules/cache/reference_genome_index.rule", + "snakemake_rules/cache/reference_download.rule", +] + +hg_cache_rules: List[str] = common_cache_rules + [ + "snakemake_rules/cache/cadd.rule", + "snakemake_rules/cache/delly.rule", + "snakemake_rules/cache/refseq.rule", + "snakemake_rules/cache/reference_vcf.rule", + "snakemake_rules/cache/vep.rule", +] + +canfam_cache_rules: List[str] = common_cache_rules + [ + "snakemake_rules/cache/refseq_canfam.rule" +] + + +SNAKEMAKE_RULES: Dict[str, Dict[str, list]] = { "common": { "qc": [ - "snakemake_rules/quality_control/fastp.rule", "snakemake_rules/quality_control/fastqc.rule", "snakemake_rules/quality_control/multiqc.rule", "snakemake_rules/quality_control/qc_metrics.rule", - "snakemake_rules/variant_calling/mergetype_tumor.rule", + "snakemake_rules/quality_control/samtools_qc.rule", + ], + "report": [ + "snakemake_rules/report/generate_pdf.rule", + "snakemake_rules/report/merge_pdfs.rule", + ], + "align": [ + "snakemake_rules/align/sentieon_alignment.rule", + "snakemake_rules/align/bam_compress.rule", ], - "align": [], "varcall": [ "snakemake_rules/variant_calling/germline_sv.rule", "snakemake_rules/variant_calling/sentieon_quality_filter.rule", "snakemake_rules/variant_calling/somatic_sv_quality_filter.rule", ], "annotate": [ - "snakemake_rules/annotation/vep.rule", + "snakemake_rules/annotation/somatic_snv_annotation.rule", + "snakemake_rules/annotation/somatic_sv_annotation.rule", + "snakemake_rules/annotation/somatic_computations.rule", + "snakemake_rules/annotation/germline_annotation.rule", "snakemake_rules/annotation/varcaller_sv_filter.rule", "snakemake_rules/annotation/vcf2cytosure_convert.rule", + "snakemake_rules/annotation/final_vcf_reheader.rule", ], }, "single_targeted": { "qc": [ - "snakemake_rules/quality_control/GATK.rule", + "snakemake_rules/quality_control/fastp_tga.rule", "snakemake_rules/quality_control/picard.rule", "snakemake_rules/quality_control/sambamba_depth.rule", "snakemake_rules/quality_control/mosdepth.rule", + "snakemake_rules/umi/concatenation_umi.rule", "snakemake_rules/umi/qc_umi.rule", "snakemake_rules/umi/mergetype_tumor_umi.rule", "snakemake_rules/umi/generate_AF_tables.rule", ], "align": [ - "snakemake_rules/align/bwa_mem.rule", "snakemake_rules/umi/sentieon_umiextract.rule", "snakemake_rules/umi/sentieon_consensuscall.rule", + "snakemake_rules/align/postprocess_bam.rule", ], "varcall": [ "snakemake_rules/variant_calling/germline.rule", "snakemake_rules/variant_calling/split_bed.rule", - "snakemake_rules/variant_calling/cnvkit_single.rule", + "snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule", "snakemake_rules/variant_calling/somatic_tumor_only.rule", "snakemake_rules/variant_calling/somatic_sv_tumor_only.rule", "snakemake_rules/umi/sentieon_varcall_tnscope.rule", @@ -50,29 +90,29 @@ }, "paired_targeted": { "qc": [ - "snakemake_rules/quality_control/GATK.rule", + "snakemake_rules/quality_control/fastp_tga.rule", "snakemake_rules/quality_control/picard.rule", "snakemake_rules/quality_control/sambamba_depth.rule", "snakemake_rules/quality_control/mosdepth.rule", "snakemake_rules/umi/qc_umi.rule", - "snakemake_rules/variant_calling/mergetype_normal.rule", "snakemake_rules/quality_control/somalier.rule", + "snakemake_rules/umi/concatenation_umi.rule", "snakemake_rules/umi/mergetype_tumor_umi.rule", "snakemake_rules/umi/mergetype_normal_umi.rule", "snakemake_rules/quality_control/contest.rule", "snakemake_rules/umi/generate_AF_tables.rule", ], "align": [ - "snakemake_rules/align/bwa_mem.rule", "snakemake_rules/umi/sentieon_umiextract.rule", "snakemake_rules/umi/sentieon_consensuscall.rule", + "snakemake_rules/align/postprocess_bam.rule", ], "varcall": [ "snakemake_rules/variant_calling/germline.rule", "snakemake_rules/variant_calling/split_bed.rule", "snakemake_rules/variant_calling/somatic_tumor_normal.rule", "snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule", - "snakemake_rules/variant_calling/cnvkit_paired.rule", + "snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule", "snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule", ], "annotate": [ @@ -83,11 +123,10 @@ }, "single_wgs": { "qc": [ + "snakemake_rules/quality_control/fastp_wgs.rule", "snakemake_rules/quality_control/sentieon_qc_metrics.rule", "snakemake_rules/quality_control/picard_wgs.rule", - "snakemake_rules/quality_control/report.rule", ], - "align": ["snakemake_rules/align/sentieon_alignment.rule"], "varcall": [ "snakemake_rules/variant_calling/sentieon_germline.rule", "snakemake_rules/variant_calling/sentieon_split_snv_sv.rule", @@ -101,13 +140,11 @@ }, "paired_wgs": { "qc": [ + "snakemake_rules/quality_control/fastp_wgs.rule", "snakemake_rules/quality_control/sentieon_qc_metrics.rule", "snakemake_rules/quality_control/picard_wgs.rule", - "snakemake_rules/quality_control/report.rule", - "snakemake_rules/variant_calling/mergetype_normal.rule", "snakemake_rules/quality_control/somalier.rule", ], - "align": ["snakemake_rules/align/sentieon_alignment.rule"], "varcall": [ "snakemake_rules/variant_calling/sentieon_germline.rule", "snakemake_rules/variant_calling/sentieon_split_snv_sv.rule", @@ -119,23 +156,26 @@ "snakemake_rules/annotation/vcfheader_rename.rule", ], }, + "cache": { + GenomeVersion.HG19: hg_cache_rules, + GenomeVersion.HG38: hg_cache_rules, + GenomeVersion.CanFam3: canfam_cache_rules, + }, } - -DELIVERY_RULES = [ +DELIVERY_RULES: List[str] = [ # QC "multiqc", "collect_custom_qc_metrics", - "cnv_report", # Alignment - "mergeBam_tumor", - "mergeBam_normal", "mergeBam_tumor_umiconsensus", "mergeBam_normal_umiconsensus", + "bam_compress_tumor", + "bam_compress_normal", # Germline - "vep_germline_tumor", - "vep_germline_normal", "vcfheader_rename_germline", + "vep_annotate_germlineVAR_tumor", + "vep_annotate_germlineVAR_normal", # SNVs "bcftools_view_split_variant", "bcftools_filter_tnscope_research_tumor_only", @@ -164,11 +204,15 @@ "delly_cnv_tumor_only", "delly_cnv_tumor_normal", "ascat_tumor_normal", + "cnvpytor_tumor_only", "vcf2cytosure_convert_tumor_only", "vcf2cytosure_convert_tumor_normal", - "cnvkit_single", - "cnvkit_paired", + "cnvkit_segment_CNV_research", + "cnvkit_call_CNV_research", "vcf2cytosure_convert", + "finalize_gens_outputfiles", # TMB "tmb_calculation", + # CNV report + "merge_cnv_pdf_reports", ] diff --git a/BALSAMIC/constants/tools.py b/BALSAMIC/constants/tools.py new file mode 100644 index 000000000..197650df4 --- /dev/null +++ b/BALSAMIC/constants/tools.py @@ -0,0 +1,45 @@ +from BALSAMIC.constants.analysis import SequencingType + +# GENS parameters +GENS_PARAMS = { + "MINIMUM_REGION_SIZE": 100, + "ALLOWED_CHR_LIST": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "X", + "Y", + "MT", + ], + "SEQUENCING_TYPE": { + SequencingType.WGS: { + "BAF_SKIP_N": {"o": 135, "a": 30, "b": 8, "c": 3, "d": 1}, + "COV_REGION_SIZES": { + "o": 100000, + "a": 25000, + "b": 5000, + "c": 1000, + "d": 100, + }, + } + }, +} diff --git a/BALSAMIC/constants/variant_filters.py b/BALSAMIC/constants/variant_filters.py index d0ef82283..e3ddab449 100644 --- a/BALSAMIC/constants/variant_filters.py +++ b/BALSAMIC/constants/variant_filters.py @@ -19,7 +19,6 @@ "field": "INFO", }, "MQ": {"tag_value": 40, "filter_name": "balsamic_low_mq", "field": "INFO"}, - "AF_max": {"tag_value": 1, "filter_name": "balsamic_af_one", "field": "INFO"}, "AF_min": {"tag_value": 0.007, "filter_name": "balsamic_low_af", "field": "INFO"}, "pop_freq": { "tag_value": 0.005, @@ -50,7 +49,6 @@ "filter_name": "balsamic_low_tumor_dp", "field": "FORMAT", }, - "AF_max": {"tag_value": 1, "filter_name": "balsamic_af_one", "field": "FORMAT"}, "AF_min": {"tag_value": 0.05, "filter_name": "balsamic_low_af", "field": "FORMAT"}, "pop_freq": { "tag_value": 0.001, diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 3c6d46614..d3a2486b6 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -10,98 +10,98 @@ VCF_DICT = { "tnscope_umi": { "mutation": "somatic", - "type": "SNV", + "mutation_type": "SNV", "analysis_type": ["single", "paired"], "sequencing_type": ["targeted"], "workflow_solution": ["Sentieon_umi"], }, "tnscope": { "mutation": "somatic", - "type": "SNV", + "mutation_type": "SNV", "analysis_type": ["paired", "single"], "sequencing_type": ["wgs"], "workflow_solution": ["Sentieon"], }, "dnascope": { "mutation": "germline", - "type": "SNV", + "mutation_type": "SNV", "analysis_type": ["paired", "single"], "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["Sentieon"], }, "manta": { "mutation": "somatic", - "type": "SV", + "mutation_type": "SV", "analysis_type": ["paired", "single"], "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["BALSAMIC"], }, "cnvkit": { "mutation": "somatic", - "type": "CNV", + "mutation_type": "CNV", "analysis_type": ["paired", "single"], "sequencing_type": ["targeted"], "workflow_solution": ["BALSAMIC"], }, "vardict": { "mutation": "somatic", - "type": "SNV", + "mutation_type": "SNV", "analysis_type": ["paired", "single"], "sequencing_type": ["targeted"], "workflow_solution": ["BALSAMIC"], }, "manta_germline": { "mutation": "germline", - "type": "SV", + "mutation_type": "SV", "analysis_type": ["paired", "single"], "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["BALSAMIC"], }, "haplotypecaller": { "mutation": "germline", - "type": "SNV", + "mutation_type": "SNV", "analysis_type": ["paired", "single"], "sequencing_type": ["targeted"], "workflow_solution": ["BALSAMIC"], }, "dellysv": { "mutation": "somatic", - "type": "SV", + "mutation_type": "SV", "analysis_type": ["paired", "single"], "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["BALSAMIC"], }, "tiddit": { "mutation": "somatic", - "type": "SV", + "mutation_type": "SV", "analysis_type": ["single", "paired"], "sequencing_type": ["wgs"], "workflow_solution": ["BALSAMIC"], }, "dellycnv": { "mutation": "somatic", - "type": "CNV", + "mutation_type": "CNV", "analysis_type": ["single", "paired"], "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["BALSAMIC"], }, "ascat": { "mutation": "somatic", - "type": "CNV", + "mutation_type": "CNV", "analysis_type": ["paired"], "sequencing_type": ["wgs"], "workflow_solution": ["BALSAMIC"], }, "cnvpytor": { "mutation": "somatic", - "type": "CNV", + "mutation_type": "CNV", "analysis_type": ["single"], "sequencing_type": ["wgs"], "workflow_solution": ["BALSAMIC"], }, "svdb": { "mutation": "somatic", - "type": "SV", + "mutation_type": "SV", "analysis_type": ["paired", "single"], "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["BALSAMIC"], diff --git a/BALSAMIC/containers/balsamic/Dockerfile b/BALSAMIC/containers/balsamic/Dockerfile deleted file mode 100644 index 9a3f1bf76..000000000 --- a/BALSAMIC/containers/balsamic/Dockerfile +++ /dev/null @@ -1,34 +0,0 @@ -FROM continuumio/miniconda3:4.10.3-alpine - -LABEL base.image="continuumio/miniconda3:4.10.3-alpine" -LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" -LABEL about.documentation="https://balsamic.readthedocs.io/" -LABEL about.license="MIT License (MIT)" -LABEL about.maintainer="Hassan Foroughi hassan dot foroughi at scilifelab dot se" -LABEL about.description="Bioinformatic analysis pipeline for somatic mutations in cancer" -LABEL about.version="12.0.2" - -ENV PATH="/opt/conda/bin/:${PATH}" -ENV MUSL_LOCPATH="/usr/share/i18n/locales/musl" -ENV LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 - -RUN apk add --no-cache bash gcc git zlib-dev musl-dev libintl libffi-dev openssl-dev hdf5-dev jpeg-dev - -# Locale installation -RUN apk add --no-cache --virtual .locale_tmp cmake make gettext-dev && \ - git clone https://gitlab.com/rilian-la-te/musl-locales && \ - cd musl-locales && cmake -DLOCALE_PROFILE=OFF -DCMAKE_INSTALL_PREFIX:PATH=/usr . && make && make install && \ - cd .. && rm -r musl-locales && \ - apk del .locale_tmp - -ARG WORK_DIR=project -ARG CONTAINER_NAME - -# Copy all project files -COPY . /${WORK_DIR} - -RUN cd /${WORK_DIR}/BALSAMIC/containers/${CONTAINER_NAME}/ && /bin/sh ${CONTAINER_NAME}.sh ${CONTAINER_NAME} -RUN cd /${WORK_DIR} && pip install --upgrade --no-cache-dir . - -# Clean work environment -RUN rm -rf /${WORK_DIR} && conda clean --all --yes diff --git a/BALSAMIC/containers/balsamic/balsamic.sh b/BALSAMIC/containers/balsamic/balsamic.sh deleted file mode 100644 index 905536f39..000000000 --- a/BALSAMIC/containers/balsamic/balsamic.sh +++ /dev/null @@ -1,2 +0,0 @@ -conda env update -n base --file "${1}".yaml -pip install --no-cache-dir fpdf2==2.4.6 diff --git a/BALSAMIC/containers/balsamic/balsamic.yaml b/BALSAMIC/containers/balsamic/balsamic.yaml deleted file mode 100644 index f4a5c916f..000000000 --- a/BALSAMIC/containers/balsamic/balsamic.yaml +++ /dev/null @@ -1,10 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults - -dependencies: - - conda-forge::python=3.7 - - conda-forge::pip - - conda-forge::pygraphviz - - conda-forge::pillow diff --git a/BALSAMIC/containers/cadd/Dockerfile b/BALSAMIC/containers/cadd/Dockerfile new file mode 100644 index 000000000..3c1f290a7 --- /dev/null +++ b/BALSAMIC/containers/cadd/Dockerfile @@ -0,0 +1,43 @@ +FROM condaforge/mambaforge:23.1.0-4 + +LABEL base.image="condaforge/mambaforge:23.1.0-4" +LABEL maintainer="Clinical Genomics" +LABEL about.contact="support@clinicalgenomics.se" +LABEL software="cadd" +LABEL software.version="1.16" +LABEL about.summary="Combined Annotation Dependent Depletion (CADD)" +LABEL about.home="https://cadd.gs.washington.edu/" +LABEL about.documentation="https://cadd.gs.washington.edu/info" +LABEL about.license="MIT License (MIT)" + +RUN apt-get update && apt-get -y upgrade && \ + apt-get -y install --no-install-recommends unzip && \ + apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +## Install snakemake +RUN conda update -n base conda && \ + conda install -c conda-forge -c bioconda snakemake=7.32.2 && \ + conda clean -ya + +## Donwload and extract CADD repository +RUN wget --no-verbose \ + https://github.com/kircherlab/CADD-scripts/archive/CADD1.6.zip \ + -O /opt/conda/share/CADD1.6.zip && \ + unzip -o /opt/conda/share/CADD1.6.zip -d /opt/conda/share/ && \ + rm /opt/conda/share/CADD1.6.zip && \ + mv /opt/conda/share/CADD-scripts-CADD1.6 /opt/conda/share/CADD-scripts + +## Install CADD-scripts +RUN cd /opt/conda/share/CADD-scripts && \ + snakemake test/input.tsv.gz --use-conda --conda-create-envs-only \ + --conda-prefix /opt/conda/share/CADD-scripts/envs \ + --cores 1 --configfile config/config_GRCh38_v1.6.yml --snakefile Snakefile && \ + ln -s /opt/conda/share/CADD-scripts/CADD.sh /opt/conda/bin/CADD.sh && \ + conda clean -ya + +RUN adduser --disabled-password --gecos '' ubuntu &&\ +chsh -s /bin/bash && mkdir -p /home/ubuntu + +USER ubuntu +WORKDIR /home/ubuntu +CMD ["/bin/bash"] diff --git a/BALSAMIC/commands/plugins/__init__.py b/BALSAMIC/containers/cadd/__init__.py similarity index 100% rename from BALSAMIC/commands/plugins/__init__.py rename to BALSAMIC/containers/cadd/__init__.py diff --git a/BALSAMIC/containers/cadd/cadd.yaml b/BALSAMIC/containers/cadd/cadd.yaml new file mode 100644 index 000000000..80d75fab7 --- /dev/null +++ b/BALSAMIC/containers/cadd/cadd.yaml @@ -0,0 +1 @@ +- cadd=1.6 diff --git a/BALSAMIC/containers/cnvkit/Dockerfile b/BALSAMIC/containers/cnvkit/Dockerfile new file mode 100644 index 000000000..6da45c992 --- /dev/null +++ b/BALSAMIC/containers/cnvkit/Dockerfile @@ -0,0 +1,35 @@ +FROM python:3.10-slim + +LABEL base.image="python:3.10-slim" +LABEL maintainer="Clinical Genomics" +LABEL about.contact="support@clinicalgenomics.se" +LABEL software="cnvkit" +LABEL software.version="0.9.10" +LABEL about.summary="Copy number variant detection from targeted DNA sequencing" +LABEL about.home="https://github.com/etal/cnvkit" +LABEL about.documentation="https://cnvkit.readthedocs.io" +LABEL about.license="MIT License (MIT)" + +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get update && apt-get -y upgrade && \ + apt-get -y install --no-install-recommends tabix liblzma-dev zlib1g-dev \ + r-base-core r-bioc-dnacopy && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV VENV /opt/venv + +RUN python -m pip install --upgrade --no-cache-dir pip +RUN python -m venv ${VENV} +ENV PATH="${VENV}/bin:$PATH" + +RUN pip install --no-cache-dir --upgrade pip + +RUN pip install --no-cache-dir cnvkit==0.9.10 + +RUN adduser --disabled-password --gecos '' ubuntu && \ + chsh -s /bin/bash && mkdir -p /home/ubuntu + +USER ubuntu +WORKDIR /home/ubuntu +CMD ["/bin/bash"] diff --git a/BALSAMIC/config/__init__.py b/BALSAMIC/containers/cnvkit/__init__.py similarity index 100% rename from BALSAMIC/config/__init__.py rename to BALSAMIC/containers/cnvkit/__init__.py diff --git a/BALSAMIC/containers/cnvkit/cnvkit.yaml b/BALSAMIC/containers/cnvkit/cnvkit.yaml new file mode 100644 index 000000000..7ec08edf4 --- /dev/null +++ b/BALSAMIC/containers/cnvkit/cnvkit.yaml @@ -0,0 +1 @@ +- cnvkit=0.9.10 diff --git a/BALSAMIC/containers/cnvpytor/Dockerfile b/BALSAMIC/containers/cnvpytor/Dockerfile index 141861c72..44feebcc5 100644 --- a/BALSAMIC/containers/cnvpytor/Dockerfile +++ b/BALSAMIC/containers/cnvpytor/Dockerfile @@ -1,22 +1,55 @@ -FROM continuumio/miniconda3:4.10.3-alpine +FROM python:3.11-slim AS builder -LABEL base.image="continuumio/miniconda3:4.10.3-alpine" -LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" -LABEL about.documentation="https://balsamic.readthedocs.io/" +ENV WORK_DIR /opt +ENV VENV ${WORK_DIR}/venv + +RUN apt-get update && \ + apt-get -y upgrade && \ + apt-get -y install --no-install-recommends git && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN python -m pip install --upgrade --no-cache-dir pip +RUN python -m venv ${VENV} +ENV PATH="${VENV}/bin:$PATH" + +RUN pip install --no-cache-dir seaborn==0.12.2 numpy==1.23.5 + +RUN cd ${WORK_DIR} && \ + git clone https://github.com/abyzovlab/CNVpytor.git && \ + cd CNVpytor && \ + git checkout v1.3.1 && \ + pip install --no-cache-dir . && \ + cnvpytor -download + +FROM python:3.11-slim + +LABEL base.image="python:3.11-slim" +LABEL maintainer="Clinical Genomics" +LABEL about.contact="support@clinicalgenomics.se" +LABEL software="cnvpytor" +LABEL software.version="1.3.1" +LABEL about.summary="A tool for CNV discovery and genotyping from depth-of-coverage by mapped reads" +LABEL about.home="https://github.com/abyzovlab/CNVpytor" +LABEL about.documentation="https://github.com/abyzovlab/CNVpytor/wiki" LABEL about.license="MIT License (MIT)" -LABEL about.description="Bioinformatic analysis pipeline for somatic mutations in cancer" -ENV PATH="/opt/conda/bin/:${PATH}" +RUN apt-get update && \ + apt-get -y upgrade && \ + apt-get -y install --no-install-recommends tabix && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* -RUN apk add --no-cache bash gcc git +ENV WORK_DIR /opt +ENV VENV ${WORK_DIR}/venv -ARG WORK_DIR=project -ARG CONTAINER_NAME +COPY --from=builder ${WORK_DIR} ${WORK_DIR} -# Copy all project files -COPY . /${WORK_DIR} +ENV PATH="${VENV}/bin:$PATH" -RUN cd /${WORK_DIR}/BALSAMIC/containers/${CONTAINER_NAME}/ && /bin/sh ${CONTAINER_NAME}.sh ${CONTAINER_NAME} +RUN adduser --disabled-password --gecos '' ubuntu && \ + chsh -s /bin/bash && mkdir -p /home/ubuntu -# Clean work environment -RUN rm -rf /${WORK_DIR:?} && conda clean --all --yes +USER ubuntu +WORKDIR /home/ubuntu +CMD ["/bin/bash"] diff --git a/BALSAMIC/containers/cnvpytor/cnvpytor.sh b/BALSAMIC/containers/cnvpytor/cnvpytor.sh deleted file mode 100644 index 92f21aa23..000000000 --- a/BALSAMIC/containers/cnvpytor/cnvpytor.sh +++ /dev/null @@ -1 +0,0 @@ -conda env update -n base --file "${1}".yaml diff --git a/BALSAMIC/containers/cnvpytor/cnvpytor.yaml b/BALSAMIC/containers/cnvpytor/cnvpytor.yaml index c42a937ea..845f4dbe8 100644 --- a/BALSAMIC/containers/cnvpytor/cnvpytor.yaml +++ b/BALSAMIC/containers/cnvpytor/cnvpytor.yaml @@ -1,104 +1 @@ -channels: - - anaconda - - bioconda - - defaults - -dependencies: - - _libgcc_mutex=0.1 - - _openmp_mutex=4.5 - - blas=1.0 - - brotli=1.0.9 - - brotlipy=0.7.0 - - bzip2=1.0.8 - - c-ares=1.18.1 - - ca-certificates=2023.01.10 - - cached-property=1.5.2 - - certifi=2022.12.7 - - cffi=1.15.0 - - chardet=4.0.0 - - cnvpytor=1.2.1 - - conda=23.1.0 - - conda-package-handling=2.0.2 - - conda-package-streaming=0.7.0 - - cryptography=38.0.4 - - curl=7.82.0 - - cycler=0.11.0 - - flit-core=3.6.0 - - fonttools=4.25.0 - - freetype=2.11.0 - - giflib=5.2.1 - - h5py=3.6.0 - - hdf5=1.10.6 - - htslib=1.9 - - idna=2.10 - - importlib-metadata=4.11.3 - - importlib_metadata=4.11.3 - - intel-openmp=2021.4.0 - - jpeg=9e - - kiwisolver=1.4.2 - - krb5=1.19.2 - - lcms2=2.12 - - ld_impl_linux-64=2.35.1 - - libcurl=7.82.0 - - libdeflate=1.0 - - libedit=3.1.20210714 - - libev=4.33 - - libffi=3.3 - - libgcc-ng=9.3.0 - - libgfortran-ng=7.5.0 - - libgfortran4=7.5.0 - - libgomp=9.3.0 - - libnghttp2=1.46.0 - - libpng=1.6.37 - - libssh2=1.10.0 - - libstdcxx-ng=9.3.0 - - libtiff=4.2.0 - - libwebp=1.2.2 - - libwebp-base=1.2.2 - - lz4-c=1.9.3 - - matplotlib-base=3.5.1 - - mkl=2021.4.0 - - mkl-service=2.4.0 - - mkl_fft=1.3.1 - - mkl_random=1.2.2 - - munkres=1.1.4 - - ncurses=6.2 - - numpy=1.21.5 - - numpy-base=1.21.5 - - openssl=1.1.1t - - packaging=22.0 - - pathlib=1.0.1 - - pillow=9.0.1 - - pip=22.3.1 - - pluggy=1.0.0 - - pycosat=0.6.3 - - pycparser=2.20 - - pyopenssl=20.0.1 - - pyparsing=3.0.9 - - pysam=0.15.3 - - pysocks=1.7.1 - - python=3.7.11 - - python-dateutil=2.8.2 - - readline=8.1 - - requests=2.25.1 - - ruamel.yaml=0.16.12 - - ruamel.yaml.clib=0.2.6 - - scipy=1.7.3 - - setuptools=65.6.3 - - six=1.16.0 - - sqlite=3.36.0 - - tabix=1.11 - - tk=8.6.10 - - toolz=0.12.0 - - tqdm=4.61.2 - - typing_extensions=4.4.0 - - tzdata=2021a - - urllib3=1.26.6 - - wheel=0.36.2 - - xlsxwriter=3.0.3 - - xz=5.2.5 - - yaml=0.2.5 - - zipp=3.11.0 - - zlib=1.2.11 - - zstandard=0.15.2 - - zstd=1.4.9 +- cnvpytor=1.3.1 diff --git a/BALSAMIC/containers/gatk/Dockerfile b/BALSAMIC/containers/gatk/Dockerfile new file mode 100644 index 000000000..ec33c09f8 --- /dev/null +++ b/BALSAMIC/containers/gatk/Dockerfile @@ -0,0 +1,37 @@ +FROM python:3.11-slim + +LABEL base.image="python:3.11-slim" +LABEL maintainer="Clinical Genomics" +LABEL about.contact="support@clinicalgenomics.se" +LABEL software="GATK" +LABEL software.version="4.4.0.4" +LABEL about.summary="Variant Discovery in High-Throughput Sequencing Data" +LABEL about.home="https://github.com/broadinstitute/gatk" +LABEL about.documentation="https://gatk.broadinstitute.org/hc/en-us" +LABEL about.license="MIT License (MIT)" + +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get update && apt-get -y upgrade && \ + apt-get -y install --no-install-recommends wget openjdk-17-jre-headless apt-utils \ + git r-base-core && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 + +WORKDIR /tmp +# install GATK4 +ENV GATK_VERSION="4.4.0.0" +RUN wget --no-verbose https://github.com/broadinstitute/gatk/releases/download/${GATK_VERSION}/gatk-${GATK_VERSION}.zip && \ + unzip gatk-${GATK_VERSION}.zip -d /opt && \ + rm gatk-${GATK_VERSION}.zip + +ENV PATH /opt/gatk-${GATK_VERSION}:$PATH + +RUN adduser --disabled-password --gecos '' ubuntu && \ + chsh -s /bin/bash && mkdir -p /home/ubuntu + +USER ubuntu +WORKDIR /home/ubuntu +CMD ["/bin/bash"] + diff --git a/BALSAMIC/containers/balsamic/__init__.py b/BALSAMIC/containers/gatk/__init__.py similarity index 100% rename from BALSAMIC/containers/balsamic/__init__.py rename to BALSAMIC/containers/gatk/__init__.py diff --git a/BALSAMIC/containers/gatk/gatk.yaml b/BALSAMIC/containers/gatk/gatk.yaml new file mode 100644 index 000000000..16aaabf7f --- /dev/null +++ b/BALSAMIC/containers/gatk/gatk.yaml @@ -0,0 +1 @@ +- gatk=4.4.0.0 diff --git a/BALSAMIC/containers/htslib/Dockerfile b/BALSAMIC/containers/htslib/Dockerfile new file mode 100644 index 000000000..fea12856f --- /dev/null +++ b/BALSAMIC/containers/htslib/Dockerfile @@ -0,0 +1,25 @@ +FROM ubuntu:22.04 + +LABEL base.image="ubuntu:22.04" +LABEL maintainer="Clinical Genomics" +LABEL about.contact="support@clinicalgenomics.se" +LABEL software="htslib, samtools, bcftools, tabix" +LABEL software.version="1.13" +LABEL about.summary="A unified C library for accessing common bioinformatics file formats" +LABEL about.home="http://www.htslib.org/" +LABEL about.documentation="http://www.htslib.org/doc/#manual-pages" +LABEL about.license="MIT License (MIT)" + +RUN apt-get update && \ + apt-get -y upgrade && \ + apt-get -y install --no-install-recommends \ + samtools=1.13-4 bcftools=1.13-1 tabix=1.13+ds-2build1 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +RUN adduser --disabled-password --gecos '' ubuntu &&\ +chsh -s /bin/bash && mkdir -p /home/ubuntu + +USER ubuntu +WORKDIR /home/ubuntu +CMD ["/bin/bash"] diff --git a/BALSAMIC/containers/varcall_cnvkit/__init__.py b/BALSAMIC/containers/htslib/__init__.py similarity index 100% rename from BALSAMIC/containers/varcall_cnvkit/__init__.py rename to BALSAMIC/containers/htslib/__init__.py diff --git a/BALSAMIC/containers/htslib/htslib.yaml b/BALSAMIC/containers/htslib/htslib.yaml new file mode 100644 index 000000000..9d49a08be --- /dev/null +++ b/BALSAMIC/containers/htslib/htslib.yaml @@ -0,0 +1,4 @@ +- samtools=1.13 +- bcftools=1.13 +- tabix=1.13 +- bgzip=1.13 diff --git a/BALSAMIC/containers/purecn/Dockerfile b/BALSAMIC/containers/purecn/Dockerfile new file mode 100644 index 000000000..3ac55adde --- /dev/null +++ b/BALSAMIC/containers/purecn/Dockerfile @@ -0,0 +1,22 @@ +FROM markusriester/purecn:2.6.4-amd64 + +LABEL base.image="markusriester/purecn:2.6.4-amd64" +LABEL maintainer="Clinical Genomics" +LABEL about.contact="support@clinicalgenomics.se" +LABEL software="PureCN" +LABEL software.version="2.6.4" +LABEL about.summary="Copy number calling and variant classification using targeted short read sequencing" +LABEL about.home="https://github.com/lima1/PureCN" +LABEL about.documentation="https://bioconductor.org/packages/devel/bioc/html/PureCN.html" +LABEL about.license="MIT License (MIT)" + +RUN apt-get update && apt-get -y upgrade && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN adduser --disabled-password --gecos '' ubuntu && \ + chsh -s /bin/bash && mkdir -p /home/ubuntu + +USER ubuntu +WORKDIR /home/ubuntu +CMD ["/bin/bash"] diff --git a/BALSAMIC/containers/purecn/__init__.py b/BALSAMIC/containers/purecn/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/BALSAMIC/containers/purecn/purecn.yaml b/BALSAMIC/containers/purecn/purecn.yaml new file mode 100644 index 000000000..81dcddfb3 --- /dev/null +++ b/BALSAMIC/containers/purecn/purecn.yaml @@ -0,0 +1 @@ +- purecn=2.6.4 diff --git a/BALSAMIC/containers/varcall_cnvkit/Dockerfile b/BALSAMIC/containers/varcall_cnvkit/Dockerfile deleted file mode 100644 index 8c25633ab..000000000 --- a/BALSAMIC/containers/varcall_cnvkit/Dockerfile +++ /dev/null @@ -1,23 +0,0 @@ -FROM continuumio/miniconda3:4.10.3-alpine - -LABEL base.image="continuumio/miniconda3:4.10.3-alpine" -LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" -LABEL about.documentation="https://balsamic.readthedocs.io/" -LABEL about.license="MIT License (MIT)" -LABEL about.maintainer="Hassan Foroughi hassan dot foroughi at scilifelab dot se" -LABEL about.description="Bioinformatic analysis pipeline for somatic mutations in cancer" - -ENV PATH="/opt/conda/bin/:${PATH}" - -RUN apk add --no-cache bash gcc git zlib-dev musl-dev - -ARG WORK_DIR=project -ARG CONTAINER_NAME - -# Copy all project files -COPY . /${WORK_DIR} - -RUN cd /${WORK_DIR}/BALSAMIC/containers/${CONTAINER_NAME}/ && /bin/sh ${CONTAINER_NAME}.sh ${CONTAINER_NAME} - -# Clean work environment -RUN rm -rf /${WORK_DIR} && conda clean --all --yes diff --git a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.sh b/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.sh deleted file mode 100644 index 92f21aa23..000000000 --- a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.sh +++ /dev/null @@ -1 +0,0 @@ -conda env update -n base --file "${1}".yaml diff --git a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.yaml b/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.yaml deleted file mode 100644 index bb8b53e70..000000000 --- a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.yaml +++ /dev/null @@ -1,306 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults - -dependencies: - - _libgcc_mutex=0.1 - - _openmp_mutex=4.5 - - _r-mutex=1.0.1 - - bcftools=1.15.1 - - binutils_impl_linux-64=2.39 - - binutils_linux-64=2.39 - - bioconductor-annotationdbi=1.56.2 - - bioconductor-biobase=2.54.0 - - bioconductor-biocfilecache=2.2.0 - - bioconductor-biocgenerics=0.40.0 - - bioconductor-biocio=1.4.0 - - bioconductor-biocparallel=1.28.3 - - bioconductor-biomart=2.50.0 - - bioconductor-biostrings=2.62.0 - - bioconductor-bsgenome=1.62.0 - - bioconductor-delayedarray=0.20.0 - - bioconductor-dnacopy=1.68.0 - - bioconductor-genomeinfodb=1.30.0 - - bioconductor-genomeinfodbdata=1.2.7 - - bioconductor-genomicalignments=1.30.0 - - bioconductor-genomicfeatures=1.46.1 - - bioconductor-genomicranges=1.46.0 - - bioconductor-iranges=2.28.0 - - bioconductor-keggrest=1.34.0 - - bioconductor-matrixgenerics=1.6.0 - - bioconductor-purecn=2.0.2 - - bioconductor-rhdf5=2.38.1 - - bioconductor-rhdf5filters=1.6.0 - - bioconductor-rhdf5lib=1.16.0 - - bioconductor-rhtslib=1.26.0 - - bioconductor-rsamtools=2.10.0 - - bioconductor-rtracklayer=1.54.0 - - bioconductor-s4vectors=0.32.4 - - bioconductor-summarizedexperiment=1.24.0 - - bioconductor-variantannotation=1.40.0 - - bioconductor-xvector=0.34.0 - - bioconductor-zlibbioc=1.40.0 - - boltons=23.0.0 - - brotlipy=0.7.0 - - bwidget=1.9.14 - - bzip2=1.0.8 - - c-ares=1.19.1 - - ca-certificates=2023.5.7 - - cairo=1.16.0 - - certifi=2023.5.7 - - cffi=1.15.1 - - chardet=4.0.0 - - conda=23.5.2 - - conda-package-handling=2.0.2 - - conda-package-streaming=0.8.0 - - cryptography=39.0.0 - - curl=7.86.0 - - expat=2.5.0 - - font-ttf-dejavu-sans-mono=2.37 - - font-ttf-inconsolata=3.000 - - font-ttf-source-code-pro=2.038 - - font-ttf-ubuntu=0.83 - - fontconfig=2.14.2 - - fonts-conda-ecosystem=1 - - fonts-conda-forge=1 - - freetype=2.12.1 - - fribidi=1.0.10 - - gcc_impl_linux-64=10.4.0 - - gcc_linux-64=10.4.0 - - gettext=0.21.1 - - gfortran_impl_linux-64=10.4.0 - - gfortran_linux-64=10.4.0 - - graphite2=1.3.13 - - gsl=2.7 - - gxx_impl_linux-64=10.4.0 - - gxx_linux-64=10.4.0 - - harfbuzz=6.0.0 - - htslib=1.15.1 - - icu=70.1 - - idna=2.10 - - jpeg=9e - - jsonpatch=1.32 - - jsonpointer=2.0 - - kernel-headers_linux-64=2.6.32 - - keyutils=1.6.1 - - krb5=1.19.3 - - ld_impl_linux-64=2.39 - - lerc=4.0.0 - - libblas=3.9.0 - - libcblas=3.9.0 - - libcurl=7.86.0 - - libdeflate=1.13 - - libedit=3.1.20191231 - - libev=4.33 - - libexpat=2.5.0 - - libffi=3.4.2 - - libgcc-devel_linux-64=10.4.0 - - libgcc-ng=13.1.0 - - libgfortran-ng=13.1.0 - - libgfortran5=13.1.0 - - libglib=2.74.1 - - libgomp=13.1.0 - - libiconv=1.17 - - liblapack=3.9.0 - - libnghttp2=1.51.0 - - libnsl=2.0.0 - - libopenblas=0.3.23 - - libpng=1.6.39 - - libsanitizer=10.4.0 - - libssh2=1.10.0 - - libstdcxx-devel_linux-64=10.4.0 - - libstdcxx-ng=13.1.0 - - libtiff=4.4.0 - - libuuid=2.38.1 - - libwebp-base=1.3.1 - - libxcb=1.13 - - libxml2=2.9.14 - - libzlib=1.2.13 - - make=4.3 - - ncurses=6.4 - - openssl=1.1.1u - - packaging=23.1 - - pango=1.50.14 - - pcre2=10.37 - - perl=5.32.1 - - pip=23.2 - - pixman=0.40.0 - - pluggy=1.2.0 - - pthread-stubs=0.4 - - pycosat=0.6.4 - - pycparser=2.20 - - pyopenssl=20.0.1 - - pysocks=1.7.1 - - python=3.8.12 - - python_abi=3.8 - - r-askpass=1.1 - - r-assertthat=0.2.1 - - r-base=4.1.3 - - r-bh=1.81.0_1 - - r-bit=4.0.5 - - r-bit64=4.0.5 - - r-bitops=1.0_7 - - r-blob=1.2.4 - - r-cachem=1.0.8 - - r-cli=3.6.1 - - r-colorspace=2.1_0 - - r-cpp11=0.4.3 - - r-crayon=1.5.2 - - r-curl=4.3.3 - - r-data.table=1.14.2 - - r-dbi=1.1.3 - - r-dbplyr=2.3.0 - - r-digest=0.6.31 - - r-dplyr=1.1.2 - - r-ellipsis=0.3.2 - - r-fansi=1.0.4 - - r-farver=2.1.1 - - r-fastmap=1.1.1 - - r-filelock=1.0.2 - - r-formatr=1.14 - - r-futile.logger=1.4.3 - - r-futile.options=1.0.1 - - r-generics=0.1.3 - - r-getopt=1.20.3 - - r-ggplot2=3.4.0 - - r-glue=1.6.2 - - r-gridextra=2.3 - - r-gtable=0.3.3 - - r-hms=1.1.3 - - r-httr=1.4.6 - - r-isoband=0.2.7 - - r-jsonlite=1.8.5 - - r-labeling=0.4.2 - - r-lambda.r=1.2.4 - - r-lattice=0.21_8 - - r-lifecycle=1.0.2 - - r-magrittr=2.0.3 - - r-mass=7.3_58.3 - - r-matrix=1.5_4.1 - - r-matrixstats=1.0.0 - - r-mclust=6.0.0 - - r-memoise=2.0.1 - - r-mgcv=1.8_42 - - r-mime=0.12 - - r-munsell=0.5.0 - - r-nlme=3.1_162 - - r-openssl=2.0.5 - - r-optparse=1.7.1 - - r-pillar=1.9.0 - - r-pkgconfig=2.0.3 - - r-plogr=0.2.0 - - r-png=0.1_8 - - r-prettyunits=1.1.1 - - r-progress=1.2.2 - - r-purrr=0.3.5 - - r-r6=2.5.1 - - r-rappdirs=0.3.3 - - r-rcolorbrewer=1.1_3 - - r-rcpp=1.0.10 - - r-rcurl=1.98_1.12 - - r-restfulr=0.0.15 - - r-rjson=0.2.21 - - r-rlang=1.0.5 - - r-rsqlite=2.3.1 - - r-scales=1.2.1 - - r-snow=0.4_4 - - r-stringi=1.7.12 - - r-stringr=1.4.1 - - r-sys=3.4.2 - - r-tibble=3.2.1 - - r-tidyselect=1.1.2 - - r-utf8=1.2.3 - - r-vctrs=0.4.2 - - r-vgam=1.1_8 - - r-viridislite=0.4.1 - - r-withr=2.5.0 - - r-xml=3.99_0.11 - - r-xml2=1.3.3 - - r-yaml=2.3.7 - - readline=8.1 - - requests=2.25.1 - - ruamel.yaml=0.17.32 - - ruamel.yaml.clib=0.2.7 - - sed=4.8 - - setuptools=68.0.0 - - six=1.16.0 - - sqlite=3.36.0 - - sysroot_linux-64=2.12 - - tabix=1.11 - - tk=8.6.12 - - tktable=2.10 - - toolz=0.12.0 - - tqdm=4.61.2 - - urllib3=1.26.6 - - wheel=0.36.2 - - xorg-kbproto=1.0.7 - - xorg-libice=1.0.10 - - xorg-libsm=1.2.3 - - xorg-libx11=1.8.4 - - xorg-libxau=1.0.11 - - xorg-libxdmcp=1.1.3 - - xorg-libxext=1.3.4 - - xorg-libxrender=0.9.10 - - xorg-libxt=1.3.0 - - xorg-renderproto=0.11.1 - - xorg-xextproto=7.3.0 - - xorg-xproto=7.0.31 - - xz=5.2.5 - - yaml=0.2.5 - - zlib=1.2.13 - - zstandard=0.19.0 - - zstd=1.5.2 - - pip: - - apricot-select==0.6.1 - - biopython==1.79 - - cmake==3.26.4 - - cnvkit==0.9.9 - - contourpy==1.1.0 - - cycler==0.11.0 - - cython==3.0.0 - - filelock==3.12.2 - - fonttools==4.41.0 - - importlib-metadata==6.8.0 - - importlib-resources==6.0.0 - - jinja2==3.1.2 - - joblib==0.17.0 - - kiwisolver==1.4.4 - - lit==16.0.6 - - llvmlite==0.40.1 - - markupsafe==2.1.3 - - matplotlib==3.7.2 - - mpmath==1.3.0 - - networkx==3.1 - - nose==1.3.7 - - numba==0.57.1 - - numpy==1.24.4 - - nvidia-cublas-cu11==11.10.3.66 - - nvidia-cuda-cupti-cu11==11.7.101 - - nvidia-cuda-nvrtc-cu11==11.7.99 - - nvidia-cuda-runtime-cu11==11.7.99 - - nvidia-cudnn-cu11==8.5.0.96 - - nvidia-cufft-cu11==10.9.0.58 - - nvidia-curand-cu11==10.2.10.91 - - nvidia-cusolver-cu11==11.4.0.1 - - nvidia-cusparse-cu11==11.7.4.91 - - nvidia-nccl-cu11==2.14.3 - - nvidia-nvtx-cu11==11.7.91 - - pandas==1.5.3 - - pillow==10.0.0 - - pomegranate==1.0.0 - - pyfaidx==0.7.2.1 - - pyparsing==3.0.9 - - pysam==0.21.0 - - python-dateutil==2.8.2 - - pytz==2023.3 - - reportlab==4.0.4 - - scikit-learn==1.0.2 - - scipy==1.10.1 - - sympy==1.12 - - threadpoolctl==3.2.0 - - torch==2.0.1 - - triton==2.0.0 - - typing-extensions==4.7.1 - - tzdata==2023.3 - - zipp==3.16.2 diff --git a/BALSAMIC/containers/vcf2cytosure/Dockerfile b/BALSAMIC/containers/vcf2cytosure/Dockerfile index dc6f55f27..b446195dc 100644 --- a/BALSAMIC/containers/vcf2cytosure/Dockerfile +++ b/BALSAMIC/containers/vcf2cytosure/Dockerfile @@ -1,28 +1,55 @@ -FROM continuumio/miniconda3:4.10.3-alpine +FROM python:3.11.3-slim AS builder -LABEL base.image="continuumio/miniconda3:4.10.3-alpine" +ARG CONTAINER_NAME +ENV WORK_DIR /opt/ +ENV VENV ${WORK_DIR}/venv + +RUN apt-get update && \ + apt-get -y upgrade && \ + apt-get -y install -y --no-install-recommends git && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN python -m pip install --upgrade --no-cache-dir pip +RUN python -m venv ${VENV} +ENV PATH="${VENV}/bin:$PATH" + +RUN pip install --no-cache-dir \ +cyvcf2==0.30.22 \ +lxml==4.9.2 \ +pandas==2.0.1 + +RUN cd /opt &&\ + git clone https://github.com/NBISweden/vcf2cytosure.git && \ + cd /opt/${CONTAINER_NAME}/ && \ + git checkout v0.8 && \ + pip install --no-cache-dir -e . + +FROM python:3.11-slim + +LABEL base.image="python:3.11-slim" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" -LABEL about.maintainer="Ashwini Jeggari ashwini dot jeggari at scilifelab dot se" -LABEL about.description="Bioinformatic analysis pipeline for somatic mutations in cancer" +LABEL about.description="vcf2cytosure for BALSAMIC" -ARG CONTAINER_NAME -ARG WORK_DIR=project -ENV PATH="/opt/${CONTAINER_NAME}/bin:${PATH}" -ENV PATH="/opt/conda/bin/:${PATH}" -ENV PYTHONPATH="/opt/${CONTAINER_NAME}" +RUN apt-get update && \ + apt-get -y upgrade && \ + apt-get -y install -y --no-install-recommends tabix && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* -COPY . /${WORK_DIR} +ENV WORK_DIR /opt/ +ENV VENV ${WORK_DIR}/venv -RUN apk add --no-cache bash gcc git python3 gzip +COPY --from=builder ${WORK_DIR} ${WORK_DIR} -RUN cd /opt \ - && git clone https://github.com/NBISweden/vcf2cytosure.git \ - && cd /opt/${CONTAINER_NAME}/ \ - && git checkout v0.8 \ - && pip install --no-cache-dir . +ENV PATH="${VENV}/bin:$PATH" -RUN cd /${WORK_DIR}/BALSAMIC/containers/${CONTAINER_NAME}/ && /bin/sh ${CONTAINER_NAME}.sh ${CONTAINER_NAME} +RUN adduser --disabled-password --gecos '' ubuntu && \ +chsh -s /bin/bash && \ +mkdir -p /home/ubuntu -RUN rm -rf /${WORK_DIR:?} && conda clean --all --yes +USER ubuntu +WORKDIR /home/ubuntu +CMD ["/bin/bash"] diff --git a/BALSAMIC/containers/vcf2cytosure/vcf2cytosure.sh b/BALSAMIC/containers/vcf2cytosure/vcf2cytosure.sh deleted file mode 100644 index 0270699d1..000000000 --- a/BALSAMIC/containers/vcf2cytosure/vcf2cytosure.sh +++ /dev/null @@ -1 +0,0 @@ -conda install -n base -c bioconda tabix=1.11 diff --git a/BALSAMIC/models/__init__.py b/BALSAMIC/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/BALSAMIC/models/cache.py b/BALSAMIC/models/cache.py new file mode 100644 index 000000000..137806b5e --- /dev/null +++ b/BALSAMIC/models/cache.py @@ -0,0 +1,457 @@ +"""Balsamic reference cache models.""" +import logging +from pathlib import Path +from typing import Dict, Optional, List, Union + +from pydantic import ( + BaseModel, + AnyUrl, + DirectoryPath, + FilePath, + field_validator, + ValidationInfo, +) + +from BALSAMIC.constants.cache import GenomeVersion, GRCHVersion +from BALSAMIC.constants.constants import FileType, BwaIndexFileType +from BALSAMIC.utils.exc import BalsamicError + +LOG = logging.getLogger(__name__) + + +class ReferenceUrl(BaseModel): + """ + Reference model handling URLs and destination paths. + + Attributes: + url (AnyUrl) : Address of the reference to download. + file_type (FileType) : Reference file type. + gzip (bool) : Compression status. + file_name (str) : Reference file name after being downloaded. + dir_name (str) : Destination directory of the downloaded file. + file_path (str, optional) : Full reference file path. + secret (str) : Database key. + """ + + url: AnyUrl + file_type: FileType + gzip: bool + file_name: str + dir_name: str + file_path: Optional[str] = None + secret: Optional[str] = None + + +class References(BaseModel): + """ + Reference files model. + + Attributes: + genome_chrom_size (ReferenceUrl) : Genome chromosome sizes. + reference_genome (ReferenceUrl) : Required field for reference genome FASTA file. + refgene_sql (ReferenceUrl) : RefSeq's gene SQL format from UCSC. + refgene_txt (ReferenceUrl) : RefSeq's gene flat format from UCSC. + """ + + genome_chrom_size: ReferenceUrl + reference_genome: ReferenceUrl + refgene_sql: ReferenceUrl + refgene_txt: ReferenceUrl + + def get_reference_genome_file_paths(self) -> List[str]: + """Return output reference genome files.""" + return [ + self.reference_genome.file_path, + f"{self.reference_genome.file_path}.{FileType.FAI}", + self.reference_genome.file_path.replace(FileType.FASTA, FileType.DICT), + ] + self.get_reference_genome_bwa_index_file_paths() + + def get_reference_genome_bwa_index_file_paths(self) -> List[str]: + """Return output BWA reference genome index files.""" + return [ + f"{self.reference_genome.file_path}.{bwa_index}" + for bwa_index in BwaIndexFileType + ] + + def get_refgene_file_paths(self) -> List[str]: + """Return RefSeq's gene files from UCSC.""" + return [ + self.refgene_txt.file_path, + self.get_refgene_flat_file_path(), + self.get_refgene_bed_file_path(), + ] + + def get_refgene_flat_file_path(self) -> str: + """Return RefSeq's gene flat file from UCSC.""" + return self.refgene_txt.file_path.replace(FileType.TXT, FileType.FLAT) + + def get_refgene_bed_file_path(self) -> str: + """Return RefSeq's gene BED file from UCSC.""" + return f"{self.get_refgene_flat_file_path()}.{FileType.BED}" + + +class ReferencesCanFam(References): + """Canine reference genome files model.""" + + +class ReferencesHg(References): + """ + Human reference genome files model. + + Attributes: + access_regions (ReferenceUrl) : Accessible genome regions. + ascat_chr_y_loci (ReferenceUrl) : Chromosome Y loci. + ascat_gc_correction (ReferenceUrl) : Genome GC correction bins. + cadd_snv (ReferenceUrl) : CADD SNV annotation file. + simple_repeat (ReferenceUrl) : Simple repeats + clinvar (ReferenceUrl) : ClinVar reference. + cosmic (ReferenceUrl) : COSMIC database's variants as VCF. + dbsnp (ReferenceUrl) : dbSNP VCF file. + delly_exclusion (ReferenceUrl) : Genome exclusion regions. + delly_mappability (ReferenceUrl) : Genome mappability. + delly_mappability_findex (ReferenceUrl) : Genome mappability fasta index. + delly_mappability_gindex (ReferenceUrl) : Genome mappability index. + gnomad_variant (ReferenceUrl) : gnomAD variants (non SV) as VCF. + gnomad_variant_index (ReferenceUrl) : gnomAD variants VCF index. + hc_vcf_1kg (ReferenceUrl) : High confidence 1000 Genome VCF. + known_indel_1kg (ReferenceUrl) : 1000 Genome known InDels VCF. + mills_1kg (ReferenceUrl) : Mills' high confidence InDels VCF. + rank_score (ReferenceUrl) : Rank score model. + somalier_sites (ReferenceUrl) : Somalier sites VCF. + vcf_1kg (ReferenceUrl) : 1000 Genome all SNPs. + wgs_calling_regions (ReferenceUrl) : WGS calling intervals. + """ + + access_regions: ReferenceUrl + ascat_chr_y_loci: ReferenceUrl + ascat_gc_correction: ReferenceUrl + cadd_snv: ReferenceUrl + simple_repeat: ReferenceUrl + clinvar: ReferenceUrl + cosmic: ReferenceUrl + dbsnp: ReferenceUrl + delly_exclusion: ReferenceUrl + delly_mappability: ReferenceUrl + delly_mappability_findex: ReferenceUrl + delly_mappability_gindex: ReferenceUrl + gnomad_variant: ReferenceUrl + gnomad_variant_index: ReferenceUrl + hc_vcf_1kg: ReferenceUrl + known_indel_1kg: ReferenceUrl + mills_1kg: ReferenceUrl + rank_score: ReferenceUrl + somalier_sites: ReferenceUrl + vcf_1kg: ReferenceUrl + wgs_calling_regions: ReferenceUrl + + def get_cadd_snv_file_paths(self) -> List[str]: + """Return CADD SNV reference output files.""" + return [self.cadd_snv.file_path, f"{self.cadd_snv.file_path}.{FileType.TBI}"] + + def get_delly_file_paths(self) -> List[str]: + """Return Delly associated output files.""" + return [ + self.delly_exclusion.file_path, + self.get_delly_exclusion_converted_file_path(), + self.delly_mappability.file_path, + self.delly_mappability_findex.file_path, + self.delly_mappability_gindex.file_path, + ] + + def get_delly_exclusion_converted_file_path(self) -> str: + """Return path to Delly exclusion converted file.""" + return self.delly_exclusion.file_path.replace( + f".{FileType.TSV}", f"_converted.{FileType.TSV}" + ) + + def get_gnomad_file_paths(self) -> List[str]: + """Return gnomAD associated output files.""" + return [self.gnomad_variant.file_path, self.gnomad_variant_index.file_path] + + def get_1k_genome_file_paths(self) -> List[str]: + """Return 1000 Genome related files.""" + return [ + f"{self.known_indel_1kg.file_path}.{FileType.GZ}", + f"{self.mills_1kg.file_path}.{FileType.GZ}", + f"{self.hc_vcf_1kg.file_path}.{FileType.GZ}", + f"{self.vcf_1kg.file_path}.{FileType.GZ}", + ] + + +class AnalysisReferences(BaseModel): + """ + Reference files pytest t for a general Balsamic analysis. + + Attributes: + genome_chrom_size (FilePath) : Genome chromosome sizes. + reference_genome (FilePath) : Required field for reference genome FASTA file. + refgene_bed (FilePath) : RefSeq's gene BED format from UCSC. + refgene_flat (FilePath) : RefSeq's gene flat format from UCSC. + refgene_txt (FilePath) : RefSeq's gene txt format from UCSC. + """ + + genome_chrom_size: FilePath + reference_genome: FilePath + refgene_bed: FilePath + refgene_flat: FilePath + refgene_txt: FilePath + + +class AnalysisReferencesCanFam(AnalysisReferences): + """Canine reference genome files model.""" + + +class AnalysisReferencesHg(AnalysisReferences): + """ + Human reference genome files model. + + Attributes: + access_regions (FilePath) : Accessible genome regions. + ascat_chr_y_loci (FilePath) : Chromosome Y loci. + ascat_gc_correction (FilePath) : Genome GC correction bins. + cadd_snv (FilePath) : CADD SNV annotation file. + simple_repeat (FilePath) : Simple repeats. + clinvar (FilePath) : ClinVar reference. + cosmic (FilePath) : COSMIC database's variants as VCF. + dbsnp (FilePath) : dbSNP VCF file. + delly_exclusion (FilePath) : Genome exclusion regions. + delly_exclusion_converted (FilePath) : Genome exclusion regions without "chr" field. + delly_mappability (FilePath) : Genome mappability. + gnomad_variant (FilePath) : gnomAD variants (non SV) as VCF. + hc_vcf_1kg (FilePath) : High confidence 1000 Genome VCF. + known_indel_1kg (FilePath) : 1000 Genome known InDels VCF. + mills_1kg (FilePath) : Mills' high confidence InDels VCF. + rank_score (FilePath) : Rank score model. + somalier_sites (FilePath) : Somalier sites VCF. + vcf_1kg (FilePath) : 1000 Genome all SNPs. + vep_dir (DirectoryPath) : VEP annotations output directory. + wgs_calling_regions (FilePath) : WGS calling intervals. + """ + + access_regions: FilePath + ascat_chr_y_loci: FilePath + ascat_gc_correction: FilePath + cadd_snv: FilePath + simple_repeat: FilePath + clinvar: FilePath + cosmic: FilePath + dbsnp: FilePath + delly_exclusion: FilePath + delly_exclusion_converted: FilePath + delly_mappability: FilePath + gnomad_variant: FilePath + hc_vcf_1kg: FilePath + known_indel_1kg: FilePath + mills_1kg: FilePath + rank_score: FilePath + somalier_sites: FilePath + vcf_1kg: FilePath + vep_dir: DirectoryPath + wgs_calling_regions: FilePath + + +class CacheAnalysis(BaseModel): + """ + Reference analysis configuration model. + + + Attributes: + case_id (str) : Reference case identifier. + """ + + case_id: str + + +class CacheConfig(BaseModel): + """ + Reference build configuration model. + + Attributes: + analysis (CacheAnalysis) : Reference analysis model. + references_dir (DirectoryPath) : Output directory for the downloaded reference. + containers_dir (Path) : Output directory for the downloaded singularity containers. + genome_dir (Path) : Genome references output directory. + variants_dir (Path) : Variant references output directory. + vep_dir (Path) : VEP annotations output directory. + genome_version (GenomeVersion) : Genome version associated with the balsamic cache. + cosmic_key (str, optional) : COSMIC database key. + bioinfo_tools (dict) : Dictionary of bioinformatics software and containers. + containers (Dict[str, str]) : Dictionary linking container names and dockerhub images. + references (Union[ReferencesHg, ReferencesCanFam]) : Reference files model. + references_date (str) : Reference access date. + """ + + analysis: CacheAnalysis + references_dir: DirectoryPath + containers_dir: Path + genome_dir: Path + variants_dir: Path + vep_dir: Path + genome_version: GenomeVersion + cosmic_key: Optional[str] = None + bioinfo_tools: dict + containers: Dict[str, str] + references: Union[ReferencesHg, ReferencesCanFam] + references_date: str + + @field_validator("references") + def validate_references( + cls, references: References, info: ValidationInfo + ) -> References: + """Validate the reference output paths.""" + for model in references: + reference_key: str + reference: ReferenceUrl + reference_key, reference = model[0], model[1] + reference.file_path = ( + Path( + info.data.get("references_dir"), + reference.dir_name, + reference.file_name, + ).as_posix() + if reference + else None + ) + reference.secret = ( + info.data.get("cosmic_key") if "cosmic" in reference_key else None + ) + return references + + def get_grch_version(self) -> Optional[GRCHVersion]: + """Return GRCH format version of the genome version if exists.""" + version: Dict[GenomeVersion, GRCHVersion] = { + GenomeVersion.HG19: GRCHVersion.GRCH37, + GenomeVersion.HG38: GRCHVersion.GRCH38, + } + return version.get(self.genome_version) + + def get_reference_file_paths(self) -> List[str]: + """Return a list of reference paths.""" + return [ + Path(reference[1].file_path).as_posix() for reference in self.references + ] + + def get_reference_by_path(self, reference_path: str) -> ReferenceUrl: + """Return a reference given its full path.""" + for model in self.references: + reference: ReferenceUrl = model[1] + if reference.file_path == reference_path: + return reference + LOG.error(f"No reference with the provided reference path {reference_path}") + raise BalsamicError() + + def get_reference_file_paths_by_file_type_and_compression( + self, file_type: FileType, compression: bool + ) -> List[str]: + """Return a list of reference paths given a file type and a compression status.""" + file_type_references: List[str] = self.get_reference_file_paths_by_file_type( + file_type=file_type + ) + compression_references: List[ + str + ] = self.get_reference_file_paths_by_compression(compression=compression) + return list(set(file_type_references).intersection(compression_references)) + + def get_reference_file_paths_by_file_type(self, file_type: FileType) -> List[str]: + """Return a list of reference paths given a file type.""" + return [ + reference[1].file_path + for reference in self.references + if reference[1].file_type == file_type + ] + + def get_reference_file_paths_by_compression(self, compression: bool) -> List[str]: + """Return a list of reference paths given a compression status.""" + return [ + reference[1].file_path + for reference in self.references + if reference[1].gzip == compression + ] + + def get_compressed_indexed_vcf_paths(self) -> List[str]: + """Return an output list of compressed and indexed VCFs.""" + return [ + f"{vcf}.{FileType.GZ}.{FileType.TBI}" + for vcf in self.get_reference_file_paths_by_file_type_and_compression( + file_type=FileType.VCF, compression=True + ) + ] + + def get_container_output_paths(self) -> List[str]: + """Return a complete list of output singularity images.""" + return [ + Path(self.containers_dir, f"{image}.{FileType.SIF}").as_posix() + for image in self.containers.keys() + ] + + def get_reference_output_paths(self) -> List[str]: + """Return a complete list of output reference paths.""" + reference_paths: List[str] = [ + self.references.genome_chrom_size.file_path, + *self.references.get_reference_genome_file_paths(), + *self.references.get_refgene_file_paths(), + ] + if self.genome_version == GenomeVersion.CanFam3: + return reference_paths + + reference_paths += [ + self.references.access_regions.file_path, + self.references.ascat_chr_y_loci.file_path, + self.references.ascat_gc_correction.file_path, + self.references.cadd_snv.file_path, + self.references.simple_repeat.file_path, + f"{self.references.clinvar.file_path}.{FileType.GZ}", + f"{self.references.cosmic.file_path}.{FileType.GZ}", + f"{self.references.dbsnp.file_path}.{FileType.GZ}", + self.references.rank_score.file_path, + f"{self.references.somalier_sites.file_path}.{FileType.GZ}", + self.references.wgs_calling_regions.file_path, + *self.get_compressed_indexed_vcf_paths(), + *self.references.get_1k_genome_file_paths(), + *self.references.get_cadd_snv_file_paths(), + *self.references.get_delly_file_paths(), + *self.references.get_gnomad_file_paths(), + self.vep_dir.as_posix(), + ] + return reference_paths + + def get_analysis_references( + self, + ) -> Union[AnalysisReferencesHg, AnalysisReferencesCanFam]: + """Return reference output model for Balsamic analyses.""" + if self.genome_version == GenomeVersion.CanFam3: + return AnalysisReferencesCanFam( + genome_chrom_size=self.references.genome_chrom_size.file_path, + reference_genome=self.references.reference_genome.file_path, + refgene_bed=self.references.get_refgene_bed_file_path(), + refgene_flat=self.references.get_refgene_flat_file_path(), + refgene_txt=self.references.refgene_txt.file_path, + ) + + return AnalysisReferencesHg( + access_regions=self.references.access_regions.file_path, + ascat_chr_y_loci=self.references.ascat_chr_y_loci.file_path, + ascat_gc_correction=self.references.ascat_gc_correction.file_path, + cadd_snv=self.references.cadd_snv.file_path, + simple_repeat=self.references.simple_repeat.file_path, + clinvar=f"{self.references.clinvar.file_path}.{FileType.GZ}", + cosmic=f"{self.references.cosmic.file_path}.{FileType.GZ}", + dbsnp=f"{self.references.dbsnp.file_path}.{FileType.GZ}", + delly_exclusion=self.references.delly_exclusion.file_path, + delly_exclusion_converted=self.references.get_delly_exclusion_converted_file_path(), + delly_mappability=self.references.delly_mappability.file_path, + genome_chrom_size=self.references.genome_chrom_size.file_path, + gnomad_variant=self.references.gnomad_variant.file_path, + hc_vcf_1kg=f"{self.references.hc_vcf_1kg.file_path}.{FileType.GZ}", + known_indel_1kg=f"{self.references.known_indel_1kg.file_path}.{FileType.GZ}", + mills_1kg=f"{self.references.mills_1kg.file_path}.{FileType.GZ}", + rank_score=self.references.rank_score.file_path, + reference_genome=self.references.reference_genome.file_path, + refgene_bed=self.references.get_refgene_bed_file_path(), + refgene_flat=self.references.get_refgene_flat_file_path(), + refgene_txt=self.references.refgene_txt.file_path, + somalier_sites=f"{self.references.somalier_sites.file_path}.{FileType.GZ}", + vcf_1kg=f"{self.references.vcf_1kg.file_path}.{FileType.GZ}", + vep_dir=self.vep_dir.as_posix(), + wgs_calling_regions=self.references.wgs_calling_regions.file_path, + ) diff --git a/BALSAMIC/models/config.py b/BALSAMIC/models/config.py new file mode 100644 index 000000000..7df595883 --- /dev/null +++ b/BALSAMIC/models/config.py @@ -0,0 +1,427 @@ +"""Balsamic analysis config case models.""" +import re +from glob import glob +from pathlib import Path +from typing import Annotated, Dict, List, Optional + +from pydantic import AfterValidator, BaseModel, field_validator, model_validator + +from BALSAMIC import __version__ as balsamic_version +from BALSAMIC.constants.analysis import ( + AnalysisType, + AnalysisWorkflow, + FastqName, + Gender, + MutationOrigin, + MutationType, + PONWorkflow, + SampleType, + SequencingType, + WorkflowSolution, +) +from BALSAMIC.models.params import QCModel +from BALSAMIC.models.validators import is_dir, is_file + + +class FastqInfoModel(BaseModel): + """Holds filepaths for forward and reverse reads for a fastq_pattern.""" + + fwd: Annotated[str, AfterValidator(is_file)] + rev: Annotated[str, AfterValidator(is_file)] + fwd_resolved: Annotated[Optional[str], AfterValidator(is_file)] = None + rev_resolved: Annotated[Optional[str], AfterValidator(is_file)] = None + + +class SampleInstanceModel(BaseModel): + """Holds attributes for samples used in analysis. + + Attributes: + type: Field(str): sample type [tumor, normal] + name: Field(str): sample name + fastq_info: Field(dict): fastq patterns: paths to forward and reverse fastqs + """ + + type: SampleType + name: str + fastq_info: Dict[str, FastqInfoModel] + + +class PanelModel(BaseModel): + """Holds attributes of PANEL BED file if provided + Attributes: + capture_kit : Field(str(Path)); string representation of path to PANEL BED file + chrom : Field(list(str)); list of chromosomes in PANEL BED + pon_cnn: Field(optional); Path where PON reference .cnn file is stored + + Raises: + ValueError: + When capture_kit argument is set, but is not a valid path + + """ + + capture_kit: Annotated[Optional[str], AfterValidator(is_file)] = None + chrom: Optional[List[str]] = None + pon_cnn: Annotated[Optional[str], AfterValidator(is_file)] = None + + +class VarcallerAttribute(BaseModel): + """Holds variables for variant caller software + Attributes: + mutation: str of mutation class + mutation_type: str of mutation type + analysis_type: list of str for analysis types + workflow_solution: list of str for workflows + sequencing_type: list of str for workflows + + Raises: + ValueError: + When a variable other than [somatic, germline] is passed in mutation field + When a variable other than [SNV, CNV, SV] is passed in mutation_type field + + """ + + mutation: MutationOrigin + mutation_type: MutationType + analysis_type: Optional[List[AnalysisType]] = None + sequencing_type: Optional[List[SequencingType]] = None + workflow_solution: Optional[List[WorkflowSolution]] = None + + +class VCFModel(BaseModel): + """Contains VCF config""" + + vardict: VarcallerAttribute + tnscope: VarcallerAttribute + dnascope: VarcallerAttribute + tnscope_umi: VarcallerAttribute + manta_germline: VarcallerAttribute + manta: VarcallerAttribute + dellysv: VarcallerAttribute + cnvkit: VarcallerAttribute + ascat: VarcallerAttribute + dellycnv: VarcallerAttribute + tiddit: VarcallerAttribute + cnvpytor: VarcallerAttribute + svdb: VarcallerAttribute + + +class AnalysisModel(BaseModel): + """Pydantic model containing workflow variables + + Attributes: + + case_id : Field(required); string case identifier + gender: Field(required); string case gender + analysis_type : Field(required); string literal [single, paired, pon] + single : if only tumor samples are provided + paired : if both tumor and normal samples are provided + pon : panel of normal analysis + sequencing_type : Field(required); string literal [targeted, wgs] + targeted : if capture kit was used to enrich specific genomic regions + wgs : if whole genome sequencing was performed + analysis_workflow: Field(required); string literal [balsamic, balsamic-qc, balsamic-umi] + balsamic: execute balsamic workflow + balsamic-qc: execute balsamic qc-only workflow + balsamic-umi: execute balsamic along with UMIworkflow for panels + analysis_dir : Field(required); existing path where to save files + fastq_path : Field(optional); Path where fastq files will be stored + script : Field(optional); Path where snakemake scripts will be stored + log : Field(optional); Path where logs will be saved + result : Field(optional); Path where BALSAMIC output will be stored + benchmark : Field(optional); Path where benchmark report will be stored + dag : Field(optional); Path where DAG graph of workflow will be stored + BALSAMIC_version : Field(optional); Current version of BALSAMIC + config_creation_date : Field(optional); Timestamp when config was created + + Raises: + ValueError: + When gender is set to any other than [female, male] + When analysis_type is set to any value other than [single, paired, pon] + When sequencing_type is set to any value other than [wgs, targeted] + When analysis_workflow is set to any other than [balsamic, balsamic-qc, balsamic-umi] + """ + + case_id: str + analysis_type: AnalysisType + gender: Optional[Gender] = None + sequencing_type: SequencingType + analysis_workflow: AnalysisWorkflow + analysis_dir: Annotated[str, AfterValidator(is_dir)] + fastq_path: Annotated[str, AfterValidator(is_dir)] + log: Annotated[str, AfterValidator(is_dir)] + script: Annotated[str, AfterValidator(is_dir)] + result: Annotated[str, AfterValidator(is_dir)] + benchmark: Annotated[str, AfterValidator(is_dir)] + dag: str + BALSAMIC_version: str = balsamic_version + config_creation_date: str + pon_version: Optional[str] = None + pon_workflow: Optional[PONWorkflow] = None + + @field_validator("pon_version") + def validate_pon_version(cls, pon_version: Optional[str]): + """Checks that the PON version matches the following syntax: v""" + if pon_version and not re.fullmatch("^v[1-9]\d*$", pon_version): + raise ValueError( + f"The provided PON version ({pon_version}) does not follow the defined syntax (v)" + ) + return pon_version + + +class ConfigModel(BaseModel): + """ + Class providing common functions and variables for different balsamic workflows. + + Attributes: + QC : Field(QCmodel); variables relevant for fastq preprocessing and QC + samples : Field(List[SampleInstanceModel]); List containing samples submitted for analysis + reference : Field(Dict); dictionary containing paths to reference genome files + panel : Field(PanelModel(optional)); variables relevant to PANEL BED if capture kit is used + bioinfo_tools : Field(dict); dictionary of bioinformatics software and which conda/container they are in + bioinfo_tools_version : Field(dict); dictionary of bioinformatics software and their versions used for the analysis + singularity : Field(Dict); path to singularity container of BALSAMIC + vcf : Field(VCFmodel); variables relevant for variant calling pipeline + background_variants: Field(Path(optional)); path to BACKGROUND VARIANTS for UMI + analysis: Field(AnalysisModel); Pydantic model containing workflow variables + + This class also contains functions that help retrieve sample and file information, + facilitating BALSAMIC run operations in Snakemake. + + Functions: + - get_all_sample_names: Return all sample names in the analysis. + - get_fastq_patterns_by_sample: Return all fastq patterns for given samples. + - get_all_fastqs_for_sample: Return all fastqs for a sample. + - get_fastq_by_fastq_pattern: Return fastq file path for requested fastq pattern and type. + - get_sample_name_by_type: Return sample name for requested sample type. + - get_sample_type_by_name: Return sample type for requested sample name. + - get_bam_name_per_lane: Return list of bam file names for all fastq patterns of a sample. + - get_final_bam_name: Return final bam name for downstream analysis. + """ + + QC: QCModel + samples: List[SampleInstanceModel] + reference: Dict[str, Path] + singularity: Dict[str, str] + bioinfo_tools: Dict + bioinfo_tools_version: Dict + panel: Optional[PanelModel] = None + vcf: Optional[VCFModel] = None + background_variants: Optional[str] = None + analysis: AnalysisModel + + @field_validator("reference") + def abspath_as_str(cls, reference: Dict[str, Path]): + for k, v in reference.items(): + reference[k] = Path(v).resolve().as_posix() + return reference + + @field_validator("singularity") + def transform_path_to_dict(cls, singularity: Dict[str, str]): + for k, v in singularity.items(): + singularity[k] = Path(v).resolve().as_posix() + return singularity + + @field_validator("background_variants") + def background_variants_abspath_as_str(cls, background_variants: str): + """Converts FilePath to string.""" + if background_variants: + return Path(background_variants).resolve().as_posix() + return None + + @field_validator("samples") + def no_duplicate_fastq_patterns(cls, samples): + """Validate that no duplicate fastq patterns have been assigned in dict.""" + fastq_pattern_counts = {} + + # Count Fastq pattern occurrence + for sample in samples: + for fastq_pattern in sample.fastq_info.keys(): + if fastq_pattern not in fastq_pattern_counts: + fastq_pattern_counts[fastq_pattern] = 1 + else: + fastq_pattern_counts[fastq_pattern] += 1 + + # Look for duplicates + duplicates = [] + for fastq_pattern in fastq_pattern_counts: + if fastq_pattern_counts[fastq_pattern] > 1: + duplicates.append(fastq_pattern) + + if duplicates: + raise ValueError( + f"Duplicate FastqPattern(s) found: {', '.join(duplicates)} across multiple samples" + ) + + return samples + + @model_validator(mode="before") + def no_unassigned_fastqs_in_fastq_dir(cls, values): + """All fastq files in the supplied fastq-dir must have been assigned to the sample-dict.""" + + def get_all_fwd_rev_values(samples) -> List[str]: + # Return all fastq files in analysis + fwd_rev_values = [] + for sample in samples: + for fastq_pattern in sample["fastq_info"]: + fwd_rev_values.append( + sample["fastq_info"][fastq_pattern][FastqName.FWD] + ) + fwd_rev_values.append( + sample["fastq_info"][fastq_pattern][FastqName.REV] + ) + return fwd_rev_values + + fastq_path = values["analysis"]["fastq_path"] + + # Get a set of all fastq files in fastq-directory + fastqs_in_fastq_path = set(glob(f"{fastq_path}/*fastq.gz")) + + # Look for fastqs in sample dict + fastqs_assigned = set(get_all_fwd_rev_values(values["samples"])) + + unassigned_fastqs = fastqs_in_fastq_path - fastqs_assigned + if unassigned_fastqs: + raise ValueError( + f"Fastqs in fastq-dir not assigned to sample config: {unassigned_fastqs}" + ) + + return values + + def get_all_sample_names(self) -> List[str]: + """Return all sample names in the analysis.""" + return [sample.name for sample in self.samples] + + def get_fastq_patterns_by_sample(self, sample_names: List[str]) -> List[str]: + """Return all fastq_patterns for a given sample.""" + return [ + fastq_pattern + for sample in self.samples + if sample.name in sample_names + for fastq_pattern in sample.fastq_info.keys() + ] + + def get_all_fastqs_for_sample( + self, sample_name: str, fastq_types: Optional[List[FastqName]] = None + ) -> List[str]: + """Return all fastqs (optionally only [fwd/rev]) involved in analysis of sample.""" + + if fastq_types is None: + fastq_types = [FastqName.FWD, FastqName.REV] + + fastq_list: List = [] + for sample in self.samples: + if sample.name == sample_name: + for fastq_info in sample.fastq_info.values(): + if FastqName.FWD in fastq_types: + fastq_list.append(fastq_info.fwd) + if FastqName.REV in fastq_types: + fastq_list.append(fastq_info.rev) + return fastq_list + + def get_all_fastq_names(self, remove_suffix: bool = False) -> List[str]: + """Return all fastq_names involved in analysis, optionally remove fastq.gz suffix.""" + fastq_names = [] + for sample in self.samples: + for fastq_pattern, fastqs in sample.fastq_info.items(): + if remove_suffix: + fastq_names.extend( + [ + Path(fastqs.fwd).name.replace(".fastq.gz", ""), + Path(fastqs.rev).name.replace(".fastq.gz", ""), + ] + ) + else: + fastq_names.extend( + [ + Path(fastqs.fwd).name, + Path(fastqs.rev).name, + ] + ) + return fastq_names + + def get_fastq_by_fastq_pattern( + self, fastq_pattern: str, fastq_type: FastqName + ) -> str: + """Return fastq file path for requested fastq pair pattern and fastq type: [fwd/rev].""" + for sample in self.samples: + if fastq_pattern in sample.fastq_info: + return ( + sample.fastq_info[fastq_pattern].fwd + if fastq_type == FastqName.FWD + else sample.fastq_info[fastq_pattern].rev + ) + + def get_sample_name_by_type(self, sample_type: str) -> str: + """Return sample name for requested sample type.""" + for sample in self.samples: + if sample.type == sample_type: + return sample.name + + def get_sample_type_by_name(self, sample_name: str, uppercase: bool = False) -> str: + """Return sample type for requested sample name, optionally return it capitalized""" + for sample in self.samples: + if sample.name == sample_name: + return sample.type.upper() if uppercase else sample.type + + def get_bam_name_per_lane(self, bam_dir: str, sample_name: str) -> List[str]: + """Return list of bam-file names for all fastq_patterns of a sample.""" + bam_names = [] + for sample in self.samples: + if sample.name == sample_name: + bam_names.extend( + [ + f"{bam_dir}{sample_name}_align_sort_{fastq_pattern}.bam" + for fastq_pattern in sample.fastq_info + ] + ) + return bam_names + + def get_final_bam_name( + self, bam_dir: str, sample_name: str = None, sample_type: str = None + ) -> str: + """Return final bam name to be used in downstream analysis.""" + + if not sample_name and not sample_type: + raise ValueError( + "Either sample_name or sample_type must be provided to get the final bam name." + ) + + sample_name = ( + self.get_sample_name_by_type(sample_type) + if not sample_name + else sample_name + ) + + sample_type = ( + self.get_sample_type_by_name(sample_name) + if not sample_type + else sample_type + ) + + if self.analysis.analysis_type == AnalysisType.PON: + # Only dedup is necessary for panel of normals + final_bam_suffix = "dedup" + elif self.analysis.sequencing_type == SequencingType.TARGETED: + # Only dedup is necessary for TGA + final_bam_suffix = "dedup_sorted" + else: + # For WGS the bamfiles are realigned + final_bam_suffix = "dedup.realign" + + return f"{bam_dir}{sample_type}.{sample_name}.{final_bam_suffix}.bam" + + def get_cnv_report_plots(self) -> List[str]: + """Return a list of AscatNgs CNV plot files.""" + if self.analysis.analysis_type == AnalysisType.SINGLE: + return [ + f"CNV.somatic.{self.analysis.case_id}.cnvpytor.circular.png", + f"CNV.somatic.{self.analysis.case_id}.cnvpytor.scatter.png", + ] + return [ + f"CNV.somatic.{self.analysis.case_id}.ascat.ascatprofile.png", + f"CNV.somatic.{self.analysis.case_id}.ascat.rawprofile.png", + f"CNV.somatic.{self.analysis.case_id}.ascat.ASPCF.png", + f"CNV.somatic.{self.analysis.case_id}.ascat.tumor.png", + f"CNV.somatic.{self.analysis.case_id}.ascat.germline.png", + f"CNV.somatic.{self.analysis.case_id}.ascat.sunrise.png", + ] diff --git a/BALSAMIC/models/metrics.py b/BALSAMIC/models/metrics.py new file mode 100644 index 000000000..55f4877a1 --- /dev/null +++ b/BALSAMIC/models/metrics.py @@ -0,0 +1,66 @@ +"""QC validation metrics model.""" +import logging +from typing import Optional, Any, List, Annotated + +from pydantic import BaseModel, AfterValidator + +from BALSAMIC.constants.metrics import VALID_OPS + +LOG = logging.getLogger(__name__) + + +class MetricCondition(BaseModel): + """Defines the metric condition model. + + Attributes: + norm (string, optional) : Validation condition. + threshold (float, optional) : Validation cut off. + """ + + norm: Optional[str] = None + threshold: Optional[float] = None + + +class Metric(BaseModel): + """Defines the metric attributes model. + + Attributes: + header (str, optional) : Data. + id (str, required) : Unique sample identifier (sample_id, case_id or project_id). + input (str, required) : Input file. + name (str, required) : Metric name. + step (str, required) : Step that generated the metric. + value (Any, required) : Metric value. + condition (MetricCondition, required) : Metric validation condition. + """ + + header: Optional[str] = None + id: str + input: str + name: str + step: str + value: Any + condition: Optional[MetricCondition] + + +def validate_metric(metric: Metric): + """Checks if a metric meets its filtering condition.""" + if metric.condition and not VALID_OPS[metric.condition.norm]( + metric.value, metric.condition.threshold + ): + raise ValueError( + f"QC metric {metric.name}: {metric.value} validation has failed. " + f"(Condition: {metric.condition.norm} {metric.condition.threshold}, ID: {metric.id})." + ) + LOG.info(f"QC metric {metric.name}: {metric.value} meets its condition.") + return metric + + +class MetricValidation(BaseModel): + """Defines the metric validation model. + + Attributes: + metrics (List[Metric], required) : Metric model to validate. + """ + + metrics: List[Annotated[Metric, AfterValidator(validate_metric)]] diff --git a/BALSAMIC/models/params.py b/BALSAMIC/models/params.py new file mode 100644 index 000000000..e79d5d0ad --- /dev/null +++ b/BALSAMIC/models/params.py @@ -0,0 +1,230 @@ +"""Balsamic analysis parameters models.""" +from typing import Optional + +from pydantic import BaseModel, ConfigDict + + +class ParamsCommon(BaseModel): + """This class defines the common params settings used as constants across various rules in balsamic workflow. + + Attributes: + pcr_model: str (required). PCR indel model used to weed out false positive indels. Eg: none- PCR free samples. + align_header: str (required); header line appended to the aligned BAM output + min_mapq: int (required); minimum mapping quality score. Eg: 20- probability of mapping random read at 99% accuracy + picard_fixmate: str (required), fix read mate information in bam file + picard_RG_normal: str (required); replace readgroups in normal bam file + picard_RG_tumor: str (required); replace readgroups in tumor bam file + """ + + align_header: str + pcr_model: str + min_mapq: int + picard_fixmate: str + picard_RG_normal: str + picard_RG_tumor: str + + +class ParamsVardict(BaseModel): + """This class defines the params settings used as constants in vardict rule. + + Attributes: + allelic_frequency: float (required); minimum allelic frequency to detect + max_pval: float (required); the maximum p-value. Vardict default: 0.05 + max_mm: float (required); the maximum mean mismatches allowed. Vardict default: 5.25 + column_info: str (required); set of vardict filters for passing final variants + """ + + allelic_frequency: float + max_pval: float + max_mm: float + column_info: str + + +class ParamsVEP(BaseModel): + """This class defines the params settings used as constants in vep rule. + + Attributes: + vep_filters: str (required); set of choosen options for processing vep annotated vcf file + """ + + vep_filters: str + + +class QCModel(BaseModel): + """Contains settings for quality control and pre-processing + Attributes: + picard_rmdup : Field(bool); whether duplicate removal is to be applied in the workflow + adapter : Field(str(AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT)); adapter sequence to trim + quality_trim : Field(bool); whether quality trimming it to be performed in the workflow + adapter_trim : Field(bool); whether adapter trimming is to be performed in the workflow + umi_trim : Field(bool); whether UMI trimming is to be performed in the workflow + min_seq_length : Field(str(int)); minimum sequence length cutoff for reads + umi_trim_length : Field(str(int)); length of UMI to be trimmed from reads + n_base_limit : Field(str(int)); supports filtering by limiting the N base number + + Raises: + ValueError: + When the input in min_seq_length and umi_trim_length cannot + be interpreted as integer and coerced to string + + """ + + model_config = ConfigDict(coerce_numbers_to_str=True) + picard_rmdup: bool = False + adapter: str = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT" + quality_trim: bool = True + adapter_trim: bool = False + umi_trim: bool = False + min_seq_length: str = "25" + umi_trim_length: str = "5" + n_base_limit: str = "50" + + +class UMIParamsCommon(BaseModel): + """This class defines the common params settings used as constants across various rules in UMI workflow. + + Attributes: + align_format: str (required); output alignment format. eg. 'BAM' + align_header: str (required); header line appended to the aligned BAM output + align_intbases: int; input bases in each batch regardless of threads, for reproducibility + filter_tumor_af: float (required); settings to filter minimum allelic frequency + """ + + align_header: str + align_intbases: int + filter_tumor_af: float + + +class UMIParamsUMIextract(BaseModel): + """This class defines the params settings used as constants in UMI workflow-rule umextract. + + Attributes: + read_structure: str (required); settings to define UMI read structure + """ + + read_structure: str = "-d, 'rs1,rs2'" + + +class UMIParamsConsensuscall(BaseModel): + """This class defines the params settings used as constants in UMI workflow-rule consensuscall. + + Attributes: + align_format: str (required); output alignment format. eg. 'BAM' + filter_minreads: str (required); settings to filter consensus tags based on group size + tag: str; Logic UMI tag + """ + + align_format: str = "BAM" + filter_minreads: str = "3,1,1" + tag: str = "XR" + + +class UMIParamsTNscope(BaseModel): + """This class defines the params settings used as constants in UMI workflow- rule tnscope. + + Attributes: + algo: str; choice of sentieon varcall algorithm. eg. 'TNscope' + disable_detect: str; disable variant detector. eg 'sv' or 'snv_indel' + filter_tumor_af: float (required); minimum allelic frequency to detect + min_tumorLOD: int (required); minimum tumor log odds in the final call of variants + init_tumorLOD: float (required); minimum tumor log odds in the initial pass calling variants + error_rate: int (required); allow error-rate to consider in calling + prunefactor: int (required); pruning factor in the kmer graph + padding: int(required); amount to pad bed interval regions + """ + + algo: str + init_tumorLOD: float + min_tumorLOD: int + error_rate: int + prunefactor: int + padding: int + disable_detect: str + + +class BalsamicWorkflowConfig(BaseModel): + """Defines set of rules in balsamic workflow + + Handles attributes for corresponding rules. + + Attributes: + common: global params defined across all rules in balsamic workflow + umicommon: global params defined across specific rules in UMI workflow + vep: global params defined in the rule vep + vardict: params defined in the rule vardict + umiextract : params defined in the rule sentieon_umiextract + umiconsensuscall: params defined in the rule sentieon_consensuscall + tnscope_umi: params defined in the rule sentieon_tnscope_umi + """ + + common: ParamsCommon + vardict: ParamsVardict + vep: ParamsVEP + umicommon: UMIParamsCommon + umiextract: UMIParamsUMIextract + umiconsensuscall: UMIParamsConsensuscall + tnscope_umi: UMIParamsTNscope + + +class VCFAttributes(BaseModel): + """General purpose filter to manage various VCF attributes + + This class handles three parameters for the purpose filtering variants + based on a tag_values, filter_name, and which field in VCF. + + E.g. AD=VCFAttributes(tag_value=5, filter_name="balsamic_low_tumor_ad", field="INFO") + A value of 5 from INFO field and filter_name will be balsamic_low_tumor_ad + + Attributes: + tag_value: float + filter_name: str + field: str + """ + + tag_value: float + filter_name: str + field: str + + +class VarCallerFilter(BaseModel): + """General purpose for variant caller filters + + This class handles attributes and filter for variant callers + + Attributes: + AD: VCFAttributes (required); minimum allelic depth + AF_min: VCFAttributes (optional); minimum allelic fraction + MQ: VCFAttributes (optional); minimum mapping quality + DP: VCFAttributes (optional); minimum read depth + pop_freq: VCFAttributes (optional); maximum gnomad allele frequency + pop_freq_umi: VCFAttributes (optional); maximum gnomad_af for UMI workflow + strand_reads: VCFAttributes (optional); minimum strand specific read counts + qss: VCFAttributes (optional); minimum sum of base quality scores + sor: VCFAttributes (optional); minimum symmetrical log-odds ratio + swegen_snv_freq: VCFAttributes (optional); maximum swegen snv allele frequency + swegen_sv_freq: VCFAttributes (optional); maximum swegen sv allele frequency + loqusdb_clinical_snv_freq: VCFAttributes (optional); maximum loqusdb clinical snv allele frequency + loqusdb_clinical_sv_freq: VCFAttributes (optional); maximum loqusdb clinical sv allele frequency + varcaller_name: str (required); variant caller name + filter_type: str (required); filter name for variant caller + analysis_type: str (required); analysis type e.g. tumor_normal or tumor_only + description: str (required); comment section for description + """ + + AD: Optional[VCFAttributes] = None + AF_min: Optional[VCFAttributes] = None + MQ: Optional[VCFAttributes] = None + DP: Optional[VCFAttributes] = None + pop_freq: Optional[VCFAttributes] = None + pop_freq_umi: Optional[VCFAttributes] = None + strand_reads: Optional[VCFAttributes] = None + qss: Optional[VCFAttributes] = None + sor: Optional[VCFAttributes] = None + swegen_snv_freq: Optional[VCFAttributes] = None + swegen_sv_freq: Optional[VCFAttributes] = None + loqusdb_clinical_snv_freq: Optional[VCFAttributes] = None + loqusdb_clinical_sv_freq: Optional[VCFAttributes] = None + varcaller_name: str + filter_type: str + analysis_type: str + description: str diff --git a/BALSAMIC/models/snakemake.py b/BALSAMIC/models/snakemake.py new file mode 100644 index 000000000..f85ade352 --- /dev/null +++ b/BALSAMIC/models/snakemake.py @@ -0,0 +1,212 @@ +"""Snakemake related models.""" +import sys +from pathlib import Path +from typing import Optional, List + +from pydantic import BaseModel, FilePath, DirectoryPath, field_validator, Field + +from BALSAMIC.constants.analysis import RunMode +from BALSAMIC.constants.cluster import ClusterMailType, QOS, ClusterProfile, MAX_JOBS +from BALSAMIC.constants.paths import SCHEDULER_PATH +from BALSAMIC.utils.utils import remove_unnecessary_spaces + + +class SingularityBindPath(BaseModel): + """Singularity binding path model. + + Attributes: + source (Path) : Path to the file or directory on the host system. + destination (Path) : Path inside the container where the source will be mounted. + """ + + source: Path + destination: Path + + +class SnakemakeExecutable(BaseModel): + """Snakemake command building model. + + Attributes: + account (Optional[str]) : Scheduler account. + benchmark (Optional[bool]) : Slurm jobs profiling option. + case_id (str) : Analysis case name. + cluster_config_path (Optional[FilePath]) : Cluster configuration file path. + config_path (FilePath) : Sample configuration file. + disable_variant_caller (Optional[str]) : Disable variant caller. + dragen (Optional[bool]) : FLag for enabling or disabling Dragen suite. + force (bool) : Force snakemake execution. + log_dir (Optional[DirectoryPath]) : Logging directory. + mail_type (Optional[ClusterMailType]) : Email type triggering job status notifications. + mail_user (Optional[str]) : User email to receive job status notifications. + profile (Optional[ClusterProfile]) : Cluster profile to submit jobs. + qos (Optional[QOS]) : QOS for sbatch jobs. + quiet (Optional[bool]) : Quiet mode for snakemake. + report_path (Optional[Path]) : Snakemake generated report path. + result_dir (Optional[DirectoryPath]) : Analysis output directory. + run_analysis (bool) : Flag to run the actual analysis. + run_mode (RunMode) : Cluster run mode to execute analysis. + script_dir (Optional[DirectoryPath]) : Cluster profile scripts directory. + singularity_bind_paths (Optional[List[SingularityBindPath]]) : Singularity source and destination bind paths. + snakefile (FilePath) : Snakemake rule configuration file. + snakemake_options (Optional[List[str]]) : Snakemake command additional options. + working_dir (Path) : Snakemake working directory. + + """ + + account: Optional[str] = None + benchmark: bool = False + case_id: str + cluster_config_path: Optional[FilePath] = None + config_path: FilePath + disable_variant_caller: Optional[str] = Field(default=None, validate_default=True) + dragen: bool = False + force: bool = False + log_dir: Optional[DirectoryPath] = None + mail_type: Optional[ClusterMailType] = None + mail_user: Optional[str] = Field(default=None, validate_default=True) + profile: Optional[ClusterProfile] = None + qos: Optional[QOS] = None + quiet: bool = False + report_path: Optional[Path] = None + result_dir: Optional[DirectoryPath] = None + run_analysis: bool = False + run_mode: RunMode + script_dir: Optional[DirectoryPath] = None + singularity_bind_paths: Optional[List[SingularityBindPath]] = None + snakefile: FilePath + snakemake_options: Optional[List[str]] = None + working_dir: Path + + @field_validator("disable_variant_caller") + def get_disable_variant_caller_option(cls, disable_variant_caller: str) -> str: + """Return string representation of the disable_variant_caller option.""" + if disable_variant_caller: + return f"disable_variant_caller={disable_variant_caller}" + return "" + + @field_validator("mail_user") + def get_mail_user_option(cls, mail_user: Optional[str]) -> str: + """Return string representation of the mail_user option.""" + if mail_user: + return f"--mail-user {mail_user}" + return "" + + def get_config_files_option(self) -> str: + """Return string representation of the config files.""" + config_files_option: str = f"--configfiles {self.config_path.as_posix()}" + if self.cluster_config_path: + config_files_option += f" {self.cluster_config_path.as_posix()}" + return config_files_option + + def get_config_options(self) -> str: + """Return Snakemake config options to be submitted.""" + return remove_unnecessary_spaces( + f"--config {self.disable_variant_caller} {self.get_dragen_flag()}" + ) + + def get_dragen_flag(self) -> str: + """Return string representation of the dragen flag.""" + if self.dragen: + return "dragen=True" + return "" + + def get_force_flag(self) -> str: + """Return string representation of the force flag.""" + if self.force: + return "--forceall" + return "" + + def get_mail_type_option(self) -> str: + """Return string representation of the mail_type option.""" + if self.mail_type: + return f"--mail-type {self.mail_type}" + return "" + + def get_quiet_flag(self) -> str: + """Return string representation of the quiet flag.""" + if self.quiet: + return "--quiet" + return "" + + def get_report_path_option(self) -> str: + """Return string representation of the report_path option.""" + if self.report_path: + return f"--report {self.report_path.as_posix()}" + return "" + + def get_run_analysis_flag(self) -> str: + """Return string representation of the run_analysis flag.""" + if not self.run_analysis: + return "--dryrun" + return "" + + def get_singularity_bind_paths_option(self) -> str: + """Return string representation of the singularity_bind_paths option.""" + if self.singularity_bind_paths: + bind_options: List[str] = [] + for singularity_bind_path in self.singularity_bind_paths: + bind_options.append( + f"--bind {singularity_bind_path.source.as_posix()}:{singularity_bind_path.destination.as_posix()}" + ) + return f"--use-singularity --singularity-args '--cleanenv {' '.join(bind_options)}'" + return "" + + def get_slurm_profiler_option(self) -> str: + """Return string representation of the slurm profiler option.""" + if self.benchmark and self.profile == ClusterProfile.SLURM: + return "--slurm-profiler task" + return "" + + def get_snakemake_options_command(self) -> str: + """Return string representation of the additional Snakemake options.""" + if self.snakemake_options: + return " ".join(self.snakemake_options) + return "" + + def get_command(self) -> str: + """Return Snakemake command to be submitted.""" + snakemake_command: str = ( + f"snakemake --notemp -p --rerun-trigger mtime " + f"--directory {self.working_dir.as_posix()} " + f"--snakefile {self.snakefile.as_posix()} " + f"{self.get_config_files_option()} " + f"{self.get_singularity_bind_paths_option()} " + f"{self.get_quiet_flag()} " + f"{self.get_force_flag()} " + f"{self.get_run_analysis_flag()} " + f"{self.get_snakemake_cluster_options()} " + f"{self.get_report_path_option()} " + f"{self.get_config_options()} " + f"{self.get_snakemake_options_command()}" + ) + return remove_unnecessary_spaces(snakemake_command) + + def get_snakemake_cluster_options(self) -> str: + """Return Snakemake cluster options to be submitted.""" + if self.run_mode == RunMode.CLUSTER: + snakemake_cluster_options: str = ( + f"--immediate-submit -j {MAX_JOBS} " + f"--jobname BALSAMIC.{self.case_id}.{{rulename}}.{{jobid}}.sh " + f"--cluster-config {self.cluster_config_path.as_posix()} " + f"--cluster {self.get_cluster_submit_command()}" + ) + return remove_unnecessary_spaces(snakemake_cluster_options) + return "" + + def get_cluster_submit_command(self) -> str: + """Get cluster command to be submitted by Snakemake.""" + cluster_submit_command: str = ( + f"'{sys.executable} {SCHEDULER_PATH.as_posix()} " + f"--sample-config {self.config_path.as_posix()} " + f"--profile {self.profile} " + f"--account {self.account} " + f"--qos {self.qos} " + f"--log-dir {self.log_dir.as_posix()} " + f"--script-dir {self.script_dir.as_posix()} " + f"--result-dir {self.result_dir.as_posix()} " + f"{self.get_slurm_profiler_option()} " + f"{self.mail_user} " + f"{self.get_mail_type_option()} " + "{dependencies} '" + ) + return remove_unnecessary_spaces(cluster_submit_command) diff --git a/BALSAMIC/models/validators.py b/BALSAMIC/models/validators.py new file mode 100644 index 000000000..e73cbe7bb --- /dev/null +++ b/BALSAMIC/models/validators.py @@ -0,0 +1,21 @@ +"""Model class validators.""" +from pathlib import Path +from typing import Optional + + +def is_file(file_path: Optional[str]) -> str: + """Validate file path existence.""" + if file_path: + if Path(file_path).is_file(): + return file_path + raise ValueError(f"The supplied file path {file_path} does not exist") + return file_path + + +def is_dir(dir_path: Optional[str]) -> str: + """Validate directory path existence.""" + if dir_path: + if Path(dir_path).is_dir(): + return dir_path + raise ValueError(f"The supplied directory path {dir_path} does not exist") + return dir_path diff --git a/BALSAMIC/snakemake_rules/align/bam_compress.rule b/BALSAMIC/snakemake_rules/align/bam_compress.rule new file mode 100644 index 000000000..28f793a18 --- /dev/null +++ b/BALSAMIC/snakemake_rules/align/bam_compress.rule @@ -0,0 +1,48 @@ +"""Rule to compress bamfile to the cram format.""" + + +rule bam_compress_tumor: + input: + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), + fasta = config_model.reference["reference_genome"] + output: + cram = Path(bam_dir + "tumor.{sample}.cram").as_posix() + benchmark: + Path(benchmark_dir, "bam_compress_tumor_{sample}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("compress") + ".sif").as_posix() + params: + sample_id = "{sample}", + housekeeper_id= {"id": "{sample}", "tags": "tumor"} + threads: + get_threads(cluster_config, "bam_compress") + message: + "Compressing bam to cram for {params.sample_id}" + shell: + """ +samtools view -h -T {input.fasta} --threads {threads} -C -o {output.cram} {input.bam}; +samtools index {output.cram}; + """ + +rule bam_compress_normal: + input: + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), + fasta = config_model.reference["reference_genome"] + output: + cram = Path(bam_dir, "normal.{sample}.cram").as_posix() + benchmark: + Path(benchmark_dir, "bam_compress_normal_{sample}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("compress") + ".sif").as_posix() + params: + sample_id = "{sample}", + housekeeper_id= {"id": "{sample}", "tags": "normal"} + threads: + get_threads(cluster_config, "bam_compress") + message: + "Compressing bam to cram for {params.sample_id}" + shell: + """ +samtools view -h -T {input.fasta} --threads {threads} -C -o {output.cram} {input.bam}; +samtools index {output.cram}; + """ diff --git a/BALSAMIC/snakemake_rules/align/bwa_mem.rule b/BALSAMIC/snakemake_rules/align/bwa_mem.rule deleted file mode 100644 index 8b441ffc7..000000000 --- a/BALSAMIC/snakemake_rules/align/bwa_mem.rule +++ /dev/null @@ -1,92 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format - - -rule bwa_mem: - input: - fa = config["reference"]["reference_genome"], - read1 = Path(fastq_dir, "{sample}_1.fp.fastq.gz").as_posix(), - read2 = Path(fastq_dir, "{sample}_2.fp.fastq.gz").as_posix(), - refidx = expand(config["reference"]["reference_genome"] + ".{prefix}", prefix=["amb","ann","bwt","pac","sa"]) - output: - bamout = temp(Path(bam_dir, "{sample}.sorted.bam").as_posix()) - benchmark: - Path(benchmark_dir, "bwa_mem_{sample}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bwa") + ".sif").as_posix() - params: - bam_header = params.common.align_header, - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - sample_id = "{sample}" - threads: - get_threads(cluster_config, "bwa_mem") - message: - ("Align fastq files with bwa-mem to reference genome and " - "sort using samtools for sample: {params.sample_id}") - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; - -bwa mem \ --t {threads} \ --R {params.bam_header} \ --M \ --v 1 \ -{input.fa} {input.read1} {input.read2} \ -| samtools sort -T {params.tmpdir} \ ---threads {threads} \ ---output-fmt BAM \ --o {output.bamout} - ; - -samtools index -@ {threads} {output.bamout}; -rm -rf {params.tmpdir}; - """ - - -rule picard_markduplicates: - input: - Path(bam_dir, "{sample}.sorted.bam").as_posix() - output: - mrkdup = Path(bam_dir, "{sample}.sorted." + picarddup + ".bam").as_posix(), - picard_stats = Path(bam_dir, "{sample}.sorted." + picarddup + ".txt").as_posix(), - flagstats = Path(bam_dir, "{sample}.samtools.flagstats.txt").as_posix(), - idxstats = Path(bam_dir, "{sample}.samtools.idxstats.txt").as_posix(), - stats = Path(bam_dir, "{sample}.samtools.stats.txt").as_posix(), - benchmark: - Path(benchmark_dir,"picard_markduplicates_{sample}.tsv").as_posix() - singularity: - Path(singularity_image,config["bioinfo_tools"].get("picard") + ".sif").as_posix() - params: - mem = "16g", - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - rm_dup = "FALSE" if picarddup == "mrkdup" else "TRUE", - sample_id = "{sample}" - threads: - get_threads(cluster_config, "picard_markduplicates") - message: - "Picard marking duplicates and samtool indexing for sample: {params.sample_id}" - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; - -picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ -MarkDuplicates \ -INPUT={input} \ -OUTPUT={output.mrkdup} \ -VALIDATION_STRINGENCY=SILENT \ -MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \ -REMOVE_DUPLICATES={params.rm_dup} \ -METRICS_FILE='{output.picard_stats}'; - -samtools index {output.mrkdup}; - -samtools flagstats --threads {threads} {output.mrkdup} > {output.flagstats}; -samtools stats --threads {threads} {output.mrkdup} > {output.stats}; -samtools idxstats --threads {threads} {output.mrkdup} > {output.idxstats}; - -rm -rf {params.tmpdir}; - """ diff --git a/BALSAMIC/snakemake_rules/align/postprocess_bam.rule b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule new file mode 100644 index 000000000..8fd40bc48 --- /dev/null +++ b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule @@ -0,0 +1,63 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + + +# NOTE: This rule is only applied to prevent VarDict from failing with error like this: +# Critical exception occurs on region: 20:39794721-39795011, program will be stopped. +# java.util.concurrent.CompletionException: java.lang.InternalError: a fault occurred in a recent unsafe memory access +# It's however unclear how this resolves the issue. + +rule samtools_sort_index: + input: + bam = Path(bam_dir,"{sample_type}.{sample}.dedup.bam").as_posix(), + output: + bam = Path(bam_dir,"{sample_type}.{sample}.dedup_sorted.bam").as_posix(), + benchmark: + Path(benchmark_dir,"samtools_sort_index_{sample_type}_{sample}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("samtools") + ".sif").as_posix() + params: + sample_id="{sample}" + threads: + get_threads(cluster_config,"samtools_sort_index") + message: + "Calculating alignment stats for sample: {params.sample_id}" + shell: + """ +samtools sort --threads {threads} -o {output.bam} {input.bam}; +samtools index -@ {threads} {output.bam}; + """ + + +rule postprocess_bam: + input: + bam = Path(bam_dir,"{sample_type}.{sample}.dedup_sorted.bam").as_posix() + output: + postprocessed_bam = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted_addRG.bam").as_posix(), + benchmark: + Path(benchmark_dir,"postprocess_bam_{sample_type}.{sample}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("picard") + ".sif").as_posix() + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + sample_id = "{sample}" + threads: + get_threads(cluster_config, "postprocess_bam") + message: + "Collapse readgroups for sample: {params.sample_id}" + shell: + """ +export TMPDIR={params.tmpdir}; + +picard -Xmx75g AddOrReplaceReadGroups \ +-RGPU ILLUMINAi -RGID {wildcards.sample_type} -RGSM {wildcards.sample_type} \ +-RGPL ILLUMINAi -RGLB ILLUMINAi -MAX_RECORDS_IN_RAM 1000000 \ +-CREATE_INDEX true -CREATE_MD5_FILE true \ +-TMP_DIR {params.tmpdir} \ +-INPUT {input.bam} \ +-OUTPUT {output.postprocessed_bam}; + +samtools index {output.postprocessed_bam}; + +rm -rf {params.tmpdir}; + """ diff --git a/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule b/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule index ee198a0d4..4a5dcf7bc 100644 --- a/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule +++ b/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule @@ -1,29 +1,28 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format - +"""Rules to align, mark duplicates and realign reads with Sentieon tools.""" rule sentieon_align_sort: input: ref = config["reference"]["reference_genome"], - read1 = Path(fastq_dir, "{sample}_1.fp.fastq.gz").as_posix(), - read2 = Path(fastq_dir, "{sample}_2.fp.fastq.gz").as_posix(), + fastq_r1 = Path(fastq_dir, "{fastq_pattern}_1.fp.fastq.gz").as_posix(), + fastq_r2 = Path(fastq_dir, "{fastq_pattern}_2.fp.fastq.gz").as_posix(), refidx = expand(config["reference"]["reference_genome"] + ".{prefix}", prefix=["amb","ann","bwt","pac","sa"]) output: - bamout = Path(bam_dir, "{sample}.bam").as_posix() + bam_out = Path(bam_dir, "{sample}_align_sort_{fastq_pattern}.bam").as_posix() benchmark: - Path(benchmark_dir, "sentieon_align_sort_{sample}.tsv").as_posix() + Path(benchmark_dir, "sentieon_align_sort_{sample}_{fastq_pattern}.tsv").as_posix() params: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), header = params.common.align_header, sentieon_exec = config["SENTIEON_EXEC"], sentieon_lic = config["SENTIEON_LICENSE"], - sample_id = "{sample}" + sample_id = "{sample}", + sample_type = lambda wildcards: config_model.get_sample_type_by_name(wildcards.sample, uppercase=True), + fastq_pattern = "{fastq_pattern}" threads: get_threads(cluster_config, 'sentieon_align_sort') message: - "Align fastq reads using sentieon bwa-mem and sort reads using samtools for sample: {params.sample_id}" + ("Align fastq reads using sentieon bwa-mem and sort reads using samtools for sample type: " + "{params.sample_type} : {params.sample_id}, {params.fastq_pattern}") shell: """ mkdir -p {params.tmpdir}; @@ -32,27 +31,26 @@ export SENTIEON_TMPDIR={params.tmpdir}; export SENTIEON_LICENSE={params.sentieon_lic}; {params.sentieon_exec} bwa mem -M \ --R {params.header} \ +-R '@RG\\tID:{wildcards.fastq_pattern}\\tSM:{params.sample_type}\\tPL:ILLUMINA' \ -t {threads} \ -K 50000000 \ -{input.ref} {input.read1} {input.read2} \ +{input.ref} {input.fastq_r1} {input.fastq_r2} \ | {params.sentieon_exec} util sort \ --o {output.bamout} \ +-o {output.bam_out} \ -t {threads} \ --block_size 3G \ --sam2bam -i -; """ - rule sentieon_dedup: input: - bam = Path(bam_dir, "{sample}.bam").as_posix(), + bam_files = lambda wildcards: config_model.get_bam_name_per_lane(bam_dir = bam_dir, sample_name = wildcards.sample) output: - bam = Path(bam_dir, "{sample}.dedup.bam").as_posix(), - score = Path(bam_dir, "{sample}.dedup.score").as_posix(), - metrics = Path(bam_dir, "{sample}.dedup.metrics").as_posix() + bam = Path(bam_dir, "{sample_type}.{sample}.dedup.bam").as_posix(), + score = Path(bam_dir, "{sample_type}.{sample}.dedup.score").as_posix(), + metrics = Path(qc_dir, "{sample_type}.{sample}.dedup.metrics").as_posix() benchmark: - Path(benchmark_dir, "sentieon_dedup_{sample}.tsv").as_posix() + Path(benchmark_dir, "sentieon_dedup_{sample_type}.{sample}.tsv").as_posix() params: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), sentieon_exec = config["SENTIEON_EXEC"], @@ -61,7 +59,8 @@ rule sentieon_dedup: threads: get_threads(cluster_config, 'sentieon_dedup') message: - "Collects read information using sentieon LocusCollector and remove duplicated reads for sample: {params.sample_id}" + ("Collects read information using sentieon LocusCollector and mark duplicated reads. " + "Current sample: {params.sample_id}") shell: """ mkdir -p {params.tmpdir}; @@ -69,34 +68,37 @@ export TMPDIR={params.tmpdir}; export SENTIEON_TMPDIR={params.tmpdir}; export SENTIEON_LICENSE={params.sentieon_lic}; +shell_bam_files=$(echo {input.bam_files} | sed 's/ / -i /g') ; + {params.sentieon_exec} driver \ -t {threads} \ --i {input.bam} \ +-i $shell_bam_files \ --algo LocusCollector \ --fun score_info \ {output.score}; {params.sentieon_exec} driver \ -t {threads} \ --i {input.bam} \ +-i $shell_bam_files \ --algo Dedup \ ---rmdup \ --score_info {output.score} \ --metrics {output.metrics} \ {output.bam}; - """ +sed 's/^LIBRARY/\\n## METRICS CLASS\tpicard\.sam\.DuplicationMetrics\\nLIBRARY/' -i {output.metrics} + """ + rule sentieon_realign: input: ref = config["reference"]["reference_genome"], mills = config["reference"]["mills_1kg"], - indel_1kg = config["reference"]["1kg_known_indel"], - bam = Path(bam_dir, "{sample}.dedup.bam").as_posix() + bam = Path(bam_dir, "{sample_type}.{sample}.dedup.bam").as_posix(), + indel_1kg = config["reference"]["known_indel_1kg"] output: - bam = Path(bam_dir, "{sample}.dedup.realign.bam").as_posix() + bam = Path(bam_dir, "{sample_type}.{sample}.dedup.realign.bam").as_posix() benchmark: - Path(benchmark_dir, "sentieon_realign_{sample}.tsv").as_posix() + Path(benchmark_dir, "sentieon_realign_{sample_type}.{sample}.tsv").as_posix() params: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), sentieon_exec = config["SENTIEON_EXEC"], @@ -108,17 +110,18 @@ rule sentieon_realign: "INDEL realignment using sentieon realigner for sample: {params.sample_id}" shell: """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; -export SENTIEON_TMPDIR={params.tmpdir}; -export SENTIEON_LICENSE={params.sentieon_lic}; + mkdir -p {params.tmpdir}; + export TMPDIR={params.tmpdir}; + export SENTIEON_TMPDIR={params.tmpdir}; + export SENTIEON_LICENSE={params.sentieon_lic}; + + {params.sentieon_exec} driver \ + -r {input.ref} \ + -t {threads} \ + -i {input.bam} \ + --algo Realigner \ + -k {input.mills} \ + -k {input.indel_1kg} \ + {output}; + """ -{params.sentieon_exec} driver \ --r {input.ref} \ --t {threads} \ --i {input.bam} \ ---algo Realigner \ --k {input.mills} \ --k {input.indel_1kg} \ -{output}; - """ diff --git a/BALSAMIC/snakemake_rules/annotation/final_vcf_reheader.rule b/BALSAMIC/snakemake_rules/annotation/final_vcf_reheader.rule new file mode 100644 index 000000000..0cac24155 --- /dev/null +++ b/BALSAMIC/snakemake_rules/annotation/final_vcf_reheader.rule @@ -0,0 +1,18 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +rule create_final_vcf_namemap: + input: + multiqc_json = qc_dir + "multiqc_data/multiqc_data.json", + output: + namemap = vep_dir + "status_to_sample_id_namemap" + params: + status_to_sample_id = status_to_sample_id + message: + "Creating final vcf namemap." + threads: + get_threads(cluster_config, "create_final_vcf_namemap") + shell: + """ +echo -e {params.status_to_sample_id} > {output.namemap}; + """ diff --git a/BALSAMIC/snakemake_rules/annotation/germline_annotation.rule b/BALSAMIC/snakemake_rules/annotation/germline_annotation.rule new file mode 100644 index 000000000..ccfcebf5b --- /dev/null +++ b/BALSAMIC/snakemake_rules/annotation/germline_annotation.rule @@ -0,0 +1,82 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 +# Germline variants annotation module. + + +rule vep_annotate_germlineVAR_tumor: + input: + vcf = vcf_dir + "{var_type}.germline.tumor.{var_caller}.vcf.gz", + cosmic = config["reference"]["cosmic"] + output: + vcf_tumor = vep_dir + "{var_type}.germline.tumor.{var_caller}.vcf.gz", + benchmark: + Path(benchmark_dir, "vep_germline_{var_type}.tumor.{var_caller}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() + params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "germline"}, + sample = 'tumor', + vep_cache = config["reference"]["vep_dir"], + vep_defaults = params.vep.vep_filters + threads: + get_threads(cluster_config, 'vep_germline') + message: + "Running vep annotation on germline variants for {params.sample} sample" + shell: + """ +vep_path=$(dirname $(readlink -f $(which vep))); +export PERL5LIB=; + +vep \ +--dir $vep_path \ +--dir_cache {params.vep_cache} \ +--dir_plugins $vep_path \ +--input_file {input.vcf} \ +--output_file {output.vcf_tumor} \ +--fork {threads} \ +{params.vep_defaults} \ +--custom {input.cosmic},COSMIC,vcf,exact,0,CDS,GENE,STRAND,CNT,AA; + +tabix -p vcf -f {output.vcf_tumor}; + + """ + + +rule vep_annotate_germlineVAR_normal: + input: + vcf = vcf_dir + "{var_type}.germline.normal.{var_caller}.vcf.gz", + cosmic = config["reference"]["cosmic"] + output: + vcf_normal = vep_dir + "{var_type}.germline.normal.{var_caller}.vcf.gz", + benchmark: + Path(benchmark_dir, "vep_germline_{var_type}.normal.{var_caller}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() + params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "germline"}, + sample = 'normal', + vep_cache = config["reference"]["vep_dir"], + vep_defaults = params.vep.vep_filters + threads: + get_threads(cluster_config, 'vep_germline') + message: + "Running vep annotation on germline variants for {params.sample} sample" + shell: + """ +vep_path=$(dirname $(readlink -f $(which vep))); +export PERL5LIB=; + +vep \ +--dir $vep_path \ +--dir_cache {params.vep_cache} \ +--dir_plugins $vep_path \ +--input_file {input.vcf} \ +--output_file {output.vcf_normal} \ +--fork {threads} \ +{params.vep_defaults} \ +--custom {input.cosmic},COSMIC,vcf,exact,0,CDS,GENE,STRAND,CNT,AA; + +tabix -p vcf -f {output.vcf_normal}; + + """ + diff --git a/BALSAMIC/snakemake_rules/annotation/rankscore.rule b/BALSAMIC/snakemake_rules/annotation/rankscore.rule index 757ba8ede..73b8e2b41 100644 --- a/BALSAMIC/snakemake_rules/annotation/rankscore.rule +++ b/BALSAMIC/snakemake_rules/annotation/rankscore.rule @@ -3,11 +3,10 @@ # Rank variants according to a rankscore model - rule genmod_score_vardict: input: vcf = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.filtered.pass.vcf.gz", - rankscore = config["reference"]["rankscore"] + rank_score = config["reference"]["rank_score"] output: vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.filtered.pass.ranked.vcf.gz", benchmark: @@ -22,7 +21,7 @@ rule genmod_score_vardict: ("Scoring annotated vardict variants using genmod for {params.case_name}") shell: """ -genmod score -r -c {input.rankscore} {input.vcf} | \ +genmod score -r -c {input.rank_score} {input.vcf} | \ bcftools view -o {output.vcf_pass} -O z; diff --git a/BALSAMIC/snakemake_rules/annotation/somatic_computations.rule b/BALSAMIC/snakemake_rules/annotation/somatic_computations.rule new file mode 100644 index 000000000..15b951c94 --- /dev/null +++ b/BALSAMIC/snakemake_rules/annotation/somatic_computations.rule @@ -0,0 +1,61 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 +# Computation of TMB and other somatic/oncology related scores. + + +rule tmb_calculation: + input: + vep_research = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.research.vcf.gz", + output: + tmb = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.balsamic_stat" + benchmark: + Path(benchmark_dir, "tmb_calculation_{var_type}.somatic.{case_name}.{var_caller}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() + params: + housekeeper_id={"id": config["analysis"]["case_id"], "tags": "research"}, + af_cutoff = "0.05", + bed = config["panel"]["capture_kit"] if "panel" in config else "", + message_text = "{var_type}.somatic.{case_name}.{var_caller}.research", + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + threads: + get_threads(cluster_config, "vep") + message: + "Calculating TMB score for {params.message_text}" + shell: + """ +mkdir -p {params.tmpdir}; +export TMPDIR={params.tmpdir}; + +if [ \"{params.bed}\" == \"\" ]; then region_size=3101.78817; else region_size=$(awk '{{s+=$3-$2}}END{{print s/1e6}}' {params.bed}); fi; + +echo -e '##INFO=' > {params.tmpdir}/vcf_header + +bcftools query -s TUMOR \ +-f '%CHROM\\t%POS\\t[%AF]\\n' \ +{input.vep_research} \ +| bgzip -c > {params.tmpdir}/tumor.txt.gz; + +tabix -f -s1 -b2 -e2 {params.tmpdir}/tumor.txt.gz; + +bcftools view -s TUMOR {input.vep_research} \ +| bcftools annotate -s TUMOR \ +-a {params.tmpdir}/tumor.txt.gz \ +-h {params.tmpdir}/vcf_header \ +-c CHROM,POS,INFO/AF_TUMOR \ +-O z -o {params.tmpdir}/temp.vcf.gz; + +tabix -f -p vcf {params.tmpdir}/temp.vcf.gz; + +bcftools view --types snps,indels --apply-filters PASS {params.tmpdir}/temp.vcf.gz \ +| bcftools filter -i "INFO/AF_TUMOR>={params.af_cutoff}" \ +| filter_vep --filter 'not Existing_variation' \ +| filter_vep --filter 'not COSMIC' \ +| filter_vep --filter 'not non_coding_transcript_exon_variant' \ +| filter_vep --filter 'not non_coding_transcript_variant' \ +| filter_vep --filter 'not feature_truncation' -C \ +| awk -v region=${{region_size}} '{{print $NF/region}}' > {output.tmb}; + +rm -r {params.tmpdir}; + """ + diff --git a/BALSAMIC/snakemake_rules/annotation/somatic_snv_annotation.rule b/BALSAMIC/snakemake_rules/annotation/somatic_snv_annotation.rule new file mode 100644 index 000000000..778f64536 --- /dev/null +++ b/BALSAMIC/snakemake_rules/annotation/somatic_snv_annotation.rule @@ -0,0 +1,166 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 +# Somatic SNV annotation module. + +rule bcftools_get_somaticINDEL_research: + input: + vcf_research = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", + output: + vcf_indel_research = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.indel.research.vcf.gz", + benchmark: + Path(benchmark_dir, "vep_somatic_research_snv.{case_name}.{var_caller}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + message_text = "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", + threads: + get_threads(cluster_config, "bcftools_get_somaticINDEL_research") + message: + "Running bcftools to get INDELs from {params.message_text}" + shell: + """ +bcftools view --threads {threads} --output-type v --output-file {output.vcf_indel_research} --types indels {input.vcf_research}; +tabix -p vcf -f {output.vcf_indel_research} + """ + +rule cadd_annotate_somaticINDEL_research: + input: + vcf_indel_research = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.indel.research.vcf.gz", + output: + cadd_indel_research = vep_dir + "SNV.somatic.{case_name}.{var_caller}.cadd_indel.research.tsv.gz", + benchmark: + Path(benchmark_dir, "vep_somatic_research_snv.{case_name}.{var_caller}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("cadd") + ".sif").as_posix() + params: + message_text = "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", + threads: + get_threads(cluster_config, "cadd_annotate_somaticINDEL_research") + message: + "Running cadd annotation for INDELs on {params.message_text}" + shell: + """ +CADD.sh -g GRCh37 -o {output.cadd_indel_research} {input.vcf_indel_research} + + """ + + +rule bcftools_annotate_somaticINDEL_research: + input: + vcf_research = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", + cadd_indel_research = vep_dir + "SNV.somatic.{case_name}.{var_caller}.cadd_indel.research.tsv.gz", + output: + vcf_indel_research = vep_dir + "SNV.somatic.{case_name}.{var_caller}.cadd_indel.research.vcf.gz", + benchmark: + Path(benchmark_dir, "vep_somatic_research_snv.{case_name}.{var_caller}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + header_line = temp(f"{vep_dir}cadd_header_line.txt"), + message_text = "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", + threads: + get_threads(cluster_config, "bcftools_annotate_somaticINDEL_research") + message: + "Running bcftools to annotate INDELs on {params.message_text}" + shell: + """ +echo '##INFO=' >\ + {params.header_line} + +tabix -s 1 -b 2 -e 2 -c "##" -f {input.cadd_indel_research} + +bcftools annotate --threads {threads} --output-type v \ +--header-lines {params.header_line} \ +--columns Chrom,Pos,Ref,Alt,-,CADD \ +--annotations {input.cadd_indel_research} \ +--output {output.vcf_indel_research} \ +{input.vcf_research} + +tabix -p vcf -f {output.vcf_indel_research} + """ + + +rule vep_annotate_somaticSNV_research: + input: + vcf_snv_research = vep_dir + "SNV.somatic.{case_name}.{var_caller}.cadd_indel.research.vcf.gz", + header = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.sample_name_map", + cosmic = config["reference"]["cosmic"] + output: + vcf_snv_research = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz"), + vcfanno_research_toml = vep_dir + "SNV.somatic.{case_name}.{var_caller}_vcfanno_research.toml" + benchmark: + Path(benchmark_dir, "vep_somatic_research_snv.{case_name}.{var_caller}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() + params: + message_text = "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", + tmp_vcf_research = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}.tmp.research.vcf.gz"), + vcfanno_research_annotations = dump_toml(research_annotations), + vep_cache = config["reference"]["vep_dir"], + vep_defaults = params.vep.vep_filters + threads: + get_threads(cluster_config, "vep_annotate_somaticSNV_research") + message: + "Running vep annotation on SNVs and INDELs in {params.message_text}" + shell: + """ +vep_path=$(dirname $(readlink -f $(which vep))); +export PERL5LIB=; + +echo \'{params.vcfanno_research_annotations}\' > {output.vcfanno_research_toml}; + +vcfanno -p {threads} {output.vcfanno_research_toml} {input.vcf_snv_research} \ +| bcftools reheader --threads {threads} -s {input.header} \ +| bcftools view --threads {threads} -O z -o {params.tmp_vcf_research} ; + +vep \ +--dir $vep_path \ +--dir_cache {params.vep_cache} \ +--dir_plugins $vep_path \ +--input_file {params.tmp_vcf_research} \ +--output_file {output.vcf_snv_research} \ +--fork {threads} \ +{params.vep_defaults} \ +--custom {input.cosmic},COSMIC,vcf,exact,0,CDS,GENE,STRAND,CNT,AA ; + +tabix -p vcf -f {output.vcf_snv_research}; + +rm {params.tmp_vcf_research}; + """ + +rule vcfanno_annotate_somaticSNV_clinical: + input: + vcf_snv_research = vep_dir + "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", + header = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.sample_name_map", + output: + vcf_snv_clinical = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}.clinical.vcf.gz"), + benchmark: + Path(benchmark_dir, "vcfanno_somatic_clinical_snv.{case_name}.{var_caller}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() + params: + message_text = "SNV.somatic.{case_name}.{var_caller}.clinical.vcf.gz", + clinical_snv = clinical_snv_obs, + cancer_germline_snv = cancer_germline_snv_obs, + cancer_somatic_snv = cancer_somatic_snv_obs, + vcfanno_clinical_annotations = dump_toml(clinical_annotations), + vcfanno_clinical_toml = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}_vcfanno_clinical.toml"), + threads: + get_threads(cluster_config, "vcfanno_annotate_somaticSNV_clinical") + message: + "Running vcfanno annotation for single nuceotide variants on {params.message_text}" + shell: + """ +if [[ -f "{params.clinical_snv}" || -f "{params.cancer_germline_snv}" || -f "{params.cancer_somatic_snv}" ]]; then + echo \'{params.vcfanno_clinical_annotations}\' > {params.vcfanno_clinical_toml}; + vcfanno -p {threads} {params.vcfanno_clinical_toml} {input.vcf_snv_research} | \ + bcftools reheader --threads {threads} -s {input.header} | \ + bcftools view --threads {threads} -O z -o {output.vcf_snv_clinical}; +else + cp {input.vcf_snv_research} {output.vcf_snv_clinical}; +fi + +tabix -p vcf -f {output.vcf_snv_clinical}; + """ + + diff --git a/BALSAMIC/snakemake_rules/annotation/somatic_sv_annotation.rule b/BALSAMIC/snakemake_rules/annotation/somatic_sv_annotation.rule new file mode 100644 index 000000000..3f1bfe352 --- /dev/null +++ b/BALSAMIC/snakemake_rules/annotation/somatic_sv_annotation.rule @@ -0,0 +1,142 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 +# Somatic SV annotation module. + + +rule vep_somatic_research_sv: + input: + vcf_research = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.vcf.gz", + header = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.sample_name_map", + output: + vcf_research_vep = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.vep.vcf.gz", + benchmark: + Path(benchmark_dir, "vep_somatic_research_sv." + config["analysis"]["case_id"] + ".svdb.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() + params: + message_text = "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.vcf.gz", + vep_cache = config["reference"]["vep_dir"], + vep_defaults = params.vep.vep_filters + threads: + get_threads(cluster_config, "vep_somatic_research_sv") + message: + "Running vep annotation for structural and copy number variants on {params.message_text}" + shell: + """ +vep_path=$(dirname $(readlink -f $(which vep))); +export PERL5LIB=; +bcftools reheader --threads {threads} -s {input.header} {input.vcf_research} | \ +bcftools view --threads {threads} -O v | \ +vep \ +--dir $vep_path \ +--dir_cache {params.vep_cache} \ +--dir_plugins $vep_path \ +--output_file {output.vcf_research_vep} \ +--fork {threads} \ +{params.vep_defaults} \ + +tabix -p vcf -f {output.vcf_research_vep}; + """ + + +rule svdb_annotate_somatic_research_sv: + input: + vcf_sv_research_vep = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.vep.vcf.gz", + header = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.sample_name_map", + output: + vcf_research = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.vcf.gz", + benchmark: + Path(benchmark_dir, "svdb_annotate_somatic_research_sv." + config["analysis"]["case_id"] + ".svdb.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() + params: + swegen_sv_frequency = swegen_sv, + tmp_vcf = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.tmp.swegen.vcf"), + message_text = "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.vep.vcf.gz", + threads: + get_threads(cluster_config, "svdb_annotate_somatic_research_sv") + message: + "Running SVDB for annotating research SVs with Swegen database on {params.message_text}" + shell: + """ +if [[ -f "{params.swegen_sv_frequency}" ]]; then + svdb --query --bnd_distance 5000 --overlap 0.80 \ + --in_occ OCC --out_occ swegen_obs --in_frq FRQ --out_frq SWEGENAF \ + --db {params.swegen_sv_frequency} --query_vcf {input.vcf_sv_research_vep} > {params.tmp_vcf}; + bgzip -l 9 -c {params.tmp_vcf} > {output.vcf_research}; +else + cp {input.vcf_sv_research_vep} {output.vcf_research}; +fi + +tabix -p vcf -f {output.vcf_research}; + + """ + + +rule svdb_annotate_clinical_obs_somatic_clinical_sv: + input: + vcf_sv_research = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.filtered.pass.vcf.gz", + output: + vcf_sv_clinical_obs = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical_obs.vcf.gz"), + benchmark: + Path(benchmark_dir, 'svdb_annotate_clinical_obs_somatic_clinical_sv.' + config["analysis"]["case_id"] + ".tsv") + singularity: + Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() + params: + case_name = config["analysis"]["case_id"], + clinical_sv_observations = clinical_sv, + vcf_clinical_obs = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical_obs.vcf"), + threads: + get_threads(cluster_config, "svdb_annotate_clinical_obs_somatic_clinical_sv") + message: + "Annotating structural and copy number variants with clinical observations using SVDB for {params.case_name}", + shell: + """ +if [[ -f "{params.clinical_sv_observations}" ]]; then + svdb --query --bnd_distance 10000 --overlap 0.80 \ + --in_occ Obs --out_occ clin_obs --in_frq Frq --out_frq Frq \ + --db {params.clinical_sv_observations} --query_vcf {input.vcf_sv_research} > {params.vcf_clinical_obs} + bgzip -l 9 -c {params.vcf_clinical_obs} > {output.vcf_sv_clinical_obs}; +else + cp {input.vcf_sv_research} {output.vcf_sv_clinical_obs}; +fi + +tabix -p vcf -f {output.vcf_sv_clinical_obs}; + +rm {params.vcf_clinical_obs} + """ + + +rule svdb_annotate_somatic_obs_somatic_clinical_sv: + input: + vcf_sv_clinical_obs = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical_obs.vcf.gz", + output: + vcf_sv_clinical = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical.vcf.gz", + benchmark: + Path(benchmark_dir, 'svdb_annotate_somatic_obs_somatic_clinical_sv.' + config["analysis"]["case_id"] + ".tsv") + singularity: + Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() + params: + case_name = config["analysis"]["case_id"], + somatic_sv_observations = somatic_sv, + vcf_somatic_obs = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.somatic_obs.vcf"), + threads: + get_threads(cluster_config, "svdb_annotate_somatic_obs_somatic_clinical_sv") + message: + "Annotating structural and copy number variants with clinical observations using SVDB for {params.case_name}", + shell: + """ +if [[ -f "{params.somatic_sv_observations}" ]]; then + svdb --query --bnd_distance 10000 --overlap 0.80 \ + --in_occ Obs --out_occ Cancer_Somatic_Obs --in_frq Frq --out_frq Cancer_Somatic_Frq \ + --db {params.somatic_sv_observations} --query_vcf {input.vcf_sv_clinical_obs} > {params.vcf_somatic_obs} + bgzip -l 9 -c {params.vcf_somatic_obs} > {output.vcf_sv_clinical}; + rm {params.vcf_somatic_obs}; +else + cp {input.vcf_sv_clinical_obs} {output.vcf_sv_clinical}; +fi + +tabix -p vcf -f {output.vcf_sv_clinical}; + +rm {input.vcf_sv_clinical_obs}; + """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule index 487cc313a..2172a5aa9 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule @@ -3,7 +3,6 @@ # NGS filters for various scenarios - rule bcftools_filter_vardict_research_tumor_normal: input: vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.vcf.gz", @@ -131,6 +130,7 @@ rm {output.vcf_pass_tnscope_umi}.temp2; rule bcftools_filter_vardict_clinical_tumor_normal: input: vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.vcf.gz", + namemap = vep_dir + "status_to_sample_id_namemap" output: vcf_pass_vardict = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.filtered.pass.vcf.gz", bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.filtered.pass.stats" @@ -153,11 +153,11 @@ rule bcftools_filter_vardict_clinical_tumor_normal: "adding FOUND_IN tags to the output VCF for {params.case_name} " shell: """ -bcftools view {input.vcf_snv_clinical} | \ +bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} |\ bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp1 -O z; +bcftools view --threads {threads} -f PASS -O z -o {output.vcf_pass_vardict}.temp1; python {params.edit_vcf_script} \ --input_vcf {output.vcf_pass_vardict}.temp1 \ @@ -179,6 +179,7 @@ rm {output.vcf_pass_vardict}.temp2; rule bcftools_filter_TNscope_umi_clinical_tumor_normal: input: vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.vcf.gz", + namemap = vep_dir + "status_to_sample_id_namemap" output: vcf_pass_tnscope_umi = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.vcf.gz", bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.stats" @@ -201,11 +202,12 @@ rule bcftools_filter_TNscope_umi_clinical_tumor_normal: "adding FOUND_IN tags to the output VCF file for {params.case_name} " shell: """ -bcftools view --threads {threads} -f PASS,triallelic_site {input.vcf_snv_clinical} | \ +bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} |\ +bcftools view --threads {threads} -f PASS,triallelic_site | \ bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_pass_tnscope_umi}.temp1 -O z; +bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -O z -o {output.vcf_pass_tnscope_umi}.temp1; python {params.edit_vcf_script} \ --input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule index 2dfc31b46..52c66d584 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule @@ -3,7 +3,6 @@ # NGS filters for various scenarios - rule bcftools_filter_vardict_research_tumor_only: input: vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.vcf.gz", @@ -131,6 +130,7 @@ rm {output.vcf_pass_tnscope_umi}.temp2; rule bcftools_filter_vardict_clinical_tumor_only: input: vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.vcf.gz", + namemap = vep_dir + "status_to_sample_id_namemap" output: vcf_pass_vardict = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.filtered.pass.vcf.gz", bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.filtered.pass.stats" @@ -153,12 +153,13 @@ rule bcftools_filter_vardict_clinical_tumor_only: "adding FOUND_IN tags to the output VCF for {params.case_name}" shell: """ -bcftools view {input.vcf_snv_clinical} | \ +bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} |\ bcftools filter --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp1 -O z; + python {params.edit_vcf_script} \ --input_vcf {output.vcf_pass_vardict}.temp1 \ --output_vcf {output.vcf_pass_vardict}.temp2 \ @@ -179,6 +180,7 @@ rm {output.vcf_pass_vardict}.temp2; rule bcftools_filter_TNscope_umi_clinical_tumor_only: input: vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.vcf.gz", + namemap = vep_dir + "status_to_sample_id_namemap" output: vcf_pass_tnscope_umi = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.vcf.gz", bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.stats" @@ -201,11 +203,12 @@ rule bcftools_filter_TNscope_umi_clinical_tumor_only: "adding FOUND_IN tags to the output VCF for {params.case_name}" shell: """ -bcftools view --threads {threads} -f PASS,triallelic_site {input.vcf_snv_clinical} | \ +bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} |\ +bcftools view --threads {threads} -f PASS,triallelic_site | \ bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_pass_tnscope_umi}.temp1 -O z; +bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -O z -o {output.vcf_pass_tnscope_umi}.temp1; python {params.edit_vcf_script} \ --input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule index 38b949eef..32d0ae2fb 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule @@ -2,9 +2,10 @@ # coding: utf-8 # NGS filters for merged SVs and CNVs + rule bcftools_filter_sv_research: input: - vcf_sv_research = vep_dir + "SV.somatic.{case_name}.svdb.research.swegen.vcf.gz", + vcf_sv_research = vep_dir + "SV.somatic.{case_name}.svdb.research.vcf.gz", output: vcf_pass_svdb = vep_dir + "SV.somatic.{case_name}.svdb.research.filtered.pass.vcf.gz", bcftools_counts = vep_dir + "SV.somatic.{case_name}.svdb.research.filtered.pass.stats" @@ -24,7 +25,7 @@ rule bcftools_filter_sv_research: """ bcftools view --threads {threads} -f .,PASS,MaxDepth {input.vcf_sv_research} |\ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f .,PASS,MaxDepth -O z -o {output.vcf_pass_svdb}; +bcftools view --threads {threads} -f .,PASS,MaxDepth -O z -o {output.vcf_pass_svdb}; tabix -p vcf -f {output.vcf_pass_svdb}; @@ -35,6 +36,7 @@ bcftools +counts {output.vcf_pass_svdb} > {output.bcftools_counts}; rule bcftools_filter_sv_clinical: input: vcf_sv_clinical = vep_dir + "SV.somatic.{case_name}.svdb.clinical.vcf.gz", + namemap = vep_dir + "status_to_sample_id_namemap" output: vcf_pass_svdb = vep_dir + "SV.somatic.{case_name}.svdb.clinical.filtered.pass.vcf.gz", bcftools_counts = vep_dir + "SV.somatic.{case_name}.svdb.clinical.filtered.pass.stats" @@ -53,10 +55,11 @@ rule bcftools_filter_sv_clinical: "Filtering merged clinical structural and copy number variants using bcftools for {params.case_name}" shell: """ -bcftools view --threads {threads} -f .,PASS,MaxDepth {input.vcf_sv_clinical} |\ +bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_sv_clinical} |\ +bcftools view --threads {threads} -f .,PASS,MaxDepth |\ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f .,PASS,MaxDepth -O z -o {output.vcf_pass_svdb}; +bcftools view --threads {threads} -f .,PASS,MaxDepth -O z -o {output.vcf_pass_svdb}; tabix -p vcf -f {output.vcf_pass_svdb}; diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule index d0fbce5bc..1c4f1fd85 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule @@ -3,7 +3,6 @@ # NGS filters for various scenarios - rule bcftools_filter_tnscope_research_tumor_normal: input: vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.tnscope.research.vcf.gz", @@ -39,6 +38,7 @@ bcftools +counts {output.vcf_pass_tnscope} > {output.bcftools_counts_research}; rule bcftools_filter_tnscope_clinical_tumor_normal: input: vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope.clinical.vcf.gz", + namemap = vep_dir + "status_to_sample_id_namemap" output: vcf_pass_tnscope = vep_dir + "{var_type}.somatic.{case_name}.tnscope.clinical.filtered.pass.vcf.gz", bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope.clinical.filtered.pass.stats" @@ -51,18 +51,19 @@ rule bcftools_filter_tnscope_clinical_tumor_normal: swegen_freq = [SENTIEON_CALLER.swegen_snv_freq.tag_value, SENTIEON_CALLER.swegen_snv_freq.filter_name], loqusdb_clinical_freq = [SENTIEON_CALLER.loqusdb_clinical_snv_freq.tag_value, SENTIEON_CALLER.loqusdb_clinical_snv_freq.filter_name], housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, - case_name = '{case_name}' + case_name = '{case_name}', threads: get_threads(cluster_config, 'bcftools_filter_tnscope_clinical_tumor_normal') message: "Filtering WGS tumor-normal tnscope annotated clinical variants using bcftools for {params.case_name}" shell: """ -bcftools view -f PASS,triallelic_site --threads {threads} {input.vcf_snv_clinical} | \ +bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} | \ +bcftools view -f PASS,triallelic_site --threads {threads} | \ bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_pass_tnscope} -O z; +bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -O z -o {output.vcf_pass_tnscope}; tabix -p vcf -f {output.vcf_pass_tnscope}; diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule index bcaf5c4ed..bc9c4e9ed 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule @@ -3,11 +3,10 @@ # NGS filters for various scenarios - rule bcftools_filter_tnscope_research_tumor_only: input: vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.tnscope.research.vcf.gz", - wgs_calling_file = config["reference"]["wgs_calling_interval"] + wgs_calling_file = config["reference"]["wgs_calling_regions"] output: vcf_pass_tnscope = vep_dir + "{var_type}.somatic.{case_name}.tnscope.research.filtered.pass.vcf.gz", bcftools_counts_research = vep_dir + "{var_type}.somatic.{case_name}.tnscope.research.filtered.pass.stats" @@ -41,7 +40,8 @@ bcftools +counts {output.vcf_pass_tnscope} > {output.bcftools_counts_research}; rule bcftools_filter_tnscope_clinical_tumor_only: input: vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope.clinical.vcf.gz", - wgs_calling_file = config["reference"]["wgs_calling_interval"] + wgs_calling_file = config["reference"]["wgs_calling_regions"], + namemap = vep_dir + "status_to_sample_id_namemap" output: vcf_pass_tnscope = vep_dir + "{var_type}.somatic.{case_name}.tnscope.clinical.filtered.pass.vcf.gz", bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope.clinical.filtered.pass.stats" @@ -61,12 +61,14 @@ rule bcftools_filter_tnscope_clinical_tumor_only: "Filtering WGS tumor-only tnscope annotated clinical variants using bcftools for {params.case_name}" shell: """ -grep -v '^@' {input.wgs_calling_file} > {input.wgs_calling_file}.bed +grep -v '^@' {input.wgs_calling_file} > {input.wgs_calling_file}.bed; -bcftools view -f PASS,triallelic_site --threads {threads} --regions-file {input.wgs_calling_file}.bed {input.vcf_snv_clinical} | \ +bcftools view --regions-file {input.wgs_calling_file}.bed {input.vcf_snv_clinical} | \ +bcftools reheader --threads {threads} -s {input.namemap} | \ +bcftools view -f PASS,triallelic_site --threads {threads} | \ bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ -bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ +bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' | \ +bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' | \ bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_pass_tnscope} -O z; tabix -p vcf -f {output.vcf_pass_tnscope}; @@ -74,3 +76,4 @@ tabix -p vcf -f {output.vcf_pass_tnscope}; bcftools +counts {output.vcf_pass_tnscope} > {output.bcftools_counts_clinical}; """ + diff --git a/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule b/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule index 76625bb03..f1bc991ed 100644 --- a/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule +++ b/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule @@ -1,6 +1,7 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 + if config["analysis"]["sequencing_type"] != 'wgs': rule vcf2cytosure_convert: input: diff --git a/BALSAMIC/snakemake_rules/annotation/vcfheader_rename.rule b/BALSAMIC/snakemake_rules/annotation/vcfheader_rename.rule index 1eb2cfe3a..49f38d19d 100644 --- a/BALSAMIC/snakemake_rules/annotation/vcfheader_rename.rule +++ b/BALSAMIC/snakemake_rules/annotation/vcfheader_rename.rule @@ -15,14 +15,14 @@ rule vcfheader_rename_germline: params: housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "genotype"}, sample_name = "NORMAL", - lims_name = lims_id["normal"] + sample_id = config_model.get_sample_name_by_type(SampleType.NORMAL) threads: get_threads(cluster_config,'vcfheader_rename_germline') message: - "Renaming header in DNAscope germline VCF output from {params.sample_name} to {params.lims_name}" + "Renaming header in DNAscope germline VCF output from {params.sample_name} to {params.sample_id}" shell: """ -echo -e \"{params.sample_name}\\t{params.lims_name}\" > {output.namemap}; +echo -e \"{params.sample_name}\\t{params.sample_id}\" > {output.namemap}; bcftools reheader --threads {threads} \ -s {output.namemap} {input.vcf} \ diff --git a/BALSAMIC/snakemake_rules/annotation/vep.rule b/BALSAMIC/snakemake_rules/annotation/vep.rule deleted file mode 100644 index eb6c66b20..000000000 --- a/BALSAMIC/snakemake_rules/annotation/vep.rule +++ /dev/null @@ -1,335 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 -# VEP annotation module. Annotate all VCFs generated through VEP - - -rule vep_somatic_research_snv: - input: - vcf_snv_research = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", - header = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.sample_name_map", - cosmic = config["reference"]["cosmic"] - output: - vcf_snv_research = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz"), - vcfanno_research_toml = vep_dir + "SNV.somatic.{case_name}.{var_caller}_vcfanno_research.toml" - benchmark: - Path(benchmark_dir, "vep_somatic_research_SNV.somatic.{case_name}.{var_caller}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() - params: - message_text = "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", - tmpvcf_research = vep_dir + "SNV.somatic.{case_name}.{var_caller}.tmp.research.vcf.gz", - vcfanno_research_annotations = dump_toml(research_annotations), - vep_cache = config["reference"]["vep"], - vep_defaults = params.vep.vep_filters - threads: - get_threads(cluster_config, "vep_somatic_research_snv") - message: - "Running vep annotation for single nuceotide variants on {params.message_text}" - shell: - """ -vep_path=$(dirname $(readlink -f $(which vep))); -tmpvcf_research={params.tmpvcf_research}; -export PERL5LIB=; - -echo \'{params.vcfanno_research_annotations}\' > {output.vcfanno_research_toml}; - -vcfanno -p {threads} {output.vcfanno_research_toml} {input.vcf_snv_research} \ -| bcftools reheader --threads {threads} -s {input.header} \ -| bcftools view --threads {threads} -O z -o $tmpvcf_research ; - -vep \ ---dir $vep_path \ ---dir_cache {params.vep_cache} \ ---dir_plugins $vep_path \ ---input_file $tmpvcf_research \ ---output_file {output.vcf_snv_research} \ ---fork {threads} \ -{params.vep_defaults} \ ---custom {input.cosmic},COSMIC,vcf,exact,0,CDS,GENE,STRAND,CNT,AA ; - -tabix -p vcf -f {output.vcf_snv_research}; - -rm $tmpvcf_research; - """ - -rule vep_somatic_clinical_snv: - input: - vcf_snv_research = vep_dir + "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", - header = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.sample_name_map", - output: - vcf_snv_clinical = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}.clinical.vcf.gz"), - vcfanno_clinical_toml = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}_vcfanno_clinical.toml"), - benchmark: - Path(benchmark_dir, "vep_somatic_clinical_SNV.somatic.{case_name}.{var_caller}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() - params: - message_text = "SNV.somatic.{case_name}.{var_caller}.clinical.vcf.gz", - clinical_snv = clinical_snv_obs, - vcfanno_clinical_annotations = dump_toml(clinical_annotations), - threads: - get_threads(cluster_config, "vep_somatic_snv") - message: - "Running vep annotation for single nuceotide variants on {params.message_text}" - shell: - """ -if [[ -f "{params.clinical_snv}" ]]; then - -echo \'{params.vcfanno_clinical_annotations}\' > {output.vcfanno_clinical_toml}; - -vcfanno -p {threads} {output.vcfanno_clinical_toml} {input.vcf_snv_research} \ -| bcftools reheader --threads {threads} -s {input.header} \ -| bcftools view --threads {threads} -O z -o {output.vcf_snv_clinical} ; - -else - -cp {input.vcf_snv_research} {output.vcf_snv_clinical}; - -fi - -tabix -p vcf -f {output.vcf_snv_clinical}; - """ - -rule vep_somatic_sv: - input: - vcf_research = vcf_dir + "SV.somatic.{case_name}.svdb.research.vcf.gz", - header = vcf_dir + "SV.somatic.{case_name}.svdb.sample_name_map", - output: - vcf_research = temp(vep_dir + "SV.somatic.{case_name}.svdb.research.vcf.gz"), - benchmark: - Path(benchmark_dir, "vep_somatic_SV.somatic.{case_name}.svdb.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() - params: - message_text = "SV.somatic.{case_name}.svdb.research.vcf.gz", - vep_cache = config["reference"]["vep"], - vep_defaults = params.vep.vep_filters - threads: - get_threads(cluster_config, "vep_somatic_sv") - message: - "Running vep annotation for structural and copy number variants on {params.message_text}" - shell: - """ -vep_path=$(dirname $(readlink -f $(which vep))); -export PERL5LIB=; - -bcftools reheader --threads {threads} -s {input.header} {input.vcf_research} | \ -bcftools view --threads {threads} -O v | \ -vep \ ---dir $vep_path \ ---dir_cache {params.vep_cache} \ ---dir_plugins $vep_path \ ---output_file {output.vcf_research} \ ---fork {threads} \ -{params.vep_defaults} \ - -tabix -p vcf -f {output.vcf_research}; - """ - -rule annotate_swegen_frequency_somatic_sv: - input: - vcf_sv_research = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.vcf.gz", - output: - vcf_sv_clinical = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.swegen.vcf.gz"), - benchmark: - Path(benchmark_dir, 'annotate_swegen_frequency_somatic_sv_' + config["analysis"]["case_id"] + ".tsv") - singularity: - Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() - params: - tumor = get_sample_type(config["samples"], "tumor"), - normal = get_sample_type(config["samples"], "normal"), - case_name = config["analysis"]["case_id"], - swegen_sv_frequency = swegen_sv, - tmp_vcf_clinical = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.tmp.clinical.vcf"), - threads: - get_threads(cluster_config, "annotate_swegen_frequency_somatic_sv") - message: - "Annotating structural and copy number variants with swegen frequency using SVDB for {params.case_name}", - shell: - """ -if [[ -f "{params.swegen_sv_frequency}" ]]; then -svdb --query --bnd_distance 5000 --overlap 0.80 \ ---in_occ OCC --out_occ swegen_obs --in_frq FRQ --out_frq SWEGENAF \ ---db {params.swegen_sv_frequency} --query_vcf {input.vcf_sv_research} > {params.tmp_vcf_clinical}; - -bgzip -l 9 -c {params.tmp_vcf_clinical} > {output.vcf_sv_clinical}; - -else - -cp {input.vcf_sv_research} {output.vcf_sv_clinical}; - -fi - -tabix -p vcf -f {output.vcf_sv_clinical}; - """ - -rule annotate_clinical_observation_somatic_sv: - input: - vcf_sv_research = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.research.swegen.vcf.gz", - output: - vcf_sv_clinical = vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical.vcf.gz", - benchmark: - Path(benchmark_dir, 'annotate_clinical_observation_somatic_sv_' + config["analysis"]["case_id"] + ".tsv") - singularity: - Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() - params: - tumor = get_sample_type(config["samples"], "tumor"), - normal = get_sample_type(config["samples"], "normal"), - case_name = config["analysis"]["case_id"], - clinical_sv_observations = clinical_sv, - tmp_vcf_clinical = temp(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.tmp.clinical.vcf"), - threads: - get_threads(cluster_config, "svdb_query_somatic_sv") - message: - "Annotating structural and copy number variants with clinical observations using SVDB for {params.case_name}", - shell: - """ -if [[ -f "{params.clinical_sv_observations}" ]]; then -svdb --query --bnd_distance 10000 --overlap 0.80 \ ---in_occ Obs --out_occ clin_obs --in_frq Frq --out_frq Frq \ ---db {params.clinical_sv_observations} --query_vcf {input.vcf_sv_research} > {params.tmp_vcf_clinical} - -bgzip -l 9 -c {params.tmp_vcf_clinical} > {output.vcf_sv_clinical}; - -else - -cp {input.vcf_sv_research} {output.vcf_sv_clinical}; - -fi - -tabix -p vcf -f {output.vcf_sv_clinical}; - """ - -rule tmb_calculation: - input: - vep_research = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.research.vcf.gz", - output: - tmb = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.balsamic_stat" - benchmark: - Path(benchmark_dir, "vep_stat_{var_type}.somatic.{case_name}.{var_caller}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() - params: - housekeeper_id={"id": config["analysis"]["case_id"], "tags": "research"}, - af_cutoff = "0.05", - bed = config["panel"]["capture_kit"] if "panel" in config else "", - message_text = "{var_type}.somatic.{case_name}.{var_caller}.research", - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - threads: - get_threads(cluster_config, "vep") - message: - "Calculating TMB score for {params.message_text}" - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; - -if [ \"{params.bed}\" == \"\" ]; then region_size=3101.78817; else region_size=$(awk '{{s+=$3-$2}}END{{print s/1e6}}' {params.bed}); fi; - -echo -e '##INFO=' > {params.tmpdir}/vcf_header - -bcftools query -s TUMOR \ --f '%CHROM\\t%POS\\t[%AF]\\n' \ -{input.vep_research} \ -| bgzip -c > {params.tmpdir}/tumor.txt.gz; - -tabix -f -s1 -b2 -e2 {params.tmpdir}/tumor.txt.gz; - -bcftools view -s TUMOR {input.vep_research} \ -| bcftools annotate -s TUMOR \ --a {params.tmpdir}/tumor.txt.gz \ --h {params.tmpdir}/vcf_header \ --c CHROM,POS,INFO/AF_TUMOR \ --O z -o {params.tmpdir}/temp.vcf.gz; - -tabix -f -p vcf {params.tmpdir}/temp.vcf.gz; - -bcftools view --types snps,indels --apply-filters PASS {params.tmpdir}/temp.vcf.gz \ -| bcftools filter -i "INFO/AF_TUMOR>={params.af_cutoff}" \ -| filter_vep --filter 'not Existing_variation' \ -| filter_vep --filter 'not COSMIC' \ -| filter_vep --filter 'not non_coding_transcript_exon_variant' \ -| filter_vep --filter 'not non_coding_transcript_variant' \ -| filter_vep --filter 'not feature_truncation' -C \ -| awk -v region=${{region_size}} '{{print $NF/region}}' > {output.tmb}; - -rm -r {params.tmpdir}; - """ - - -rule vep_germline_tumor: - input: - vcf = vcf_dir + "{var_type}.germline.tumor.{var_caller}.vcf.gz", - cosmic = config["reference"]["cosmic"] - output: - vcf_tumor = vep_dir + "{var_type}.germline.tumor.{var_caller}.vcf.gz", - benchmark: - Path(benchmark_dir, "vep_germline_{var_type}.germline.tumor.{var_caller}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() - params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "germline"}, - sample = 'tumor', - vep_cache = config["reference"]["vep"], - vep_defaults = params.vep.vep_filters - threads: - get_threads(cluster_config, 'vep_germline') - message: - "Running vep annotation on germline variants for {params.sample} sample" - shell: - """ -vep_path=$(dirname $(readlink -f $(which vep))); -export PERL5LIB=; - -vep \ ---dir $vep_path \ ---dir_cache {params.vep_cache} \ ---dir_plugins $vep_path \ ---input_file {input.vcf} \ ---output_file {output.vcf_tumor} \ ---fork {threads} \ -{params.vep_defaults} \ ---custom {input.cosmic},COSMIC,vcf,exact,0,CDS,GENE,STRAND,CNT,AA; - -tabix -p vcf -f {output.vcf_tumor}; - - """ - - -rule vep_germline_normal: - input: - vcf = vcf_dir + "{var_type}.germline.normal.{var_caller}.vcf.gz", - cosmic = config["reference"]["cosmic"] - output: - vcf_normal = vep_dir + "{var_type}.germline.normal.{var_caller}.vcf.gz", - benchmark: - Path(benchmark_dir, "vep_germline_{var_type}.germline.normal.{var_caller}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() - params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "germline"}, - sample = 'normal', - vep_cache = config["reference"]["vep"], - vep_defaults = params.vep.vep_filters - threads: - get_threads(cluster_config, 'vep_germline') - message: - "Running vep annotation on germline variants for {params.sample} sample" - shell: - """ -vep_path=$(dirname $(readlink -f $(which vep))); -export PERL5LIB=; - -vep \ ---dir $vep_path \ ---dir_cache {params.vep_cache} \ ---dir_plugins $vep_path \ ---input_file {input.vcf} \ ---output_file {output.vcf_normal} \ ---fork {threads} \ -{params.vep_defaults} \ ---custom {input.cosmic},COSMIC,vcf,exact,0,CDS,GENE,STRAND,CNT,AA; - -tabix -p vcf -f {output.vcf_normal}; - - """ diff --git a/BALSAMIC/snakemake_rules/cache/__init__.py b/BALSAMIC/snakemake_rules/cache/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/BALSAMIC/snakemake_rules/cache/cadd.rule b/BALSAMIC/snakemake_rules/cache/cadd.rule new file mode 100644 index 000000000..3b3404a17 --- /dev/null +++ b/BALSAMIC/snakemake_rules/cache/cadd.rule @@ -0,0 +1,23 @@ +"""Rules to process reference files.""" + + +rule index_cadd: + """Index CADD reference files.""" + input: + singularity_image=f"{config['containers_dir']}/{config['bioinfo_tools']['tabix']}.{FileType.SIF}", + cadd_snv=cache_config.references.cadd_snv.file_path + output: + cadd_snv_tbi=f"{cache_config.references.cadd_snv.file_path}.{FileType.TBI}" + singularity: + f"{config['containers_dir']}/{config['bioinfo_tools']['tabix']}.{FileType.SIF}" + threads: get_threads(cluster_config=cluster_config, rule_name="index_cadd") + message: + "Indexing CADD file {input.cadd_snv}" + benchmark: + f"{cache_config.references.cadd_snv.file_path}.{FileType.TBI}.benchmark.{FileType.TSV}" + log: + f"{cache_config.references.cadd_snv.file_path}.{FileType.TBI}.{FileType.LOG}", + shell: + """ + tabix -s 1 -b 2 -e 2 "{input.cadd_snv}" &> "{log}" + """ diff --git a/BALSAMIC/snakemake_rules/cache/delly.rule b/BALSAMIC/snakemake_rules/cache/delly.rule new file mode 100644 index 000000000..da16fffc4 --- /dev/null +++ b/BALSAMIC/snakemake_rules/cache/delly.rule @@ -0,0 +1,20 @@ +"""Delly reference specific rules.""" + + +rule convert_delly_exclusion_file: + """Remove all occurrences of "chr" from the Delly telomeres and centromeres exclusion file.""" + input: + exclusion_file=cache_config.references.delly_exclusion.file_path, + output: + exclusion_converted_file=cache_config.references.get_delly_exclusion_converted_file_path(), + threads: get_threads(cluster_config=cluster_config, rule_name="convert_delly_exclusion_file") + message: + "Converting delly exclusion file {input.exclusion_file}" + benchmark: + f"{cache_config.references.get_delly_exclusion_converted_file_path()}.benchmark.{FileType.TSV}" + log: + f"{cache_config.references.get_delly_exclusion_converted_file_path()}.{FileType.LOG}", + shell: + """ + sed 's/chr//g' "{input.exclusion_file}" > "{output.exclusion_converted_file}" 2> "{log}" + """ diff --git a/BALSAMIC/snakemake_rules/cache/reference_download.rule b/BALSAMIC/snakemake_rules/cache/reference_download.rule new file mode 100644 index 000000000..043c39de9 --- /dev/null +++ b/BALSAMIC/snakemake_rules/cache/reference_download.rule @@ -0,0 +1,36 @@ +"""Common reference download rules.""" + + +rule download_references: + """Download Balsamic reference files.""" + output: + reference_path="{reference_path}", + wildcard_constraints: + reference_path="|".join(cache_config.get_reference_file_paths()), + params: + reference=lambda wildcards: cache_config.get_reference_by_path( + Path(config["references_dir"], wildcards.reference_path).as_posix() + ), + threads: get_threads(cluster_config=cluster_config, rule_name="download_references") + message: + "Downloading reference file {output.reference_path}" + benchmark: + f"{{reference_path}}.benchmark.{FileType.TSV}" + log: + f"{{reference_path}}.{FileType.LOG}", + shell: + """ + if [[ -n "{params.reference.secret}" ]] && [[ "{params.reference.secret}" != "None" ]]; then + response=$(curl -s -H "Authorization: Basic {params.reference.secret}" "{params.reference.url}") + download_url=$(echo $response | grep -o 'https://[^"]*') + cmd="curl '$download_url' -o -" + elif [[ "{params.reference.url}" == gs://* ]]; then + cmd="gsutil cp '{params.reference.url}' -" + else + cmd="wget '{params.reference.url}' -O -" + fi + if [[ "{params.reference.gzip}" == "True" ]]; then + cmd+=" | gunzip" + fi + eval "$cmd > '{output.reference_path}'" &> "{log}" + """ diff --git a/BALSAMIC/snakemake_rules/cache/reference_genome_index.rule b/BALSAMIC/snakemake_rules/cache/reference_genome_index.rule new file mode 100644 index 000000000..27ae9caef --- /dev/null +++ b/BALSAMIC/snakemake_rules/cache/reference_genome_index.rule @@ -0,0 +1,70 @@ +"""Reference genome files processing rules.""" + + +rule picard_dict_reference_genome: + """Create a sequence dictionary for a reference using Picard.""" + input: + singularity_image=f"{config['containers_dir']}/{config['bioinfo_tools']['picard']}.{FileType.SIF}", + reference_genome=cache_config.references.reference_genome.file_path, + output: + dict_reference_genome=cache_config.references.reference_genome.file_path.replace( + FileType.FASTA, FileType.DICT + ), + singularity: + f"{config['containers_dir']}/{config['bioinfo_tools']['picard']}.{FileType.SIF}" + threads: get_threads(cluster_config=cluster_config, rule_name="picard_dict_reference_genome") + message: + "Creating a sequence dictionary for a reference file {input.reference_genome}" + benchmark: + f"{cache_config.references.reference_genome.file_path.replace(FileType.FASTA,FileType.DICT)}.benchmark.{FileType.TSV}" + log: + f"{cache_config.references.reference_genome.file_path.replace(FileType.FASTA, FileType.DICT)}.{FileType.LOG}", + shell: + """ + picard CreateSequenceDictionary REFERENCE="{input.reference_genome}" OUTPUT="{output.dict_reference_genome}" \ + &> "{log}" + """ + + +rule fasta_index_reference_genome: + """Create a FASTA index for the reference genome.""" + input: + singularity_image=f"{config['containers_dir']}/{config['bioinfo_tools']['samtools']}.{FileType.SIF}", + reference_genome=cache_config.references.reference_genome.file_path, + output: + indexed_reference_genome=f"{cache_config.references.reference_genome.file_path}.{FileType.FAI}", + singularity: + f"{config['containers_dir']}/{config['bioinfo_tools']['samtools']}.{FileType.SIF}" + threads: get_threads(cluster_config=cluster_config, rule_name="fasta_index_reference_genome") + message: + "FASTA format indexing of the reference genome file {input.reference_genome}" + benchmark: + f"{cache_config.references.reference_genome.file_path}.{FileType.FAI}.benchmark.{FileType.TSV}" + log: + f"{cache_config.references.reference_genome.file_path}.{FileType.FAI}.{FileType.LOG}", + shell: + """ + samtools faidx "{input.reference_genome}" &> "{log}" + """ + + +rule bwa_index_reference_genome: + """Create BWA indexes for the reference genome.""" + input: + singularity_image=f"{config['containers_dir']}/{config['bioinfo_tools']['bwa']}.{FileType.SIF}", + reference_genome=cache_config.references.reference_genome.file_path, + output: + indexed_reference_genome=cache_config.references.get_reference_genome_bwa_index_file_paths(), + singularity: + f"{config['containers_dir']}/{config['bioinfo_tools']['bwa']}.{FileType.SIF}" + threads: get_threads(cluster_config=cluster_config, rule_name="bwa_index_reference_genome") + message: + "BWA indexing of the reference genome file {input.reference_genome}" + benchmark: + f"{cache_config.references.reference_genome.file_path}.bwa_indexes.benchmark.{FileType.TSV}" + log: + f"{cache_config.references.reference_genome.file_path}.bwa_indexes.{FileType.LOG}", + shell: + """ + bwa index -a bwtsw "{input.reference_genome}" &> "{log}" + """ diff --git a/BALSAMIC/snakemake_rules/cache/reference_vcf.rule b/BALSAMIC/snakemake_rules/cache/reference_vcf.rule new file mode 100644 index 000000000..3ac6083b0 --- /dev/null +++ b/BALSAMIC/snakemake_rules/cache/reference_vcf.rule @@ -0,0 +1,49 @@ +"""Rules to process VCF reference files.""" + + +wildcard_constraints: + vcf="|".join(cache_config.get_reference_file_paths_by_file_type(file_type=FileType.VCF)), + + +rule compress_vcfs: + """Compress VCF reference files.""" + input: + singularity_image=f"{config['containers_dir']}/{config['bioinfo_tools']['bgzip']}.{FileType.SIF}", + vcf="{vcf}", + output: + vcf_gz=f"{{vcf}}.{FileType.GZ}", + singularity: + f"{config['containers_dir']}/{config['bioinfo_tools']['bgzip']}.{FileType.SIF}" + threads: get_threads(cluster_config=cluster_config, rule_name="compress_vcfs") + message: + "Compressing VCF variant file {input.vcf}" + benchmark: + f"{{vcf}}.{FileType.GZ}.benchmark.{FileType.TSV}" + log: + f"{{vcf}}.{FileType.GZ}.{FileType.LOG}", + shell: + """ + bgzip "{input.vcf}" &> "{log}" + """ + + +rule index_vcfs: + """Index VCF reference files.""" + input: + singularity_image=f"{config['containers_dir']}/{config['bioinfo_tools']['tabix']}.{FileType.SIF}", + vcf_gz=f"{{vcf}}.{FileType.GZ}", + output: + vcf_gz_tbi=f"{{vcf}}.{FileType.GZ}.{FileType.TBI}", + singularity: + f"{config['containers_dir']}/{config['bioinfo_tools']['tabix']}.{FileType.SIF}" + threads: get_threads(cluster_config=cluster_config, rule_name="index_vcfs") + message: + "Indexing VCF variant file {input.vcf_gz}" + benchmark: + f"{{vcf}}.{FileType.GZ}.{FileType.TBI}.benchmark.{FileType.TSV}" + log: + f"{{vcf}}.{FileType.GZ}.{FileType.TBI}.{FileType.LOG}", + shell: + """ + tabix -p vcf "{input.vcf_gz}" &> "{log}" + """ diff --git a/BALSAMIC/snakemake_rules/cache/refseq.rule b/BALSAMIC/snakemake_rules/cache/refseq.rule new file mode 100644 index 000000000..d1c623146 --- /dev/null +++ b/BALSAMIC/snakemake_rules/cache/refseq.rule @@ -0,0 +1,47 @@ +"""Rules to process RefSeq's gene files.""" + + +rule preprocess_refseq: + """Preprocess RefSeq's gene files.""" + input: + singularity_image=f"{config['containers_dir']}/{config['bioinfo_tools']['bedtools']}.{FileType.SIF}", + access_regions=cache_config.references.access_regions.file_path, + refgene_sql=cache_config.references.refgene_sql.file_path, + refgene_txt=cache_config.references.refgene_txt.file_path, + output: + refgene_bed=cache_config.references.get_refgene_bed_file_path(), + refgene_flat=cache_config.references.get_refgene_flat_file_path(), + params: + refseq_script_path=REFSEQ_SCRIPT_PATH.as_posix(), + singularity: + f"{config['containers_dir']}/{config['bioinfo_tools']['bedtools']}.{FileType.SIF}" + threads: get_threads(cluster_config=cluster_config, rule_name="preprocess_refseq") + message: + "Preprocessing RefSeq's gene files {input.refgene_sql} and {input.refgene_txt}" + benchmark: + f"{cache_config.references.get_refgene_bed_file_path()}.benchmark.{FileType.TSV}" + log: + refgene_bed=f"{cache_config.references.get_refgene_bed_file_path()}.{FileType.LOG}", + refgene_flat=f"{cache_config.references.get_refgene_flat_file_path()}.{FileType.LOG}", + shell: + """ + # Generate RefSeq's BED file + ( + header=$(awk -f "{params.refseq_script_path}" "{input.refgene_sql}") + (echo \"$header\"; cat "{input.refgene_txt}") |\ + csvcut -t -c chrom,exonStarts,exonEnds,name,score,strand,exonCount,txStart,txEnd,name2 |\ + csvformat -T |\ + bedtools expand -c 2,3 |\ + awk '$1~/chr[1-9]/ && $1!~/[_]/' |\ + cut -c 4- |\ + sort -k1,1 -k2,2n > "{output.refgene_bed}" + ) &> "{log.refgene_bed}" + + # Generate RefSeq's flat file and remove "chr" prefix from refgene_txt and access_regions files + ( + awk -v OFS=\"\\t\" '$3!~/_/ {{ gsub(\"chr\",\"\",$3); $1=$13; print }}' "{input.refgene_txt}" |\ + cut -f 1-11 > "{output.refgene_flat}" + sed -i 's/chr//g' "{input.refgene_txt}" + sed -i 's/chr//g' "{input.access_regions}" + ) &> "{log.refgene_flat}" + """ diff --git a/BALSAMIC/snakemake_rules/cache/refseq_canfam.rule b/BALSAMIC/snakemake_rules/cache/refseq_canfam.rule new file mode 100644 index 000000000..2d97fb93a --- /dev/null +++ b/BALSAMIC/snakemake_rules/cache/refseq_canfam.rule @@ -0,0 +1,43 @@ +"""Rules to process canine RefSeq's gene files.""" + + +rule preprocess_refseq_canfam: + """Preprocess RefSeq's gene files.""" + input: + singularity_image=f"{config['containers_dir']}/{config['bioinfo_tools']['bedtools']}.{FileType.SIF}", + refgene_sql=cache_config.references.refgene_sql.file_path, + refgene_txt=cache_config.references.refgene_txt.file_path, + output: + refgene_bed=cache_config.references.get_refgene_bed_file_path(), + refgene_flat=cache_config.references.get_refgene_flat_file_path(), + params: + refseq_script_path=REFSEQ_SCRIPT_PATH.as_posix(), + singularity: + f"{config['containers_dir']}/{config['bioinfo_tools']['bedtools']}.{FileType.SIF}" + threads: get_threads(cluster_config=cluster_config, rule_name="preprocess_refseq_canfam") + message: + "Preprocessing RefSeq's gene files {input.refgene_sql} and {input.refgene_txt}" + benchmark: + f"{cache_config.references.get_refgene_bed_file_path()}.benchmark.{FileType.TSV}" + log: + refgene_bed=f"{cache_config.references.get_refgene_bed_file_path()}.{FileType.LOG}", + refgene_flat=f"{cache_config.references.get_refgene_flat_file_path()}.{FileType.LOG}", + shell: + """ + # Generate RefSeq's BED file + ( + header=$(awk -f "{params.refseq_script_path}" "{input.refgene_sql}") + (echo \"$header\"; cat "{input.refgene_txt}") |\ + csvcut -t -c chrom,exonStarts,exonEnds,name,score,strand,exonCount,txStart,txEnd,name2 |\ + csvformat -T |\ + bedtools expand -c 2,3 |\ + awk '$1~/chr[1-9]/ && $1!~/[_]/' |\ + sort -k1,1 -k2,2n > "{output.refgene_bed}" + ) &> "{log.refgene_bed}" + + # Generate RefSeq's flat file + ( + awk -v OFS=\"\\t\" '$3!~/_/ {{ gsub(\"chr\",\"chr\",$3); $1=$13; print }}' "{input.refgene_txt}" |\ + cut -f 1-11 > "{output.refgene_flat}" + ) &> "{log.refgene_flat}" + """ diff --git a/BALSAMIC/snakemake_rules/cache/singularity_containers.rule b/BALSAMIC/snakemake_rules/cache/singularity_containers.rule new file mode 100644 index 000000000..5333c6f60 --- /dev/null +++ b/BALSAMIC/snakemake_rules/cache/singularity_containers.rule @@ -0,0 +1,27 @@ +"""Rules to download Singularity containers.""" + + +rule download_containers: + """Download Singularity containers from Docker Hub.""" + output: + container_path=f"{config['containers_dir']}/{{singularity_image}}.sif", + wildcard_constraints: + singularity_image="|".join(cache_config.containers.keys()), + params: + tmp_dir=f"{config['containers_dir']}/tmp", + singularity_image="{singularity_image}", + dockerhub_image=lambda wildcards: config["containers"][ + wildcards.singularity_image + ], + threads: get_threads(cluster_config=cluster_config, rule_name="download_containers") + message: + "Downloading singularity image {output.container_path}" + benchmark: + f"{config['containers_dir']}/benchmarks/{{singularity_image}}.{FileType.SIF}.benchmark.{FileType.TSV}" + log: + f"{config['containers_dir']}/logs/{{singularity_image}}.{FileType.SIF}.{FileType.LOG}", + shell: + """ + export SINGULARITY_CACHEDIR={params.tmp_dir}/{params.singularity_image} + singularity pull {output.container_path} {params.dockerhub_image} &> "{log}" + """ diff --git a/BALSAMIC/snakemake_rules/cache/vep.rule b/BALSAMIC/snakemake_rules/cache/vep.rule new file mode 100644 index 000000000..ac06283a1 --- /dev/null +++ b/BALSAMIC/snakemake_rules/cache/vep.rule @@ -0,0 +1,32 @@ +"""Rules to download VEP package references.""" + + +rule download_vep: + """Download and install VEP package.""" + input: + singularity_image=f"{config['containers_dir']}/{config['bioinfo_tools']['ensembl-vep']}.{FileType.SIF}", + output: + vep_dir=directory(cache_config.vep_dir), + params: + species=Species.HOMO_SAPIENS, + assembly=cache_config.get_grch_version(), + plugins=VEP_PLUGINS, + singularity: + f"{config['containers_dir']}/{config['bioinfo_tools']['ensembl-vep']}.{FileType.SIF}" + threads: get_threads(cluster_config=cluster_config, rule_name="download_vep") + message: + "Downloading and installing VEP package in {output.vep_dir}" + benchmark: + f"{cache_config.vep_dir}/download_vep.benchmark.{FileType.TSV}" + log: + f"{cache_config.vep_dir}/download_vep.{FileType.LOG}", + shell: + """ + vep_install --SPECIES {params.species} \ + --AUTO cfp \ + --ASSEMBLY {params.assembly} \ + --CACHEDIR {output.vep_dir} \ + --PLUGINS {params.plugins} \ + --NO_HTSLIB --CONVERT --NO_UPDATE \ + &> {log} + """ diff --git a/BALSAMIC/snakemake_rules/concatenation/__init__.py b/BALSAMIC/snakemake_rules/concatenation/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/BALSAMIC/snakemake_rules/concatenation/concatenation.rule b/BALSAMIC/snakemake_rules/concatenation/concatenation.rule new file mode 100644 index 000000000..fdd636e23 --- /dev/null +++ b/BALSAMIC/snakemake_rules/concatenation/concatenation.rule @@ -0,0 +1,29 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +rule concatenate: + """Merge fastq files per lane into a single forward and reverse fastq.""" + input: + fastqs_fwd=lambda wildcards: config_model.get_all_fastqs_for_sample( + sample_name=wildcards.sample, fastq_types=[FastqName.FWD] + ), + fastqs_rev=lambda wildcards: config_model.get_all_fastqs_for_sample( + sample_name=wildcards.sample, fastq_types=[FastqName.REV] + ), + output: + concat_fwd = fastq_dir + "{sample}_concat_R_1.fp.fastq.gz", + concat_rev = fastq_dir + "{sample}_concat_R_2.fp.fastq.gz" + benchmark: + Path(benchmark_dir, "concatenated_{sample}.tsv").as_posix() + params: + fastq_dir = fastq_dir, + sample = "{sample}", + threads: + get_threads(cluster_config, "concatenate") + message: + "Sample {params.sample} FASTQ concatenation" + shell: + """ +cat {input.fastqs_fwd} > {output.concat_fwd} +cat {input.fastqs_rev} > {output.concat_rev} + """ diff --git a/BALSAMIC/snakemake_rules/dragen_suite/dragen_dna.rule b/BALSAMIC/snakemake_rules/dragen_suite/dragen_dna.rule index 76290d03b..d02027ff6 100644 --- a/BALSAMIC/snakemake_rules/dragen_suite/dragen_dna.rule +++ b/BALSAMIC/snakemake_rules/dragen_suite/dragen_dna.rule @@ -6,8 +6,8 @@ rule dragen_align_call_tumor_only: input: reference = config["reference"]["reference_genome"], - read1 = Path(fastq_dir, "{mysample}_1.fp.fastq.gz".format(mysample = tumor_sample)).as_posix(), - read2 = Path(fastq_dir, "{mysample}_2.fp.fastq.gz".format(mysample = tumor_sample)).as_posix(), + fastq_r1 = Path(fastq_dir, "{sample}_concat_R_1.fp.fastq.gz".format(sample=tumor_sample)).as_posix(), + fastq_r2 = Path(fastq_dir, "{sample}_concat_R_2.fp.fastq.gz".format(sample=tumor_sample)).as_posix() output: bam = Path(result_dir, "dragen", "SNV.somatic." + config["analysis"]["case_id"] + ".dragen_tumor.bam").as_posix(), vcf = Path(result_dir, "dragen", "SNV.somatic." + config["analysis"]["case_id"] + ".dragen.vcf.gz").as_posix() @@ -49,8 +49,8 @@ echo "will run the following DRAGEN command" mkdir -p {params.dragen_result_dir} dragen -f \ -r {params.tmp_reference_dir} \ ---tumor-fastq1 {input.read1} \ ---tumor-fastq2 {input.read2} \ +--tumor-fastq1 {input.fastq_r1} \ +--tumor-fastq2 {input.fastq_r2} \ --enable-variant-caller true \ --RGID-tumor {params.sample_name} \ --RGSM-tumor {params.sample_name} \ diff --git a/BALSAMIC/snakemake_rules/pon/cnvkit_create_pon.rule b/BALSAMIC/snakemake_rules/pon/cnvkit_create_pon.rule new file mode 100644 index 000000000..bf91309e1 --- /dev/null +++ b/BALSAMIC/snakemake_rules/pon/cnvkit_create_pon.rule @@ -0,0 +1,52 @@ +"""Rules for creation of CNVkit PON.""" + +rule create_target: + input: + target_bait = target_bed, + refgene_flat = refgene_flat, + access_bed = access_5kb_hg19 + output: + target_bed = cnv_dir + "target.bed", + offtarget_bed = cnv_dir + "antitarget.bed" + singularity: + Path(singularity_image, "varcall_cnvkit.sif").as_posix() + benchmark: + Path(benchmark_dir, "cnvkit.targets.tsv").as_posix() + shell: + """ +cnvkit.py target {input.target_bait} --annotate {input.refgene_flat} --split -o {output.target_bed}; +cnvkit.py antitarget {input.target_bait} -g {input.access_bed} -o {output.offtarget_bed}; + """ + +rule create_coverage: + input: + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), + target_bed = cnv_dir + "target.bed", + antitarget_bed = cnv_dir + "antitarget.bed" + output: + target_cnn = cnv_dir + "{sample}.targetcoverage.cnn", + antitarget_cnn = cnv_dir + "{sample}.antitargetcoverage.cnn" + singularity: + Path(singularity_image, "varcall_cnvkit.sif").as_posix() + benchmark: + Path(benchmark_dir, "cnvkit_{sample}.coverage.tsv").as_posix() + shell: + """ +cnvkit.py coverage {input.bam} {input.target_bed} -o {output.target_cnn}; +cnvkit.py coverage {input.bam} {input.antitarget_bed} -o {output.antitarget_cnn}; + """ + +rule create_reference: + input: + cnn = expand(cnv_dir + "{sample}.{prefix}coverage.cnn", sample=config_model.get_all_sample_names(), prefix=["target", "antitarget"]), + ref = reffasta + output: + ref_cnn = pon_reference + singularity: + Path(singularity_image, "varcall_cnvkit.sif").as_posix() + benchmark: + Path(benchmark_dir, "cnvkit.reference.tsv").as_posix() + shell: + """ +cnvkit.py reference {input.cnn} --fasta {input.ref} -o {output.ref_cnn} ; + """ diff --git a/BALSAMIC/snakemake_rules/pon/gens_create_pon.rule b/BALSAMIC/snakemake_rules/pon/gens_create_pon.rule new file mode 100644 index 000000000..5800216b9 --- /dev/null +++ b/BALSAMIC/snakemake_rules/pon/gens_create_pon.rule @@ -0,0 +1,33 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +rule gatk_create_readcount_pon: + input: + readcounts_hdf5s = expand(cnv_dir + "{sample}.collectreadcounts.hdf5", sample=sample_names) + output: + pon_hdf5 = cnv_dir + "gens_pon_100bp.{gender}.{version}.hdf5" + params: + tmpdir=tempfile.mkdtemp(prefix=tmp_dir), + gender="{gender}" + benchmark: + Path(benchmark_dir, "gatk_create_readcount_pon.{gender}.{version}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix() + threads: + get_threads(cluster_config, "gatk_create_readcount_pon") + message: + "Running GATK CreateReadCountPanelOfNormals for {params.gender} PON for GENS." + shell: + """ +export TMPDIR={params.tmpdir}; + +shell_readcounts_hdf5s=$(echo {input.readcounts_hdf5s} | sed 's/ / -I /g') ; + +gatk --java-options "-Xmx170000m" CreateReadCountPanelOfNormals \ +--minimum-interval-median-percentile 10.0 \ +--maximum-chunk-size 29349635 \ +-O {output.pon_hdf5} \ +-I $shell_readcounts_hdf5s + +rm -rf {params.tmpdir} + """ diff --git a/BALSAMIC/snakemake_rules/quality_control/GATK.rule b/BALSAMIC/snakemake_rules/quality_control/GATK.rule index 443125ac5..5a11e6151 100644 --- a/BALSAMIC/snakemake_rules/quality_control/GATK.rule +++ b/BALSAMIC/snakemake_rules/quality_control/GATK.rule @@ -5,8 +5,8 @@ rule PreparePopVCF: input: - bam = bam_dir + "tumor.merged.bam", - ref1kg = config["reference"]["1kg_snps_all"], + bam = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + ref1kg = config["reference"]["vcf_1kg"], output: popvcf = result_dir + "popvcf.vcf" benchmark: diff --git a/BALSAMIC/snakemake_rules/quality_control/contest.rule b/BALSAMIC/snakemake_rules/quality_control/contest.rule index 35b2eb1fd..7cb339823 100644 --- a/BALSAMIC/snakemake_rules/quality_control/contest.rule +++ b/BALSAMIC/snakemake_rules/quality_control/contest.rule @@ -5,8 +5,8 @@ rule gatk_contest: input: - bamN = bam_dir + "normal.merged.bam", - bamT = bam_dir + "tumor.merged.bam", + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), fa = config["reference"]["reference_genome"], popvcf = result_dir + "popvcf.vcf", output: diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp.rule b/BALSAMIC/snakemake_rules/quality_control/fastp.rule deleted file mode 100644 index ec0dfa659..000000000 --- a/BALSAMIC/snakemake_rules/quality_control/fastp.rule +++ /dev/null @@ -1,111 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -if 'quality_trim' in config['QC'].keys(): - fastp_param_qc = list() - fastp_param_adapter = list() - fastp_param_umi = list() - if config['QC']['quality_trim']: - fastp_param_qc.extend(["--trim_tail1", "1", "--n_base_limit", "50", - "--length_required", config["QC"]["min_seq_length"], - "--low_complexity_filter", "--trim_poly_g"]) - else: - fastp_param_qc.extend(["--disable_quality_filtering", - "--disable_length_filtering", - "--disable_trim_poly_g"]) - - if not config['QC']['adapter_trim']: - fastp_param_adapter.extend(["--disable_adapter_trimming"]) - else: - fastp_param_adapter.extend(["--detect_adapter_for_pe"]) - - # UMI trimming will work only if adapter_trim is disabled - if config['QC']['umi_trim']: - fastp_param_umi.extend(["--umi","--umi_loc per_read", - "--umi_len", config['QC']['umi_trim_length'], - "--umi_prefix","UMI"]) - - - -rule fastp_umi: - input: - read1=config["analysis"]["fastq_path"] + "{sample}" + "_1.fastq.gz", - read2=config["analysis"]["fastq_path"] + "{sample}" + "_2.fastq.gz", - output: - read1 = temp(fastq_dir + "{sample}_1.umi_optimized.fastq.gz"), - read2 = temp(fastq_dir + "{sample}_2.umi_optimized.fastq.gz"), - json = qc_dir + "fastp/{sample}_fastp_umi.json", - html = qc_dir + "fastp/{sample}_fastp_umi.html", - benchmark: - Path(benchmark_dir, "fastp_umi" + "{sample}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix() - params: - tmpdir = tmp_dir, - qc = " ".join(fastp_param_qc), - adapter = " ".join(fastp_param_adapter), - sample_name = "{sample}", - threads: - get_threads(cluster_config, 'fastp_umi') - message: - "Quality control and trimming input fastq for {params.sample_name}" - shell: - """ -export TMPDIR={params.tmpdir}; - -fastp \ ---thread {threads} \ ---in1 {input.read1} \ ---in2 {input.read2} \ ---out1 {output.read1} \ ---out2 {output.read2} \ ---json {output.json} \ ---html {output.html} \ ---overrepresentation_analysis \ -{params.qc} \ -{params.adapter}; - """ -# Double pass to hard trim adapter and UMIs - - -rule fastp: - input: - read1 = fastq_dir + "{sample}_1.umi_optimized.fastq.gz", - read2 = fastq_dir + "{sample}_2.umi_optimized.fastq.gz" - output: - read1 = temp(fastq_dir + "{sample}_1.fp.fastq.gz"), - read2 = temp(fastq_dir + "{sample}_2.fp.fastq.gz"), - json = qc_dir + "fastp/{sample}_fastp.json", - html = qc_dir + "fastp/{sample}_fastp.html" - benchmark: - Path(benchmark_dir, "fastp_" + "{sample}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix() - params: - tmpdir = tmp_dir, - umi = " ".join(fastp_param_umi), - minimum_length = config["QC"]["min_seq_length"], - sample_name = "{sample}", - threads: - get_threads(cluster_config, 'fastp') - message: - "Quality control and trimming of umi optimized fastq file for {params.sample_name}" - shell: - """ -export TMPDIR={params.tmpdir}; - -fastp \ ---thread {threads} \ ---in1 {input.read1} \ ---in2 {input.read2} \ ---out1 {output.read1} \ ---out2 {output.read2} \ ---json {output.json} \ ---html {output.html} \ ---disable_adapter_trimming \ ---disable_quality_filtering \ ---disable_length_filtering \ ---disable_trim_poly_g \ ---length_required {params.minimum_length} \ -{params.umi}; - """ diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule new file mode 100644 index 000000000..a9abae72f --- /dev/null +++ b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule @@ -0,0 +1,83 @@ +"""Rules for TGA fastq pre-processing: removal UMIs and quality and adapter-trimming.""" + +rule fastp_umi_trim: + """Fastq TGA data pre-processing to remove UMIs.""" + input: + fastq_r1 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.FWD), + fastq_r2 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.REV) + output: + fastq_r1 = temp(fastq_dir + "{fastq_pattern}_1.umi_removed.fastq.gz"), + fastq_r2 = temp(fastq_dir + "{fastq_pattern}_2.umi_removed.fastq.gz"), + json = qc_dir + "fastp/{fastq_pattern}_umi_removed_fastp.json", + html = qc_dir + "fastp/{fastq_pattern}_umi_removed_fastp.html", + benchmark: + Path(benchmark_dir, "fastp_remove_umi" + "{fastq_pattern}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix() + params: + tmpdir = tmp_dir, + fastp_trim_umi = " ".join(fastp_parameters["fastp_trim_umi"]), + fastq_pattern = "{fastq_pattern}", + threads: + get_threads(cluster_config, 'fastp_remove_umi') + message: + "Trimming away UMI sequences for fastqs in fastq pattern: {params.fastq_pattern}" + shell: + """ +export TMPDIR={params.tmpdir}; + +fastp \ +--thread {threads} \ +--in1 {input.fastq_r1} \ +--in2 {input.fastq_r2} \ +--out1 {output.fastq_r1} \ +--out2 {output.fastq_r2} \ +--json {output.json} \ +--html {output.html} \ +--disable_adapter_trimming \ +--disable_quality_filtering \ +--disable_length_filtering \ +--disable_trim_poly_g \ +{params.fastp_trim_umi}; + """ + + +rule fastp_quality_trim_tga: + """Fastq data pre-processing after removal of UMIs.""" + input: + fastq_r1 = fastq_dir + "{fastq_pattern}_1.umi_removed.fastq.gz", + fastq_r2 = fastq_dir + "{fastq_pattern}_2.umi_removed.fastq.gz" + output: + fastq_r1 = temp(fastq_dir + "{fastq_pattern}_1.fp.fastq.gz"), + fastq_r2 = temp(fastq_dir + "{fastq_pattern}_2.fp.fastq.gz"), + json = qc_dir + "fastp/{fastq_pattern}_fastp.json", + html = qc_dir + "fastp/{fastq_pattern}_fastp.html" + benchmark: + Path(benchmark_dir, "fastp_quality_trim" + "{fastq_pattern}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix() + params: + tmpdir = tmp_dir, + quality_trim = " ".join(fastp_parameters["fastp_trim_qual"]), + adapter_trim = " ".join(fastp_parameters["fastp_trim_adapter"]), + fastq_pattern = "{fastq_pattern}" + threads: + get_threads(cluster_config, 'fastp_quality_trim') + message: + "Quality and adapter trimming for fastqs for fastq pattern: {params.fastq_pattern}" + shell: + """ +export TMPDIR={params.tmpdir}; + +fastp \ +--thread {threads} \ +--in1 {input.fastq_r1} \ +--in2 {input.fastq_r2} \ +--out1 {output.fastq_r1} \ +--out2 {output.fastq_r2} \ +--json {output.json} \ +--html {output.html} \ +--overrepresentation_analysis \ +{params.quality_trim} \ +{params.adapter_trim}; + """ diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp_wgs.rule b/BALSAMIC/snakemake_rules/quality_control/fastp_wgs.rule new file mode 100644 index 000000000..743d50db3 --- /dev/null +++ b/BALSAMIC/snakemake_rules/quality_control/fastp_wgs.rule @@ -0,0 +1,42 @@ +"""Rules for WGS fastq pre-processing: quality and adapter-trimming.""" + + +rule fastp_quality_trim_wgs: + """Fastq data pre-processing for WGS.""" + input: + fastq_r1 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.FWD), + fastq_r2 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.REV) + output: + fastq_r1 = temp(fastq_dir + "{fastq_pattern}_1.fp.fastq.gz"), + fastq_r2 = temp(fastq_dir + "{fastq_pattern}_2.fp.fastq.gz"), + json = qc_dir + "fastp/{fastq_pattern}_fastp.json", + html = qc_dir + "fastp/{fastq_pattern}_fastp.html" + benchmark: + Path(benchmark_dir, "fastp_quality_trim" + "{fastq_pattern}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix() + params: + tmpdir = tmp_dir, + quality_trim = " ".join(fastp_parameters["fastp_trim_qual"]), + adapter_trim = " ".join(fastp_parameters["fastp_trim_adapter"]), + fastq_pattern = "{fastq_pattern}" + threads: + get_threads(cluster_config, 'fastp') + message: + "Quality control and trimming of UMI-removed fastqs for fastq pattern: {params.fastq_pattern}" + shell: + """ +export TMPDIR={params.tmpdir}; + +fastp \ +--thread {threads} \ +--in1 {input.fastq_r1} \ +--in2 {input.fastq_r2} \ +--out1 {output.fastq_r1} \ +--out2 {output.fastq_r2} \ +--json {output.json} \ +--html {output.html} \ +--overrepresentation_analysis \ +{params.quality_trim} \ +{params.adapter_trim}; + """ diff --git a/BALSAMIC/snakemake_rules/quality_control/fastqc.rule b/BALSAMIC/snakemake_rules/quality_control/fastqc.rule index 20f413cbe..493a892fd 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastqc.rule @@ -1,37 +1,29 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format - - rule fastqc: + """Perform quality control checks on raw sequence data.""" input: - read1 = fastq_dir + "{sample}_1.fastq.gz", - read2 = fastq_dir + "{sample}_2.fastq.gz", + fastq = input_fastq_dir + "{fastq_file_names}.fastq.gz" output: - read1 = fastqc_dir + "{sample}_1_fastqc.zip", - read2 = fastqc_dir + "{sample}_2_fastqc.zip" + fastqc_zip = fastqc_dir + "{fastq_file_names}_fastqc.zip" benchmark: - Path(benchmark_dir, "fastqc_{sample}.tsv").as_posix() + Path(benchmark_dir, "fastqc_{fastq_file_names}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("fastqc") + ".sif").as_posix() params: fastqc_dir = fastqc_dir, - sample = "{sample}", - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + fastq_file_name = "{fastq_file_names}", + tmpdir = tempfile.mkdtemp(prefix=tmp_dir) threads: get_threads(cluster_config, "fastqc") message: - "Running FastQC on {params.sample}" + "Running FastQC on {params.fastq_file_name}" shell: """ mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; -fastqc --threads {threads} {input.read1} \ ---dir {params.tmpdir} \ ---outdir {params.fastqc_dir}; - -fastqc --threads {threads} {input.read2} \ +fastqc --threads {threads} {input.fastq} \ --dir {params.tmpdir} \ --outdir {params.fastqc_dir}; """ diff --git a/BALSAMIC/snakemake_rules/quality_control/mosdepth.rule b/BALSAMIC/snakemake_rules/quality_control/mosdepth.rule index 01ad507a8..c66487c46 100644 --- a/BALSAMIC/snakemake_rules/quality_control/mosdepth.rule +++ b/BALSAMIC/snakemake_rules/quality_control/mosdepth.rule @@ -5,16 +5,16 @@ rule mosdepth_coverage: input: - bam = bam_dir + "{sample}" + ".sorted." + picarddup + ".bam", + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), bed = config["panel"]["capture_kit"] output: - bam_dir + "{sample}.mosdepth.global.dist.txt", - bam_dir + "{sample}.mosdepth.region.dist.txt", - bam_dir + "{sample}.mosdepth.summary.txt", - bam_dir + "{sample}.per-base.bed.gz", - bam_dir + "{sample}.regions.bed.gz" + bam_dir + "{sample}_{sample_type}.mosdepth.global.dist.txt", + bam_dir + "{sample}_{sample_type}.mosdepth.region.dist.txt", + bam_dir + "{sample}_{sample_type}.mosdepth.summary.txt", + bam_dir + "{sample}_{sample_type}.per-base.bed.gz", + bam_dir + "{sample}_{sample_type}.regions.bed.gz" benchmark: - Path(benchmark_dir, "mosdepth_coverage_" + "{sample}.tsv").as_posix() + Path(benchmark_dir, "mosdepth_coverage_" + "{sample}_{sample_type}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("mosdepth") + ".sif").as_posix() params: @@ -40,6 +40,6 @@ mosdepth \ --flag {params.samflag} \ --quantize {params.quantize} \ --threads {threads} \ -{params.output_dir}/{params.sample_name} \ +{params.output_dir}/{params.sample_name}_{wildcards.sample_type} \ {input.bam}; """ diff --git a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule index adf8fedf6..176da7b15 100644 --- a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule @@ -1,65 +1,92 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -multiqc_input = [bam_dir + "tumor.merged.bam"] +multiqc_input = [config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample)] + +multiqc_input.append(qc_dir + "tumor.{}.dedup.metrics".format(tumor_sample)) +multiqc_input.extend(expand(bam_dir + "{sample}.samtools.{stats}.txt", + sample=sample_names, stats=['flagstats', 'idxstats', 'stats'])) + + +multiqc_input.extend(expand(bam_dir + "tumor.{sample}.cram", sample=tumor_sample)) + +# fastqc metrics +multiqc_input.extend(expand(fastqc_dir + "{fastq_file_names}_fastqc.zip", fastq_file_names=config_model.get_all_fastq_names(remove_suffix = True))) if config['analysis']['analysis_type'] == "paired": - multiqc_input.append(bam_dir + "normal.merged.bam") + multiqc_input.extend(expand(bam_dir + "normal.{sample}.cram", sample=normal_sample)) + multiqc_input.append(config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample)) multiqc_input.append(qc_dir + "somalier/somalier.pairs.tsv") + multiqc_input.append(qc_dir + "normal.{}.dedup.metrics".format(normal_sample)) + -# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format if config["analysis"]["sequencing_type"] == 'wgs': picard_metrics_wildcard = ["alignment_summary_metrics", "base_distribution_by_cycle_metrics", "base_distribution_by_cycle.pdf", "insert_size_histogram.pdf", "insert_size_metrics", "quality_by_cycle_metrics", "quality_by_cycle.pdf", "quality_distribution_metrics", "quality_distribution.pdf"] - # fastqc metrics - multiqc_input.extend(expand(fastqc_dir + "{sample}_{read_num}_fastqc.zip", sample=config["samples"], read_num=[1, 2])) # fastp metrics - multiqc_input.extend(expand(qc_dir + "fastp/{sample}_fastp.json", sample=config["samples"])) - multiqc_input.extend(expand(qc_dir + "fastp/{sample}_fastp.html", sample=config["samples"])) + multiqc_input.extend(expand(qc_dir + "fastp/{fastq_pattern}_fastp.json", + fastq_pattern=config_model.get_fastq_patterns_by_sample(sample_names = sample_names))) + multiqc_input.extend(expand(qc_dir + "fastp/{fastq_pattern}_fastp.html", + fastq_pattern=config_model.get_fastq_patterns_by_sample(sample_names = sample_names))) + # Picard metrics - multiqc_input.extend(expand(qc_dir + "{sample}_picard_wgs_metrics.txt", sample=config["samples"])) - multiqc_input.extend(expand(qc_dir + "{sample}.multiple_metrics.{metrics_wc}", sample=config["samples"], metrics_wc=picard_metrics_wildcard)) + multiqc_input.extend(expand(qc_dir + "{sample}_picard_wgs_metrics.txt", + sample=sample_names)) + multiqc_input.extend(expand(qc_dir + "{sample}.multiple_metrics.{metrics_wc}", + sample=sample_names, metrics_wc=picard_metrics_wildcard)) + multiqc_input.extend(expand(qc_dir + "{sample}.dedup.realign.hsmetric.txt", sample=sample_names)) + multiqc_input.extend(expand(qc_dir + "{sample}.dedup.realign.gc_bias_metrics.txt", sample=sample_names)) + multiqc_input.extend(expand(qc_dir + "{sample}.dedup.realign.gc_bias_metrics.sum_metrics.txt", sample=sample_names)) + multiqc_input.extend(expand(qc_dir + "{sample}.dedup.realign.gc_bias_metrics.chart.pdf", sample=sample_names)) # Sentieon metrics - multiqc_input.extend(expand(qc_dir + "{sample}_sentieon_wgs_metrics.txt", sample=config["samples"])) - multiqc_input.extend(expand(qc_dir + "{sample}_coverage.gz", sample=config["samples"])) - multiqc_input.append(bam_dir+"tumor.merged.recal_data.table") - if config['analysis']['analysis_type'] == "paired": - multiqc_input.append(bam_dir+"normal.merged.recal_data.table") + multiqc_input.extend(expand(qc_dir + "{sample}_sentieon_wgs_metrics.txt", sample=sample_names)) + multiqc_input.extend(expand(qc_dir + "{sample}_coverage.gz", sample=sample_names)) -else: - # fastqc metrics - multiqc_input.extend(expand(fastqc_dir + "{sample}_{read_num}_fastqc.zip", sample=config["samples"], read_num=[1, 2])) + if config["analysis"]["analysis_workflow"] != "balsamic-qc": + multiqc_input.append(bam_dir + "tumor.recal_data.table") + if config['analysis']['analysis_type'] == "paired": + multiqc_input.append(bam_dir + "normal.recal_data.table") + +else: # fastp metrics - multiqc_input.extend(expand(qc_dir + "fastp/{sample}_fastp.json", sample=config["samples"])) - multiqc_input.extend(expand(qc_dir + "fastp/{sample}_fastp.html", sample=config["samples"])) + multiqc_input.extend(expand(qc_dir + "fastp/{fastq_pattern}_fastp.json", + fastq_pattern=config_model.get_fastq_patterns_by_sample(sample_names = sample_names))) + multiqc_input.extend(expand(qc_dir + "fastp/{fastq_pattern}_fastp.html", + fastq_pattern=config_model.get_fastq_patterns_by_sample(sample_names = sample_names))) # picard metrics - multiqc_input.extend(expand(bam_dir + "{sample}.sorted.insertsizemetric", sample=config["samples"])) - multiqc_input.extend(expand(bam_dir + "{sample}.sorted.alignmetric", sample=config["samples"])) - multiqc_input.extend(expand(bam_dir + "{sample}.sorted."+ picarddup +".hsmetric", sample=config["samples"])) + multiqc_input.extend(expand(bam_dir + "{sample}.dedup.insertsizemetric.txt", sample=tumor_sample)) + multiqc_input.extend(expand(bam_dir + "{sample}.dedup.alignmetric.txt", sample=tumor_sample)) + if config['analysis']['analysis_type'] == "paired": + multiqc_input.extend(expand(bam_dir + "{sample}.dedup.insertsizemetric.txt", sample=normal_sample)) + multiqc_input.extend(expand(bam_dir + "{sample}.dedup.alignmetric.txt", sample=normal_sample)) + + multiqc_input.extend(expand(bam_dir + "{sample}.dedup.hsmetric.txt", sample=sample_names)) # sambamba metrics - multiqc_input.extend(expand(bam_dir + "{sample}.sorted." + picarddup + ".cov.bed", sample=config["samples"])) - multiqc_input.extend(expand(bam_dir + "{sample}.sorted." + picarddup + ".exon.cov.bed", sample=config["samples"])) - - # mosdepth metrics - multiqc_input.extend(expand(bam_dir + "{sample}.mosdepth.global.dist.txt", sample=config["samples"])) - multiqc_input.extend(expand(bam_dir + "{sample}.mosdepth.region.dist.txt", sample=config["samples"])) - multiqc_input.extend(expand(bam_dir + "{sample}.mosdepth.summary.txt", sample=config["samples"])) + multiqc_input.extend(expand(bam_dir + "{sample}.dedup.cov.bed", sample=sample_names)) + multiqc_input.extend(expand(bam_dir + "{sample}.dedup.exon.cov.bed", sample=sample_names)) - # samtools metrics - multiqc_input.extend(expand(bam_dir + "{sample}.samtools.{stats}.txt", sample=config["samples"], stats=['flagstats', 'idxstats', 'stats'])) + # mosdepth metrics + mosdepth_metrics_wildcard = ["mosdepth.global.dist.txt", "mosdepth.region.dist.txt", + "mosdepth.summary.txt", "per-base.bed.gz", "regions.bed.gz"] + multiqc_input.extend(expand(bam_dir + "{sample}_tumor.{mosdepth_wc}", + sample=tumor_sample, mosdepth_wc = mosdepth_metrics_wildcard)) + if config['analysis']['analysis_type'] == "paired": + multiqc_input.extend(expand(bam_dir + "{sample}_normal.{mosdepth_wc}", + sample=normal_sample, mosdepth_wc = mosdepth_metrics_wildcard)) if config["analysis"]["analysis_workflow"]=="balsamic-umi": # UMI picard metrics - multiqc_input.extend(expand(umi_qc_dir + "{sample}.umi.collect_hsmetric", sample=config["samples"])) - multiqc_input.extend(expand(umi_qc_dir + "{sample}.umi.metrics", sample=config["samples"])) + multiqc_input.extend(expand(umi_qc_dir + "{sample}.umi.collect_hsmetric.txt", sample=sample_names)) + multiqc_input.extend(expand(umi_qc_dir + "{sample}.umi.metrics.txt", sample=sample_names)) + rule multiqc: input: diff --git a/BALSAMIC/snakemake_rules/quality_control/picard.rule b/BALSAMIC/snakemake_rules/quality_control/picard.rule index 6a159629b..66709ec77 100644 --- a/BALSAMIC/snakemake_rules/quality_control/picard.rule +++ b/BALSAMIC/snakemake_rules/quality_control/picard.rule @@ -10,26 +10,25 @@ rule picard_CollectHsMetrics: input: fadict = (config["reference"]["reference_genome"]).replace(".fasta",".dict"), bed = config["panel"]["capture_kit"], - bam = bam_dir + "{sample}.sorted." + picarddup + ".bam", + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), fa = config["reference"]["reference_genome"], output: - mrkdup = bam_dir + "{sample}.sorted." + picarddup + ".hsmetric" + hs_metrics = bam_dir + "{sample}.dedup.hsmetric.txt" benchmark: - Path(benchmark_dir + "picard_CollectHsMetrics_" + "{sample}.tsv").as_posix() + Path(benchmark_dir + "picard_CollectHsMetrics_{sample}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("picard") + ".sif").as_posix() params: mem = memory, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - baitsetname = os.path.basename(config["panel"]["capture_kit"]), - sample = '{sample}' + baitsetname = Path(config["panel"]["capture_kit"]).name, + sample = "{sample}" threads: - get_threads(cluster_config, 'picard_CollectHsMetrics') + get_threads(cluster_config, "picard_CollectHsMetrics") message: - "Calculating picard HsMetrics for sample '{params.sample}'" + "Calculating picard HsMetrics for sample {params.sample}" shell: """ -mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ @@ -43,7 +42,7 @@ CollectHsMetrics \ BI={input.bam}.picard.bedintervals \ TI={input.bam}.picard.bedintervals \ I={input.bam} \ -O={output.mrkdup} \ +O={output.hs_metrics} \ R={input.fa} \ BAIT_SET_NAME={params.baitsetname} \ COVERAGE_CAP=50000 \ @@ -53,26 +52,25 @@ METRIC_ACCUMULATION_LEVEL=ALL_READS; rule picard_CollectAlignmentSummaryMetrics: input: - bam = bam_dir + "{sample}.sorted.bam", + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), fa = config["reference"]["reference_genome"] output: - bam_dir + "{sample}.sorted.alignmetric" + bam_dir + "{sample}.dedup.alignmetric.txt" benchmark: - Path(benchmark_dir, "CollectAlignmentSummaryMetrics_" + "{sample}.tsv").as_posix() + Path(benchmark_dir, "CollectAlignmentSummaryMetrics_{sample}.tsv").as_posix() singularity: Path(singularity_image,config["bioinfo_tools"].get("picard") + ".sif").as_posix() params: mem = "16g", tmpdir = tempfile.mkdtemp(prefix=tmp_dir), adapter = config["QC"]["adapter"], - sample = '{sample}' + sample = "{sample}" threads: - get_threads(cluster_config,'picard_CollectAlignmentSummaryMetrics') + get_threads(cluster_config,"picard_CollectAlignmentSummaryMetrics") message: - "Calculating picard alignment summary metrics for sample '{params.sample}'" + "Calculating picard alignment summary metrics for sample {params.sample}" shell: """ -mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ @@ -88,25 +86,24 @@ METRIC_ACCUMULATION_LEVEL=LIBRARY; rule picard_CollectInsertSizeMetrics: input: - bam = bam_dir + "{sample}.sorted.bam" + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), output: - pdf = bam_dir + "{sample}.sorted.insertsizemetric.pdf", - txt = bam_dir + "{sample}.sorted.insertsizemetric" + pdf = bam_dir + "{sample}.dedup.insertsizemetric.pdf", + txt = bam_dir + "{sample}.dedup.insertsizemetric.txt" benchmark: - Path(benchmark_dir, "picard_CollectInsertSizeMetrics_" + "{sample}.tsv").as_posix() + Path(benchmark_dir, "picard_CollectInsertSizeMetrics_{sample}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("picard") + ".sif").as_posix() params: mem = "16g", tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - sample = '{sample}' + sample = "{sample}" threads: - get_threads(cluster_config,'picard_CollectInsertSizeMetrics') + get_threads(cluster_config,"picard_CollectInsertSizeMetrics") message: - "Calculating picard InsertSize metrics for sample '{params.sample}'" + "Calculating picard InsertSize metrics for sample {params.sample}" shell: """ -mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ diff --git a/BALSAMIC/snakemake_rules/quality_control/picard_wgs.rule b/BALSAMIC/snakemake_rules/quality_control/picard_wgs.rule index e103cc85b..d459fff3e 100644 --- a/BALSAMIC/snakemake_rules/quality_control/picard_wgs.rule +++ b/BALSAMIC/snakemake_rules/quality_control/picard_wgs.rule @@ -7,15 +7,91 @@ picard_metrics_wildcard = ["alignment_summary_metrics", "base_distribution_by_cy "quality_by_cycle.pdf", "quality_distribution_metrics", "quality_distribution.pdf"] +rule picard_CollectHsMetrics_WGS: + input: + fadict = config_model.reference["reference_genome"].replace(".fasta",".dict"), + bed = config_model.reference["refgene_bed"], + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), + fa = config_model.reference["reference_genome"], + output: + hs_metrics = qc_dir + "{sample}.dedup.realign.hsmetric.txt" + benchmark: + Path(benchmark_dir + "picard_CollectHsMetrics_{sample}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("picard") + ".sif").as_posix() + params: + mem = "16g", + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + bed_name = Path(config_model.reference["refgene_bed"]).name, + sample = "{sample}" + threads: + get_threads(cluster_config, "picard_CollectHsMetrics") + message: + "Calculating picard HsMetrics for sample {params.sample}" + shell: + """ +export TMPDIR={params.tmpdir}; + +picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ +BedToIntervalList \ +I={input.bed} \ +O={input.bam}.picard.bedintervals \ +SD={input.fadict}; + +picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ +CollectHsMetrics \ +BI={input.bam}.picard.bedintervals \ +TI={input.bam}.picard.bedintervals \ +I={input.bam} \ +O={output.hs_metrics} \ +R={input.fa} \ +BAIT_SET_NAME={params.bed_name} \ +COVERAGE_CAP=50000 \ +METRIC_ACCUMULATION_LEVEL=ALL_READS; + """ + +rule picard_CollectGcBiasMetrics: + input: + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), + fa = config_model.reference["reference_genome"], + output: + gc_bias_metrics = qc_dir + "{sample}.dedup.realign.gc_bias_metrics.txt", + gc_bias_sum_metrics = qc_dir + "{sample}.dedup.realign.gc_bias_metrics.sum_metrics.txt", + gc_bias_sum_chart = qc_dir + "{sample}.dedup.realign.gc_bias_metrics.chart.pdf" + benchmark: + Path(benchmark_dir + "picard_CollectHsMetrics_{sample}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("picard") + ".sif").as_posix() + params: + mem = "16g", + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + bed_name = Path(config_model.reference["refgene_bed"]).name, + sample = "{sample}" + threads: + get_threads(cluster_config, "picard_CollectHsMetrics") + message: + "Calculating picard HsMetrics for sample {params.sample}" + shell: + """ +export TMPDIR={params.tmpdir}; + +picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ +CollectGcBiasMetrics \ +I={input.bam} \ +O={output.gc_bias_metrics} \ +R={input.fa} \ +S={output.gc_bias_sum_metrics} \ +CHART={output.gc_bias_sum_chart}; + """ rule picard_CollectMultipleMetrics: input: - bam = bam_dir + "{sample}.dedup.bam", + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), reference = config["reference"]["reference_genome"] output: - expand(qc_dir + "{{sample}}.multiple_metrics.{metrics_wc}", sample = config["samples"], metrics_wc = picard_metrics_wildcard) + expand(qc_dir + "{{sample}}.multiple_metrics.{metrics_wc}", metrics_wc = picard_metrics_wildcard) benchmark: - Path(benchmark_dir, "picard_CollectMultipleMetrics_" + "{sample}.tsv").as_posix() + Path(benchmark_dir, "picard_CollectMultipleMetrics_{sample}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("picard") + ".sif").as_posix() params: @@ -24,12 +100,11 @@ rule picard_CollectMultipleMetrics: output_prefix = qc_dir + "{sample}.multiple_metrics", sample = '{sample}' threads: - get_threads(cluster_config,'picard_CollectMultipleMetrics') + get_threads(cluster_config, "picard_CollectMultipleMetrics") message: "Collecting picard multiple quality metrics for wgs sample {params.sample}" shell: """ -mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ @@ -42,25 +117,24 @@ R={input.reference}; rule picard_CollectWgsMetrics: input: - bam = bam_dir + "{sample}.dedup.bam", + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), reference = config["reference"]["reference_genome"] output: qc_dir + "{sample}_picard_wgs_metrics.txt" benchmark: - Path(benchmark_dir + "picard_CollectWgsMetrics_" + "{sample}.tsv").as_posix() + Path(benchmark_dir + "picard_CollectWgsMetrics_{sample}.tsv").as_posix() singularity: Path(singularity_image,config[ "bioinfo_tools" ].get("picard") + ".sif").as_posix() params: mem = "16g", tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - sample = '{sample}' + sample = "{sample}" threads: - get_threads(cluster_config,'picard_CollectWgsMetrics') + get_threads(cluster_config,"picard_CollectWgsMetrics") message: - "Collecting various picard quality metrics for wgs sample '{params.sample}'" + "Collecting various picard quality metrics for wgs sample {params.sample}" shell: """ -mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ diff --git a/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule b/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule index c894f41e3..860746bac 100644 --- a/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule +++ b/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule @@ -21,11 +21,9 @@ rule collect_custom_qc_metrics: output: yaml = qc_dir + config["analysis"]["case_id"] + "_metrics_deliverables.yaml" params: - config_path = analysis_dir + config["analysis"]["case_id"] + ".json", + config_path = f"{analysis_dir_home}/{case_id}/{case_id}.json", collect_qc_metrics_script = get_script_path("collect_qc_metrics.py"), housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "qc-metrics"} - singularity: - Path(singularity_image, "balsamic.sif").as_posix() threads: get_threads(cluster_config, "collect_custom_qc_metrics") message: diff --git a/BALSAMIC/snakemake_rules/quality_control/report.rule b/BALSAMIC/snakemake_rules/quality_control/report.rule index 57067fc2d..073b5a7f6 100644 --- a/BALSAMIC/snakemake_rules/quality_control/report.rule +++ b/BALSAMIC/snakemake_rules/quality_control/report.rule @@ -3,14 +3,12 @@ rule cnv_report: input: - cnv_data= cnv_data_paths + cnv_data = cnv_report_paths output: - cnv_report_pdf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".report.pdf" + cnv_report_pdf = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".report.pdf" params: housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, cnv_report_script= get_script_path("generate_cnv_report.py"), - singularity: - Path(singularity_image, "balsamic.sif").as_posix() threads: get_threads(cluster_config, "cnv_report") message: diff --git a/BALSAMIC/snakemake_rules/quality_control/sambamba_depth.rule b/BALSAMIC/snakemake_rules/quality_control/sambamba_depth.rule index 922fc4ff1..203d0da3b 100644 --- a/BALSAMIC/snakemake_rules/quality_control/sambamba_depth.rule +++ b/BALSAMIC/snakemake_rules/quality_control/sambamba_depth.rule @@ -5,10 +5,10 @@ rule sambamba_panel_depth: input: - bam = bam_dir + "{sample}" + ".sorted." + picarddup + ".bam", + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), bed = config["panel"]["capture_kit"] output: - bam_dir + "{sample}.sorted." + picarddup + ".cov.bed" + bam_dir + "{sample}.dedup.cov.bed" benchmark: Path(benchmark_dir, "sambamba_panel_depth_" + "{sample}.tsv").as_posix() singularity: @@ -39,10 +39,10 @@ sambamba depth region \ rule sambamba_exon_depth: input: - bam = bam_dir + "{sample}" + ".sorted." + picarddup + ".bam", - bed = config["reference"]["exon_bed"] + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), + bed = config["reference"]["refgene_bed"] output: - bam_dir + "{sample}.sorted." + picarddup + ".exon.cov.bed" + bam_dir + "{sample}.dedup.exon.cov.bed" benchmark: Path(benchmark_dir, "sambamba_exon_depth_" + "{sample}.tsv").as_posix() singularity: diff --git a/BALSAMIC/snakemake_rules/quality_control/samtools_qc.rule b/BALSAMIC/snakemake_rules/quality_control/samtools_qc.rule new file mode 100644 index 000000000..f39ed5ac3 --- /dev/null +++ b/BALSAMIC/snakemake_rules/quality_control/samtools_qc.rule @@ -0,0 +1,26 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +rule samtools_qc: + input: + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample) + output: + flagstats = Path(bam_dir, "{sample}.samtools.flagstats.txt").as_posix(), + idxstats = Path(bam_dir, "{sample}.samtools.idxstats.txt").as_posix(), + stats = Path(bam_dir, "{sample}.samtools.stats.txt").as_posix(), + benchmark: + Path(benchmark_dir,"samtools_qc_{sample}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("samtools") + ".sif").as_posix() + params: + sample_id = "{sample}" + threads: + get_threads(cluster_config, "samtools_qc") + message: + "Calculating alignment stats for sample: {params.sample_id}" + shell: + """ +samtools flagstats --threads {threads} {input.bam} > {output.flagstats}; +samtools stats --threads {threads} {input.bam} > {output.stats}; +samtools idxstats --threads {threads} {input.bam} > {output.idxstats}; + """ diff --git a/BALSAMIC/snakemake_rules/quality_control/sentieon_qc_metrics.rule b/BALSAMIC/snakemake_rules/quality_control/sentieon_qc_metrics.rule index a30b6b898..a221e3974 100644 --- a/BALSAMIC/snakemake_rules/quality_control/sentieon_qc_metrics.rule +++ b/BALSAMIC/snakemake_rules/quality_control/sentieon_qc_metrics.rule @@ -3,18 +3,16 @@ def repeat(param, values): param_values = [] - + for value in values: param_values.append(" ".join(map(str, [param, value]))) return " ".join(param_values) - - rule sentieon_wgs_metrics: input: - bam = bam_dir + '{sample}.dedup.bam', + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), reference = config["reference"]["reference_genome"] output: wgs_metrics = qc_dir + "{sample}_sentieon_wgs_metrics.txt", @@ -24,7 +22,7 @@ rule sentieon_wgs_metrics: params: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), min_base_qual = '10', - gene_list = config["reference"]["refGene"], + gene_list = config["reference"]["refgene_txt"], cov_threshold = repeat("--cov_thresh", [50, 100, 150, 200, 250]), sentieon_exec = config["SENTIEON_EXEC"], sentieon_lic = config["SENTIEON_LICENSE"], @@ -35,7 +33,6 @@ rule sentieon_wgs_metrics: "Calculate coverage metrics for wgs cases using sentieon tools for sample {params.sample}" shell: """ -mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; export SENTIEON_TMPDIR={params.tmpdir}; export SENTIEON_LICENSE={params.sentieon_lic}; @@ -54,7 +51,7 @@ export SENTIEON_LICENSE={params.sentieon_lic}; --gene_list {params.gene_list} {params.cov_threshold} \ {output.coverage_metrics}_tmp; -gzip -c {output.coverage_metrics}_tmp > {output.coverage_metrics}; -rm {output.coverage_metrics}_tmp; -rm -rf {params.tmpdir}; +gzip -c {output.coverage_metrics}_tmp > {output.coverage_metrics} ; +rm {output.coverage_metrics}_tmp ; +rm -rf {params.tmpdir} ; """ diff --git a/BALSAMIC/snakemake_rules/quality_control/somalier.rule b/BALSAMIC/snakemake_rules/quality_control/somalier.rule index ec44e675e..530ea1d25 100644 --- a/BALSAMIC/snakemake_rules/quality_control/somalier.rule +++ b/BALSAMIC/snakemake_rules/quality_control/somalier.rule @@ -4,7 +4,7 @@ rule somalier_extract_normal: input: - bamN = bam_dir + "normal.merged.bam", + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), fa = config["reference"]["reference_genome"], ref_sites = config["reference"]["somalier_sites"], output: @@ -28,7 +28,7 @@ rule somalier_extract_normal: rule somalier_extract_tumor: input: - bamT=bam_dir + "tumor.merged.bam", + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), fa = config["reference"]["reference_genome"], ref_sites = config["reference"]["somalier_sites"], output: diff --git a/BALSAMIC/snakemake_rules/report/__init__.py b/BALSAMIC/snakemake_rules/report/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/BALSAMIC/snakemake_rules/report/generate_pdf.rule b/BALSAMIC/snakemake_rules/report/generate_pdf.rule new file mode 100644 index 000000000..39aee569d --- /dev/null +++ b/BALSAMIC/snakemake_rules/report/generate_pdf.rule @@ -0,0 +1,62 @@ +"""Rules for generating TGA PDF reports.""" + + +rule csv_to_pdf: + """Converting LOH report CSV files to PDF format.""" + input: + purity_csv=f"{cnv_dir}CNV.somatic.{config['analysis']['case_id']}.purecn.purity.csv", + output: + purity_csv_pdf=f"{cnv_dir}CNV.somatic.{config['analysis']['case_id']}.purecn.purity.csv.pdf", + params: + csv_to_pdf_script=get_script_path("csv_to_pdf.py"), + loh_regions=f"{cnv_dir}CNV.somatic.{config['analysis']['case_id']}.purecn.LOHregions.csv", + loh_genes=f"{cnv_dir}CNV.somatic.{config['analysis']['case_id']}.purecn.LOHgenes.csv", + threads: + get_threads(cluster_config=cluster_config, rule_name="csv_to_pdf") + message: + "Converting CSV files to PDF" + shell: + """ +for file in "{input.purity_csv}" "{params.loh_regions}" "{params.loh_genes}"; do + if [[ -f "$file" ]]; then + python "{params.csv_to_pdf_script}" "$file" "$file.pdf" --header + fi +done + """ + +rule txt_to_pdf: + """Converting AscatNgs statistics TXT file to PDF format.""" + input: + sample_statistics=f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.ascat.samplestatistics.txt" + output: + sample_statistics_pdf=f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.ascat.samplestatistics.txt.pdf" + params: + csv_to_pdf_script=get_script_path("csv_to_pdf.py"), + threads: + get_threads(cluster_config=cluster_config, rule_name="txt_to_pdf") + message: + "Converting AscatNgs statistics TXT file to PDF" + shell: + """ +python "{params.csv_to_pdf_script}" "{input.sample_statistics}" "{output.sample_statistics_pdf}" --delimiter " " + """ + + +rule image_to_pdf: + """Converting CNV PNG files to PDF format.""" + input: + plot=f"{vcf_dir}{{plot}}" + output: + plot_pdf=f"{vcf_dir}{{plot}}.{FileType.PDF}" + wildcard_constraints: + plot="|".join(config_model.get_cnv_report_plots()), + params: + image_to_pdf_script=get_script_path("image_to_pdf.py"), + threads: + get_threads(cluster_config=cluster_config, rule_name="image_to_pdf") + message: + "Converting CNV PNG files to PDF" + shell: + """ +python "{params.image_to_pdf_script}" "{input.plot}" "{output.plot_pdf}" + """ diff --git a/BALSAMIC/snakemake_rules/report/merge_pdfs.rule b/BALSAMIC/snakemake_rules/report/merge_pdfs.rule new file mode 100644 index 000000000..dd384dd20 --- /dev/null +++ b/BALSAMIC/snakemake_rules/report/merge_pdfs.rule @@ -0,0 +1,28 @@ +"""Rules for merging PDF reports.""" + + +rule merge_cnv_pdf_reports: + """Rule for merging CNV PDF reports.""" + input: + report_paths=cnv_report_paths, + output: + cnv_report_pdf=f"{cnv_dir}CNV.somatic.{config['analysis']['case_id']}.report.pdf", + params: + housekeeper_id={"id": config["analysis"]["case_id"], "tags": "clinical"}, + merge_pdfs_script=get_script_path("merge_pdfs.py"), + loh_regions_pdf=f"{cnv_dir}CNV.somatic.{config['analysis']['case_id']}.purecn.LOHregions.csv.pdf", + loh_genes_pdf=f"{cnv_dir}CNV.somatic.{config['analysis']['case_id']}.purecn.LOHgenes.csv.pdf", + threads: + get_threads(cluster_config=cluster_config, rule_name="merge_cnv_pdf_reports") + message: + "Merging CNV PDF reports {output.cnv_report_pdf}" + benchmark: + Path(benchmark_dir, f"merge_cnv_pdf_reports_{config['analysis']['case_id']}.tsv").as_posix() + shell: + """ +IFS=" " read -r -a report_paths <<< "{input.report_paths}" +if [[ -f "{params.loh_regions_pdf}" ]] && [[ -f "{params.loh_genes_pdf}" ]]; then + report_paths+=("{params.loh_regions_pdf}" "{params.loh_genes_pdf}") +fi +python "{params.merge_pdfs_script}" "${{report_paths[@]}}" "{output.cnv_report_pdf}" + """ diff --git a/BALSAMIC/snakemake_rules/umi/concatenation_umi.rule b/BALSAMIC/snakemake_rules/umi/concatenation_umi.rule new file mode 100644 index 000000000..23de2c483 --- /dev/null +++ b/BALSAMIC/snakemake_rules/umi/concatenation_umi.rule @@ -0,0 +1,29 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + + +rule concatenate_umi_reads: + input: + fastqs_fwd=lambda wildcards: config_model.get_all_fastqs_for_sample( + sample_name=wildcards.sample, fastq_types=[FastqName.FWD] + ), + fastqs_rev=lambda wildcards: config_model.get_all_fastqs_for_sample( + sample_name=wildcards.sample, fastq_types=[FastqName.REV] + ), + output: + fastq_fwd=fastq_dir + "concat.{sample}_1.pre_umi.fastq.gz", + fastq_rev=fastq_dir + "concat.{sample}_2.pre_umi.fastq.gz", + benchmark: + Path(benchmark_dir, "concatenate_umi_reads_{sample}.tsv").as_posix() + params: + fastq_dir=config["analysis"]["fastq_path"], + sample="{sample}", + threads: + get_threads(cluster_config, "concatenate") + message: + "Sample {params.sample} FASTQ concatenation per read-direction, before UMI extraction" + shell: + """ + cat {input.fastqs_fwd} > {output.fastq_fwd} + cat {input.fastqs_rev} > {output.fastq_rev} + """ diff --git a/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule b/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule index f366602d3..4e2dd64d1 100644 --- a/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule +++ b/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule @@ -6,12 +6,12 @@ rule mergeBam_normal_umiconsensus: input: fasta = config["reference"]["reference_genome"], - bam = expand(umi_dir + "{mysample}_consensusfiltered_umi.bam", mysample=normal_sample) + bam = expand(umi_dir + "{sample}_consensusfiltered_umi.bam", sample = normal_sample) output: bam = umi_dir + "normal_umi_consensusfiltered.merged.bam", cram = umi_dir + "normal_umi_consensusfiltered.merged.cram" benchmark: - Path(benchmark_dir, 'mergeBam_normal_umiconsensus_' + "{mysample}.tsv".format(mysample=normal_sample)).as_posix() + Path(benchmark_dir, 'mergeBam_normal_umiconsensus_' + "{sample}.tsv".format(sample=normal_sample)).as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("picard") + ".sif").as_posix() params: diff --git a/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule b/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule index 20d158e47..996a0e207 100644 --- a/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule +++ b/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule @@ -6,12 +6,12 @@ rule mergeBam_tumor_umiconsensus: input: fasta = config["reference"]["reference_genome"], - bam = expand(umi_dir + "{mysample}_consensusfiltered_umi.bam", mysample = tumor_sample) + bam = expand(umi_dir + "{sample}_consensusfiltered_umi.bam", sample = tumor_sample) output: bam = umi_dir + "tumor_umi_consensusfiltered.merged.bam", cram = umi_dir + "tumor_umi_consensusfiltered.merged.cram" benchmark: - Path(benchmark_dir,'mergeBam_tumor_umiconsensus_' + "{mysample}.tsv".format(mysample=tumor_sample)).as_posix() + Path(benchmark_dir,'mergeBam_tumor_umiconsensus_' + "{sample}.tsv".format(sample=tumor_sample)).as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("picard") + ".sif").as_posix() params: diff --git a/BALSAMIC/snakemake_rules/umi/qc_umi.rule b/BALSAMIC/snakemake_rules/umi/qc_umi.rule index a4b09f124..c1a43e110 100644 --- a/BALSAMIC/snakemake_rules/umi/qc_umi.rule +++ b/BALSAMIC/snakemake_rules/umi/qc_umi.rule @@ -7,11 +7,11 @@ rule picard_umiaware: bam = umi_dir + "{sample}_consensusfiltered_umi.bam" output: bam = umi_qc_dir + "{sample}.picard.umiaware.bam", - duplicates = umi_qc_dir + "{sample}.umi.duplicatemetrics", - umimetrics = umi_qc_dir + "{sample}.umi.metrics" + duplicates = umi_qc_dir + "{sample}.umi.duplicatemetrics.txt", + umimetrics = umi_qc_dir + "{sample}.umi.metrics.txt" benchmark: Path(benchmark_dir, "picard_umiaware_{sample}.tsv").as_posix() - singularity: + singularity: Path(singularity_image, config["bioinfo_tools"].get("picard") + ".sif").as_posix() params: sample_id = "{sample}" @@ -36,7 +36,7 @@ rule picard_collecthsmetrics_umi: fa = config["reference"]["reference_genome"], fadict = (config["reference"]["reference_genome"]).replace(".fasta",".dict"), output: - mrkdup = umi_qc_dir + "{sample}.umi.collect_hsmetric" + mrkdup = umi_qc_dir + "{sample}.umi.collect_hsmetric.txt" benchmark: Path(benchmark_dir, "picard_collecthsmetrics_umi_{sample}.tsv").as_posix() singularity: @@ -44,7 +44,7 @@ rule picard_collecthsmetrics_umi: params: baitsetname = os.path.basename(config["panel"]["capture_kit"]), sample_id = "{sample}" - threads: + threads: get_threads(cluster_config, "CollectHsMetrics") message: "Collecting HSmetrics using Picardtools for {params.sample_id}" diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule b/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule index 460833320..0041f67c7 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule @@ -92,17 +92,17 @@ export SENTIEON_LICENSE={params.sentieon_lic}; rule sentieon_consensusfilter_umi: input: - umi_dir + "{case_name}_consensuscalled_umi.bam" + umi_dir + "{sample}_consensuscalled_umi.bam" output: - temp(umi_dir + "{case_name}_consensusfiltered_umi.bam") + temp(umi_dir + "{sample}_consensusfiltered_umi.bam") benchmark: - Path(benchmark_dir, "sentieon_consensusfilter_umi_{case_name}.tsv").as_posix() + Path(benchmark_dir, "sentieon_consensusfilter_umi_{sample}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("samtools") + ".sif").as_posix() params: consensusfilter_script = get_script_path("FilterDuplexUMIconsensus.awk"), minreads = params.umiconsensuscall.filter_minreads, - sample_id = '{case_name}', + sample_id = '{sample}', threads: get_threads(cluster_config, "sentieon_consensusfilter_umi") message: diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule b/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule index 71e5e2f73..2fec0f642 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule @@ -1,13 +1,13 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -# Extract umi tags using the defined read structure. +# Extract umi tags using the defined read structure. rule sentieon_umiextract: input: - read1 = Path(fastq_dir + "{sample}_1.umi_optimized.fastq.gz").as_posix(), - read2 = Path(fastq_dir + "{sample}_2.umi_optimized.fastq.gz").as_posix() + fastq_r1 = fastq_dir + "concat.{sample}_1.pre_umi.fastq.gz", + fastq_r2 = fastq_dir + "concat.{sample}_2.pre_umi.fastq.gz" output: ds_umi = temp(umi_dir + "{sample}_umiextract.fastq.gz") benchmark: @@ -17,12 +17,12 @@ rule sentieon_umiextract: sentieon_install_dir = config["SENTIEON_INSTALL_DIR"], sentieon_exec = config["SENTIEON_EXEC"], sentieon_lic = config["SENTIEON_LICENSE"], - ds_params= params.umiextract.read_structure, - sample_id = "{sample}" + ds_params = params.umiextract.read_structure, + sample = "{sample}" threads: get_threads(cluster_config, "sentieon_umiextract") - message: - "Extracing UMI tags using sentieon for {params.sample_id}" + message: + "Extracing UMI tags using sentieon for {params.sample}" shell: """ export LD_PRELOAD={params.sentieon_install_dir}/lib/libjemalloc.so.1 @@ -34,7 +34,7 @@ export SENTIEON_TMPDIR={params.tmpdir}; export SENTIEON_LICENSE={params.sentieon_lic}; {params.sentieon_exec} umi extract \ -{params.ds_params} {input.read1} {input.read2} \ +{params.ds_params} {input.fastq_r1} {input.fastq_r2} \ -o {output.ds_umi}; """ # Align the UMI-extracted reads @@ -56,9 +56,9 @@ rule sentieon_bwa_umiextract: sample_id = '{sample}', sheader = params.umicommon.align_header, ip_bases = params.umicommon.align_intbases - threads: + threads: get_threads(cluster_config, "sentieon_bwa_umiextract") - message: + message: "Aligning UMI extracted reads and sorting using sentieon bwa-mem for {params.sample_id}" shell: """ diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule index 195e73388..54767f73d 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule @@ -12,7 +12,7 @@ rule sentieon_tnscope_umi: bed = config["panel"]["capture_kit"], dbsnp = config["reference"]["dbsnp"] output: - vcf_tnscope_umi = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", + vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.sample_name_map" benchmark: Path(benchmark_dir, "sentieon_tnscope_umi_" + config["analysis"]["case_id"] + ".tsv").as_posix() diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule index adac4ff2c..dda2c7fcb 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule @@ -12,7 +12,7 @@ rule sentieon_tnscope_umi_tn: bed = config["panel"]["capture_kit"], dbsnp = config["reference"]["dbsnp"] output: - vcf_tnscope_umi = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", + vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.sample_name_map" benchmark: Path(benchmark_dir, "sentieon_tnscope_umi_" + config["analysis"]["case_id"] + ".tsv").as_posix() diff --git a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_paired.rule b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_paired.rule deleted file mode 100644 index 91cf980dc..000000000 --- a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_paired.rule +++ /dev/null @@ -1,219 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 -# - -def get_pon_cnn(config): - if "pon_cnn" in config["panel"]: - return os.path.abspath(config["panel"]["pon_cnn"]) - else: - return None - -rule cnvkit_paired: - input: - access_bed = config["reference"]["access_regions"], - fasta = config["reference"]["reference_genome"], - bamN = bam_dir + "normal.merged" + ".bam", - bamT = bam_dir + "tumor.merged" + ".bam", - baits_bed = config["panel"]["capture_kit"], - refflat = config["reference"]["refflat"], - snv_vcf_tumor = vcf_dir + "SNV.germline.tumor.dnascope.vcf.gz", - snv_vcf_normal = vcf_dir + "SNV.germline.normal.dnascope.vcf.gz", - output: - cnr = cnv_dir + "tumor.merged" + ".cnr", - cns = cnv_dir + "tumor.merged" + ".cns", - diagram = cnv_dir + "tumor.merged" + "-diagram.pdf", - gene_breaks = cnv_dir + config["analysis"]["case_id"] + ".gene_breaks", - gene_metrics = cnv_dir + config["analysis"]["case_id"] + ".gene_metrics", - namemap = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.sample_name_map"), - scatter = cnv_dir + "tumor.merged" + "-scatter.pdf", - vcf = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf.gz"), - benchmark: - Path(benchmark_dir + "cnvkit_paired_" + config["analysis"]["case_id"] + ".tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("cnvkit") + ".sif").as_posix() - params: - cnv_dir = cnv_dir, - purecn_dir = cnv_dir + "PureCN", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv"}, - name = config["analysis"]["case_id"], - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - tumor_name = "tumor.merged", - min_mapq = params.common.min_mapq, - case_name = config["analysis"]["case_id"], - gender = config["analysis"]["gender"], - tumor_sample_id= "TUMOR", - normal_sample_id= "NORMAL", - genome = config["reference"]["genome_version"], - pon = " " if get_pon_cnn(config) is None else get_pon_cnn(config), - message: - "Calling CNVs using CNVkit and calculating tumor purity/ploidy using PureCN for {params.case_name}" - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; -export PURECN='/opt/conda/lib/R/library/PureCN/extdata/PureCN.R' - -# merge the tumor and normal VCF -bcftools merge \ --O z -o {params.tmpdir}/SNV.merged.vcf.gz \ --O z {input.snv_vcf_tumor} {input.snv_vcf_normal}; - -tabix -p vcf -f {params.tmpdir}/SNV.merged.vcf.gz; - -# create target and anti-target bed files -cnvkit.py target {input.baits_bed} \ ---annotate {input.refflat} \ ---split \ ---output {params.cnv_dir}/targets.bed; - -cnvkit.py antitarget {input.baits_bed} \ ---access {input.access_bed} \ ---output {params.cnv_dir}/antitarget_bed; - -# calculate coverage in the given regions from BAM read depths -cnvkit.py coverage {input.bamT} \ -{params.cnv_dir}/targets.bed \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {params.cnv_dir}/tumor.targetcoverage.cnn; - -cnvkit.py coverage {input.bamT} \ -{params.cnv_dir}/antitarget_bed \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {params.cnv_dir}/tumor.antitargetcoverage.cnn; - -cnvkit.py coverage {input.bamN} \ -{params.cnv_dir}/targets.bed \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {params.cnv_dir}/normal.targetcoverage.cnn; - -cnvkit.py coverage {input.bamN} \ -{params.cnv_dir}/antitarget_bed \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {params.cnv_dir}/normal.antitargetcoverage.cnn; - -# Compile a coverage reference from the given list of files -cnvkit.py reference \ -{params.cnv_dir}/normal.targetcoverage.cnn \ -{params.cnv_dir}/normal.antitargetcoverage.cnn \ ---fasta {input.fasta} \ ---output {params.cnv_dir}/normalReference.cnn; - -# Combine the uncorrected target and antitarget coverage tables (.cnn) and -# correct for biases in regional coverage and GC content, according to the given normal or PON reference - -if [[ ! -f "{params.pon}" ]]; then -cnvkit.py reference \ -{params.cnv_dir}/normal.targetcoverage.cnn \ -{params.cnv_dir}/normal.antitargetcoverage.cnn \ ---fasta {input.fasta} \ ---output {params.cnv_dir}/normalReference.cnn; - -cnvkit.py fix {params.cnv_dir}/tumor.targetcoverage.cnn \ -{params.cnv_dir}/tumor.antitargetcoverage.cnn \ -{params.cnv_dir}/normalReference.cnn \ ---output {output.cnr}; -else -echo "PON reference exists- Using it for coverage correction" -cnvkit.py fix {params.cnv_dir}/tumor.targetcoverage.cnn {params.cnv_dir}/tumor.antitargetcoverage.cnn {params.pon} --output {output.cnr}; -fi - - -# Infer copy number segments from the given coverage table -# segmentattion methods (-m): cbs: reccommended for mid-size target panels and exomes -# drop bins with 0 read depth and bins whose log2 values deviating from avg -cnvkit.py segment {output.cnr} \ ---output {params.cnv_dir}/tumor.initial.cns \ ---method cbs \ ---drop-low-coverage \ ---processes {threads}; - -# Convert copy number segments (.cns) to standard SEG format to be used in PureCN -cnvkit.py export seg {params.cnv_dir}/tumor.initial.cns \ ---output {params.cnv_dir}/tumor.seg; - -# Run PureCN to estimate tumor-purity and ploidy -mkdir -p {params.purecn_dir} - -# Set default values to run by cnvkit call -purity="0.02"; ploidy=2; - -# if purecn runs succesfully: update purity and ploidy values -purecn_status="true" - -{{ -Rscript $PURECN \ ---out {params.purecn_dir} \ ---sampleid {params.tumor_sample_id} \ ---tumor {output.cnr} \ ---segfile {params.cnv_dir}/tumor.seg \ ---vcf {params.tmpdir}/SNV.merged.vcf.gz \ ---genome {params.genome} \ ---funsegmentation Hclust \ ---force --postoptimize \ ---seed 124; -}} || purecn_status="false" - -if $purecn_status; then -purity=$(awk -F\\, 'NR>1 {{print $2}}' {params.purecn_dir}/{params.tumor_sample_id}.csv) -ploidy=$(awk -F\\, 'NR>1 {{printf int($3)}}' {params.purecn_dir}/{params.tumor_sample_id}.csv); -fi - - -# Call copy number variants from segmented log2 ratios -cnvkit.py call {params.cnv_dir}/tumor.initial.cns \ ---vcf {params.tmpdir}/SNV.merged.vcf.gz \ ---sample-sex {params.gender} \ ---method clonal \ ---purity $purity \ ---ploidy $ploidy \ ---sample-id {params.tumor_sample_id} \ ---normal-id {params.normal_sample_id} \ ---output {output.cns}; - -# Plot bin-level log2 coverages and segmentation calls -cnvkit.py scatter {output.cnr} \ ---segment {output.cns} \ ---output {output.scatter}; - -# Draw copy number (.cnr or .cns) on chromosomes as an ideogram -# Draw copy number (.cnr or .cns) on chromosomes as an ideogram -cnvkit.py diagram {output.cnr} \ ---segment {output.cns} \ ---output {output.diagram}; - -# Identify targeted genes with copy number gain or loss -cnvkit.py genemetrics {output.cnr} \ ---segment {output.cns} \ ---drop-low-coverage \ ---sample-sex {params.gender} \ ---output {output.gene_metrics}; - -# List the genenames that contain a possibe copy number breakpoint. -cnvkit.py breaks {output.cnr} {output.cns} \ -| cut -f1 | sort -u > {output.gene_breaks}; - -# Convert segments to a vcf file -cnvkit.py export vcf {output.cns} \ ---cnr {output.cnr} \ --o {params.cnv_dir}/{params.tumor_name}.vcf \ ---sample-id {params.tumor_sample_id} \ ---sample-sex {params.gender}; - -bgzip -f {params.cnv_dir}/{params.tumor_name}.vcf; - -tabix -p vcf -f {params.cnv_dir}/{params.tumor_name}.vcf.gz; - -bcftools sort \ --o {output.vcf} \ ---temp-dir {params.tmpdir} \ --O z {params.cnv_dir}/{params.tumor_name}.vcf.gz; - -tabix -p vcf -f {output.vcf}; - -echo -e \"TUMOR\\tTUMOR\" > {output.namemap}; -rm -rf {params.tmpdir}; - """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule deleted file mode 100644 index 3eafdc926..000000000 --- a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule +++ /dev/null @@ -1,181 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - - -def get_pon_cnn(config): - if "pon_cnn" in config["panel"]: - return os.path.abspath(config["panel"]["pon_cnn"]) - else: - return None - - -rule cnvkit_single: - input: - access_bed = config["reference"]["access_regions"], - bamT = bam_dir + "tumor.merged.bam", - baits_bed = config["panel"]["capture_kit"], - fasta = config["reference"]["reference_genome"], - refflat = config["reference"]["refflat"], - snv_vcf = vcf_dir + "SNV.germline.tumor.dnascope.vcf.gz", - output: - cns = cnv_dir + "tumor.merged" + ".cns", - cnr = cnv_dir + "tumor.merged" + ".cnr", - diagram = cnv_dir + "tumor.merged" + "-diagram.pdf", - gene_breaks = cnv_dir + config["analysis"]["case_id"] + ".gene_breaks", - gene_metrics = cnv_dir + config["analysis"]["case_id"] + ".gene_metrics", - scatter = cnv_dir + "tumor.merged" + "-scatter.pdf", - namemap = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.sample_name_map"), - vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf.gz" - benchmark: - Path(benchmark_dir, 'cnvkit_single_' + config["analysis"]["case_id"] + ".cnvkit_single.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("cnvkit") + ".sif").as_posix() - threads: - get_threads(cluster_config, "cnvkit_single") - params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv"}, - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - tumor_name = "tumor.merged", - purecn_dir = cnv_dir + "PureCN", - cnv_dir = cnv_dir, - min_mapq= params.common.min_mapq, - case_name = config["analysis"]["case_id"], - gender = config["analysis"]["gender"], - sample_id = "TUMOR", - genome_version = config["reference"]["genome_version"], - pon = " " if get_pon_cnn(config) is None else get_pon_cnn(config) - message: - ("Calling CNVs using CNVkit and calculating tumor purity/ploidy using PureCN for {params.case_name}") - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; -export PURECN='/opt/conda/lib/R/library/PureCN/extdata/PureCN.R' - -# create target and anti-target bed files -cnvkit.py target {input.baits_bed} \ ---annotate {input.refflat} \ ---split \ ---output {params.cnv_dir}/targets.bed; - -cnvkit.py antitarget {input.baits_bed} \ ---access {input.access_bed} \ ---output {params.cnv_dir}/antitarget_bed; - -# calculate coverage in the given regions from BAM read depths -cnvkit.py coverage {input.bamT} \ -{params.cnv_dir}/targets.bed \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {params.cnv_dir}/tumor.targetcoverage.cnn; - -cnvkit.py coverage {input.bamT} \ -{params.cnv_dir}/antitarget_bed \ ---min-mapq {params.min_mapq} \ ---processes {threads} \ ---output {params.cnv_dir}/tumor.antitargetcoverage.cnn; - - -# Combine the uncorrected target and antitarget coverage tables (.cnn) and -# correct for biases in regional coverage and GC content, according to the given reference -if [[ ! -f "{params.pon}" ]]; then -cnvkit.py reference --output {params.cnv_dir}/FlatReference.cnn --fasta {input.fasta} --targets {params.cnv_dir}/targets.bed --antitargets {params.cnv_dir}/antitarget_bed; -cnvkit.py fix {params.cnv_dir}/tumor.targetcoverage.cnn {params.cnv_dir}/tumor.antitargetcoverage.cnn {params.cnv_dir}/FlatReference.cnn --output {output.cnr}; -else -echo "PON reference exists- Using it for coverage correction" -cnvkit.py fix {params.cnv_dir}/tumor.targetcoverage.cnn {params.cnv_dir}/tumor.antitargetcoverage.cnn {params.pon} --output {output.cnr}; -fi - - -# Infer copy number segments from the given coverage table -# segmentattion methods (-m): cbs: reccommended for mid-size target panels and exomes -# drop bins with 0 read depth and bins whose log2 values deviating from avg -cnvkit.py segment {output.cnr} \ ---output {params.cnv_dir}/tumor.initial.cns \ ---method cbs \ ---drop-low-coverage \ ---processes {threads}; - -# Convert copy number segments (.cns) to standard SEG format to be used in PureCN -cnvkit.py export seg {params.cnv_dir}/tumor.initial.cns \ ---output {params.cnv_dir}/tumor.seg; - -# Run PureCN to estimate tumor-purity and ploidy -mkdir -p {params.purecn_dir} - -# Set default values to run by cnvkit call -purity="0.02"; ploidy=2; - -# if purecn runs succesffully, update purity and ploidy values -purecn_status="true" - -{{ -Rscript $PURECN \ ---out {params.purecn_dir} \ ---sampleid {params.sample_id} \ ---tumor {output.cnr} \ ---segfile {params.cnv_dir}/tumor.seg \ ---vcf {input.snv_vcf} \ ---genome {params.genome_version} \ ---funsegmentation Hclust \ ---force --postoptimize \ ---seed 124; -}} || purecn_status="false" - -if $purecn_status; then -purity=$(awk -F\\, 'NR>1 {{print $2}}' {params.purecn_dir}/{params.sample_id}.csv) -ploidy=$(awk -F\\, 'NR>1 {{printf int($3)}}' {params.purecn_dir}/{params.sample_id}.csv); -fi - -# Call copy number variants from segmented log2 ratios -cnvkit.py call {params.cnv_dir}/tumor.initial.cns \ ---vcf {input.snv_vcf} \ ---sample-sex {params.gender} \ ---method clonal \ ---purity $purity \ ---ploidy $ploidy \ ---output {output.cns}; - -# Plot bin-level log2 coverages and segmentation calls -cnvkit.py scatter {output.cnr} \ ---segment {output.cns} \ ---output {output.scatter}; - -# Draw copy number (.cnr or .cns) on chromosomes as an ideogram -cnvkit.py diagram {output.cnr} \ ---segment {output.cns} \ ---output {output.diagram}; - -# Identify targeted genes with copy number gain or loss -cnvkit.py genemetrics {output.cnr} \ ---segment {output.cns} \ ---drop-low-coverage \ ---sample-sex {params.gender} \ ---output {output.gene_metrics}; - -# List the genenames that contain a possibe copy number breakpoint. -cnvkit.py breaks {output.cnr} {output.cns} \ -| cut -f1 | sort -u > {output.gene_breaks}; - -# Convert segments to a vcf file -cnvkit.py export vcf {output.cns} \ ---cnr {output.cnr} \ ---output {params.cnv_dir}/{params.tumor_name}.vcf \ ---sample-sex {params.gender} \ ---sample-id {params.sample_id}; - -bgzip -f {params.cnv_dir}/{params.tumor_name}.vcf; - -tabix -p vcf -f {params.cnv_dir}/{params.tumor_name}.vcf.gz; - -bcftools sort \ --o {output.vcf} \ ---temp-dir {params.tmpdir} \ --O z {params.cnv_dir}/{params.tumor_name}.vcf.gz; - -tabix -p vcf -f {output.vcf}; - -echo -e \"TUMOR\\tTUMOR\" > {output.namemap}; - -rm -rf {params.tmpdir}; - """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/gatk_read_counts.rule b/BALSAMIC/snakemake_rules/variant_calling/gatk_read_counts.rule new file mode 100644 index 000000000..9fa5ccd47 --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/gatk_read_counts.rule @@ -0,0 +1,33 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +rule gatk_collectreadcounts: + input: + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), + genome_interval = config["reference"]["genome_interval"] + output: + readcounts_hdf5 = cnv_dir + "{sample}.collectreadcounts.hdf5" + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + sample = "{sample}" + benchmark: + Path(benchmark_dir, "gatk_collectreadcounts_{sample}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix() + threads: + get_threads(cluster_config, "gatk_collectreadcounts") + message: + "Running GATK CollectReadCounts on {params.sample} for GENS." + shell: + """ +export TMPDIR={params.tmpdir}; + +gatk --java-options "-XX:-UseLargePages -Xmx20g" CollectReadCounts -I {input.bam} \ + -L {input.genome_interval} \ + --tmp-dir {params.tmpdir} \ + --interval-merging-rule OVERLAPPING_ONLY \ + -O {output.readcounts_hdf5} + +rm -rf {params.tmpdir} + """ + diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule new file mode 100644 index 000000000..e2b32534f --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule @@ -0,0 +1,119 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +rule sentieon_DNAscope_gnomad: + input: + ref = config["reference"]["reference_genome"], + gnomad_af5= config["reference"]["gnomad_min_af5"], + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample) + output: + gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + pcr_model = params.common.pcr_model, + sentieon_exec = config["SENTIEON_EXEC"], + sentieon_lic = config["SENTIEON_LICENSE"], + sentieon_ml_dnascope = config["SENTIEON_DNASCOPE"], + sample = "{sample}" + benchmark: + Path(benchmark_dir, "sentieon_DNAscope_gnomad_{sample}.tsv").as_posix() + threads: + get_threads(cluster_config, "sentieon_DNAscope_gnomad") + message: + "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}" + shell: + """ +export TMPDIR={params.tmpdir}; +export SENTIEON_TMPDIR={params.tmpdir}; +export SENTIEON_LICENSE={params.sentieon_lic}; +export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope}; + +{params.sentieon_exec} driver \ +-t {threads} \ +-r {input.ref} \ +-i {input.bam} \ +--algo DNAscope \ +--pcr_indel_mode {params.pcr_model} \ +--given {input.gnomad_af5} {output.gnomad_af5_vcf}; + +rm -rf {params.tmpdir}; + """ + +rule gatk_denoisereadcounts: + input: + gens_pon = config["reference"]["gens_coverage_pon"], + readcounts_hdf5 = cnv_dir + "{sample}.collectreadcounts.hdf5" + output: + denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv", + standardized_cr = cnv_dir + "{sample}.standardizedCR.tsv" + params: + tmpdir=tempfile.mkdtemp(prefix=tmp_dir), + sample="{sample}" + benchmark: + Path(benchmark_dir, "gatk_denoisereadcounts_{sample}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix() + threads: + get_threads(cluster_config,"gatk_denoisereadcounts") + message: + "Running GATK DenoiseReadCounts on {params.sample} for GENS." + shell: + """ +export TMPDIR={params.tmpdir}; + +gatk --java-options "-Xmx60g" DenoiseReadCounts \ +-I {input.readcounts_hdf5} \ +--count-panel-of-normals {input.gens_pon} \ +--tmp-dir {params.tmpdir} \ +--standardized-copy-ratios {output.standardized_cr} \ +--denoised-copy-ratios {output.denoised_cr} + +rm -rf {params.tmpdir} + """ + +rule gens_preprocessing: + input: + denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv", + gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", + output: + gens_baf_bed = cnv_dir + "{sample}.baf.bed", + gens_cov_bed = cnv_dir + "{sample}.cov.bed" + params: + gens_preprocessing = get_script_path("preprocess_gens.py"), + sequencing_type = sequencing_type, + sample="{sample}" + benchmark: + Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix() + threads: + get_threads(cluster_config, "gens_preprocessing") + message: + "Formatting output for GENS for sample: {params.sample}." + shell: + """ +python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} +python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr} + """ + +rule finalize_gens_outputfiles: + input: + gens_input = cnv_dir + "{sample}.{gens_input}.bed" + output: + gens_bed = cnv_dir + "{sample}.{gens_input}.bed.gz", + params: + sample_id="{sample}", + gens_input="{gens_input}", + housekeeper_id= {"id": "{sample}", "tags": "cnv"} + benchmark: + Path(benchmark_dir, "finalize_gens_outputfiles_{sample}_{gens_input}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bgzip") + ".sif").as_posix() + threads: + get_threads(cluster_config, "finalize_gens_outputfiles") + message: + "Bgzip and index GENS output: {params.gens_input} for sample: {params.sample_id}." + shell: + """ +bgzip {input.gens_input} +tabix {input.gens_input}.gz + """ + diff --git a/BALSAMIC/snakemake_rules/variant_calling/germline.rule b/BALSAMIC/snakemake_rules/variant_calling/germline.rule index 97fe45bc0..d2af0ecc7 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/germline.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/germline.rule @@ -3,7 +3,7 @@ rule sentieon_DNAscope: input: - bam = bam_dir + "{sample_type}.merged.bam", + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_type = wildcards.sample_type), ref = config["reference"]["reference_genome"], dbsnp = config["reference"]["dbsnp"], interval = config["panel"]["capture_kit"] diff --git a/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule b/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule index 4eed3880e..195a67ee1 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule @@ -6,7 +6,7 @@ rule manta_germline: input: fa = config["reference"]["reference_genome"], - bam = bam_dir + "{sample_type}.merged.bam", + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_type = wildcards.sample_type), output: final = vcf_dir + "SV.germline.{sample_type}.manta_germline.vcf.gz", benchmark: diff --git a/BALSAMIC/snakemake_rules/variant_calling/mergetype_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/mergetype_normal.rule deleted file mode 100644 index bb5cd66a1..000000000 --- a/BALSAMIC/snakemake_rules/variant_calling/mergetype_normal.rule +++ /dev/null @@ -1,59 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -if config["analysis"]["sequencing_type"] == 'wgs': - normal_bam = "{normal}.dedup.realign.bam".format(normal = normal_sample) -else: - normal_bam = "{normal}.sorted.{picardstr}.bam".format(normal = normal_sample, picardstr = picarddup) - - - -rule mergeBam_normal: - input: - fasta = config["reference"]["reference_genome"], - bam = bam_dir + normal_bam - output: - bam = bam_dir + "normal.merged.bam", - cram = bam_dir + "normal.merged.cram", - benchmark: - Path(benchmark_dir,'mergeBam_normal_' + "{mysample}.tsv".format(mysample=normal_sample)).as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("picard") + ".sif").as_posix() - params: - housekeeper_id = {"id": normal_sample, "tags": "normal"}, - picard_fixmateinfo = params.common.picard_fixmate, - picard_rg = params.common.picard_RG_normal, - sample = normal_sample, - tmpdir= tempfile.mkdtemp(prefix=tmp_dir), - threads: - get_threads(cluster_config, "mergeBam_normal") - message: - "Replacing bam header using Picardtools for {params.sample}" - shell: - """ -picard -Xmx150g FixMateInformation {params.picard_fixmateinfo} \ --TMP_DIR {params.tmpdir} \ --INPUT {input.bam} \ --OUTPUT {params.tmpdir}/normal.fixed.bam; - -samtools view --threads {threads} -O BAM -f 4 {params.tmpdir}/normal.fixed.bam \ --o {params.tmpdir}/normal.fixed.um.bam; - -samtools index {params.tmpdir}/normal.fixed.um.bam; - -samtools view --threads {threads} -O BAM -F 4 {params.tmpdir}/normal.fixed.bam \ --o {params.tmpdir}/normal.fixed.m.bam; - -samtools index {params.tmpdir}/normal.fixed.m.bam; - -picard -Xmx150g AddOrReplaceReadGroups {params.picard_rg} \ --TMP_DIR {params.tmpdir} \ --INPUT {params.tmpdir}/normal.fixed.m.bam \ --OUTPUT {output.bam}; - -samtools index {output.bam}; - -samtools view -h -T {input.fasta} --threads {threads} -C -o {output.cram} {output.bam}; - -samtools index {output.cram}; - """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/mergetype_tumor.rule b/BALSAMIC/snakemake_rules/variant_calling/mergetype_tumor.rule deleted file mode 100644 index a7112c602..000000000 --- a/BALSAMIC/snakemake_rules/variant_calling/mergetype_tumor.rule +++ /dev/null @@ -1,60 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -if config["analysis"]["sequencing_type"] == 'wgs': - tumor_bam = "{tumor}.dedup.realign.bam".format(tumor = tumor_sample) -else: - tumor_bam = "{tumor}.sorted.{picardstr}.bam".format(tumor = tumor_sample, picardstr = picarddup) - - - -rule mergeBam_tumor: - input: - fasta = config["reference"]["reference_genome"], - bam = bam_dir + tumor_bam - output: - bam = bam_dir + "tumor.merged.bam", - cram = bam_dir + "tumor.merged.cram", - benchmark: - Path(benchmark_dir,'mergeBam_tumor_' + "{mysample}.tsv".format(mysample=tumor_sample)).as_posix() - singularity: - Path(singularity_image, config[ "bioinfo_tools" ].get("picard") + ".sif").as_posix() - params: - housekeeper_id = {"id": tumor_sample, "tags": "tumor"}, - picard_fixmateinfo = params.common.picard_fixmate, - picard_rg = params.common.picard_RG_tumor, - sample = tumor_sample, - tmpdir= tempfile.mkdtemp(prefix=tmp_dir), - threads: - get_threads(cluster_config, "mergeBam_tumor") - message: - "Replacing bam header using Picardtools for {params.sample}" - shell: - """ -picard -Xmx150g FixMateInformation {params.picard_fixmateinfo} \ --TMP_DIR {params.tmpdir} \ --INPUT {input.bam} \ --OUTPUT {params.tmpdir}/tumor.fixed.bam; - -samtools view --threads {threads} -O BAM -f 4 {params.tmpdir}/tumor.fixed.bam \ --o {params.tmpdir}/tumor.fixed.um.bam ; - -samtools index {params.tmpdir}/tumor.fixed.um.bam; - -samtools view --threads {threads} -O BAM -F 4 {params.tmpdir}/tumor.fixed.bam \ --o {params.tmpdir}/tumor.fixed.m.bam; - -samtools index {params.tmpdir}/tumor.fixed.m.bam; - -picard -Xmx150g AddOrReplaceReadGroups {params.picard_rg} \ --TMP_DIR {params.tmpdir} \ --INPUT {params.tmpdir}/tumor.fixed.m.bam \ --TMP_DIR {params.tmpdir} \ --OUTPUT {output.bam}; - -samtools index {output.bam}; - -samtools view -h -T {input.fasta} --threads {threads} -C -o {output.cram} {output.bam}; - -samtools index {output.cram}; - """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule index 4d39675ae..f7967227e 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule @@ -2,13 +2,12 @@ # coding: utf-8 - rule sentieon_DNAscope: input: ref = config["reference"]["reference_genome"], dbsnp = config["reference"]["dbsnp"], - bam = bam_dir + "{sample_type}.merged.bam", - recal_table = bam_dir + "{sample_type}.merged.recal_data.table" + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_type = wildcards.sample_type), + recal_table = bam_dir + "{sample_type}.recal_data.table" output: vcf = vcf_dir + "SNV.germline.{sample_type}.dnascope.vcf.gz", params: @@ -26,7 +25,6 @@ rule sentieon_DNAscope: "Calling germline variants using Sentieon DNAscope for {params.sample}" shell: """ -mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; export SENTIEON_TMPDIR={params.tmpdir}; export SENTIEON_LICENSE={params.sentieon_lic}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_quality_filter.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_quality_filter.rule index 0aea9c2fd..96459c0cd 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_quality_filter.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_quality_filter.rule @@ -4,7 +4,7 @@ if config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["analys rule bcftools_quality_filter_tnscope_tumor_only: input: vcf_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", - wgs_calling_file = config["reference"]["wgs_calling_interval"], + wgs_calling_file = config["reference"]["wgs_calling_regions"], output: vcf_snv_research = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", benchmark: @@ -16,12 +16,11 @@ if config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["analys DP = [SENTIEON_CALLER.DP.tag_value, SENTIEON_CALLER.DP.filter_name], AD = [SENTIEON_CALLER.AD.tag_value, SENTIEON_CALLER.AD.filter_name], AF_min = [SENTIEON_CALLER.AF_min.tag_value, SENTIEON_CALLER.AF_min.filter_name], - AF_max = [SENTIEON_CALLER.AF_max.tag_value, SENTIEON_CALLER.AF_max.filter_name], strand_reads = [SENTIEON_CALLER.strand_reads.tag_value, SENTIEON_CALLER.strand_reads.filter_name], qss = [SENTIEON_CALLER.qss.tag_value, SENTIEON_CALLER.qss.filter_name], sor = [SENTIEON_CALLER.sor.tag_value, SENTIEON_CALLER.sor.filter_name], case_name = config["analysis"]["case_id"], - threads: + threads: get_threads(cluster_config, 'bcftools_quality_filter_tnscope_tumor_only') message: "Quality filtering WGS tumor-only tnscope variants using bcftools for {params.case_name}" @@ -36,7 +35,6 @@ bcftools view -f PASS,triallelic_site --threads {threads} --regions-file {input. | bcftools filter --threads {threads} --include 'SUM(FORMAT/AD[0:0]+FORMAT/AD[0:1]) >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' \ | bcftools filter --threads {threads} --include 'FORMAT/AD[0:1] > {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' \ | bcftools filter --threads {threads} --include 'FORMAT/AF > {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' \ -| bcftools filter --threads {threads} --include 'FORMAT/AF < {params.AF_max[0]}' --soft-filter '{params.AF_max[1]}' --mode '+' \ | bcftools filter --threads {threads} --include 'SUM(FORMAT/QSS)/SUM(FORMAT/AD) >= {params.qss[0]}' --soft-filter '{params.qss[1]}' --mode '+' \ | bcftools filter --threads {threads} --include 'FORMAT/ALT_F1R2 > {params.strand_reads[0]} && (FORMAT/ALT_F1R2 > 0 && FORMAT/ALT_F2R1 > {params.strand_reads[0]} && FORMAT/REF_F1R2 > {params.strand_reads[0]} && FORMAT/REF_F2R1 > {params.strand_reads[0]})' --soft-filter '{params.strand_reads[1]}' --mode '+' \ | bcftools filter --threads {threads} --include "INFO/SOR < {params.sor[0]}" --soft-filter '{params.sor[1]}' --mode '+' \ @@ -49,7 +47,7 @@ elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["anal rule bcftools_quality_filter_tnscope_tumor_normal: input: vcf_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", - wgs_calling_file = config["reference"]["wgs_calling_interval"], + wgs_calling_file = config["reference"]["wgs_calling_regions"], output: vcf_snv_research = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", benchmark: @@ -60,7 +58,6 @@ elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["anal AD = [SENTIEON_CALLER.AD.tag_value, SENTIEON_CALLER.AD.filter_name], DP = [SENTIEON_CALLER.DP.tag_value, SENTIEON_CALLER.DP.filter_name], AF_min = [SENTIEON_CALLER.AF_min.tag_value, SENTIEON_CALLER.AF_min.filter_name], - AF_max = [SENTIEON_CALLER.AF_max.tag_value, SENTIEON_CALLER.AF_max.filter_name], case_name = config["analysis"]["case_id"], threads: get_threads(cluster_config, 'bcftools_quality_filter_tnscope_tumor_normal') @@ -72,7 +69,6 @@ bcftools view -f PASS,triallelic_site {input.vcf_snv} \ | bcftools filter --threads {threads} --include 'SUM(FORMAT/AD[0:0]+FORMAT/AD[0:1]) >= {params.DP[0]} || SUM(FORMAT/AD[1:0]+FORMAT/AD[1:1]) >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' \ | bcftools filter --threads {threads} --include 'FORMAT/AD[0:1] >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' \ | bcftools filter --threads {threads} --include 'FORMAT/AF[0] >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' \ -| bcftools filter --threads {threads} --include 'FORMAT/AF[0] < {params.AF_max[0]}' --soft-filter '{params.AF_max[1]}' --mode '+' \ | bcftools view -f PASS,triallelic_site -O z -o {output.vcf_snv_research}; tabix -p vcf -f {output.vcf_snv_research}; @@ -94,7 +90,6 @@ if config["analysis"]["sequencing_type"] == 'targeted' and config["analysis"]["a AD=[VARDICT.AD.tag_value, VARDICT.AD.filter_name], DP=[VARDICT.DP.tag_value, VARDICT.DP.filter_name], AF_min=[VARDICT.AF_min.tag_value, VARDICT.AF_min.filter_name], - AF_max=[VARDICT.AF_max.tag_value, VARDICT.AF_max.filter_name], case_name = config["analysis"]["case_id"], threads: get_threads(cluster_config,'bcftools_quality_filter_vardict_tumor_only') @@ -107,7 +102,6 @@ bcftools filter --include 'INFO/MQ >= {params.MQ[0]}' --soft-filter '{params.MQ[ bcftools filter --include 'INFO/DP >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' | \ bcftools filter --include 'INFO/VD >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' | \ bcftools filter --include 'INFO/AF >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' | \ -bcftools filter --include 'INFO/AF < {params.AF_max[0]}' --soft-filter '{params.AF_max[1]}' --mode '+' | \ bcftools view -f PASS -o {output.vcf_filtered} -O z; tabix -p vcf -f {output.vcf_filtered}; @@ -154,7 +148,6 @@ elif config["analysis"]["sequencing_type"] == 'targeted' and config["analysis"][ AD=[VARDICT.AD.tag_value, VARDICT.AD.filter_name], DP=[VARDICT.DP.tag_value, VARDICT.DP.filter_name], AF_min=[VARDICT.AF_min.tag_value, VARDICT.AF_min.filter_name], - AF_max=[VARDICT.AF_max.tag_value, VARDICT.AF_max.filter_name], possible_germline="balsamic_possible_germline", case_name = config["analysis"]["case_id"], threads: @@ -168,7 +161,6 @@ bcftools filter --include 'SMPL_MIN(FMT/MQ) >= {params.MQ[0]}' --soft-filter '{p bcftools filter --include 'INFO/DP >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' | \ bcftools filter --include 'INFO/VD >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' | \ bcftools filter --include 'INFO/AF >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' | \ -bcftools filter --include 'INFO/AF < {params.AF_max[0]}' --soft-filter '{params.AF_max[1]}' --mode '+' | \ bcftools filter --exclude 'INFO/STATUS ~ "germline/i"' --soft-filter '{params.possible_germline}' --mode '+' | \ bcftools view -f PASS -o {output.vcf_filtered} -O z; diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule index 986beaf37..71a313722 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule @@ -14,12 +14,12 @@ rule sentieon_base_calibration: input: ref = config["reference"]["reference_genome"], mills = config["reference"]["mills_1kg"], - indel_1kg = config["reference"]["1kg_known_indel"], + indel_1kg = config["reference"]["known_indel_1kg"], dbsnp = config["reference"]["dbsnp"], - bam = Path(bam_dir, "{sample_type}.merged.bam").as_posix() + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_type = wildcards.sample_type) output: - recal_data_table = Path(bam_dir, "{sample_type}.merged.recal_data.table").as_posix(), - qual_recal = Path(bam_dir, "{sample_type}.merged.recal.csv").as_posix(), + recal_data_table = Path(bam_dir, "{sample_type}.recal_data.table").as_posix(), + qual_recal = Path(bam_dir, "{sample_type}.recal.csv").as_posix(), qual_recal_plot = Path(bam_dir, "{sample_type}.recal.pdf").as_posix(), benchmark: Path(benchmark_dir, "sentieon_base_calibration_{sample_type}.tsv").as_posix() @@ -75,7 +75,7 @@ rm -rf {params.tmpdir}; rule sentieon_TNhaplotyper_tumor_only: input: - bam = expand(bam_dir + "tumor.merged.bam"), + bam = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), recal_data_table = expand(bam_dir + "tumor.merged.recal_data.table"), ref = config["reference"]["reference_genome"], dbsnp = config["reference"]["dbsnp"], @@ -125,8 +125,8 @@ rule sentieon_TNscope_tumor_only: ref = config["reference"]["reference_genome"], dbsnp = config["reference"]["dbsnp"], cosmic = config["reference"]["cosmic"], - bam = expand(bam_dir + "tumor.merged.bam"), - recal = expand(bam_dir + "tumor.merged.recal_data.table") + bam = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + recal = expand(bam_dir + "tumor.recal_data.table") output: vcf_tnscope = vcf_dir + "sentieon_tnscope" + "/" + "ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", namemap_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule index 1cf7a76df..16dca4405 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule @@ -7,12 +7,12 @@ rule sentieon_base_calibration: input: ref = config["reference"]["reference_genome"], mills = config["reference"]["mills_1kg"], - indel_1kg = config["reference"]["1kg_known_indel"], + indel_1kg = config["reference"]["known_indel_1kg"], dbsnp = config["reference"]["dbsnp"], - bam = Path(bam_dir, "{sample_type}.merged.bam").as_posix() + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_type = wildcards.sample_type) output: - recal_data_table = Path(bam_dir, "{sample_type}.merged.recal_data.table").as_posix(), - qual_recal = Path(bam_dir, "{sample_type}.merged.recal.csv").as_posix(), + recal_data_table = Path(bam_dir, "{sample_type}.recal_data.table").as_posix(), + qual_recal = Path(bam_dir, "{sample_type}.recal.csv").as_posix(), qual_recal_plot = Path(bam_dir, "{sample_type}.recal.pdf").as_posix(), benchmark: Path(benchmark_dir, "sentieon_base_calibration_{sample_type}.tsv").as_posix() @@ -27,7 +27,6 @@ rule sentieon_base_calibration: "Base recalibration using Sentieon tools for {params.sample}" shell: """ -mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; export SENTIEON_TMPDIR={params.tmpdir}; export SENTIEON_LICENSE={params.sentieon_lic}; @@ -69,10 +68,10 @@ rule sentieon_TNscope: ref = config["reference"]["reference_genome"], dbsnp = config["reference"]["dbsnp"], cosmic = config["reference"]["cosmic"], - bamT = expand(bam_dir + "tumor.merged.bam"), - bamN = expand(bam_dir + "normal.merged.bam"), - recalT = expand(bam_dir + "tumor.merged.recal_data.table"), - recalN = expand(bam_dir + "normal.merged.recal_data.table"), + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), + recalT = expand(bam_dir + "tumor.recal_data.table"), + recalN = expand(bam_dir + "normal.recal_data.table"), output: vcf_tnscope = vcf_dir + "sentieon_tnscope/ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", namemap_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", @@ -97,7 +96,6 @@ rule sentieon_TNscope: "applying machine learning algorithm for sample {params.case_name}") shell: """ -mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; export SENTIEON_TMPDIR={params.tmpdir}; export SENTIEON_LICENSE={params.sentieon_lic}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule new file mode 100644 index 000000000..754997fa8 --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule @@ -0,0 +1,351 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + + +rule bcftools_merge_germlineSNV_research: + input: + snv_vcf_tumor = vcf_dir + "SNV.germline.tumor.dnascope.vcf.gz", + snv_vcf_normal = vcf_dir + "SNV.germline.normal.dnascope.vcf.gz", + output: + snv_merged = vcf_dir + "SNV.germline.merged.dnascope.vcf.gz", + benchmark: + Path(f"{benchmark_dir}/bcftools_merge_germlineSNV_{config['analysis']['case_id']}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + threads: + get_threads(cluster_config, "bcftools_merge_germlineSNV_research") + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + case_name = config["analysis"]["case_id"], + message: + "Merging germline SNVs using bcftools for {params.case_name}" + shell: + """ +# merge the tumor and normal VCF +bcftools merge \ +--threads {threads} \ +-O z -o {params.tmpdir}/SNV.merged.vcf.gz \ +{input.snv_vcf_tumor} {input.snv_vcf_normal}; + +tabix -p vcf -f {params.tmpdir}/SNV.merged.vcf.gz; + +bcftools sort \ +-O z -o {output.snv_merged} \ +--temp-dir {params.tmpdir} \ +{params.tmpdir}/SNV.merged.vcf.gz; + +tabix -p vcf -f {output.snv_merged}; + +rm -rf {params.tmpdir}; + """ + + +rule cnvkit_segment_CNV_research: + input: + access_bed = config["reference"]["access_regions"], + baits_bed = config["panel"]["capture_kit"], + fasta = config["reference"]["reference_genome"], + refgene_flat = config["reference"]["refgene_flat"], + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + bamN = config_model.get_final_bam_name(bam_dir = bam_dir,sample_name = normal_sample), + output: + cns_initial = cnv_dir + "tumor.initial.cns", + cnr = cnv_dir + "tumor.merged.cnr", + targets = cnv_dir + "targets.bed", + antitargets = cnv_dir + "antitarget.bed", + tumor_target_coverage = cnv_dir + "tumor.targetcoverage.cnn", + tumor_antitarget_coverage = cnv_dir + "tumor.antitargetcoverage.cnn", + normal_target_coverage = cnv_dir + "normal.targetcoverage.cnn", + normal_antitarget_coverage= cnv_dir + "normal.antitargetcoverage.cnn", + segment = cnv_dir + "tumor.seg", + benchmark: + Path(f"{benchmark_dir}/cnvkit_segment_{config['analysis']['case_id']}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("cnvkit") + ".sif").as_posix() + threads: + get_threads(cluster_config, "cnvkit_segment_CNV_research") + params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv"}, + cnv_dir = cnv_dir, + case_name = config["analysis"]["case_id"], + sample_id = "TUMOR", + normal_id = "NORMAL", + min_mapq = params.common.min_mapq, + normal_reference_cnn = cnv_dir + "normal_reference.cnn", + pon = pon_cnn, + message: + "Segmenting genomic regions using CNVkit for {params.case_name}" + shell: + """ +# create target and anti-target bed files +cnvkit.py target {input.baits_bed} \ +--annotate {input.refgene_flat} \ +--split \ +--output {output.targets}; + +cnvkit.py antitarget {input.baits_bed} \ +--access {input.access_bed} \ +--output {output.antitargets}; + +# calculate coverage in the given regions from BAM read depths +cnvkit.py coverage {input.bamT} \ +{output.targets} \ +--min-mapq {params.min_mapq} \ +--processes {threads} \ +--output {output.tumor_target_coverage}; + +cnvkit.py coverage {input.bamT} \ +{output.antitargets} \ +--min-mapq {params.min_mapq} \ +--processes {threads} \ +--output {output.tumor_antitarget_coverage}; + +cnvkit.py coverage {input.bamN} \ +{output.targets} \ +--min-mapq {params.min_mapq} \ +--processes {threads} \ +--output {output.normal_target_coverage}; + +cnvkit.py coverage {input.bamN} \ +{output.antitargets} \ +--min-mapq {params.min_mapq} \ +--processes {threads} \ +--output {output.normal_antitarget_coverage}; + +# Combine the uncorrected target and antitarget coverage tables (.cnn) and +# correct for biases in regional coverage and GC content, according to the given normal or PON reference +if [[ ! -f "{params.pon}" ]]; then + +# Compile a coverage reference from the given list of files +cnvkit.py reference \ +{output.normal_target_coverage} \ +{output.normal_antitarget_coverage} \ +--fasta {input.fasta} \ +--output {params.normal_reference_cnn}; + +cnvkit.py fix {output.tumor_target_coverage} \ +{output.tumor_antitarget_coverage} \ +{params.normal_reference_cnn} \ +--output {output.cnr}; + +else + +echo "PON reference exists- Using it for coverage correction" +cnvkit.py fix {output.tumor_target_coverage} \ +{output.tumor_antitarget_coverage} \ +{params.pon} \ +--output {output.cnr}; + +fi + +# Infer copy number segments from the given coverage table +# segmentattion methods (-m): cbs: reccommended for mid-size target panels and exomes +# drop bins with 0 read depth and bins whose log2 values deviating from avg +cnvkit.py segment {output.cnr} \ +--sample-id {params.sample_id} \ +--normal-id {params.normal_id} \ +--output {output.cns_initial} \ +--method cbs \ +--drop-low-coverage \ +--processes {threads}; + +# Convert copy number segments (initial.cns) to standard SEG format to be used in PureCN +cnvkit.py export seg {output.cns_initial} \ +--output {output.segment}; + """ + +rule purecn_call_CNV_research: + input: + fasta = config["reference"]["reference_genome"], + refgene_flat = config["reference"]["refgene_flat"], + snv_merged = vcf_dir + "SNV.germline.merged.dnascope.vcf.gz", + segment = cnv_dir + "tumor.seg", + cnr = cnv_dir + "tumor.merged.cnr", + output: + purecn_purity = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv", + benchmark: + Path(f"{benchmark_dir}/purecn_call_{config['analysis']['case_id']}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("purecn") + ".sif").as_posix() + threads: + get_threads(cluster_config, "purecn_call_CNV_research") + params: + cnv_dir = cnv_dir, + name = config["analysis"]["case_id"], + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + case_name = config["analysis"]["case_id"], + genome = config["reference"]["genome_version"], + sample_id= "TUMOR", + purity_pdf= cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.pdf", + loh_regions= cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.LOHregions.csv", + loh_genes= cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.LOHgenes.csv", + cnv_csv= cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.cnv.csv", + purecn_vcf= cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.vcf.gz", + message: + "Computing tumor purity and ploidy using PureCN for {params.case_name}" + shell: + """ +export PURECN='/opt/PureCN/PureCN.R' + +# Run PureCN to estimate tumor-purity and ploidy +Rscript $PURECN \ +--parallel \ +--out {params.tmpdir} \ +--out-vcf TRUE \ +--sampleid {params.sample_id} \ +--tumor {input.cnr} \ +--seg-file {input.segment} \ +--vcf {input.snv_merged} \ +--genome {params.genome} \ +--fun-segmentation Hclust \ +--force --post-optimize \ +--seed 124; + +if [[ -f "{params.tmpdir}/{params.sample_id}_dnacopy.seg" ]]; then +mv {params.tmpdir}/{params.sample_id}_dnacopy.seg {params.cnv_csv}; +fi; + +if [[ -f "{params.tmpdir}/{params.sample_id}.pdf" ]]; then +mv {params.tmpdir}/{params.sample_id}.pdf {params.purity_pdf}; +fi; + +if [[ -f "{params.tmpdir}/{params.sample_id}_loh.csv" ]]; then +mv {params.tmpdir}/{params.sample_id}_loh.csv {params.loh_regions}; +fi; + +if [[ -f "{params.tmpdir}/{params.sample_id}_genes.csv" ]]; then +mv {params.tmpdir}/{params.sample_id}_genes.csv {params.loh_genes}; +fi; + +if [[ -f "{params.tmpdir}/{params.sample_id}.vcf.gz" ]]; then +mv {params.tmpdir}/{params.sample_id}.vcf.gz {params.purecn_vcf}; +fi; + +if [[ -f "{params.tmpdir}/{params.sample_id}.csv" ]]; then +mv {params.tmpdir}/{params.sample_id}.csv {output.purecn_purity}; +else +echo '"Sampleid","Purity","Ploidy","Sex","Contamination","Flagged","Failed","Curated","Comment" +"tumor.initial",0.02,2,"?",0,FALSE,FALSE,FALSE,NA' > {output.purecn_purity}; +fi; + +rm -rf {params.tmpdir}; + """ + +rule cnvkit_call_CNV_research: + input: + access_bed = config["reference"]["access_regions"], + fasta = config["reference"]["reference_genome"], + baits_bed = config["panel"]["capture_kit"], + refgene_flat = config["reference"]["refgene_flat"], + purity_ploidy = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv", + cns_initial = cnv_dir + "tumor.initial.cns", + cnr = cnv_dir + "tumor.merged.cnr", + snv_merged = vcf_dir + "SNV.germline.merged.dnascope.vcf.gz", + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), + output: + cns = cnv_dir + "tumor.merged.cns", + gene_breaks = cnv_dir + config["analysis"]["case_id"] + ".gene_breaks", + gene_metrics = cnv_dir + config["analysis"]["case_id"] + ".gene_metrics", + vcf = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf", + diagram = cnv_dir + "tumor.merged-diagram.pdf", + scatter = cnv_dir + "tumor.merged-scatter.pdf", + benchmark: + Path(benchmark_dir + "cnvkit_call_CNV_research" + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("cnvkit") + ".sif").as_posix() + threads: + get_threads(cluster_config, "cnvkit_call_CNV_research") + params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv"}, + cnv_dir = cnv_dir, + cnsr = lambda wc: "tumor.merged.cn{s,r}", + case_name = config["analysis"]["case_id"], + gender = config["analysis"]["gender"], + tumor_sample_id = "TUMOR", + normal_sample_id = "NORMAL", + message: + "Computing tumor purity and ploidy using PureCN for {params.case_name}" + shell: + """ +purity=$(awk -F\\, 'NR>1 {{print $2}}' {input.purity_ploidy}) +ploidy=$(awk -F\\, 'NR>1 {{printf int($3)}}' {input.purity_ploidy}) + +# Call copy number variants from segmented log2 ratios +cnvkit.py call {input.cns_initial} \ +--vcf {input.snv_merged} \ +--sample-sex {params.gender} \ +--method clonal \ +--purity $purity \ +--ploidy $ploidy \ +--sample-id {params.tumor_sample_id} \ +--normal-id {params.normal_sample_id} \ +--output {output.cns}; + +# Plot bin-level log2 coverages and segmentation calls +cnvkit.py scatter {input.cnr} \ +--segment {output.cns} \ +--output {output.scatter}; + +# Draw copy number (.cnr or .cns) on chromosomes as an ideogram +# Draw copy number (.cnr or .cns) on chromosomes as an ideogram +cnvkit.py diagram {input.cnr} \ +--segment {output.cns} \ +--output {output.diagram}; + +# Identify targeted genes with copy number gain or loss +cnvkit.py genemetrics {input.cnr} \ +--segment {output.cns} \ +--drop-low-coverage \ +--sample-sex {params.gender} \ +--output {output.gene_metrics}; + +# List the genenames that contain a possibe copy number breakpoint. +cnvkit.py breaks {input.cnr} {output.cns} \ +| cut -f1 | sort -u > {output.gene_breaks}; + +# Convert segments to a vcf file +cnvkit.py export vcf {output.cns} \ +--cnr {input.cnr} \ +-o {output.vcf} \ +--sample-id {params.tumor_sample_id} \ +--sample-sex {params.gender}; + """ + +rule bcftools_sort_cnvkitCNV_research: + input: + vcf = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf", + output: + namemap = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.sample_name_map"), + vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf.gz" + benchmark: + Path(f"{benchmark_dir}/bcftools_sort_cnvkitCNV_research_{config['analysis']['case_id']}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + threads: + get_threads(cluster_config, "bcftools_sort_cnvkitCNV_research") + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + cnv_dir = cnv_dir, + case_name = config["analysis"]["case_id"], + tumor = tumor_sample, + normal = normal_sample, + message: + "Sorting CNVs using bcftools for {params.case_name}" + shell: + """ +bgzip -l 9 {input.vcf}; + +tabix -p vcf -f {input.vcf}.gz; + +bcftools sort \ +-O z -o {output.vcf} \ +--temp-dir {params.tmpdir} \ +{input.vcf}.gz; + +tabix -p vcf -f {output.vcf}; + +echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap}; + +rm -rf {params.tmpdir}; + """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule new file mode 100644 index 000000000..1b6ab374c --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule @@ -0,0 +1,284 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +rule cnvkit_segment_CNV_research: + input: + access_bed = config["reference"]["access_regions"], + baits_bed = config["panel"]["capture_kit"], + fasta = config["reference"]["reference_genome"], + refgene_flat = config["reference"]["refgene_flat"], + bamT = config_model.get_final_bam_name(bam_dir = bam_dir,sample_name=tumor_sample), + output: + cns_initial = cnv_dir + "tumor.initial.cns", + cnr = cnv_dir + "tumor.merged.cnr", + targets = cnv_dir + "targets.bed", + antitargets = cnv_dir + "antitarget.bed", + tumor_target_coverage = cnv_dir + "tumor.targetcoverage.cnn", + tumor_antitarget_coverage = cnv_dir + "tumor.antitargetcoverage.cnn", + segment = cnv_dir + "tumor.seg", + benchmark: + Path(f"{benchmark_dir}/cnvkit_segment_{config['analysis']['case_id']}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("cnvkit") + ".sif").as_posix() + threads: + get_threads(cluster_config,"cnvkit_segment_CNV_research") + params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv"}, + min_mapq = params.common.min_mapq, + case_name = config["analysis"]["case_id"], + sample_id = "TUMOR", + flat_reference_cnn = cnv_dir + "flat_reference.cnn", + pon = pon_cnn, + message: + ("Segmenting genomic regions using CNVkit for {params.case_name}") + shell: + """ +# create target and anti-target bed files +cnvkit.py target {input.baits_bed} \ +--annotate {input.refgene_flat} \ +--split \ +--output {output.targets}; + +cnvkit.py antitarget {input.baits_bed} \ +--access {input.access_bed} \ +--output {output.antitargets}; + +# calculate coverage in the given regions from BAM read depths +cnvkit.py coverage {input.bamT} \ +{output.targets} \ +--min-mapq {params.min_mapq} \ +--processes {threads} \ +--output {output.tumor_target_coverage}; + +cnvkit.py coverage {input.bamT} \ +{output.antitargets} \ +--min-mapq {params.min_mapq} \ +--processes {threads} \ +--output {output.tumor_antitarget_coverage}; + +# Combine the uncorrected target and antitarget coverage tables (.cnn) and +# correct for biases in regional coverage and GC content, according to the given reference +if [[ ! -f "{params.pon}" ]]; then +cnvkit.py reference --output {params.flat_reference_cnn} \ +--fasta {input.fasta} \ +--targets {output.targets} \ +--antitargets {output.antitargets}; + +cnvkit.py fix {output.tumor_target_coverage} \ +{output.tumor_antitarget_coverage} \ +{params.flat_reference_cnn} \ +--output {output.cnr}; + +else + +echo "PON reference exists- Using it for coverage correction" +cnvkit.py fix {output.tumor_target_coverage} \ +{output.tumor_antitarget_coverage} \ +{params.pon} \ +--output {output.cnr}; + +fi + +# Infer copy number segments from the given coverage table +# segmentattion methods (-m): cbs: reccommended for mid-size target panels and exomes +# drop bins with 0 read depth and bins whose log2 values deviating from avg +cnvkit.py segment {output.cnr} \ +--sample-id {params.sample_id} \ +--output {output.cns_initial} \ +--method cbs \ +--drop-low-coverage \ +--processes {threads}; + +# Convert copy number segments (initial.cns) to standard SEG format to be used for PureCN +cnvkit.py export seg {output.cns_initial} --output {output.segment}; + """ + +rule purecn_call_CNV_research: + input: + fasta = config["reference"]["reference_genome"], + refgene_flat = config["reference"]["refgene_flat"], + snv_vcf = vcf_dir + "SNV.germline.tumor.dnascope.vcf.gz", + segment = cnv_dir + "tumor.seg", + cnr = cnv_dir + "tumor.merged.cnr", + output: + purecn_purity = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv", + benchmark: + Path(f"{benchmark_dir}/purecn_call_{config['analysis']['case_id']}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("purecn") + ".sif").as_posix() + threads: + get_threads(cluster_config, "purecn_call_CNV_research") + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + cnv_dir = cnv_dir, + case_name = config["analysis"]["case_id"], + sample_id = "TUMOR", + genome = config["reference"]["genome_version"], + purity_pdf = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.pdf", + loh_regions = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.LOHregions.csv", + loh_genes = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.LOHgenes.csv", + cnv_csv = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.cnv.csv", + purecn_vcf = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.vcf.gz", + message: + ("Computing tumor purity and ploidy values using PureCN for {params.case_name}") + shell: + """ +export PURECN='/opt/PureCN/PureCN.R' + +# Run PureCN to estimate tumor-purity and ploidy +Rscript $PURECN \ +--parallel \ +--out {params.tmpdir} \ +--out-vcf TRUE \ +--sampleid {params.sample_id} \ +--tumor {input.cnr} \ +--seg-file {input.segment} \ +--vcf {input.snv_vcf} \ +--genome {params.genome} \ +--fun-segmentation Hclust \ +--force --post-optimize \ +--seed 124; + +if [[ -f "{params.tmpdir}/{params.sample_id}_dnacopy.seg" ]]; then +mv {params.tmpdir}/{params.sample_id}_dnacopy.seg {params.cnv_csv}; +fi; + +if [[ -f "{params.tmpdir}/{params.sample_id}.pdf" ]]; then +mv {params.tmpdir}/{params.sample_id}.pdf {params.purity_pdf}; +fi; + +if [[ -f "{params.tmpdir}/{params.sample_id}_loh.csv" ]]; then +mv {params.tmpdir}/{params.sample_id}_loh.csv {params.loh_regions}; +fi; + +if [[ -f "{params.tmpdir}/{params.sample_id}_genes.csv" ]]; then +mv {params.tmpdir}/{params.sample_id}_genes.csv {params.loh_genes}; +fi; + +if [[ -f "{params.tmpdir}/{params.sample_id}.vcf.gz" ]]; then +mv {params.tmpdir}/{params.sample_id}.vcf.gz {params.purecn_vcf}; +fi; + +if [[ -f "{params.tmpdir}/{params.sample_id}.csv" ]]; then +mv {params.tmpdir}/{params.sample_id}.csv {output.purecn_purity}; +else +echo '"Sampleid","Purity","Ploidy","Sex","Contamination","Flagged","Failed","Curated","Comment" +"tumor.initial",0.02,2,"?",0,FALSE,FALSE,FALSE,NA' > {output.purecn_purity}; +fi; + +rm -rf {params.tmpdir}; + """ + +rule cnvkit_call_CNV_research: + input: + access_bed = config["reference"]["access_regions"], + fasta = config["reference"]["reference_genome"], + baits_bed = config["panel"]["capture_kit"], + refgene_flat = config["reference"]["refgene_flat"], + purity_ploidy = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv", + cns_initial= cnv_dir + "tumor.initial.cns", + cnr = cnv_dir + "tumor.merged.cnr", + snv_vcf = vcf_dir + "SNV.germline.tumor.dnascope.vcf.gz", + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + output: + cns = cnv_dir + "tumor.merged.cns", + gene_breaks = cnv_dir + config["analysis"]["case_id"] + ".gene_breaks", + gene_metrics = cnv_dir + config["analysis"]["case_id"] + ".gene_metrics", + vcf = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf", + diagram = cnv_dir + "tumor.merged-diagram.pdf", + scatter = cnv_dir + "tumor.merged-scatter.pdf", + benchmark: + Path(f"{benchmark_dir}/cnvkit_call_{config['analysis']['case_id']}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("cnvkit") + ".sif").as_posix() + threads: + get_threads(cluster_config, "cnvkit_call_CNV_research") + params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv"}, + cnv_dir = cnv_dir, + cnsr = lambda wc: "tumor.merged.cn{s,r}", + case_name = config["analysis"]["case_id"], + gender = config["analysis"]["gender"], + sample_id = "TUMOR", + genome_version = config["reference"]["genome_version"], + message: + ("Calling CNVs using CNVkit for {params.case_name}") + shell: + """ +purity=$(awk -F\\, 'NR>1 {{print $2}}' {input.purity_ploidy}) +ploidy=$(awk -F\\, 'NR>1 {{printf int($3)}}' {input.purity_ploidy}) + +# Call copy number variants from segmented log2 ratios +cnvkit.py call {input.cns_initial} \ +--vcf {input.snv_vcf} \ +--sample-sex {params.gender} \ +--method clonal \ +--purity $purity \ +--ploidy $ploidy \ +--output {output.cns}; + +# Plot bin-level log2 coverages and segmentation calls +cnvkit.py scatter {input.cnr} \ +--segment {output.cns} \ +--output {output.scatter}; + +# Draw copy number (.cnr or .cns) on chromosomes as an ideogram +cnvkit.py diagram {input.cnr} \ +--segment {output.cns} \ +--output {output.diagram}; + +# Identify targeted genes with copy number gain or loss +cnvkit.py genemetrics {input.cnr} \ +--segment {output.cns} \ +--drop-low-coverage \ +--sample-sex {params.gender} \ +--output {output.gene_metrics}; + +# List the genenames that contain a possibe copy number breakpoint. +cnvkit.py breaks {input.cnr} {output.cns} \ +| cut -f1 | sort -u > {output.gene_breaks}; + +# Convert segments to a vcf file +cnvkit.py export vcf {output.cns} \ +--cnr {input.cnr} \ +--output {output.vcf} \ +--sample-sex {params.gender} \ +--sample-id {params.sample_id}; + """ + +rule bcftools_sort_cnvkitCNV_research: + input: + vcf = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf", + output: + namemap = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.sample_name_map"), + vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf.gz" + benchmark: + Path(f"{benchmark_dir}/bcftools_sort_cnvkitCNV_research_{config['analysis']['case_id']}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + threads: + get_threads(cluster_config, "bcftools_sort_cnvkitCNV_research") + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + cnv_dir = cnv_dir, + case_name = config["analysis"]["case_id"], + tumor = tumor_sample, + message: + ("Sorting CNVs using bcftools for {params.case_name}") + shell: + """ +bgzip -l 9 {input.vcf}; + +tabix -p vcf -f {input.vcf}.gz; + +bcftools sort \ +-o {output.vcf} \ +--temp-dir {params.tmpdir} \ +-O z {input.vcf}.gz; + +tabix -p vcf -f {output.vcf}; + +echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; + +rm -rf {params.tmpdir}; + """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule index 612ba583b..ff79d5805 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule @@ -1,14 +1,12 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -normal_bam = "normal.merged.bam" -tumor_bam = "tumor.merged.bam" rule manta_tumor_normal: input: fa = config["reference"]["reference_genome"], - bamN = bam_dir + normal_bam, - bamT = bam_dir + tumor_bam, + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample) output: final = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".manta.vcf.gz", namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".manta.sample_name_map", @@ -19,9 +17,9 @@ rule manta_tumor_normal: params: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), runmode = "local", - tumor = get_sample_type(config["samples"], "tumor"), - normal = get_sample_type(config["samples"], "normal"), - case_name = config["analysis"]["case_id"], + tumor = config_model.get_sample_name_by_type(SampleType.TUMOR), + normal = config_model.get_sample_name_by_type(SampleType.NORMAL), + case_name = case_id, manta_install_path = "/opt/conda/share/manta-1.6.0-2" threads: get_threads(cluster_config, "manta_tumor_normal") @@ -57,8 +55,8 @@ rm -rf {params.tmpdir}; rule delly_sv_tumor_normal: input: fa = config["reference"]["reference_genome"], - bamN = bam_dir + normal_bam, - bamT = bam_dir + tumor_bam, + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), excl = config["reference"]["delly_exclusion_converted"], output: final = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", @@ -94,8 +92,8 @@ if config["analysis"]["sequencing_type"] == 'wgs': rule delly_cnv_tumor_normal: input: fa = config["reference"]["reference_genome"], - bamN = bam_dir + normal_bam, - bamT = bam_dir + tumor_bam, + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), map = config["reference"]["delly_mappability"], output: cnv_delly = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", @@ -142,8 +140,8 @@ else: rule delly_cnv_tumor_normal: input: fa = config["reference"]["reference_genome"], - bamN = bam_dir + normal_bam, - bamT = bam_dir + tumor_bam, + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), map = config["reference"]["delly_mappability"], baits_bed = config["panel"]["capture_kit"], output: @@ -190,19 +188,19 @@ rm -rf {params.tmpdir}; rule ascat_tumor_normal: input: fa = config["reference"]["reference_genome"] , - bamN = bam_dir + normal_bam, - bamT = bam_dir + tumor_bam, - gccorrection = config["reference"]["ascat_gccorrection"], + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + gccorrection = config["reference"]["ascat_gc_correction"] output: final_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".raw.ascat.vcf.gz", ascat_copynumber = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.copynumber.txt.gz", - sample_statistics = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.samplestatistics.txt"), - plot_ascat_profile = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.ascatprofile.png"), - plot_raw_profile = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.rawprofile.png"), - plot_aspcf = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.ASPCF.png"), - plot_tumor = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.tumor.png"), - plot_germline = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.germline.png"), - plot_sunrise = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.sunrise.png"), + sample_statistics = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.samplestatistics.txt", + plot_ascat_profile = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.ascatprofile.png", + plot_raw_profile = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.rawprofile.png", + plot_aspcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.ASPCF.png", + plot_tumor = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.tumor.png", + plot_germline = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.germline.png", + plot_sunrise = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.sunrise.png", namemap = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.sample_name_map", benchmark: benchmark_dir + 'ascat_tumor_normal_' + config["analysis"]["case_id"] + "_ascat.tsv" @@ -267,8 +265,8 @@ rm -rf {params.tmpdir}; rule tiddit_sv_tumor_normal: input: fa = config["reference"]["reference_genome"], - bamN = bam_dir + normal_bam, - bamT = bam_dir + tumor_bam, + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample) output: vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tiddit.vcf.gz", cov_tumor_tiddit = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tumor.tiddit_cov.bed", @@ -415,8 +413,8 @@ rule svdb_merge_tumor_normal: Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() params: housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - tumor = get_sample_type(config["samples"], "tumor"), - normal = get_sample_type(config["samples"], "normal"), + tumor = config_model.get_sample_name_by_type(SampleType.TUMOR), + normal = config_model.get_sample_name_by_type(SampleType.NORMAL), case_name = config["analysis"]["case_id"], vcf= lambda wildcards, input:[input[index] + ":" + svdb_callers_prio[index] for index in range(0,len(input))], svdb_priority= ",".join(svdb_callers_prio) diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule index 25900e30a..cb5b860f7 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule @@ -1,12 +1,11 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -tumor_bam = "tumor.merged.bam" rule manta_tumor_only: input: fa = config["reference"]["reference_genome"], - bamT = bam_dir + tumor_bam + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample) output: final = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".manta.vcf.gz", namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".manta.sample_name_map" @@ -17,7 +16,7 @@ rule manta_tumor_only: params: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), runmode = "local", - tumor = get_sample_type(config["samples"], "tumor"), + tumor = config_model.get_sample_name_by_type(SampleType.TUMOR), case_name = config["analysis"]["case_id"], manta_install_path= "/opt/conda/share/manta-1.6.0-2" threads: @@ -53,7 +52,7 @@ rm -rf {params.tmpdir}; rule delly_sv_tumor_only: input: fa = config["reference"]["reference_genome"], - bamT = bam_dir + tumor_bam, + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), excl = config["reference"]["delly_exclusion_converted"], output: bcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", @@ -83,7 +82,7 @@ if config["analysis"]["sequencing_type"] == 'wgs': rule delly_cnv_tumor_only: input: fa = config["reference"]["reference_genome"], - bamT = bam_dir + tumor_bam, + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), bcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", map = config["reference"]["delly_mappability"], output: @@ -115,10 +114,10 @@ rm -rf {params.tmpdir}; else: rule delly_cnv_tumor_only: input: - fa=config["reference"]["reference_genome"], - bamT=bam_dir + tumor_bam, - bcf=vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", - map=config["reference"]["delly_mappability"], + fa = config["reference"]["reference_genome"], + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + bcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", + map = config["reference"]["delly_mappability"], baits_bed = config["panel"]["capture_kit"], output: cnv_delly=vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", @@ -150,7 +149,7 @@ rm -rf {params.tmpdir}; rule tiddit_sv_tumor_only: input: fa = config["reference"]["reference_genome"], - bamT = bam_dir + tumor_bam, + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample) output: vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tiddit.vcf.gz", cov_tumor_tiddit = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tumor.tiddit_cov.bed", @@ -187,12 +186,12 @@ rm -rf {params.tmpdir}; rule cnvpytor_tumor_only: input: - bamT = bam_dir + tumor_bam, + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), vcfT = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", output: cnv_cnvpytor = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvpytor.vcf.gz", circular_cnvpytor = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvpytor.circular.png", - scattter_cnvpytor = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvpytor.scatter.png", + scatter_cnvpytor = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvpytor.scatter.png", namemap= vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvpytor.sample_name_map", benchmark: benchmark_dir + 'cnvpytor_tumor_only_' + config["analysis"]["case_id"] + ".tsv" @@ -240,7 +239,7 @@ cnvpytor -root {params.tmpdir}/{params.tumor}.pytor \ for binsize in 1000 10000 100000; do cnvpytor -root {params.tmpdir}/{params.tumor}.pytor \ -view $binsize < {params.tmpdir}/{params.tumor}.out.sh;done; -cp {params.tmpdir}/{params.tumor}.global.0000.png {output.scattter_cnvpytor}; +cp {params.tmpdir}/{params.tumor}.global.0000.png {output.scatter_cnvpytor}; cp {params.tmpdir}/{params.tumor}.circular.0001.png {output.circular_cnvpytor}; @@ -301,7 +300,7 @@ rule svdb_merge_tumor_only: Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() params: housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - tumor = get_sample_type(config["samples"], "tumor"), + tumor = config_model.get_sample_name_by_type(SampleType.TUMOR), case_name = config["analysis"]["case_id"], vcf= lambda wildcards, input:[input[index] + ":" + svdb_callers_prio[index] for index in range(0,len(input))], svdb_priority= ",".join(svdb_callers_prio) diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule index 7daf0988a..f8ba2a2e1 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule @@ -6,8 +6,8 @@ rule vardict_tumor_normal: input: fa = config["reference"]["reference_genome"], - bamN = bam_dir + "normal.merged.bam", - bamT = bam_dir + "tumor.merged.bam", + bamN = expand(bam_dir + "normal.{sample}.dedup_sorted_addRG.bam", sample=normal_sample), + bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_addRG.bam", sample=tumor_sample), bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, output: temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz") @@ -30,7 +30,7 @@ rule vardict_tumor_normal: """ mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; -export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx32G\"'; +export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx90G\"'; vardict-java -U -u -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \ -b \"{input.bamT}|{input.bamN}\" \ @@ -83,8 +83,8 @@ rm -rf {params.tmpdir}; rule sentieon_TNhaplotyper: input: - bamT = bam_dir + "tumor.merged.bam", - bamN = bam_dir + "normal.merged.bam", + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), interval = config["panel"]["capture_kit"], ref = config["reference"]["reference_genome"], dbsnp = config["reference"]["dbsnp"], diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule index 8ff042d93..cf8ea7148 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule @@ -4,7 +4,7 @@ rule vardict_tumor_only: input: fa = config["reference"]["reference_genome"], - bamT = bam_dir + "tumor.merged.bam", + bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_addRG.bam", sample=tumor_sample), bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, output: temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz") @@ -29,7 +29,7 @@ export PERL5LIB=; mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; -export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx48G\"'; +export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx45G\"'; vardict-java -u -I 600 \ -G {input.fa} \ @@ -84,7 +84,7 @@ echo '{{ vcf: {{ vardict: {{ name: vardict , path: {output.vcf_vardict} }} }} }} rule sentieon_TNhaplotyper_tumor_only: input: - bam = bam_dir + "tumor.merged.bam", + bam = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), ref = config["reference"]["reference_genome"], dbsnp = config["reference"]["dbsnp"], cosmic = config["reference"]["cosmic"], diff --git a/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule b/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule index 747d7604e..7c19c0b80 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule @@ -7,7 +7,7 @@ rule bedtools_splitbed_by_chrom: input: bed = config["panel"]["capture_kit"], chrom = config["reference"]["genome_chrom_size"], - bam = bam_dir + "tumor.merged.bam" + bam = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample) output: bed = expand(vcf_dir + "split_bed/" + "{chrom}." + capture_kit, chrom=chromlist) benchmark: diff --git a/BALSAMIC/utils/analysis.py b/BALSAMIC/utils/analysis.py new file mode 100644 index 000000000..3596d2f65 --- /dev/null +++ b/BALSAMIC/utils/analysis.py @@ -0,0 +1,69 @@ +"""Utility functions for Balsamic analysis.""" +import os +from pathlib import Path +from typing import Any, Dict, List + +from BALSAMIC.constants.paths import ( + ASSETS_DIR, + BALSAMIC_DIR, + CADD_ANNOTATIONS_CONTAINER_DIR, +) +from BALSAMIC.models.cache import CacheConfig +from BALSAMIC.models.snakemake import SingularityBindPath +from BALSAMIC.utils.cli import get_resolved_fastq_files_directory + + +def get_singularity_bind_paths( + sample_config: Dict[str, Any] +) -> List[SingularityBindPath]: + """Return a list of singularity binding paths for Balsamic analysis.""" + analysis_dir: Path = Path(sample_config["analysis"]["analysis_dir"]) + fastq_dir: Path = Path( + get_resolved_fastq_files_directory(sample_config["analysis"]["fastq_path"]) + ) + cache_dir: Path = Path(os.path.commonpath(sample_config["reference"].values())) + singularity_bind_paths: List[SingularityBindPath] = [ + SingularityBindPath(source=fastq_dir, destination=fastq_dir), + SingularityBindPath(source=ASSETS_DIR, destination=ASSETS_DIR), + SingularityBindPath(source=analysis_dir, destination=analysis_dir), + SingularityBindPath(source=cache_dir, destination=cache_dir), + ] + if sample_config.get("panel"): + capture_kit_path: Path = Path(sample_config.get("panel").get("capture_kit")) + singularity_bind_paths.append( + SingularityBindPath(source=capture_kit_path, destination=capture_kit_path) + ) + if sample_config.get("panel").get("pon_cnn"): + pon_cnn_path: Path = Path(sample_config.get("panel").get("pon_cnn")) + singularity_bind_paths.append( + SingularityBindPath(source=pon_cnn_path, destination=pon_cnn_path) + ) + if sample_config.get("background_variants"): + background_variants_path: Path = Path(sample_config.get("background_variants")) + singularity_bind_paths.append( + SingularityBindPath( + source=background_variants_path, destination=background_variants_path + ) + ) + if sample_config.get("reference").get("cadd_annotations"): + cadd_annotations_path: Path = Path( + sample_config.get("reference").get("cadd_annotations") + ) + singularity_bind_paths.append( + SingularityBindPath( + source=cadd_annotations_path, destination=CADD_ANNOTATIONS_CONTAINER_DIR + ) + ) + return singularity_bind_paths + + +def get_cache_singularity_bind_paths( + cache_config: CacheConfig, +) -> List[SingularityBindPath]: + """Return a list of singularity binding paths for Balsamic init.""" + return [ + SingularityBindPath(source=BALSAMIC_DIR, destination=BALSAMIC_DIR), + SingularityBindPath( + source=cache_config.references_dir, destination=cache_config.references_dir + ), + ] diff --git a/BALSAMIC/utils/cache.py b/BALSAMIC/utils/cache.py new file mode 100644 index 000000000..f382bbec1 --- /dev/null +++ b/BALSAMIC/utils/cache.py @@ -0,0 +1,17 @@ +"""Utility methods for Balsamic init command.""" +from typing import Dict + +from BALSAMIC.constants.cache import DOCKER_URL, DockerContainers, CacheVersion + + +def get_containers(cache_version: str) -> Dict[str, str]: + """Return a dictionary mapping container names to their docker image paths.""" + cache_version: str = ( + cache_version + if cache_version == CacheVersion.DEVELOP + else f"release_v{cache_version}" + ) + return { + container: f"{DOCKER_URL}:{cache_version}-{container}" + for container in set(DockerContainers) + } diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index e1866816a..552619c19 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -1,22 +1,26 @@ -import os -import shutil import logging -import sys -import collections +import os import re import subprocess -from pathlib import Path -from io import StringIO +import sys from distutils.spawn import find_executable -import zlib -from typing import List +from io import StringIO +from pathlib import Path +from typing import Dict, List, Optional -import yaml -import snakemake +import click import graphviz +import snakemake +import yaml from colorclass import Color from BALSAMIC import __version__ as balsamic_version +from BALSAMIC.constants.analysis import FASTQ_SUFFIXES, FastqName, PonParams, SampleType +from BALSAMIC.constants.cache import CacheVersion +from BALSAMIC.constants.cluster import ClusterConfigType +from BALSAMIC.constants.constants import FileType +from BALSAMIC.constants.paths import CONSTANTS_DIR +from BALSAMIC.models.config import FastqInfoModel, SampleInstanceModel from BALSAMIC.utils.exc import BalsamicError LOG = logging.getLogger(__name__) @@ -38,155 +42,6 @@ def __exit__(self, *args): sys.stdout = self._stdout -class SnakeMake: - """ - To build a snakemake command using cli options - - Params: - case_name - analysis case name - working_dir - working directory for snakemake - configfile - sample configuration file (json) output of balsamic-config-sample - run_mode - run mode - cluster or local shell run - cluster_config - cluster config json file - scheduler - slurm command constructor - log_path - log file path - script_path - file path for slurm scripts - result_path - result directory - qos - QOS for sbatch jobs - account - scheduler(e.g. slurm) account - mail_user - email to account to send job run status - forceall - To add '--forceall' option for snakemake - run_analysis - To run pipeline - use_singularity - To use singularity - singularity_bind- Singularity bind path - quiet - Quiet mode for snakemake - singularity_arg - Singularity arguments to pass to snakemake - sm_opt - snakemake additional options - disable_variant_caller - Disable variant caller - dragen - enable/disable dragen suite - slurm_profiler - enable slurm profiler - """ - - def __init__(self): - self.case_name = str() - self.working_dir = str() - self.snakefile = str() - self.configfile = str() - self.run_mode = str() - self.profile = str() - self.cluster_config = str() - self.scheduler = str() - self.log_path = str() - self.script_path = str() - self.result_path = str() - self.qos = str() - self.account = str() - self.mail_type = str() - self.mail_user = str() - self.forceall = False - self.run_analysis = False - self.quiet = False - self.report = str() - self.use_singularity = True - self.singularity_bind = str() - self.singularity_arg = str() - self.sm_opt = str() - self.disable_variant_caller = str() - self.dragen = False - self.slurm_profiler = str() - - def build_cmd(self): - forceall = str() - quiet_mode = str() - sm_opt = str() - cluster_cmd = str() - dryrun = str() - report = str() - snakemake_config_key_value = list() - - if self.forceall: - forceall = "--forceall" - - if self.report: - report = "--report {}".format(self.report) - - if self.quiet: - quiet_mode = " --quiet " - - if self.sm_opt: - sm_opt = " ".join(self.sm_opt) - - if not self.run_analysis: - dryrun = "--dryrun" - - if self.disable_variant_caller: - snakemake_config_key_value.append( - f"disable_variant_caller={self.disable_variant_caller}" - ) - - if self.dragen: - snakemake_config_key_value.append("dragen=True") - - if snakemake_config_key_value: - snakemake_config_key_value.insert(0, "--config") - - if self.use_singularity: - self.singularity_arg = "--use-singularity --singularity-args ' --cleanenv " - for bind_path in self.singularity_bind: - self.singularity_arg += " --bind {}:{}".format(bind_path, bind_path) - self.singularity_arg += "' " - - if self.run_mode == "cluster": - sbatch_cmd = ( - " '{} {} " - " --sample-config {} --profile {} " - " --account {} --qos {} " - " --log-dir {} --script-dir {} " - " --result-dir {} ".format( - sys.executable, - self.scheduler, - self.configfile, - self.profile, - self.account, - self.qos, - self.log_path, - self.script_path, - self.result_path, - ) - ) - - if self.slurm_profiler: - sbatch_cmd += " --slurm-profiler {}".format(self.slurm_profiler) - - if self.mail_user: - sbatch_cmd += " --mail-user {} ".format(self.mail_user) - - if self.mail_type: - sbatch_cmd += " --mail-type {} ".format(self.mail_type) - - sbatch_cmd += " {dependencies} '" - - cluster_cmd = ( - " --immediate-submit -j 999 " - "--jobname BALSAMIC.{}.{{rulename}}.{{jobid}}.sh " - "--cluster-config {} --cluster {} ".format( - self.case_name, self.cluster_config, sbatch_cmd - ) - ) - - # Merge snakmake config key value list - snakemake_config_key_value = " ".join(snakemake_config_key_value) - - sm_cmd = ( - f" snakemake --notemp -p " - f" --directory {self.working_dir} --snakefile {self.snakefile} --configfiles {self.configfile} " - f" {self.cluster_config} {self.singularity_arg} {quiet_mode} " - f" {forceall} {dryrun} {cluster_cmd} " - f" {report} {snakemake_config_key_value} {sm_opt}" - ) - return sm_cmd - - def add_doc(docstring): """ A decorator for adding docstring. Taken shamelessly from stackexchange. @@ -223,39 +78,14 @@ def createDir(path, interm_path=[]): return os.path.abspath(path) -def iterdict(dic): - """dictionary iteration - returns generator""" - for key, value in dic.items(): - if isinstance(value, dict): - yield from iterdict(value) - else: - yield key, value - - -def get_schedulerpy(): - """ - Returns a string path for scheduler.py - """ - - p = Path(__file__).parents[1] - scheduler = str(Path(p, "utils", "scheduler.py")) - - return scheduler - - -def get_snakefile(analysis_type, analysis_workflow="balsamic", reference_genome="hg19"): - """ - Return a string path for variant calling snakefile. - """ +def get_snakefile(analysis_type, analysis_workflow="balsamic") -> str: + """Return a string path for the specific snakemake file.""" p = Path(__file__).parents[1] snakefile = Path(p, "workflows", "balsamic.smk") if analysis_type == "generate_ref": snakefile = Path(p, "workflows", "reference.smk") - if "canfam3" in reference_genome: - snakefile = Path(p, "workflows", "reference-canfam3.smk") - return str(snakefile) if analysis_type == "pon": snakefile = Path(p, "workflows", "PON.smk") @@ -266,36 +96,9 @@ def get_snakefile(analysis_type, analysis_workflow="balsamic", reference_genome= return str(snakefile) -def get_config(config_name): - """ - Return a string path for config file. - """ - - p = Path(__file__).parents[1] - config_file = str(Path(p, "config", config_name + ".json")) - if Path(config_file).exists(): - return config_file - else: - raise FileNotFoundError(f"Config for {config_name} was not found.") - - -def recursive_default_dict(): - """ - Recursivly create defaultdict. - """ - return collections.defaultdict(recursive_default_dict) - - -def convert_defaultdict_to_regular_dict(inputdict: dict): - """ - Recursively convert defaultdict to dict. - """ - if isinstance(inputdict, collections.defaultdict): - inputdict = { - key: convert_defaultdict_to_regular_dict(value) - for key, value in inputdict.items() - } - return inputdict +def get_config_path(config_type: ClusterConfigType) -> Path: + """Return a config path given its type.""" + return Path(CONSTANTS_DIR, f"{config_type}.{FileType.JSON}") def find_file_index(file_path): @@ -304,6 +107,7 @@ def find_file_index(file_path): ".cram": [".cram.crai", ".crai"], ".vcf.gz": [".vcf.gz.tbi"], ".vcf": [".vcf.tbi"], + ".bed.gz": [".bed.gz.tbi"], } file_path_index = set() @@ -331,25 +135,6 @@ def get_file_extension(file_path): return file_extension[1:] -def get_from_two_key(input_dict, from_key, by_key, by_value, default=None): - """ - Given two keys with list of values of same length, find matching index of by_value in from_key from by_key. - - from_key and by_key should both exist - """ - - matching_value = default - if ( - from_key in input_dict - and by_key in input_dict - and by_value in input_dict[from_key] - ): - idx = input_dict[from_key].index(by_value) - matching_value = input_dict[by_key][idx] - - return matching_value - - def get_file_status_string(file_to_check): """ Checks if file exsits. and returns a string with checkmark or redcorss mark @@ -365,50 +150,6 @@ def get_file_status_string(file_to_check): return return_str, file_status -def singularity(sif_path: str, cmd: str, bind_paths: list) -> str: - """Run within container - - Excutes input command string via Singularity container image - - Args: - sif_path: Path to singularity image file (sif) - cmd: A string for series of commands to run - bind_path: a path to bind within container - - Returns: - A sanitized Singularity cmd - - Raises: - BalsamicError: An error occured while creating cmd - """ - - singularity_cmd = shutil.which("singularity") - if not singularity_cmd: - raise BalsamicError("singularity command does not exist") - - if not Path(sif_path).is_file(): - raise BalsamicError("container file does not exist") - - singularity_bind_path = "" - for bind_path in bind_paths: - singularity_bind_path += "--bind {} ".format(bind_path) - - shellcmd = "singularity exec {} {}".format(singularity_bind_path, cmd) - - return " ".join(shellcmd.split()) - - -def validate_fastq_pattern(sample): - """Finds the correct filename prefix from file path, and returns it. - An error is raised if sample name has invalid pattern""" - - fq_pattern = re.compile(r"R_[12]" + ".fastq.gz$") - sample_basename = Path(sample).name - - file_str = sample_basename[0 : (fq_pattern.search(sample_basename).span()[0] + 1)] - return file_str - - def get_panel_chrom(panel_bed) -> list: """Returns a set of chromosomes present in PANEL BED""" @@ -475,7 +216,7 @@ def bioinfo_tool_version_conda( def get_bioinfo_tools_version( - bioinfo_tools: dict, container_conda_env_path: os.PathLike + bioinfo_tools: dict, container_conda_env_path: Path ) -> dict: """Parses the names and versions of bioinfo tools used by BALSAMIC from config YAML into a dict. @@ -513,59 +254,125 @@ def get_bioinfo_tools_version( return bioinfo_tools_version -def get_sample_dict( - tumor: str, - normal: str, - tumor_sample_name: str = None, - normal_sample_name: str = None, -) -> dict: - """Concatenates sample dicts for all provided files""" - samples = {} - if normal: - for sample in normal: - key, val = get_sample_names(sample, "normal") - samples[key] = val - samples[key]["sample_name"] = normal_sample_name - - for sample in tumor: - key, val = get_sample_names(sample, "tumor") - samples[key] = val - samples[key]["sample_name"] = tumor_sample_name - return samples - - -def get_sample_names(filename, sample_type): - """Creates a dict with sample prefix, sample type, and readpair suffix""" - file_str = validate_fastq_pattern(filename) - if file_str: - return ( - file_str, +def get_fastq_info(sample_name: str, fastq_path: str) -> Dict[str, FastqInfoModel]: + """Returns a dictionary of fastq-pattern/s and FastqInfoModel instance/s for a sample. + + Args: + sample_name: (str). The name of the sample for which fastq-files will be searched for in the fastq_path. + fastq_path: (str). Path to where the fastq-files should be found for the supplied sample_name. + + Returns: + fastq_dict: (Dict) with format: + "[fastq_patternX]" (str): FastqInfoModel. + """ + fastq_dict: Dict[str, Dict] = {} + + for suffix_id, suffix_values in FASTQ_SUFFIXES.items(): + suffix_fwd = suffix_values[FastqName.FWD] + suffix_rev = suffix_values[FastqName.REV] + + fastq_fwd_regex = re.compile( + r"(^|.*_)" + sample_name + r"_.*" + suffix_fwd + r"$" + ) + + fwd_fastqs = [ + f"{fastq_path}/{fastq}" + for fastq in os.listdir(fastq_path) + if fastq_fwd_regex.match(fastq) + ] + + for fwd_fastq in fwd_fastqs: + fastq_pair_pattern = Path(fwd_fastq).name.replace(suffix_fwd, "") + if fastq_pair_pattern in fastq_dict: + error_message = ( + f"Fastq name conflict. Fastq pair pattern {fastq_pair_pattern}" + f" already assigned to dictionary for sample: {sample_name}" + ) + LOG.error(error_message) + raise BalsamicError(error_message) + + rev_fastq: str = fwd_fastq.replace(suffix_fwd, suffix_rev) + fastq_dict[fastq_pair_pattern] = { + "fwd": fwd_fastq, + "rev": rev_fastq, + } + fastq_dict[fastq_pair_pattern].update( + { + "fwd_resolved": Path(fwd_fastq).resolve().as_posix(), + "rev_resolved": Path(rev_fastq).resolve().as_posix(), + } + ) if Path(fwd_fastq).is_symlink() or Path(rev_fastq).is_symlink() else None + + if not fastq_dict: + error_message = f"No fastqs found for: {sample_name} in {fastq_path}" + LOG.error(error_message) + raise BalsamicError(error_message) + + return fastq_dict + + +def get_sample_list( + tumor_sample_name: str, normal_sample_name: Optional[str], fastq_path: str +) -> List[Dict]: + """Returns a list of SampleInstanceModel/s given the names of the tumor and/or normal samples. + Args: + tumor_sample_name (str). The sample_name of the tumor. + normal_sample_name (str). The sample_name of the normal, if it exists. + fastq_path: (str). The path to the fastq-files for the supplied samples. + + Returns: + sample_list: List containing SampleInstanceModel/s. + """ + sample_list: List[Dict] = [ + { + "name": tumor_sample_name, + "type": SampleType.TUMOR, + "fastq_info": get_fastq_info(tumor_sample_name, fastq_path), + } + ] + + if normal_sample_name: + sample_list.append( { - "file_prefix": file_str, - "type": sample_type, - "readpair_suffix": ["1", "2"], - }, + "name": normal_sample_name, + "type": SampleType.NORMAL, + "fastq_info": get_fastq_info(normal_sample_name, fastq_path), + } ) + return sample_list -def create_fastq_symlink(casefiles, symlink_dir: Path): - """Creates symlinks for provided files in analysis/fastq directory. - Identifies file prefix pattern, and also creates symlinks for the - second read file, if needed""" - for filename in casefiles: - parent_dir = Path(filename).parents[0] - file_str = validate_fastq_pattern(filename) - for f in parent_dir.rglob(f"*{file_str}*.fastq.gz"): - try: - LOG.info(f"Creating symlink {f} -> {Path(symlink_dir, f.name)}") - Path(symlink_dir, f.name).symlink_to(f) - except FileExistsError: - LOG.info(f"File {symlink_dir / f.name} exists, skipping") +def get_pon_sample_list(fastq_path: str) -> List[SampleInstanceModel]: + """Returns a list of SampleInstanceModels to be used in PON generation.""" + sample_list: List[SampleInstanceModel] = [] + sample_names = set() + + for fastq in Path(fastq_path).glob(f"*.{FileType.FASTQ}.{FileType.GZ}"): + sample_names.add(fastq.name.split("_")[-4]) + + if len(sample_names) < PonParams.MIN_PON_SAMPLES: + error_message = ( + f"Number of samples detected in supplied fastq path ({len(sample_names)})," + f"not sufficient for PON generation. Sample names detected: {sample_names}" + ) + LOG.error(error_message) + raise BalsamicError(error_message) + + for sample_name in sample_names: + sample_list.append( + { + "name": sample_name, + "type": SampleType.NORMAL, + "fastq_info": get_fastq_info(sample_name, fastq_path), + } + ) + + return sample_list def generate_graph(config_collection_dict, config_path): - """Generate DAG graph using snakemake stdout output""" + """Generate DAG graph using snakemake stdout output.""" with CaptureStdout() as graph_dot: snakemake.snakemake( @@ -574,9 +381,6 @@ def generate_graph(config_collection_dict, config_path): analysis_workflow=config_collection_dict["analysis"][ "analysis_workflow" ], - reference_genome=config_collection_dict["reference"][ - "reference_genome" - ], ), dryrun=True, configfiles=[config_path], @@ -602,30 +406,16 @@ def generate_graph(config_collection_dict, config_path): graph_obj.render(cleanup=True) -def get_fastq_bind_path(fastq_path: Path) -> list(): - """Takes a path with symlinked fastq files. - Returns unique paths to parent directories for singulatiry bind - """ - parents = set() - for fastq_file_path in Path(fastq_path).iterdir(): - parents.add(Path(fastq_file_path).resolve().parent.as_posix()) - return list(parents) - - def convert_deliverables_tags(delivery_json: dict, sample_config_dict: dict) -> dict: - """Replaces values of file_prefix with sample_name in deliverables dict""" + """Replaces values of sample_type with sample_name in deliverables dict.""" for delivery_file in delivery_json["files"]: file_tags = delivery_file["tag"].split(",") - for sample in sample_config_dict["samples"]: - file_prefix = sample_config_dict["samples"][sample]["file_prefix"] - sample_name = sample_config_dict["samples"][sample]["sample_name"] - sample_type = sample_config_dict["samples"][sample]["type"] - if file_prefix == delivery_file["id"]: - delivery_file["id"] = sample_name - for tag_index, tag in enumerate(file_tags): - if tag == file_prefix or tag == file_prefix.replace("_", "-"): - file_tags[tag_index] = sample_name + sample_list = sample_config_dict["samples"] + for sample_dict in sample_list: + sample_type = sample_dict["type"] + sample_name = sample_dict["name"] + if sample_name == delivery_file["id"]: if sample_name not in file_tags: file_tags.append(sample_name) if sample_type == delivery_file["id"]: @@ -667,31 +457,46 @@ def job_id_dump_to_yaml(job_id_dump: Path, job_id_yaml: Path, case_name: str): yaml.dump({case_name: jobid_list}, jobid_out) -def create_pon_fastq_symlink(pon_fastqs, symlink_dir): - for fastq_name in os.listdir(pon_fastqs): - pon_fastq = Path(pon_fastqs, fastq_name).as_posix() - pon_sym_file = Path(symlink_dir, fastq_name).as_posix() - try: - LOG.info(f"Creating symlink {fastq_name} -> {pon_sym_file}") - os.symlink(pon_fastq, pon_sym_file) - except FileExistsError: - LOG.info(f"File {pon_sym_file} exists, skipping") - - -def get_md5(filename): - with open(filename, "rb") as fh: - hashed = 0 - while True: - s = fh.read(65536) - if not s: - break - hashed = zlib.crc32(s, hashed) - return "%08X" % (hashed & 0xFFFFFFFF) - - -def create_md5(reference, check_md5): - """create a md5 file for all reference data""" - with open(check_md5, "w") as fh: - for key, value in reference.items(): - if os.path.isfile(value): - fh.write(get_md5(value) + " " + value + "\n") +def get_resolved_fastq_files_directory(directory: str) -> str: + """Return the absolute path for the directory containing the input fastq files.""" + input_files: List[Path] = [ + file.absolute() + for file in Path(directory).glob(f"*.{FileType.FASTQ}.{FileType.GZ}") + ] + if not input_files or not input_files[0].is_symlink(): + return directory + return os.path.commonpath([file.resolve().as_posix() for file in input_files]) + + +def get_analysis_fastq_files_directory(case_dir: str, fastq_path: str) -> str: + """Return analysis fastq directory, linking the fastq files if necessary.""" + analysis_fastq_path: Path = Path(case_dir, "fastq") + analysis_fastq_path.mkdir(parents=True, exist_ok=True) + if Path(case_dir) not in Path(fastq_path).parents: + for fastq in Path(fastq_path).glob(f"*.{FileType.FASTQ}.{FileType.GZ}"): + try: + Path(analysis_fastq_path, fastq.name).symlink_to(fastq) + LOG.info(f"Created link for {fastq} in {analysis_fastq_path}") + except FileExistsError: + LOG.warning( + f"File {Path(analysis_fastq_path, fastq.name)} exists. Skipping linking." + ) + + return analysis_fastq_path.as_posix() + return Path(fastq_path).as_posix() + + +def validate_cache_version( + _ctx: click.Context, _param: click.Parameter, version: str +) -> str: + """Validate the provided cache version.""" + version_parts: List[str] = version.split(".") + if ( + version == CacheVersion.DEVELOP + or len(version_parts) == 3 + and all(part.isdigit() for part in version_parts) + ): + return version + raise click.BadParameter( + f"Invalid cache version format. Use '{CacheVersion.DEVELOP}' or 'X.X.X'." + ) diff --git a/BALSAMIC/utils/io.py b/BALSAMIC/utils/io.py index e747418cb..c990e114b 100644 --- a/BALSAMIC/utils/io.py +++ b/BALSAMIC/utils/io.py @@ -1,32 +1,97 @@ -"""Input/Output utils file""" - +"""Input/Output utility methods.""" +import gzip import json +import logging +from datetime import datetime from pathlib import Path +from typing import List +import snakemake import yaml +from graphviz import Source + +from BALSAMIC import __version__ as balsamic_version +from BALSAMIC.utils.cli import CaptureStdout +from BALSAMIC.utils.exc import BalsamicError + +LOG = logging.getLogger(__name__) + + +def generate_workflow_graph( + config_path: Path, directory_path: Path, snakefile: Path, title: str +) -> None: + """Generate snakemake workflow graph and save it in a PDF file.""" + with CaptureStdout() as graph_dot: + snakemake.snakemake( + snakefile=snakefile, + dryrun=True, + configfiles=[config_path.as_posix()], + printrulegraph=True, + ) + graph_title: str = "_".join(["BALSAMIC", balsamic_version, title]) + graph_dot: str = "".join(graph_dot).replace( + "snakemake_dag {", 'BALSAMIC { label="' + graph_title + '";labelloc="t";' + ) + graph: Source = Source( + graph_dot, + directory=directory_path.as_posix(), + filename=f"{title}_graph", + format="pdf", + engine="dot", + ) + try: + graph_pdf: Path = Path(graph.render()) + LOG.info(f"Workflow graph generated successfully ({graph_pdf.as_posix()}) ") + except Exception: + LOG.error("Workflow graph generation failed") + raise BalsamicError() -def read_json(json_path) -> dict: +def read_json(json_path: str) -> dict: + """Read JSON file and return a dictionary.""" if Path(json_path).exists(): with open(json_path, "r") as fn: return json.load(fn) else: - raise FileNotFoundError(f"The JSON file {json_path} was not found.") + raise FileNotFoundError(f"The JSON file {json_path} was not found") -def write_json(json_out, output_config): - """Writes JSON format data to an output file""" +def write_json(json_obj: dict, path: str) -> None: + """Write JSON format data to an output file.""" try: - with open(output_config, "w") as fn: - json.dump(json_out, fn, indent=4) + with open(path, "w") as fn: + json.dump(json_obj, fn, indent=4) except OSError as error: - raise OSError(f"Error while writing JSON file: {output_config}, error: {error}") + raise OSError(f"Error while writing JSON file: {path}, error: {error}") -def read_yaml(yaml_path): - """Retrieves data from a yaml file""" +def read_yaml(yaml_path: str) -> dict: + """Read data from a yaml file.""" if Path(yaml_path).exists(): with open(yaml_path, "r") as fn: return yaml.load(fn, Loader=yaml.SafeLoader) else: - raise FileNotFoundError(f"The YAML file {yaml_path} was not found.") + raise FileNotFoundError(f"The YAML file {yaml_path} was not found") + + +def read_vcf_file(vcf_file_path: str) -> List[str]: + """ + Reads a VCF file and returns its contents as a list of lines. + + Args: + vcf_file (str): The path to the VCF file to be read. + + Returns: + List[str]: A list of strings representing the lines in the VCF file. + """ + vcf_file_path: Path = Path(vcf_file_path) + if vcf_file_path.suffix == ".gz": + with gzip.open(vcf_file_path, "rt") as vcf_file: + return vcf_file.read().splitlines() + return vcf_file_path.read_text().splitlines() + + +def write_finish_file(file_path: str) -> None: + """Write finish file indicating the analysis completion.""" + with open(file_path, mode="w") as finish_file: + finish_file.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M')}\n") diff --git a/BALSAMIC/utils/metrics.py b/BALSAMIC/utils/metrics.py new file mode 100644 index 000000000..a6ad6adb4 --- /dev/null +++ b/BALSAMIC/utils/metrics.py @@ -0,0 +1,7 @@ +"""QC metrics utility methods.""" +from BALSAMIC.models.metrics import MetricValidation + + +def validate_qc_metrics(metrics: dict) -> dict: + """Returns a set of validated QC metrics.""" + return MetricValidation(metrics=metrics).model_dump()["metrics"] diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py deleted file mode 100644 index e08c9e1ce..000000000 --- a/BALSAMIC/utils/models.py +++ /dev/null @@ -1,848 +0,0 @@ -import hashlib -import logging -import os -import re -from datetime import datetime -from pathlib import Path -from typing import Optional, List, Dict, Any - -from pydantic import BaseModel, validator, Field, AnyUrl, root_validator -from pydantic.types import DirectoryPath, FilePath - -from BALSAMIC import __version__ as balsamic_version - -from BALSAMIC.constants.common import ( - BIOINFO_TOOL_ENV, - SEQUENCING_TYPE, - ANALYSIS_TYPES, - ANALYSIS_WORKFLOW, - WORKFLOW_SOLUTION, - MUTATION_CLASS, - MUTATION_TYPE, - VALID_OPS, - GENDER_OPTIONS, -) -from BALSAMIC.constants.reference import VALID_GENOME_VER, VALID_REF_FORMAT - -LOG = logging.getLogger(__name__) - - -class VCFAttributes(BaseModel): - """General purpose filter to manage various VCF attributes - - This class handles three parameters for the purpose filtering variants - based on a tag_values, filter_name, and which field in VCF. - - E.g. AD=VCFAttributes(tag_value=5, filter_name="balsamic_low_tumor_ad", field="INFO") - A value of 5 from INFO field and filter_name will be balsamic_low_tumor_ad - - Attributes: - tag_value: float - filter_name: str - field: str - """ - - tag_value: float - filter_name: str - field: str - - -class VarCallerFilter(BaseModel): - """General purpose for variant caller filters - - This class handles attributes and filter for variant callers - - Attributes: - AD: VCFAttributes (required); minimum allelic depth - AF_min: VCFAttributes (optional); minimum allelic fraction - AF_max: VCFAttributes (optional); maximum allelic fraction - MQ: VCFAttributes (optional); minimum mapping quality - DP: VCFAttributes (optional); minimum read depth - pop_freq: VCFAttributes (optional); maximum gnomad allele frequency - pop_freq_umi: VCFAttributes (optional); maximum gnomad_af for UMI workflow - strand_reads: VCFAttributes (optional); minimum strand specific read counts - qss: VCFAttributes (optional); minimum sum of base quality scores - sor: VCFAttributes (optional); minimum symmetrical log-odds ratio - swegen_snv_freq: VCFAttributes (optional); maximum swegen snv allele frequency - swegen_sv_freq: VCFAttributes (optional); maximum swegen sv allele frequency - loqusdb_clinical_snv_freq: VCFAttributes (optional); maximum loqusdb clinical snv allele frequency - loqusdb_clinical_sv_freq: VCFAttributes (optional); maximum loqusdb clinical sv allele frequency - varcaller_name: str (required); variant caller name - filter_type: str (required); filter name for variant caller - analysis_type: str (required); analysis type e.g. tumor_normal or tumor_only - description: str (required); comment section for description - """ - - AD: Optional[VCFAttributes] - AF_min: Optional[VCFAttributes] - AF_max: Optional[VCFAttributes] - MQ: Optional[VCFAttributes] - DP: Optional[VCFAttributes] - pop_freq: Optional[VCFAttributes] - pop_freq_umi: Optional[VCFAttributes] - strand_reads: Optional[VCFAttributes] - qss: Optional[VCFAttributes] - sor: Optional[VCFAttributes] - swegen_snv_freq: Optional[VCFAttributes] - swegen_sv_freq: Optional[VCFAttributes] - loqusdb_clinical_snv_freq: Optional[VCFAttributes] - loqusdb_clinical_sv_freq: Optional[VCFAttributes] - varcaller_name: str - filter_type: str - analysis_type: str - description: str - - -class QCModel(BaseModel): - """Contains settings for quality control and pre-processing - Attributes: - picard_rmdup : Field(bool); whether duplicate removal is to be applied in the workflow - adapter : Field(str(AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT)); adapter sequence to trim - quality_trim : Field(bool); whether quality trimming it to be performed in the workflow - adapter_trim : Field(bool); whether adapter trimming is to be performed in the workflow - umi_trim : Field(bool); whether UMI trimming is to be performed in the workflow - min_seq_length : Field(str(int)); minimum sequence length cutoff for reads - umi_trim_length : Field(str(int)); length of UMI to be trimmed from reads - n_base_limit : Field(str(int)); supports filtering by limiting the N base number - - Raises: - ValueError: - When the input in min_seq_length and umi_trim_length cannot - be interpreted as integer and coerced to string - - """ - - picard_rmdup: bool = False - adapter: str = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT" - quality_trim: bool = True - adapter_trim: bool = False - umi_trim: bool = False - min_seq_length: int = 25 - umi_trim_length: int = 5 - n_base_limit: int = 50 - - @validator("min_seq_length", "umi_trim_length", "n_base_limit") - def coerce_int_as_str(cls, value): - return str(value) - - class Config: - validate_all = True - - -class VarcallerAttribute(BaseModel): - """Holds variables for variant caller software - Attributes: - mutation: str of mutation class - mutation_type: str of mutation type - analysis_type: list of str for analysis types - workflow_solution: list of str for workflows - sequencing_type: list of str for workflows - - Raises: - ValueError: - When a variable other than [somatic, germline] is passed in mutation field - When a variable other than [SNV, CNV, SV] is passed in mutation_type field - - """ - - mutation: str - mutation_type: str = Field(alias="type") - analysis_type: Optional[list] - sequencing_type: Optional[list] - workflow_solution: Optional[list] - - @validator("workflow_solution", check_fields=False) - def workflow_solution_literal(cls, value) -> str: - """Validate workflow solution""" - assert set(value).issubset( - set(WORKFLOW_SOLUTION) - ), f"{value} is not valid workflow solution." - return value - - @validator("analysis_type", check_fields=False) - def annotation_type_literal(cls, value) -> str: - """Validate analysis types""" - assert set(value).issubset( - set(ANALYSIS_TYPES) - ), f"{value} is not a valid analysis type." - return value - - @validator("mutation", check_fields=False) - def mutation_literal(cls, value) -> str: - """Validate mutation class""" - assert value in MUTATION_CLASS, f"{value} is not a valid mutation type." - return value - - @validator("mutation_type", check_fields=False) - def mutation_type_literal(cls, value) -> str: - """Validate mutation type""" - assert value in MUTATION_TYPE, f"{value} is not not a valid mutation class" - return value - - @validator("sequencing_type", check_fields=False) - def sequencing_type_literal(cls, value) -> str: - """Validate sequencing type""" - assert set(value).issubset( - set(SEQUENCING_TYPE) - ), f"{value} is not not a valid sequencing type." - return value - - -class VCFModel(BaseModel): - """Contains VCF config""" - - vardict: VarcallerAttribute - tnscope: VarcallerAttribute - dnascope: VarcallerAttribute - tnscope_umi: VarcallerAttribute - manta_germline: VarcallerAttribute - manta: VarcallerAttribute - dellysv: VarcallerAttribute - cnvkit: VarcallerAttribute - ascat: VarcallerAttribute - dellycnv: VarcallerAttribute - tiddit: VarcallerAttribute - cnvpytor: VarcallerAttribute - svdb: VarcallerAttribute - - -class AnalysisModel(BaseModel): - """Pydantic model containing workflow variables - - Attributes: - - case_id : Field(required); string case identifier - gender: Field(required); string case gender - analysis_type : Field(required); string literal [single, paired, pon] - single : if only tumor samples are provided - paired : if both tumor and normal samples are provided - pon : panel of normal analysis - sequencing_type : Field(required); string literal [targeted, wgs] - targeted : if capture kit was used to enrich specific genomic regions - wgs : if whole genome sequencing was performed - analysis_workflow: Field(required); string literal [balsamic, balsamic-qc, balsamic-umi] - balsamic: execute balsamic workflow - balsamic-qc: execute balsamic qc-only workflow - balsamic-umi: execute balsamic along with UMIworkflow for panels - analysis_dir : Field(required); existing path where to save files - fastq_path : Field(optional); Path where fastq files will be stored - script : Field(optional); Path where snakemake scripts will be stored - log : Field(optional); Path where logs will be saved - result : Field(optional); Path where BALSAMIC output will be stored - benchmark : Field(optional); Path where benchmark report will be stored - dag : Field(optional); Path where DAG graph of workflow will be stored - BALSAMIC_version : Field(optional); Current version of BALSAMIC - config_creation_date : Field(optional); Timestamp when config was created - - Raises: - ValueError: - When gender is set to any other than [female, male] - When analysis_type is set to any value other than [single, paired, pon] - When sequencing_type is set to any value other than [wgs, targeted] - When analysis_workflow is set to any other than [balsamic, balsamic-qc, balsamic-umi] - """ - - case_id: str - analysis_type: str - gender: Optional[str] - sequencing_type: str - analysis_workflow: str - analysis_dir: DirectoryPath - fastq_path: Optional[DirectoryPath] - script: Optional[DirectoryPath] - log: Optional[DirectoryPath] - result: Optional[DirectoryPath] - benchmark: Optional[DirectoryPath] - dag: Optional[FilePath] - BALSAMIC_version: str = balsamic_version - config_creation_date: Optional[str] - - class Config: - validate_all = True - - @validator("analysis_type") - def analysis_type_literal(cls, value) -> str: - balsamic_analysis_types = ANALYSIS_TYPES - if value not in balsamic_analysis_types: - raise ValueError( - f"Provided analysis type ({value}) not supported in BALSAMIC!" - ) - return value - - @validator("gender") - def gender_literal(cls, value, values) -> Optional[str]: - if value not in GENDER_OPTIONS and values.get("analysis_type") != "pon": - raise ValueError( - f"Provided gender type ({value}) is not supported in BALSAMIC!" - ) - return value - - @validator("sequencing_type") - def sequencing_type_literal(cls, value) -> str: - balsamic_sequencing_types = SEQUENCING_TYPE - if value not in balsamic_sequencing_types: - raise ValueError( - f"Provided sequencing type ({value}) not supported in BALSAMIC!" - ) - return value - - @validator("analysis_workflow", check_fields=True) - def analysis_workflow_literal(cls, value) -> str: - balsamic_analysis_workflow = ANALYSIS_WORKFLOW - if value not in balsamic_analysis_workflow: - raise ValueError( - f"Provided analysis workflow ({value} not supported in BALSAMIC" - ) - return value - - @validator("analysis_dir") - def dirpath_always_abspath(cls, value) -> str: - return Path(value).resolve().as_posix() - - @validator("log") - def parse_analysis_to_log_path(cls, value, values, **kwargs) -> str: - return ( - Path(values.get("analysis_dir"), values.get("case_id"), "logs").as_posix() - + "/" - ) - - @validator("fastq_path") - def parse_analysis_to_fastq_path(cls, value, values, **kwargs) -> str: - return ( - Path( - values.get("analysis_dir"), values.get("case_id"), "analysis", "fastq" - ).as_posix() - + "/" - ) - - @validator("script") - def parse_analysis_to_script_path(cls, value, values, **kwargs) -> str: - return ( - Path( - values.get("analysis_dir"), values.get("case_id"), "scripts" - ).as_posix() - + "/" - ) - - @validator("result") - def parse_analysis_to_result_path(cls, value, values, **kwargs) -> str: - return Path( - values.get("analysis_dir"), values.get("case_id"), "analysis" - ).as_posix() - - @validator("benchmark") - def parse_analysis_to_benchmark_path(cls, value, values, **kwargs) -> str: - return ( - Path( - values.get("analysis_dir"), values.get("case_id"), "benchmarks" - ).as_posix() - + "/" - ) - - @validator("dag") - def parse_analysis_to_dag_path(cls, value, values, **kwargs) -> str: - return ( - Path( - values.get("analysis_dir"), values.get("case_id"), values.get("case_id") - ).as_posix() - + f"_BALSAMIC_{balsamic_version}_graph.pdf" - ) - - @validator("config_creation_date") - def datetime_as_string(cls, value): - return datetime.now().strftime("%Y-%m-%d %H:%M") - - -class AnalysisPonModel(AnalysisModel): - """Pydantic model containing PON workflow variables - - Attributes: - pon_version: Field(str); version of the PON generated file - """ - - pon_version: str - - @validator("pon_version") - def validate_pon_version(cls, value): - """Checks that the version matches the following syntax: v""" - - match = re.fullmatch("^v[1-9]\d*$", value) - if not match: - raise ValueError( - f"The provided version ({value}) does not follow the defined syntax (v)" - ) - - return value - - -class SampleInstanceModel(BaseModel): - """Holds attributes for samples used in analysis - - Attributes: - file_prefix : Field(str); basename of sample pair - sample_type : Field(str; alias=type); type of sample [tumor, normal] - sample_name : Field(str); Internal ID of sample to use in deliverables - readpair_suffix : Field(List); currently always set to [1, 2] - - Raises: - ValueError: - When sample_type is set ot any value other than [tumor, normal] - - """ - - file_prefix: str - sample_name: Optional[str] - sample_type: str = Field(alias="type") - readpair_suffix: List[str] = ["1", "2"] - - @validator("sample_type") - def sample_type_literal(cls, value): - balsamic_sample_types = ["tumor", "normal"] - if value not in balsamic_sample_types: - raise ValueError( - f"Provided sample type ({value}) not supported in BALSAMIC!" - ) - return value - - @validator("sample_name") - def set_sample_id_if_missing_value(cls, value, values, **kwargs): - if value: - return value - return values.get("file_prefix") - - -class PanelModel(BaseModel): - """Holds attributes of PANEL BED file if provided - Attributes: - capture_kit : Field(str(Path)); string representation of path to PANEL BED file - chrom : Field(list(str)); list of chromosomes in PANEL BED - pon_cnn: Field(optional); Path where PON reference .cnn file is stored - - Raises: - ValueError: - When capture_kit argument is set, but is not a valid path - - """ - - capture_kit: Optional[FilePath] - chrom: Optional[List[str]] - pon_cnn: Optional[FilePath] - - @validator("capture_kit") - def path_as_abspath_str(cls, value): - return Path(value).resolve().as_posix() - - @validator("pon_cnn") - def pon_abspath_as_str(cls, value): - if value: - return Path(value).resolve().as_posix() - return None - - -class PonBalsamicConfigModel(BaseModel): - """Summarizes config models in preparation for export - - Attributes: - QC : Field(QCmodel); variables relevant for fastq preprocessing and QC - reference : Field(Dict); dictionary containing paths to reference genome files - panel : Field(PanelModel(optional)); variables relevant to PANEL BED if capture kit is used - singularity : Field(Path); path to singularity container of BALSAMIC - rule_directory : Field(Path(RULE_DIRECTORY)); path where snakemake rules can be found - bioinfo_tools : Field(dict); dictionary of bioinformatics software and which conda/container they are in - bioinfo_tools_version : Field(dict); dictionary of bioinformatics software and their versions used for the analysis - """ - - QC: QCModel - analysis: AnalysisPonModel - reference: Dict[str, Path] - singularity: DirectoryPath - bioinfo_tools: dict - bioinfo_tools_version: dict - panel: Optional[PanelModel] - - @validator("reference") - def abspath_as_str(cls, value): - for k, v in value.items(): - value[k] = Path(v).resolve().as_posix() - return value - - @validator("singularity") - def transform_path_to_dict(cls, value): - return {"image": Path(value).resolve().as_posix()} - - -class BalsamicConfigModel(BaseModel): - """Summarizes config models in preparation for export - - Attributes: - QC : Field(QCmodel); variables relevant for fastq preprocessing and QC - vcf : Field(VCFmodel); variables relevant for variant calling pipeline - samples : Field(Dict); dictionary containing samples submitted for analysis - reference : Field(Dict); dictionary containing paths to reference genome files - panel : Field(PanelModel(optional)); variables relevant to PANEL BED if capture kit is used - bioinfo_tools : Field(dict); dictionary of bioinformatics software and which conda/container they are in - bioinfo_tools_version : Field(dict); dictionary of bioinformatics software and their versions used for the analysis - singularity : Field(Path); path to singularity container of BALSAMIC - background_variants: Field(Path(optional)); path to BACKGROUND VARIANTS for UMI - rule_directory : Field(Path(RULE_DIRECTORY)); path where snakemake rules can be found - """ - - QC: QCModel - vcf: Optional[VCFModel] - analysis: AnalysisModel - samples: Dict[str, SampleInstanceModel] - reference: Dict[str, Path] - singularity: DirectoryPath - background_variants: Optional[FilePath] - bioinfo_tools: dict - bioinfo_tools_version: dict - panel: Optional[PanelModel] - - @validator("reference") - def abspath_as_str(cls, value): - for k, v in value.items(): - value[k] = Path(v).resolve().as_posix() - return value - - @validator("singularity") - def transform_path_to_dict(cls, value): - return {"image": Path(value).resolve().as_posix()} - - @validator("background_variants") - def fl_abspath_as_str(cls, value): - if value: - return Path(value).resolve().as_posix() - return None - - -class ReferenceUrlsModel(BaseModel): - """Defines a basemodel for reference urls - - This class handles four attributes for each reference url. Each attribute defines url, type of file, and gzip status. - - Attributes: - url: defines the url to access file. Essentially it will be used to download file locally. It should match url_type://... - file_type: describes file type. Accepted values are VALID_REF_FORMAT constant - gzip: gzip status. Binary: True or False - genome_version: genome version matching the content of the file. Accepted values are VALID_GENOME_VER constant - - Raises: - ValidationError: When it can't validate values matching above attributes - - """ - - url: AnyUrl - file_type: str - gzip: bool = True - genome_version: str - output_file: Optional[str] - output_path: Optional[str] - secret: Optional[str] - - @validator("file_type") - def check_file_type(cls, value) -> str: - """Validate file format according to constants""" - assert value in VALID_REF_FORMAT, f"{value} not a valid reference file format." - return value - - @validator("genome_version") - def check_genome_ver(cls, value) -> str: - """Validate genome version according constants""" - assert value in VALID_GENOME_VER, f"{value} not a valid genome version." - return value - - @property - def get_output_file(self): - """return output file full path""" - output_file_path = Path(self.output_path, self.output_file).as_posix() - return output_file_path - - @property - def write_md5(self): - """calculate md5 for first 4kb of file and write to file_name.md5""" - hash_md5 = hashlib.md5() - output_file = Path(self.output_path, self.output_file) - if not output_file.is_file(): - raise FileNotFoundError(f"{output_file.as_posix()} file does not exist") - - with open(output_file.as_posix(), "rb") as fh: - for chunk in iter(lambda: fh.read(4096), b""): - hash_md5.update(chunk) - - with open(output_file.as_posix() + ".md5", "w") as fh: - fh.write("{} {}\n".format(output_file.as_posix(), hash_md5.hexdigest())) - - -class ReferenceMeta(BaseModel): - """Defines a basemodel for all reference file - - This class defines a meta for various reference files. Only reference_genome is mandatory. - - Attributes: - basedir: str for base directory which will be appended to all ReferenceUrlsModel fields - reference_genome: ReferenceUrlsModel. Required field for reference genome fasta file - dbsnp: ReferenceUrlsModel. Optional field for dbSNP vcf file - hc_vcf_1kg: ReferenceUrlsModel. Optional field for high confidence 1000Genome vcf - mills_1kg: ReferenceUrlsModel. Optional field for Mills' high confidence indels vcf - known_indel_1kg: ReferenceUrlsModel. Optional field for 1000Genome known indel vcf - vcf_1kg: ReferenceUrlsModel. Optional field for 1000Genome all SNPs - wgs_calling: ReferenceUrlsModel. Optional field for wgs calling intervals - genome_chrom_size: ReferenceUrlsModel. Optional field for geneome's chromosome sizes - gnomad_variant: ReferenceUrlsModel. Optional gnomad variants (non SV) as vcf - cosmicdb: ReferenceUrlsModel. Optional COSMIC database's variants as vcf - refgene_txt: ReferenceUrlsModel. Optional refseq's gene flat format from UCSC - refgene_sql: ReferenceUrlsModel. Optional refseq's gene sql format from UCSC - rankscore: ReferenceUrlsModel. Optional rankscore model - access_regions: ReferenceUrlsModel. Optional field for accessible genome regions - delly_exclusion: ReferenceUrlsModel. Optional field for genome exclusion regions - delly_mappability: ReferenceUrlsModel. Optional field for genome mappability - ascat_gccorrection: ReferenceUrlsModel. Optional field for genome gc correction bins - ascat_chryloci: ReferenceUrlsModel. Optional field for chromosome Y loci - clinvar: ReferenceUrlsModel. Optional field for clinvar reference - somalier_sites: ReferenceUrlsModel. Optional field for somalier sites vcf - """ - - basedir: str = "" - reference_genome: ReferenceUrlsModel - dbsnp: Optional[ReferenceUrlsModel] - hc_vcf_1kg: Optional[ReferenceUrlsModel] - mills_1kg: Optional[ReferenceUrlsModel] - known_indel_1kg: Optional[ReferenceUrlsModel] - vcf_1kg: Optional[ReferenceUrlsModel] - wgs_calling: Optional[ReferenceUrlsModel] - genome_chrom_size: Optional[ReferenceUrlsModel] - gnomad_variant: Optional[ReferenceUrlsModel] - gnomad_variant_index: Optional[ReferenceUrlsModel] - cosmicdb: Optional[ReferenceUrlsModel] - refgene_txt: Optional[ReferenceUrlsModel] - refgene_sql: Optional[ReferenceUrlsModel] - rankscore: Optional[ReferenceUrlsModel] - access_regions: Optional[ReferenceUrlsModel] - delly_exclusion: Optional[ReferenceUrlsModel] - delly_mappability: Optional[ReferenceUrlsModel] - delly_mappability_gindex: Optional[ReferenceUrlsModel] - delly_mappability_findex: Optional[ReferenceUrlsModel] - ascat_gccorrection: Optional[ReferenceUrlsModel] - ascat_chryloci: Optional[ReferenceUrlsModel] - clinvar: Optional[ReferenceUrlsModel] - somalier_sites: Optional[ReferenceUrlsModel] - - @validator("*", pre=True) - def validate_path(cls, value, values, **kwargs): - """validate and append path in ReferenceUrlsModel fields with basedir""" - if isinstance(value, str): - output_value = value - else: - if "output_path" in value: - value["output_path"] = Path( - values["basedir"], value["output_path"] - ).as_posix() - output_value = ReferenceUrlsModel.parse_obj(value) - else: - output_value = value - - return output_value - - -class UMIParamsCommon(BaseModel): - """This class defines the common params settings used as constants across various rules in UMI workflow. - - Attributes: - align_format: str (required); output alignment format. eg. 'BAM' - align_header: str (required); header line appended to the aligned BAM output - align_intbases: int; input bases in each batch regardless of threads, for reproducibility - filter_tumor_af: float (required); settings to filter minimum allelic frequency - """ - - align_header: str - align_intbases: int - filter_tumor_af: float - - -class UMIParamsUMIextract(BaseModel): - """This class defines the params settings used as constants in UMI workflow-rule umextract. - - Attributes: - read_structure: str (required); settings to define UMI read structure - """ - - read_structure: str = "-d, 'rs1,rs2'" - - -class UMIParamsConsensuscall(BaseModel): - """This class defines the params settings used as constants in UMI workflow-rule consensuscall. - - Attributes: - align_format: str (required); output alignment format. eg. 'BAM' - filter_minreads: str (required); settings to filter consensus tags based on group size - tag: str; Logic UMI tag - """ - - align_format: str = "BAM" - filter_minreads: str = "3,1,1" - tag: str = "XR" - - -class UMIParamsTNscope(BaseModel): - """This class defines the params settings used as constants in UMI workflow- rule tnscope. - - Attributes: - algo: str; choice of sentieon varcall algorithm. eg. 'TNscope' - disable_detect: str; disable variant detector. eg 'sv' or 'snv_indel' - filter_tumor_af: float (required); minimum allelic frequency to detect - min_tumorLOD: int (required); minimum tumor log odds in the final call of variants - init_tumorLOD: float (required); minimum tumor log odds in the initial pass calling variants - error_rate: int (required); allow error-rate to consider in calling - prunefactor: int (required); pruning factor in the kmer graph - padding: int(required); amount to pad bed interval regions - """ - - algo: str - init_tumorLOD: float - min_tumorLOD: int - error_rate: int - prunefactor: int - padding: int - disable_detect: str - - -class ParamsVardict(BaseModel): - """This class defines the params settings used as constants in vardict rule. - - Attributes: - allelic_frequency: float (required); minimum allelic frequency to detect - max_pval: float (required); the maximum p-value. Vardict default: 0.05 - max_mm: float (required); the maximum mean mismatches allowed. Vardict default: 5.25 - column_info: str (required); set of vardict filters for passing final variants - """ - - allelic_frequency: float - max_pval: float - max_mm: float - column_info: str - - -class ParamsCommon(BaseModel): - """This class defines the common params settings used as constants across various rules in balsamic workflow. - - Attributes: - pcr_model: str (required). PCR indel model used to weed out false positive indels. Eg: none- PCR free samples. - align_header: str (required); header line appended to the aligned BAM output - min_mapq: int (required); minimum mapping quality score. Eg: 20- probability of mapping random read at 99% accuracy - picard_fixmate: str (required), fix read mate information in bam file - picard_RG_normal: str (required); replace readgroups in normal bam file - picard_RG_tumor: str (required); replace readgroups in tumor bam file - """ - - align_header: str - pcr_model: str - min_mapq: int - picard_fixmate: str - picard_RG_normal: str - picard_RG_tumor: str - - -class ParamsVEP(BaseModel): - """This class defines the params settings used as constants in vep rule. - - Attributes: - vep_filters: str (required); set of choosen options for processing vep annotated vcf file - """ - - vep_filters: str - - -class BalsamicWorkflowConfig(BaseModel): - """Defines set of rules in balsamic workflow - - Handles attributes for corresponding rules. - - Attributes: - common: global params defined across all rules in balsamic workflow - umicommon: global params defined across specific rules in UMI workflow - vep: global params defined in the rule vep - vardict: params defined in the rule vardict - umiextract : params defined in the rule sentieon_umiextract - umiconsensuscall: params defined in the rule sentieon_consensuscall - tnscope_umi: params defined in the rule sentieon_tnscope_umi - """ - - common: ParamsCommon - vardict: ParamsVardict - vep: ParamsVEP - umicommon: UMIParamsCommon - umiextract: UMIParamsUMIextract - umiconsensuscall: UMIParamsConsensuscall - tnscope_umi: UMIParamsTNscope - - -class MetricConditionModel(BaseModel): - """Defines the metric condition model - - Attributes: - norm: string (optional); validation condition - threshold: float (optional); validation cut off - """ - - norm: Optional[str] = None - threshold: Optional[float] = None - - -class MetricModel(BaseModel): - """Defines the metric attributes model - - Attributes: - header: str (optional); data - id: str (required); unique sample identifier (sample_id, case_id or project_id) - input: str (required); input file - name: str (required); metric name - step: str (required); step that generated the metric - value: Any (required and can take None as a value); metric value - condition: MetricConditionModel (required and can take None as a value); metric validation condition - """ - - header: Optional[str] - id: str - input: str - name: str - step: str - value: Any = ... - condition: Optional[MetricConditionModel] = ... - - @validator("name") - def validate_name(cls, name, values): - """Updates the name if the source is FastQC""" - - if "fastqc-percent_duplicates" in name: - return "PERCENT_DUPLICATION_R" + values["input"].split("_")[-2] - - return name - - -class MetricValidationModel(BaseModel): - """Defines the metric validation model - - Attributes: - metrics: List[MetricModel] (required); metric model to validate - - Raises: - ValueError: when a metric does not meet its validation requirements - """ - - metrics: List[MetricModel] - - @validator("metrics", each_item=True) - def validate_metrics(cls, metric): - """Checks if a metric meets its filtering condition""" - - if metric.condition and not VALID_OPS[metric.condition.norm]( - metric.value, metric.condition.threshold - ): - raise ValueError( - f"QC metric {metric.name}: {metric.value} validation has failed. " - f"(Condition: {metric.condition.norm} {metric.condition.threshold}, ID: {metric.id})." - ) - - LOG.info(f"QC metric {metric.name}: {metric.value} meets its condition.") - - return metric diff --git a/BALSAMIC/utils/pdf_report.py b/BALSAMIC/utils/pdf_report.py new file mode 100644 index 000000000..41b96ec73 --- /dev/null +++ b/BALSAMIC/utils/pdf_report.py @@ -0,0 +1,83 @@ +"""PDF report generation utility methods.""" +from pathlib import Path + +import pdfkit + + +def get_table_html(html_table: str, table_name: str) -> str: + """Return HTML-rendered content with the provided HTML table.""" + return f""" + + + + + + +

{table_name}

+ {html_table} + + + """ + + +def get_image_html(image_path: Path, image_name: str) -> str: + """Return HTML-rendered content with the provided image.""" + return f""" + + + + + + +

{image_name}

+
+ +
+ + + """ + + +def html_to_pdf( + html_string: str, + pdf_path: str, + orientation: str = "landscape", + margin_top: str = "1.5cm", + margin_bottom: str = "1cm", + margin_left: str = "1cm", + margin_right: str = "1cm", + zoom: int = 1, +) -> None: + """Create a PDF file from the content of an HTML string.""" + pdfkit.from_string( + input=html_string, + output_path=pdf_path, + options={ + "page-size": "A4", + "encoding": "UTF-8", + "orientation": orientation, + "zoom": zoom, + "margin-top": margin_top, + "margin-bottom": margin_bottom, + "margin-left": margin_left, + "margin-right": margin_right, + "enable-local-file-access": None, + }, + ) diff --git a/BALSAMIC/utils/qc_metrics.py b/BALSAMIC/utils/qc_metrics.py deleted file mode 100644 index 17d4b281f..000000000 --- a/BALSAMIC/utils/qc_metrics.py +++ /dev/null @@ -1,7 +0,0 @@ -from BALSAMIC.utils.models import MetricValidationModel - - -def validate_qc_metrics(metrics: dict) -> dict: - """Returns a set of validated QC metrics""" - - return MetricValidationModel(metrics=metrics).dict()["metrics"] diff --git a/BALSAMIC/utils/rule.py b/BALSAMIC/utils/rule.py index 9897d9980..847409444 100644 --- a/BALSAMIC/utils/rule.py +++ b/BALSAMIC/utils/rule.py @@ -1,34 +1,27 @@ +import logging import os import re -import toml -import logging from pathlib import Path +from typing import Dict + import snakemake -from BALSAMIC.utils.cli import get_file_extension -from BALSAMIC.utils.cli import find_file_index -from BALSAMIC.constants.common import ( - MUTATION_TYPE, - MUTATION_CLASS, - SEQUENCING_TYPE, - WORKFLOW_SOLUTION, - ANALYSIS_TYPES, +import toml +from BALSAMIC.constants.paths import SCRIPT_DIR + +from BALSAMIC.constants.analysis import ( + AnalysisType, + MutationOrigin, + MutationType, + SequencingType, + WorkflowSolution, ) -from BALSAMIC.utils.exc import WorkflowRunError, BalsamicError +from BALSAMIC.models.config import ConfigModel +from BALSAMIC.utils.cli import find_file_index, get_file_extension +from BALSAMIC.utils.exc import WorkflowRunError LOG = logging.getLogger(__name__) -def get_chrom(panelfile): - """ - input: a panel bedfile - output: list of chromosomes in the bedfile - """ - - lines = [line.rstrip("\n") for line in open(panelfile, "r")] - chrom = list(set([s.split("\t")[0] for s in lines])) - return chrom - - def get_vcf(config, var_caller, sample): """ input: BALSAMIC config file @@ -39,7 +32,7 @@ def get_vcf(config, var_caller, sample): for v in var_caller: for s in sample: vcf.append( - config["vcf"][v]["type"] + config["vcf"][v]["mutation_type"] + "." + config["vcf"][v]["mutation"] + "." @@ -76,24 +69,24 @@ def get_variant_callers( """ valid_variant_callers = list() - if mutation_type not in MUTATION_TYPE: + if mutation_type not in set(MutationType): raise WorkflowRunError(f"{mutation_type} is not a valid mutation type.") - if workflow_solution not in WORKFLOW_SOLUTION: + if workflow_solution not in set(WorkflowSolution): raise WorkflowRunError(f"{workflow_solution} is not a valid workflow solution.") - if analysis_type not in ANALYSIS_TYPES: + if analysis_type not in set(AnalysisType): raise WorkflowRunError(f"{analysis_type} is not a valid analysis type.") - if mutation_class not in MUTATION_CLASS: + if mutation_class not in set(MutationOrigin): raise WorkflowRunError(f"{mutation_class} is not a valid mutation class.") - if sequencing_type not in SEQUENCING_TYPE: + if sequencing_type not in set(SequencingType): raise WorkflowRunError(f"{sequencing_type} is not a valid sequencing type.") for variant_caller_name, variant_caller_params in config["vcf"].items(): if ( - mutation_type in variant_caller_params.get("type") + mutation_type in variant_caller_params.get("mutation_type") and mutation_class in variant_caller_params.get("mutation") and analysis_type in variant_caller_params.get("analysis_type") and workflow_solution in variant_caller_params.get("workflow_solution") @@ -133,31 +126,14 @@ def get_capture_kit(config): return None -def get_sample_type(sample, bio_type): +def get_sample_type_from_sample_name(config, sample_name): """ - input: sample dictionary from BALSAMIC's config file - output: list of sample type id - """ - - type_id = [] - for sample_id in sample: - if sample[sample_id]["type"] == bio_type: - type_id.append(sample_id) - return type_id - - -def get_sample_type_from_prefix(config, sample): - """ - input: case config file from BALSAMIC + input: case config file from BALSAMIC, and sample_name output: sample type """ - - try: - return config["samples"][sample]["type"] - except KeyError: - raise KeyError( - f"The provided sample prefix {sample} does not exist for {config['analysis']['case_id']}." - ) + for sample in config["samples"]: + if sample_name == sample["name"]: + return sample["type"] def get_result_dir(config): @@ -169,30 +145,9 @@ def get_result_dir(config): return config["analysis"]["result"] -def get_picard_mrkdup(config): - """ - input: sample config file output from BALSAMIC - output: mrkdup or rmdup strings - """ - - picard_str = "mrkdup" - - if "picard_rmdup" in config["QC"]: - if config["QC"]["picard_rmdup"] == True: - picard_str = "rmdup" - - return picard_str - - -def get_script_path(script_name: str): - """ - Retrieves script path where name is matching {{script_name}}. - """ - - p = Path(__file__).parents[1] - script_path = str(Path(p, "assets/scripts", script_name)) - - return script_path +def get_script_path(script_name: str) -> str: + """Return the path to the script matching the file name.""" + return Path(SCRIPT_DIR, script_name).as_posix() def get_threads(cluster_config, rule_name="__default__"): @@ -216,7 +171,6 @@ def get_rule_output(rules, rule_name, output_file_wildcards): output_files = list() # Extract housekeeper tags from rule's params value housekeeper = getattr(rules, rule_name).params.housekeeper_id - # Get temp_output files temp_files = getattr(rules, rule_name).rule.temp_output @@ -226,7 +180,6 @@ def get_rule_output(rules, rule_name, output_file_wildcards): for output_name in output_file_names: output_file = getattr(rules, rule_name).output[output_name] - LOG.debug("Found following potential output files: {}".format(output_file)) for file_wildcard_list in snakemake.utils.listfiles(output_file): file_to_store = file_wildcard_list[0] # Do not store file if it is a temp() output @@ -240,7 +193,6 @@ def get_rule_output(rules, rule_name, output_file_wildcards): file_extension = get_file_extension(file_to_store) file_to_store_index = find_file_index(file_to_store) - base_tags = list(file_wildcard_list[1]) base_tags.append(output_name) @@ -265,12 +217,12 @@ def get_rule_output(rules, rule_name, output_file_wildcards): composit_tag = "-".join([housekeeper["tags"], output_name]) file_tags = base_tags + [composit_tag] - # replace all instsances of "_" with "-", since housekeeper doesn't like _ + # replace all instances of "_" with "-", since housekeeper doesn't like _ file_tags = [t.replace("_", "-") for t in file_tags] - LOG.debug("Found the following delivery id: {}".format(delivery_id)) - LOG.debug("Found the following file to store: {}".format(file_to_store)) - LOG.debug("Above file is in the following rule: {}".format(rule_name)) + LOG.info("Found the following delivery id: {}".format(delivery_id)) + LOG.info("Found the following file to store: {}".format(file_to_store)) + LOG.info("Above file is in the following rule: {}".format(rule_name)) output_files.append( ( file_to_store, @@ -330,36 +282,17 @@ def get_delivery_id( return delivery_id -def get_reference_output_files( - reference_files_dict: dict, file_type: str, gzip: bool = None -) -> list: - """Returns list of files matching a file_type from reference files +def get_pon_cnn(config: dict) -> str: + """Returns path of pon_cnn for TGA workflow Args: - reference_files_dict: A validated dict model from reference - file_type: a file type string, e.g. vcf, fasta - gzip: a list of boolean + config: a config dictionary Returns: - ref_vcf_list: list of file_type files that are found in reference_files_dict - """ - ref_vcf_list = [] - for reference_key, reference_item in reference_files_dict.items(): - if reference_item["file_type"] == file_type: - if gzip is not None and reference_item["gzip"] != gzip: - continue - ref_vcf_list.append(reference_item["output_file"]) - return ref_vcf_list - + Returns path of pon_cnn generated by cnvkit -def get_pon_samples(fastq_dir): - """Given dirpath containing list of PON fastq files - Returns list of sample names """ - samples = [ - (f.split("_1"))[0] for f in os.listdir(fastq_dir) if f.endswith("_R_1.fastq.gz") - ] - return samples + return config["panel"]["pon_cnn"] if "pon_cnn" in config["panel"] else " " def get_clinical_snv_observations(config: dict) -> str: @@ -375,6 +308,32 @@ def get_clinical_snv_observations(config: dict) -> str: return Path(config["reference"]["clinical_snv_observations"]).as_posix() +def get_cancer_germline_snv_observations(config: dict) -> str: + """Returns path for cancer germline snv observations + + Args: + config: a config dictionary + + Returns: + Path for cancer-germline-snv-observations vcf file + + """ + return Path(config["reference"]["cancer_germline_snv_observations"]).as_posix() + + +def get_cancer_somatic_snv_observations(config: dict) -> str: + """Returns path for cancer somatic snv observations + + Args: + config: a config dictionary + + Returns: + Path for cancer-somatic-snv-observations vcf file + + """ + return Path(config["reference"]["cancer_somatic_snv_observations"]).as_posix() + + def get_swegen_snv(config: dict) -> str: """Returns path for swegen snv frequencies @@ -401,29 +360,30 @@ def get_clinical_sv_observations(config: dict) -> str: return Path(config["reference"]["clinical_sv_observations"]).as_posix() -def get_swegen_sv(config: dict) -> str: - """Returns path for swegen sv frequencies +def get_somatic_sv_observations(config: dict) -> str: + """Returns path for somatic sv observations Args: config: a config dictionary Returns: - Path for swegen_sv vcf file + Path for cancer_somatic_sv_observations vcf file """ - return Path(config["reference"]["swegen_sv_frequency"]).as_posix() + return Path(config["reference"]["cancer_somatic_sv_observations"]).as_posix() -def get_toml(annotation: dict) -> str: - """Returns annotation in toml format +def get_swegen_sv(config: dict) -> str: + """Returns path for swegen sv frequencies Args: - annotation: a dict containing annotation resource + config: a config dictionary Returns: - toml_annotation: a string in toml format + Path for swegen_sv vcf file + """ - return toml.dumps(annotation) + return Path(config["reference"]["swegen_sv_frequency"]).as_posix() def dump_toml(annotations: list) -> str: @@ -437,5 +397,67 @@ def dump_toml(annotations: list) -> str: """ toml_annotations = "" for annotation in annotations: - toml_annotations += get_toml(annotation) + toml_annotations += toml.dumps(annotation) return toml_annotations + + +def get_fastp_parameters(config_model: ConfigModel) -> Dict: + """Returns a dictionary with parameters for the fastp rules. + + Args: + config_model: The case config json instantiated as the ConfigModel + + Returns: + fastp_parameters_dict: Dictionary with 1 or 2 lists, depending on if sequencing type has UMIs or not. + + """ + fastp_parameters_dict = {} + + # Add UMI trimming for TGA + if config_model.analysis.sequencing_type != SequencingType.WGS: + fastp_parameters_dict["fastp_trim_umi"] = [ + "--umi", + "--umi_loc per_read", + "--umi_len", + config_model.QC.umi_trim_length, + "--umi_prefix", + "UMI", + "--dont_eval_duplication", + ] + + # Add quality and adapter trimming parameters + fastp_trim_qual = list() + fastp_trim_adapter = list() + if config_model.QC.quality_trim: + fastp_trim_qual.extend( + [ + "--trim_tail1", + "1", + "--n_base_limit", + "50", + "--length_required", + config_model.QC.min_seq_length, + "--low_complexity_filter", + "--trim_poly_g", + ] + ) + else: + fastp_trim_qual.extend( + [ + "--disable_quality_filtering", + "--disable_length_filtering", + "--disable_trim_poly_g", + ] + ) + + if not config_model.QC.adapter_trim: + fastp_trim_adapter.extend(["--disable_adapter_trimming"]) + else: + fastp_trim_adapter.extend(["--detect_adapter_for_pe"]) + + fastp_trim_qual.extend(["--dont_eval_duplication"]) + + fastp_parameters_dict["fastp_trim_qual"] = fastp_trim_qual + fastp_parameters_dict["fastp_trim_adapter"] = fastp_trim_adapter + + return fastp_parameters_dict diff --git a/BALSAMIC/utils/scheduler.py b/BALSAMIC/utils/scheduler.py index 4d02e28f7..b1c0e3b3f 100644 --- a/BALSAMIC/utils/scheduler.py +++ b/BALSAMIC/utils/scheduler.py @@ -87,7 +87,6 @@ def __init__(self): self.time = None def build_cmd(self): - resource_params = "" depend = "" qsub_options = list() diff --git a/BALSAMIC/utils/utils.py b/BALSAMIC/utils/utils.py new file mode 100644 index 000000000..2f7819160 --- /dev/null +++ b/BALSAMIC/utils/utils.py @@ -0,0 +1,18 @@ +"""Helper functions.""" +from pathlib import Path +from typing import Dict + + +def remove_unnecessary_spaces(string: str) -> str: + """Return a string removing unnecessary empty spaces.""" + return " ".join(string.split()) + + +def get_relative_paths_dict(base_path: Path, data: Dict[str, Path]) -> Dict[str, str]: + """Return a dictionary containing relative paths with respect to a given base path.""" + return {key: path.relative_to(base_path).as_posix() for key, path in data.items()} + + +def get_absolute_paths_dict(base_path: Path, data: Dict[str, Path]) -> Dict[str, Path]: + """Return a dictionary containing absolute resolved paths with respect to a given base path.""" + return {key: Path(base_path, path).resolve() for key, path in data.items()} diff --git a/BALSAMIC/utils/workflowscripts.py b/BALSAMIC/utils/workflowscripts.py index b6b402581..6fae1634f 100644 --- a/BALSAMIC/utils/workflowscripts.py +++ b/BALSAMIC/utils/workflowscripts.py @@ -1,5 +1,3 @@ -import os -import subprocess import json from pathlib import Path import pandas as pd @@ -8,9 +6,9 @@ import h5py import typing +from BALSAMIC.constants.cluster import ClusterConfigType from BALSAMIC.utils.rule import get_threads -from BALSAMIC.utils.cli import get_config -from BALSAMIC.utils.cli import generate_h5 +from BALSAMIC.utils.cli import get_config_path def plot_analysis( @@ -20,7 +18,7 @@ def plot_analysis( plots analysis job. """ - cluster_config = get_config("cluster") + cluster_config = get_config_path(ClusterConfigType.ANALYSIS) with open(cluster_config, "r") as f: cluster_config = json.load(f) diff --git a/BALSAMIC/workflows/PON.smk b/BALSAMIC/workflows/PON.smk index 517472b6c..0ed589067 100644 --- a/BALSAMIC/workflows/PON.smk +++ b/BALSAMIC/workflows/PON.smk @@ -1,57 +1,114 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -from pathlib import Path import glob -import tempfile +import logging import os +import tempfile +from pathlib import Path +from typing import Dict, List - -from BALSAMIC.utils.rule import (get_picard_mrkdup, get_threads, - get_result_dir, get_pon_samples) -from BALSAMIC.constants.common import RULE_DIRECTORY +from BALSAMIC.constants.analysis import FastqName, Gender, PONWorkflow, SampleType, SequencingType +from BALSAMIC.constants.paths import BALSAMIC_DIR from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS -from BALSAMIC.utils.models import BalsamicWorkflowConfig +from BALSAMIC.models.config import ConfigModel +from BALSAMIC.models.params import BalsamicWorkflowConfig +from BALSAMIC.utils.exc import BalsamicError +from BALSAMIC.utils.io import write_finish_file +from BALSAMIC.utils.rule import get_fastp_parameters, get_result_dir, get_threads + +# Initialize ConfigModel +config_model = ConfigModel.model_validate(config) shell.prefix("set -eo pipefail; ") localrules: all +LOG = logging.getLogger(__name__) + # parse parameters as constants to workflows -params = BalsamicWorkflowConfig.parse_obj(WORKFLOW_PARAMS) +params = BalsamicWorkflowConfig.model_validate(WORKFLOW_PARAMS) + +# Get case id/name +case_id: str = config_model.analysis.case_id +# Get analysis dir +analysis_dir_home: str = config_model.analysis.analysis_dir +analysis_dir: str = Path(analysis_dir_home, "analysis", case_id).as_posix() + "/" +# Get result dir +result_dir: str = Path(config_model.analysis.result).as_posix() + "/" + +# Create a temporary directory with trailing / +tmp_dir: str = Path(result_dir, "tmp").as_posix() + "/" +Path.mkdir(Path(tmp_dir), parents=True, exist_ok=True) + +# Directories +benchmark_dir: str = config_model.analysis.benchmark + "/" +fastq_dir: str = Path(result_dir, "fastq").as_posix() + "/" +bam_dir: str = Path(result_dir, "bam", "").as_posix() + "/" +cnv_dir: str = Path(result_dir, "cnv", "").as_posix() + "/" +qc_dir: str = Path(result_dir, "qc", "").as_posix() + "/" + +# PON setting +pon_workflow: PONWorkflow = config_model.analysis.pon_workflow + +# Run information +version: str = config_model.analysis.pon_version +singularity_image: str = config_model.singularity['image'] +sample_names: List[str] = config_model.get_all_sample_names() + +# Fastp parameters +fastp_parameters: Dict = get_fastp_parameters(config_model) + +# Find and set Sentieon binary and license server from env variables +try: + config["SENTIEON_LICENSE"] = os.environ["SENTIEON_LICENSE"] + config["SENTIEON_INSTALL_DIR"] = os.environ["SENTIEON_INSTALL_DIR"] -analysis_dir = get_result_dir(config) -fastq_dir = analysis_dir + "/fastq/" -qc_dir = analysis_dir + "/qc/" -bam_dir = analysis_dir + "/bam/" -cnv_dir = analysis_dir + "/cnv/" + if os.getenv("SENTIEON_EXEC") is not None: + config["SENTIEON_EXEC"] = os.environ["SENTIEON_EXEC"] + else: + config["SENTIEON_EXEC"] = Path(os.environ["SENTIEON_INSTALL_DIR"], "bin", "sentieon").as_posix() -reffasta = config["reference"]["reference_genome"] -refflat = config["reference"]["refflat"] -access_5kb_hg19 = config["reference"]["access_regions"] -target_bed = config["panel"]["capture_kit"] -singularity_image = config["singularity"]["image"] -benchmark_dir = config["analysis"]["benchmark"] -version = config["analysis"]["pon_version"] +except KeyError as error: + LOG.error("Set environment variables SENTIEON_LICENSE, SENTIEON_INSTALL_DIR, SENTIEON_EXEC " + "to run SENTIEON variant callers") + raise BalsamicError -tmp_dir = os.path.join(analysis_dir, "tmp", "" ) -Path.mkdir(Path(tmp_dir), exist_ok=True) +if not Path(config["SENTIEON_EXEC"]).exists(): + LOG.error("Sentieon executable not found {}".format(Path(config["SENTIEON_EXEC"]).as_posix())) + raise BalsamicError -picarddup = get_picard_mrkdup(config) -samples = get_pon_samples(fastq_dir) -panel_name = os.path.split(target_bed)[1].replace('.bed','') +sequence_type = config['analysis']["sequencing_type"] +rules_to_include = [] +if sequence_type == SequencingType.TARGETED: + rules_to_include.append("snakemake_rules/quality_control/fastp_tga.rule") +else: + rules_to_include.append("snakemake_rules/quality_control/fastp_wgs.rule") -coverage_references = expand(cnv_dir + "{sample}.{cov}coverage.cnn", sample=samples, cov=['target','antitarget']) -baited_beds = expand(cnv_dir + "{cov}.bed", cov=['target','antitarget']) -pon_reference = expand(cnv_dir + panel_name + "_CNVkit_PON_reference_" + version + ".cnn") -pon_finish = expand(analysis_dir + "/" + "analysis_PON_finish") +rules_to_include.append("snakemake_rules/align/sentieon_alignment.rule") -config["rules"] = ["snakemake_rules/quality_control/fastp.rule", - "snakemake_rules/align/bwa_mem.rule"] +if pon_workflow == PONWorkflow.CNVKIT: + reffasta: str = config_model.reference["reference_genome"] + refgene_flat: str = config_model.reference["refgene_flat"] + access_5kb_hg19: str = config_model.reference["access_regions"] + target_bed: str = config_model.panel.capture_kit + panel_name = os.path.split(target_bed)[1].replace('.bed','') -for r in config["rules"]: - include: Path(RULE_DIRECTORY, r).as_posix() + pon_reference = expand(cnv_dir + panel_name + "_CNVkit_PON_reference_" + version + ".cnn") + rules_to_include.append("snakemake_rules/pon/cnvkit_create_pon.rule") + +if pon_workflow in [PONWorkflow.GENS_MALE, PONWorkflow.GENS_FEMALE]: + gender = Gender.MALE if pon_workflow == PONWorkflow.GENS_MALE else Gender.FEMALE + + pon_reference = expand(cnv_dir + "gens_pon_100bp.{gender}.{version}.hdf5", gender=gender, version=version) + rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule") + rules_to_include.append("snakemake_rules/pon/gens_create_pon.rule") + +pon_finish = expand(analysis_dir + "analysis_PON_finish") + +for r in rules_to_include: + include: Path(BALSAMIC_DIR, r).as_posix() rule all: input: @@ -59,59 +116,4 @@ rule all: output: pon_finish_file = pon_finish run: - import datetime - - # PON finish timestamp file - with open(str(output.pon_finish_file), mode="w") as finish_file: - finish_file.write("%s\n" % datetime.datetime.now()) - -rule create_target: - input: - target_bait = target_bed, - refFlat = refflat, - access_bed = access_5kb_hg19 - output: - target_bed = cnv_dir + "target.bed", - offtarget_bed = cnv_dir + "antitarget.bed" - singularity: - Path(singularity_image, "varcall_cnvkit.sif").as_posix() - benchmark: - Path(benchmark_dir, "cnvkit.targets.tsv").as_posix() - shell: - """ -cnvkit.py target {input.target_bait} --annotate {input.refFlat} --split -o {output.target_bed}; -cnvkit.py antitarget {input.target_bait} -g {input.access_bed} -o {output.offtarget_bed}; - """ - -rule create_coverage: - input: - bam = bam_dir + "{sample}.sorted." + picarddup + ".bam", - target_bed = cnv_dir + "target.bed", - antitarget_bed = cnv_dir + "antitarget.bed" - output: - target_cnn = cnv_dir + "{sample}.targetcoverage.cnn", - antitarget_cnn = cnv_dir + "{sample}.antitargetcoverage.cnn" - singularity: - Path(singularity_image, "varcall_cnvkit.sif").as_posix() - benchmark: - Path(benchmark_dir, "cnvkit_{sample}.coverage.tsv").as_posix() - shell: - """ -cnvkit.py coverage {input.bam} {input.target_bed} -o {output.target_cnn}; -cnvkit.py coverage {input.bam} {input.antitarget_bed} -o {output.antitarget_cnn}; - """ - -rule create_reference: - input: - cnn = expand(cnv_dir + "{sample}.{prefix}coverage.cnn", sample=samples, prefix=["target","antitarget"]), - ref = reffasta - output: - ref_cnn = pon_reference - singularity: - Path(singularity_image, "varcall_cnvkit.sif").as_posix() - benchmark: - Path(benchmark_dir, "cnvkit.reference.tsv").as_posix() - shell: - """ -cnvkit.py reference {input.cnn} --fasta {input.ref} -o {output.ref_cnn} ; - """ + write_finish_file(file_path=output.pon_finish_file) diff --git a/BALSAMIC/workflows/QC.smk b/BALSAMIC/workflows/QC.smk index e2dbe436f..997d55782 100644 --- a/BALSAMIC/workflows/QC.smk +++ b/BALSAMIC/workflows/QC.smk @@ -1,73 +1,97 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -import os import logging +import os import tempfile - from pathlib import Path -from yapf.yapflib.yapf_api import FormatFile - -from snakemake.exceptions import RuleException, WorkflowError - -from BALSAMIC.utils.exc import BalsamicError - -from BALSAMIC.utils.cli import (check_executable, generate_h5) -from BALSAMIC.utils.io import write_json - -from BALSAMIC.utils.models import BalsamicWorkflowConfig - -from BALSAMIC.utils.rule import (get_rule_output, get_result_dir, - get_sample_type, get_picard_mrkdup, get_script_path, - get_threads, get_sequencing_type, get_capture_kit) +from typing import Dict, List -from BALSAMIC.constants.common import (RULE_DIRECTORY); +from BALSAMIC.constants.analysis import AnalysisType, FastqName, SampleType +from BALSAMIC.constants.paths import BALSAMIC_DIR +from BALSAMIC.constants.rules import SNAKEMAKE_RULES from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS +from BALSAMIC.models.config import ConfigModel +from BALSAMIC.models.params import BalsamicWorkflowConfig +from BALSAMIC.utils.cli import check_executable, generate_h5 +from BALSAMIC.utils.exc import BalsamicError +from BALSAMIC.utils.io import write_finish_file, write_json +from BALSAMIC.utils.rule import ( + get_capture_kit, + get_fastp_parameters, + get_result_dir, + get_rule_output, + get_script_path, + get_sequencing_type, + get_threads, +) +from snakemake.exceptions import RuleException, WorkflowError +from yapf.yapflib.yapf_api import FormatFile +# Initialize ConfigModel +config_model = ConfigModel.model_validate(config) shell.executable("/bin/bash") shell.prefix("set -eo pipefail; ") LOG = logging.getLogger(__name__) -logging.getLogger("filelock").setLevel("WARN") -# Create a temporary directory with trailing / -tmp_dir = os.path.join(get_result_dir(config), "tmp", "" ) -Path.mkdir(Path(tmp_dir), exist_ok=True) - -case_id = config["analysis"]["case_id"] -analysis_dir = config["analysis"]["analysis_dir"] + "/" + case_id + "/" -benchmark_dir = config["analysis"]["benchmark"] -fastq_dir = get_result_dir(config) + "/fastq/" -bam_dir = get_result_dir(config) + "/bam/" -fastqc_dir = get_result_dir(config) + "/fastqc/" -result_dir = get_result_dir(config) + "/" -qc_dir = get_result_dir(config) + "/qc/" -delivery_dir = get_result_dir(config) + "/delivery/" +# Get case id/name +case_id: str = config_model.analysis.case_id +# Get analysis dir +analysis_dir_home: str = config_model.analysis.analysis_dir +analysis_dir: str = Path(analysis_dir_home, "analysis", case_id).as_posix() + "/" +# Get result dir +result_dir: str = Path(config_model.analysis.result).as_posix() + "/" -singularity_image = config['singularity']['image'] - -# picarddup flag -picarddup = get_picard_mrkdup(config) +# Create a temporary directory with trailing / +tmp_dir: str = Path(result_dir, "tmp").as_posix() + "/" +Path.mkdir(Path(tmp_dir), parents=True, exist_ok=True) + +# Directories +input_fastq_dir: str = config_model.analysis.fastq_path + "/" +benchmark_dir: str = config_model.analysis.benchmark + "/" +fastq_dir: str = Path(result_dir, "fastq").as_posix() + "/" +bam_dir: str = Path(result_dir, "bam").as_posix() + "/" +fastqc_dir: str = Path(result_dir, "fastqc").as_posix() + "/" +vcf_dir: str = Path(result_dir, "vcf").as_posix() + "/" +qc_dir: str = Path(result_dir, "qc").as_posix() + "/" +delivery_dir: str = Path(result_dir, "delivery").as_posix() + "/" + +# Run information +singularity_image: str = config_model.singularity['image'] +sample_names: List[str] = config_model.get_all_sample_names() +tumor_sample: str = config_model.get_sample_name_by_type(SampleType.TUMOR) +if config_model.analysis.analysis_type == AnalysisType.PAIRED: + normal_sample: str = config_model.get_sample_name_by_type(SampleType.NORMAL) # parse parameters as constants to workflows -params = BalsamicWorkflowConfig.parse_obj(WORKFLOW_PARAMS) +params = BalsamicWorkflowConfig.model_validate(WORKFLOW_PARAMS) + +# Fastp parameters +fastp_parameters: Dict = get_fastp_parameters(config_model) # Capture kit name if config["analysis"]["sequencing_type"] != "wgs": capture_kit = os.path.split(config["panel"]["capture_kit"])[1] -# Sample names for tumor or normal -tumor_sample = get_sample_type(config["samples"], "tumor")[0] -if "paired" in config['analysis']['analysis_type']: - normal_sample = get_sample_type(config["samples"], "normal")[0] - -# Set case id/name -case_id = config["analysis"]["case_id"] - # explicitly check if cluster_config dict has zero keys. if len(cluster_config.keys()) == 0: cluster_config = config +# Find and set Sentieon binary and license server from env variables +try: + config["SENTIEON_LICENSE"] = os.environ["SENTIEON_LICENSE"] + config["SENTIEON_INSTALL_DIR"] = os.environ["SENTIEON_INSTALL_DIR"] + + if os.getenv("SENTIEON_EXEC") is not None: + config["SENTIEON_EXEC"] = os.environ["SENTIEON_EXEC"] + else: + config["SENTIEON_EXEC"] = Path(os.environ["SENTIEON_INSTALL_DIR"], "bin", "sentieon").as_posix() +except KeyError as error: + LOG.error("Set environment variables SENTIEON_LICENSE, SENTIEON_INSTALL_DIR, SENTIEON_EXEC " + "to run SENTIEON variant callers") + raise BalsamicError + if "hg38" in config["reference"]["reference_genome"]: config["reference"]["genome_version"] = "hg38" elif "canfam3" in config["reference"]["reference_genome"]: @@ -81,43 +105,37 @@ LOG.info('Genome version set to %s', config["reference"]["genome_version"]) # Set temporary dir environment variable os.environ['TMPDIR'] = get_result_dir(config) +# Include rules analysis_type = config['analysis']["analysis_type"] +sequence_type = config['analysis']["sequencing_type"] -rules_to_include = [ - "snakemake_rules/quality_control/fastp.rule", - "snakemake_rules/quality_control/fastqc.rule", - "snakemake_rules/quality_control/multiqc.rule", - "snakemake_rules/variant_calling/mergetype_tumor.rule", - "snakemake_rules/quality_control/picard.rule", - "snakemake_rules/quality_control/sambamba_depth.rule", - "snakemake_rules/quality_control/mosdepth.rule", - "snakemake_rules/align/bwa_mem.rule", - "snakemake_rules/quality_control/qc_metrics.rule" -] - -if "paired" in config['analysis']['analysis_type']: - rules_to_include.append("snakemake_rules/variant_calling/mergetype_normal.rule") - # Somalier only implemented for hg38 and hg19 - if "canfam3" not in config["reference"]["reference_genome"]: - rules_to_include.append("snakemake_rules/quality_control/somalier.rule") +rules_to_include = [] +for workflow_type, value in SNAKEMAKE_RULES.items(): + if workflow_type in ["common", analysis_type + "_" + sequence_type]: + rules_to_include.extend(value.get("qc", []) + value.get("align", [])) +rules_to_include = [rule for rule in rules_to_include if "umi" not in rule and "report" not in rule] +# Somalier only implemented for hg38 and hg19 +if "canfam3" in config["reference"]["reference_genome"]: + rules_to_include.remove("snakemake_rules/quality_control/somalier.rule") -# for r in rules_to_include: for r in rules_to_include: - include: Path(RULE_DIRECTORY, r).as_posix() + include: Path(BALSAMIC_DIR, r).as_posix() + LOG.info(f"The following rules will be included in the workflow: {rules_to_include}") # Define common and analysis specific outputs quality_control_results = [ - os.path.join(qc_dir, case_id + "_metrics_deliverables.yaml"), - os.path.join(qc_dir, "multiqc_report.html"), + Path(qc_dir, case_id + "_metrics_deliverables.yaml").as_posix(), + Path(qc_dir, "multiqc_report.html").as_posix(), ] if 'delivery' in config: - wildcard_dict = {"sample": list(config["samples"].keys())+["tumor", "normal"], - "case_name": config["analysis"]["case_id"], - "allow_missing": True - } + wildcard_dict = { + "sample": config_model.get_all_sample_names() + ["tumor", "normal"], + "case_name": case_id, + "allow_missing": True + } if 'rules_to_deliver' in config: rules_to_deliver = config['rules_to_deliver'].split(",") @@ -139,9 +157,7 @@ if 'delivery' in config: output_files_ready.extend(files_to_deliver) output_files_ready = [dict(zip(output_files_ready[0], value)) for value in output_files_ready[1:]] - delivery_ready = os.path.join(get_result_dir(config), - "delivery_report", - config["analysis"]["case_id"] + "_delivery_ready.hk") + delivery_ready = Path(get_result_dir(config), "delivery_report", case_id + "_delivery_ready.hk").as_posix() write_json(output_files_ready, delivery_ready) FormatFile(delivery_ready) @@ -149,19 +165,18 @@ rule all: input: quality_control_results output: - finish_file = os.path.join(get_result_dir(config), "analysis_finish") + finish_file = Path(get_result_dir(config), "analysis_finish").as_posix() params: tmp_dir = tmp_dir, run: import datetime import shutil - # Delete a temporal directory tree + # Remove temporary directory tree try: shutil.rmtree(params.tmp_dir) except OSError as e: print ("Error: %s - %s." % (e.filename, e.strerror)) # Finish timestamp file - with open(str(output.finish_file), mode="w") as finish_file: - finish_file.write("%s\n" % datetime.datetime.now()) + write_finish_file(file_path=output.finish_file) diff --git a/BALSAMIC/workflows/archive/VariantCalling_paired_sentieon_wes b/BALSAMIC/workflows/archive/VariantCalling_paired_sentieon_wes deleted file mode 100644 index c390221d9..000000000 --- a/BALSAMIC/workflows/archive/VariantCalling_paired_sentieon_wes +++ /dev/null @@ -1,40 +0,0 @@ -#!python -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -import os -from BALSAMIC.utils.rule import get_vcf -from BALSAMIC.utils.rule import get_result_dir - -shell.prefix("set -eo pipefail; ") - -rule_dir = config["rule_directory"] -fastq_dir = config["analysis"]["fastq_path"] -bam_dir = get_result_dir(config) + "/bam/" -cnv_dir = get_result_dir(config) + "/cnv/" -cutadapt_dir = get_result_dir(config) + "/cutadapt/" -fastqc_dir = get_result_dir(config) + "/fastqc/" -result_dir = get_result_dir(config) + "/" -vcf_dir = get_result_dir(config) + "/vcf/" -vep_dir = get_result_dir(config) + "/vep/" - -SENTIEON_LICENSE = "10.10.10.1:8990" -SENTIEON_INSTALL_DIR="/home/proj/development/cancer/sentieon/sentieon-genomics-201808.03" - -include: - rule_dir + "snakemake_rules/sentieon/sentieon.rule" - -rule all: - input: - expand(bam_dir + "{sample}.bam", sample=config["samples"]), - expand(bam_dir + "{sample}.dedup.bam", sample=config["samples"]), - expand(bam_dir + "{sample}.dedup.realign.bam", sample=config["samples"]), - expand(bam_dir + "{sample}.dedup.realign.recal_data.table", sample=config["samples"]), - expand(bam_dir + "{sample}.dedup.realign.recal.csv", sample=config["samples"]), - expand(bam_dir + "{sample}.dedup.realign.recal.pdf", sample=config["samples"]), - bam_dir + config["analysis"]["case_id"] + ".corealign.bam", - expand(vcf_dir + config["analysis"]["case_id"] + ".{algo}.vcf.gz", algo = ["tnsnv", "tnhaplotyper"]) - output: - os.path.join(get_result_dir(config), "analysis_finish") - shell: - "date +'%Y-%M-%d T%T %:z' > {output}" diff --git a/BALSAMIC/workflows/archive/VariantCalling_paired_umi b/BALSAMIC/workflows/archive/VariantCalling_paired_umi deleted file mode 100644 index befdd955e..000000000 --- a/BALSAMIC/workflows/archive/VariantCalling_paired_umi +++ /dev/null @@ -1,49 +0,0 @@ -#!python -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -import os -from BALSAMIC.utils.rule import get_result_dir, get_vcf - -shell.prefix("set -eo pipefail; ") - -rule_dir = config["rule_directory"] -bam_dir = get_result_dir(config) + "/bam/" -cnv_dir = get_result_dir(config) + "/cnv/" -cutadapt_dir = get_result_dir(config) + "/cutadapt/" -fastqc_dir = get_result_dir(config) + "/fastqc/" -result_dir = get_result_dir(config) + "/" -vcf_dir = get_result_dir(config) + "/vcf/" -vep_dir = get_result_dir(config) + "/vep/" - -include: - -include: - rule_dir + "snakemake_rules/align/bwa_mem_umi.rule" -include: - rule_dir + "snakemake_rules/variant_calling/split_bed.rule" -include: - rule_dir + "snakemake_rules/quality_control/picard.rule" -include: - rule_dir + "snakemake_rules/umi/fgbio.rule" -include: - rule_dir + "snakemake_rules/variant_calling/mergetype_paired_umi.rule" -include: - rule_dir + "snakemake_rules/variant_calling/vardict.rule" -include: - rule_dir + "snakemake_rules/variant_calling/strelka.rule" -include: - rule_dir + "snakemake_rules/variant_calling/manta.rule" - -var_type = ["SNV", "SV"] -var_class = ["somatic", "germline"] - -rule all: - input: - expand(vcf_dir + "{vcf}.vcf.gz", vcf=get_vcf(config, ["vardict", "strelka"], [config["analysis"]["case_id"]])), - expand(bam_dir + "{sample}.unalg.umi.mrkadp.bwa.map.umi.cnsunalg.bwa.map.fltr.clip.bam", sample=config["samples"]), - expand(bam_dir + "{sample}.unalg.umi.mrkadp.bwa.map.umi.metrics", sample=config["samples"]) - output: - os.path.join(get_result_dir(config), "analysis_finish") - shell: - "date +'%Y-%M-%d T%T %:z' > {output}" diff --git a/BALSAMIC/workflows/archive/VariantCalling_single_umi b/BALSAMIC/workflows/archive/VariantCalling_single_umi deleted file mode 100644 index 1d6563b8b..000000000 --- a/BALSAMIC/workflows/archive/VariantCalling_single_umi +++ /dev/null @@ -1,39 +0,0 @@ -#!python -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -import os -from BALSAMIC.utils.rule import get_result_dir, get_vcf - -shell.prefix("set -eo pipefail; ") - -rule_dir = config["rule_directory"] -bam_dir = get_result_dir(config) + "/bam/" -cnv_dir = get_result_dir(config) + "/cnv/" -cutadapt_dir = get_result_dir(config) + "/cutadapt/" -fastqc_dir = get_result_dir(config) + "/fastqc/" -result_dir = get_result_dir(config) + "/" -vcf_dir = get_result_dir(config) + "/vcf/" -vep_dir = get_result_dir(config) + "/vep/" - -include: - rule_dir + "snakemake_rules/umi/fgbio_v2.rule" -include: - rule_dir + "snakemake_rules/variant_calling/split_bed.rule" -include: - rule_dir + "snakemake_rules/variant_calling/mergetype_single_umi.rule" -include: - rule_dir + "snakemake_rules/umi/vardict_single_umi.rule" - -var_type = ["SNV"] -var_class = ["somatic", "germline"] - -rule all: - input: - expand(vcf_dir + "{vcf}.vcf.gz", vcf=get_vcf(config, ["vardict"], [config["analysis"]["case_id"]])), - expand(bam_dir + "{sample}.unalg.umi.mrkadp.bwa.map.umi.cnsunalg.bwa.map.fltr.clip.bam", sample=config["samples"]), - expand(bam_dir + "{sample}.unalg.umi.mrkadp.bwa.map.umi.bam.duplex_qc.pdf", sample=config["samples"]) - output: - os.path.join(get_result_dir(config), "analysis_finish") - shell: - "date +'%Y-%M-%d T%T %:z' > {output}" diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 3eb5bc18d..8a3cd272a 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -1,74 +1,118 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -import os +import glob import logging +import os +import re import tempfile - from pathlib import Path -from yapf.yapflib.yapf_api import FormatFile - -from snakemake.exceptions import RuleException, WorkflowError - -from PyPDF2 import PdfFileMerger - +from typing import Dict, List + +from BALSAMIC.constants.constants import FileType +from BALSAMIC.constants.analysis import FastqName, MutationType, SampleType +from BALSAMIC.constants.paths import BALSAMIC_DIR, SENTIEON_DNASCOPE_DIR, SENTIEON_TNSCOPE_DIR +from BALSAMIC.constants.rules import SNAKEMAKE_RULES +from BALSAMIC.constants.variant_filters import ( + COMMON_SETTINGS, + SENTIEON_VARCALL_SETTINGS, + SVDB_FILTER_SETTINGS, + VARDICT_SETTINGS, +) +from BALSAMIC.constants.workflow_params import VARCALL_PARAMS, WORKFLOW_PARAMS +from BALSAMIC.models.config import ConfigModel +from BALSAMIC.models.params import BalsamicWorkflowConfig, VarCallerFilter +from BALSAMIC.utils.cli import check_executable, generate_h5 from BALSAMIC.utils.exc import BalsamicError - -from BALSAMIC.utils.cli import (check_executable, generate_h5) -from BALSAMIC.utils.io import write_json, read_yaml - -from BALSAMIC.utils.models import VarCallerFilter, BalsamicWorkflowConfig - +from BALSAMIC.utils.io import read_yaml, write_finish_file, write_json +from BALSAMIC.utils.rule import ( + dump_toml, + get_cancer_germline_snv_observations, + get_cancer_somatic_snv_observations, + get_capture_kit, + get_clinical_snv_observations, + get_clinical_sv_observations, + get_fastp_parameters, + get_pon_cnn, + get_result_dir, + get_rule_output, + get_script_path, + get_sequencing_type, + get_somatic_sv_observations, + get_swegen_snv, + get_swegen_sv, + get_threads, + get_variant_callers, + get_vcf, +) from BALSAMIC.utils.workflowscripts import plot_analysis +from pypdf import PdfWriter +from snakemake.exceptions import RuleException, WorkflowError +from yapf.yapflib.yapf_api import FormatFile -from BALSAMIC.utils.rule import (get_variant_callers, get_rule_output, get_result_dir, get_vcf, get_picard_mrkdup, - get_sample_type, get_threads, get_script_path, get_sequencing_type, get_capture_kit, - get_clinical_snv_observations, get_clinical_sv_observations,get_swegen_snv, - get_swegen_sv, dump_toml) - -from BALSAMIC.constants.common import (SENTIEON_DNASCOPE, SENTIEON_TNSCOPE, RULE_DIRECTORY, MUTATION_TYPE) -from BALSAMIC.constants.variant_filters import (COMMON_SETTINGS, VARDICT_SETTINGS, SENTIEON_VARCALL_SETTINGS, - SVDB_FILTER_SETTINGS) -from BALSAMIC.constants.workflow_params import (WORKFLOW_PARAMS, VARCALL_PARAMS) -from BALSAMIC.constants.workflow_rules import SNAKEMAKE_RULES - +# Initialize ConfigModel +config_model = ConfigModel.model_validate(config) shell.executable("/bin/bash") shell.prefix("set -eo pipefail; ") LOG = logging.getLogger(__name__) -logging.getLogger("filelock").setLevel("WARN") -# Create a temporary directory with trailing / -tmp_dir = os.path.join(get_result_dir(config), "tmp", "" ) -Path.mkdir(Path(tmp_dir), exist_ok=True) +# Get case id/name +case_id: str = config_model.analysis.case_id +# Get analysis dir +analysis_dir_home: str = config_model.analysis.analysis_dir +analysis_dir: str = Path(analysis_dir_home, "analysis", case_id).as_posix() + "/" +# Get result dir +result_dir: str = Path(config_model.analysis.result).as_posix() + "/" -# Set case id/name -case_id = config["analysis"]["case_id"] +# Create a temporary directory with trailing / +tmp_dir: str = Path(result_dir, "tmp").as_posix() + "/" +Path.mkdir(Path(tmp_dir), parents=True, exist_ok=True) # Directories -analysis_dir = config["analysis"]["analysis_dir"] + "/" +case_id + "/" -benchmark_dir = config["analysis"]["benchmark"] -fastq_dir = get_result_dir(config) + "/fastq/" -bam_dir = get_result_dir(config) + "/bam/" -cnv_dir = get_result_dir(config) + "/cnv/" -fastqc_dir = get_result_dir(config) + "/fastqc/" -result_dir = get_result_dir(config) + "/" -vcf_dir = get_result_dir(config) + "/vcf/" -vep_dir = get_result_dir(config) + "/vep/" -qc_dir = get_result_dir(config) + "/qc/" -delivery_dir = get_result_dir(config) + "/delivery/" -umi_dir = get_result_dir(config) + "/umi/" -umi_qc_dir = qc_dir + "umi_qc/" -singularity_image = config['singularity']['image'] - - +input_fastq_dir: str = config_model.analysis.fastq_path + "/" +benchmark_dir: str = config_model.analysis.benchmark + "/" +fastq_dir: str = Path(result_dir, "fastq").as_posix() + "/" +bam_dir: str = Path(result_dir, "bam").as_posix() + "/" +cnv_dir: str = Path(result_dir, "cnv").as_posix() + "/" +fastqc_dir: str = Path(result_dir, "fastqc").as_posix() + "/" +vcf_dir: str = Path(result_dir, "vcf").as_posix() + "/" +vep_dir: str = Path(result_dir, "vep").as_posix() + "/" +qc_dir: str = Path(result_dir, "qc").as_posix() + "/" +delivery_dir: str = Path(result_dir, "delivery").as_posix() + "/" +umi_dir: str = Path(result_dir, "umi").as_posix() + "/" +umi_qc_dir: str = Path(qc_dir, "umi_qc").as_posix() + "/" + + +# Annotations research_annotations = [] clinical_annotations = [] clinical_snv_obs = "" +cancer_germline_snv_obs = "" +cancer_somatic_snv_obs = "" swegen_snv = "" clinical_sv = "" +somatic_sv = "" swegen_sv = "" +if config["analysis"]["sequencing_type"] != "wgs": + pon_cnn: str = get_pon_cnn(config) + +# Run information +singularity_image: str = config_model.singularity['image'] +sample_names: List[str] = config_model.get_all_sample_names() +tumor_sample: str = config_model.get_sample_name_by_type(SampleType.TUMOR) +sequencing_type = config_model.analysis.sequencing_type +if config_model.analysis.analysis_type == "paired": + normal_sample: str = config_model.get_sample_name_by_type(SampleType.NORMAL) + +# Sample status to sampleID namemap +if config_model.analysis.analysis_type == "paired": + status_to_sample_id = "TUMOR" + "\\\\t" + tumor_sample + "\\\\n" + "NORMAL" + "\\\\t" + normal_sample +else: + status_to_sample_id = "TUMOR" + "\\\\t" + tumor_sample + + # vcfanno annotations research_annotations.append( { 'annotation': [{ @@ -90,6 +134,17 @@ research_annotations.append( { } ) +research_annotations.append( { + 'annotation': [{ + 'file': Path(config["reference"]["cadd_snv"]).as_posix(), + 'names': ["CADD"], + 'ops': ["mean"], + 'columns': [6] + }] +} +) + + if "swegen_snv_frequency" in config["reference"]: research_annotations.append( { 'annotation': [{ @@ -111,43 +166,59 @@ if "clinical_snv_observations" in config["reference"]: }] } ) - clinical_snv_obs = get_clinical_snv_observations(config) + clinical_snv_obs: str = get_clinical_snv_observations(config) + +if "cancer_germline_snv_observations" in config["reference"]: + clinical_annotations.append( { + 'annotation': [{ + 'file': get_cancer_germline_snv_observations(config), + 'fields': ["Frq", "Obs", "Hom"], + 'ops': ["self", "self", "self"], + 'names': ["Cancer_Germline_Frq", "Cancer_Germline_Obs", "Cancer_Germline_Hom"] + }] + } + ) + cancer_germline_snv_obs: str = get_cancer_germline_snv_observations(config) +if "cancer_somatic_snv_observations" in config["reference"]: + clinical_annotations.append( { + 'annotation': [{ + 'file': get_cancer_somatic_snv_observations(config), + 'fields': ["Frq", "Obs", "Hom"], + 'ops': ["self", "self", "self"], + 'names': ["Cancer_Somatic_Frq", "Cancer_Somatic_Obs", "Cancer_Somatic_Hom"] + }] + } + ) + cancer_somatic_snv_obs: str = get_cancer_somatic_snv_observations(config) if "clinical_sv_observations" in config["reference"]: - clinical_sv = get_clinical_sv_observations(config) + clinical_sv: str = get_clinical_sv_observations(config) +if "cancer_somatic_sv_observations" in config["reference"]: + somatic_sv: str = get_somatic_sv_observations(config) if "swegen_sv_frequency" in config["reference"]: - swegen_sv = get_swegen_sv(config) + swegen_sv: str = get_swegen_sv(config) -# picarddup flag -picarddup = get_picard_mrkdup(config) # Varcaller filter settings -COMMON_FILTERS = VarCallerFilter.parse_obj(COMMON_SETTINGS) -VARDICT = VarCallerFilter.parse_obj(VARDICT_SETTINGS) -SENTIEON_CALLER = VarCallerFilter.parse_obj(SENTIEON_VARCALL_SETTINGS) -SVDB_FILTERS = VarCallerFilter.parse_obj(SVDB_FILTER_SETTINGS) +COMMON_FILTERS = VarCallerFilter.model_validate(COMMON_SETTINGS) +VARDICT = VarCallerFilter.model_validate(VARDICT_SETTINGS) +SENTIEON_CALLER = VarCallerFilter.model_validate(SENTIEON_VARCALL_SETTINGS) +SVDB_FILTERS = VarCallerFilter.model_validate(SVDB_FILTER_SETTINGS) + +# Fastp parameters +fastp_parameters: Dict = get_fastp_parameters(config_model) # parse parameters as constants to workflows -params = BalsamicWorkflowConfig.parse_obj(WORKFLOW_PARAMS) +params = BalsamicWorkflowConfig.model_validate(WORKFLOW_PARAMS) # Capture kit name if config["analysis"]["sequencing_type"] != "wgs": capture_kit = os.path.split(config["panel"]["capture_kit"])[1] -# Sample names for tumor or normal -tumor_sample = get_sample_type(config["samples"], "tumor")[0] -if config['analysis']['analysis_type'] == "paired": - normal_sample = get_sample_type(config["samples"], "normal")[0] - -# Get sample unique names for tumor or normal -lims_id = {'normal': [], 'tumor': []} -for sample, sample_info in config["samples"].items(): - lims_id[sample_info["type"]].append(sample_info["sample_name"]) - # explicitly check if cluster_config dict has zero keys. if len(cluster_config.keys()) == 0: cluster_config = config @@ -162,8 +233,8 @@ try: else: config["SENTIEON_EXEC"] = Path(os.environ["SENTIEON_INSTALL_DIR"], "bin", "sentieon").as_posix() - config["SENTIEON_TNSCOPE"] = SENTIEON_TNSCOPE - config["SENTIEON_DNASCOPE"] = SENTIEON_DNASCOPE + config["SENTIEON_TNSCOPE"] = SENTIEON_TNSCOPE_DIR.as_posix() + config["SENTIEON_DNASCOPE"] = SENTIEON_DNASCOPE_DIR.as_posix() except KeyError as error: LOG.error("Set environment variables SENTIEON_LICENSE, SENTIEON_INSTALL_DIR, SENTIEON_EXEC " @@ -205,26 +276,29 @@ os.environ["SENTIEON_TMPDIR"] = result_dir os.environ['TMPDIR'] = get_result_dir(config) # CNV report input files -cnv_data_paths = [] -if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "paired": - cnv_data_paths.append(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.samplestatistics.txt") - cnv_data_paths.extend(expand( - vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat." + "{output_suffix}" + ".png", - output_suffix=["ascatprofile", "rawprofile", "ASPCF", "tumor", "germline", "sunrise"] - )) - -if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "single": - cnv_data_paths.extend(expand( - vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvpytor." + "{output_suffix}" + ".png", - output_suffix=["circular", "scatter"] - )) +cnv_report_paths = [] +if config["analysis"]["sequencing_type"] == "wgs": + if config['analysis']['analysis_type'] == "paired": + cnv_report_paths.append(f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.ascat.samplestatistics.txt.pdf") + cnv_report_paths.extend(expand( + f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.ascat.{{output_suffix}}.png.pdf", + output_suffix=["ascatprofile", "rawprofile", "ASPCF", "tumor", "germline", "sunrise"] + )) + else: + cnv_report_paths.extend(expand( + f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.cnvpytor.{{output_suffix}}.png.pdf", + output_suffix=["circular", "scatter"] + )) +else: + cnv_report_paths.extend(expand(f"{cnv_dir}tumor.merged-{{plot}}.pdf",plot=["diagram", "scatter"])) + cnv_report_paths.append(f"{cnv_dir}CNV.somatic.{config['analysis']['case_id']}.purecn.purity.csv.pdf") # Extract variant callers for the workflow germline_caller = [] somatic_caller = [] somatic_caller_cnv = [] somatic_caller_sv = [] -for m in MUTATION_TYPE: +for m in set(MutationType): germline_caller_balsamic = get_variant_callers(config=config, analysis_type=config['analysis']['analysis_type'], workflow_solution="BALSAMIC", @@ -319,6 +393,14 @@ if config["analysis"]["analysis_workflow"] == "balsamic": somatic_caller = [var_caller for var_caller in somatic_caller if "umi" not in var_caller] somatic_caller_tmb = [var_caller for var_caller in somatic_caller_tmb if "umi" not in var_caller] +# Add rule for DRAGEN +if "dragen" in config: + rules_to_include.append("snakemake_rules/concatenation/concatenation.rule") + +# Add rule for GENS +if "gens_coverage_pon" in config["reference"]: + rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule") + rules_to_include.append("snakemake_rules/variant_calling/gens_preprocessing.rule") LOG.info(f"The following rules will be included in the workflow: {rules_to_include}") LOG.info(f"The following Germline variant callers will be included in the workflow: {germline_caller}") @@ -326,13 +408,13 @@ LOG.info(f"The following somatic variant callers will be included in the workflo for r in rules_to_include: - include: Path(RULE_DIRECTORY, r).as_posix() + include: Path(BALSAMIC_DIR, r).as_posix() # Define common and analysis specific outputs quality_control_results = [ - os.path.join(qc_dir,case_id + "_metrics_deliverables.yaml"), - os.path.join(qc_dir, "multiqc_report.html"), - os.path.join(qc_dir, "multiqc_data/multiqc_data.json") + Path(qc_dir, case_id + "_metrics_deliverables.yaml").as_posix(), + Path(qc_dir, "multiqc_report.html").as_posix(), + Path(qc_dir, "multiqc_data/multiqc_data.json").as_posix() ] # Analysis results @@ -344,7 +426,7 @@ analysis_specific_results.extend( ) # Germline SNVs specifically for genotype -if config["analysis"]["analysis_type"]=="paired": +if config["analysis"]["analysis_type"] == "paired": analysis_specific_results.append(vep_dir + "SNV.genotype.normal.dnascope.vcf.gz") # Raw VCFs @@ -368,10 +450,8 @@ analysis_specific_results.extend( expand(vep_dir + "{vcf}.balsamic_stat", vcf=get_vcf(config, somatic_caller_tmb, [case_id])) ) -# WGS specific files -if config["analysis"]["sequencing_type"] == "wgs": - # CNV report - analysis_specific_results.append(vcf_dir + "CNV.somatic." + case_id + ".report.pdf"), +# CNV report +analysis_specific_results.append(cnv_dir + "CNV.somatic." + case_id + ".report.pdf"), # TGA specific files if config["analysis"]["sequencing_type"] != "wgs": @@ -390,13 +470,15 @@ if config["analysis"]["sequencing_type"] != "wgs": expand(vep_dir + "{vcf}.research.filtered.pass.ranked.vcf.gz", vcf=get_vcf(config, ["vardict"], [case_id])) ) # UMI - if config["analysis"]["analysis_workflow"]=="balsamic-umi": - analysis_specific_results.extend(expand(umi_qc_dir + "{sample}.umi.mean_family_depth",sample=config["samples"])) + if config["analysis"]["analysis_workflow"] == "balsamic-umi": + analysis_specific_results.extend(expand(umi_qc_dir + "{sample}.umi.mean_family_depth", sample=config_model.get_all_sample_names())) if background_variant_file: analysis_specific_results.extend( expand(umi_qc_dir + "{case_name}.{var_caller}.AFtable.txt", case_name=case_id, var_caller=["tnscope_umi"]) ) + + if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "paired": analysis_specific_results.extend( expand(vcf_dir + "{vcf}.copynumber.txt.gz", vcf=get_vcf(config, ["ascat"], [case_id])) @@ -431,6 +513,12 @@ if config['analysis']['analysis_type'] == "single": expand(vcf_dir + "{vcf}.cov.gz",vcf=get_vcf(config,["dellycnv"],[case_id])) ) +# GENS Outputs +if config["analysis"]["sequencing_type"] == "wgs" and "gens_coverage_pon" in config["reference"]: + analysis_specific_results.extend( + expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"]) + ) + # Dragen if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "single": if "dragen" in config: @@ -459,7 +547,7 @@ if 'benchmark_plots' in config: # Merge plots into one based on rule name for my_rule in vars(rules).keys(): - my_rule_pdf = PdfFileMerger() + my_rule_pdf = PdfWriter() my_rule_plots = list() for plots in Path(benchmark_dir).glob(f"BALSAMIC*.{my_rule}.*.pdf"): my_rule_pdf.append(plots.as_posix()) @@ -473,7 +561,7 @@ if 'benchmark_plots' in config: if 'delivery' in config: wildcard_dict = { - "sample": list(config["samples"].keys())+["tumor", "normal"], + "sample": sample_names, "case_name": case_id, "allow_missing": True } @@ -502,19 +590,23 @@ if 'delivery' in config: LOG.info("Delivering step (rule) {} {}.".format(my_rule, housekeeper_id)) files_to_deliver = get_rule_output(rules=rules, rule_name=my_rule, output_file_wildcards=wildcard_dict) - LOG.debug("The following files added to delivery: {}".format(files_to_deliver)) + LOG.info("The following files added to delivery: {}".format(files_to_deliver)) output_files_ready.extend(files_to_deliver) output_files_ready = [dict(zip(output_files_ready[0], value)) for value in output_files_ready[1:]] - delivery_ready = os.path.join(get_result_dir(config), "delivery_report", case_id + "_delivery_ready.hk") + delivery_ready = Path(get_result_dir(config), "delivery_report", case_id + "_delivery_ready.hk").as_posix() write_json(output_files_ready, delivery_ready) FormatFile(delivery_ready) + +wildcard_constraints: + sample = "|".join(sample_names) + rule all: input: quality_control_results + analysis_specific_results output: - finish_file = os.path.join(get_result_dir(config), "analysis_finish") + finish_file = Path(get_result_dir(config), "analysis_finish").as_posix() params: tmp_dir = tmp_dir, case_name = config["analysis"]["case_id"], @@ -524,7 +616,7 @@ rule all: import datetime import shutil - from BALSAMIC.utils.qc_metrics import validate_qc_metrics + from BALSAMIC.utils.metrics import validate_qc_metrics # Perform validation of extracted QC metrics try: @@ -533,12 +625,11 @@ rule all: LOG.error(val_exc) raise BalsamicError - # Delete a temporal directory tree + # Remove temporary directory tree try: shutil.rmtree(params.tmp_dir) except OSError as e: print ("Error: %s - %s." % (e.filename, e.strerror)) # Finish timestamp file - with open(str(output.finish_file), mode="w") as finish_file: - finish_file.write("%s\n" % datetime.datetime.now()) + write_finish_file(file_path=output.finish_file) diff --git a/BALSAMIC/workflows/reference-canfam3.smk b/BALSAMIC/workflows/reference-canfam3.smk deleted file mode 100644 index 91699c777..000000000 --- a/BALSAMIC/workflows/reference-canfam3.smk +++ /dev/null @@ -1,217 +0,0 @@ -# syntax=python tabstop=4 expandtab -# coding: utf-8 - -import os -import logging -from pathlib import Path - -from copy import deepcopy - -from BALSAMIC.utils.rule import get_script_path -from BALSAMIC.utils.rule import get_reference_output_files -from BALSAMIC.utils.models import ReferenceMeta -from BALSAMIC.constants.reference import REFERENCE_FILES as REFERENCE_MODEL -from BALSAMIC.utils.cli import get_md5 -from BALSAMIC.utils.cli import create_md5 - -LOG = logging.getLogger(__name__) - -# explictly check if cluster_config dict has zero keys. -if len(cluster_config.keys()) == 0: - cluster_config = config - -genome_ver = config['genome_version'] - -# essential path reference files -basedir = os.path.join(config['output']) -genome_dir = os.path.join(basedir, "genome") - -# Set temporary dir environment variable -os.environ['TMPDIR'] = basedir - -REFERENCE_FILES = deepcopy(REFERENCE_MODEL) - -# intialize reference files -REFERENCE_FILES[genome_ver]['basedir'] = basedir -reference_file_model = ReferenceMeta.parse_obj(REFERENCE_FILES[genome_ver]) -reference_genome_url = reference_file_model.reference_genome -genome_chrom_size_url = reference_file_model.genome_chrom_size -refgene_txt_url = reference_file_model.refgene_txt -refgene_sql_url = reference_file_model.refgene_sql - -check_md5 = os.path.join(basedir, "reference.json.md5") - -shell.executable("/bin/bash") -shell.prefix("set -eo pipefail; ") - -singularity_image_path = config['singularity']['image_path'] -singularity_images = [Path(singularity_image_path, image_name + ".sif").as_posix() for image_name in config["singularity"]["containers"].keys()] - -########################################################## -# Generating Reference files for BALSAMIC pipeline -# Writing reference json file -########################################################## - -rule all: - input: - singularity_images, - reference_genome = reference_genome_url.get_output_file, - bwa_index = expand(reference_genome_url.get_output_file + "{ext}", ext=['.amb','.ann','.bwt','.pac','.sa']), - refgenome_fai = reference_genome_url.get_output_file + ".fai", - refgenome_dict = reference_genome_url.get_output_file.replace("fasta","dict"), - refseq_bed = refgene_txt_url.get_output_file.replace("txt", "flat") + ".bed", - refseq_flat = refgene_txt_url.get_output_file.replace("txt", "flat"), - refgene = refgene_txt_url.get_output_file, - genome_chrom_size = genome_chrom_size_url.get_output_file, - output: - finished = os.path.join(basedir,"reference.finished"), - reference_json = os.path.join(basedir, "reference.json"), - check_md5 = check_md5 - log: - os.path.join(basedir, "reference.json.log") - run: - import json - from datetime import datetime - - today = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - - ref_json = dict() - ref_json['reference'] = { - "reference_genome": input.reference_genome, - "exon_bed": input.refseq_bed, - "refflat": input.refseq_flat, - "refGene": input.refgene, - "genome_chrom_size": input.genome_chrom_size, - "reference_access_date": today, - } - - with open(str(output.reference_json), "w") as fh: - json.dump(ref_json, fh, indent=4) - - create_md5(ref_json['reference'], output.check_md5) - - with open(str(output.finished), mode='w') as finish_file: - finish_file.write('%s\n' % today ) - -########################################################### -# Download all singularity container images from dockerhub -########################################################### - -rule download_container: - output: singularity_images - run: - for image_name, docker_path in config["singularity"]["containers"].items(): - cmd = "singularity pull {}/{}.sif {}".format(config["singularity"]["image_path"], image_name, docker_path) - shell(cmd) - -########################################################## -# Download the reference genome, variant db -########################################################## -download_content = [reference_genome_url, genome_chrom_size_url, refgene_txt_url, refgene_sql_url] - -rule download_reference: - output: - expand("{output}", output=[ref.get_output_file for ref in download_content]) - run: - import requests - - for ref in download_content: - output_file = ref.get_output_file - log_file = output_file + ".log" - - cmd = "wget -a {} -O - {}".format(log_file, ref.url) - - if ref.gzip: - cmd += " | gunzip " - - cmd += " > {}".format(output_file) - shell(cmd) - ref.write_md5 - -########################################################## -# Preprocess refseq file by fetching relevant columns and -# standardize the chr column -########################################################## - -rule prepare_refgene: - input: - singularity_images, - refgene_txt = refgene_txt_url.get_output_file, - refgene_sql = refgene_sql_url.get_output_file, - params: - refgene_sql_awk = get_script_path('refseq_sql.awk'), - output: - refflat = refgene_txt_url.get_output_file.replace("txt", "flat"), - bed = refgene_txt_url.get_output_file.replace("txt", "flat") + ".bed", - log: - refgene_sql = os.path.join(basedir, "genome", "refgene_sql.log"), - refgene_txt = os.path.join(basedir, "genome", "refgene_txt.log") - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix() - shell: - """ -header=$(awk -f {params.refgene_sql_awk} {input.refgene_sql}); -(echo \"$header\"; cat {input.refgene_txt};) \ -| csvcut -t -c chrom,exonStarts,exonEnds,name,score,strand,exonCount,txStart,txEnd,name2 \ -| csvformat -T \ -| bedtools expand -c 2,3 \ -| awk '$1~/chr[1-9]/ && $1!~/[_]/' | sort -k1,1 -k2,2n > {output.bed}; - -awk -v OFS=\"\\t\" '$3!~/_/ {{ gsub(\"chr\",\"chr\",$3); $1=$13; print }}' {input.refgene_txt} \ -| cut -f 1-11 > {output.refflat}; - """ - -########################################################## -# Create BWA Index for reference genome -########################################################## - -rule bwa_index: - input: - singularity_img = singularity_images, - reference_genome = reference_genome_url.get_output_file - output: - expand(reference_genome_url.get_output_file + "{ext}", ext=['.amb','.ann','.bwt','.pac','.sa']) - log: - reference_genome_url.get_output_file + ".bwa_index.log" - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bwa") + ".sif").as_posix() - shell: - """ -bwa index -a bwtsw {input.reference_genome} 2> {log}; - """ - -########################################################## -# Create index for fasta file - .fai -########################################################## - -rule samtools_index_fasta: - input: - singularity_img = singularity_images, - reference_genome = reference_genome_url.get_output_file - output: - reference_genome_url.get_output_file + ".fai" - log: - reference_genome_url.get_output_file + ".faidx.log" - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("samtools") + ".sif").as_posix() - shell: - """ -samtools faidx {input.reference_genome} 2> {log}; - """ - - -########################################################## -# create reference dictionary using picard -########################################################## - -rule picard_ref_dict: - input: - singularity_img = singularity_images, - reference_genome = reference_genome_url.get_output_file - output: - reference_genome_url.get_output_file.replace("fasta","dict") - log: - reference_genome_url.get_output_file + ".ref_dict.log" - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("picard") + ".sif").as_posix() - shell: - """ -picard CreateSequenceDictionary REFERENCE={input.reference_genome} OUTPUT={output} 2> {log}; - """ - diff --git a/BALSAMIC/workflows/reference.smk b/BALSAMIC/workflows/reference.smk index 4cca11f1e..eb137b169 100644 --- a/BALSAMIC/workflows/reference.smk +++ b/BALSAMIC/workflows/reference.smk @@ -1,405 +1,55 @@ -# syntax=python tabstop=4 expandtab -# coding: utf-8 - -import os +"""Snakemake reference cache file.""" +import json import logging +import os from pathlib import Path +from typing import Dict -from copy import deepcopy - -from BALSAMIC.utils.rule import get_script_path -from BALSAMIC.utils.rule import get_reference_output_files -from BALSAMIC.utils.models import ReferenceMeta -from BALSAMIC.constants.reference import REFERENCE_FILES as REFERENCE_MODEL -from BALSAMIC.utils.cli import get_md5 -from BALSAMIC.utils.cli import create_md5 - +from BALSAMIC.constants.cache import Species, VEP_PLUGINS +from BALSAMIC.constants.constants import FileType +from BALSAMIC.constants.paths import BALSAMIC_DIR, REFSEQ_SCRIPT_PATH +from BALSAMIC.constants.rules import SNAKEMAKE_RULES +from BALSAMIC.models.cache import CacheConfig, AnalysisReferences +from BALSAMIC.utils.io import write_finish_file, write_json +from BALSAMIC.utils.rule import get_threads +from BALSAMIC.utils.utils import get_relative_paths_dict LOG = logging.getLogger(__name__) -# explictly check if cluster_config dict has zero keys. -if len(cluster_config.keys()) == 0: - cluster_config = config - - -# backward compatible genome version extraction from config -if 'genome_version' in config: - genome_ver = config['genome_version'] -else: - genome_ver = 'hg19' - -# essential path reference files -basedir = os.path.join(config['output']) -genome_dir = os.path.join(basedir, "genome") -vcf_dir = os.path.join(basedir, "variants") -vep_dir = os.path.join(basedir, "vep") -cosmicdb_key = config['cosmic_key'] - -# Set temporary dir environment variable -os.environ['TMPDIR'] = basedir +# Balsamic cache configuration model +cache_config: CacheConfig = CacheConfig.model_validate(config) -# indexable VCF files -# For future reference, if you delete this line pydantic fails in tests -# Don't know why, but don't delete it and keep deepcopy /A&H -REFERENCE_FILES = deepcopy(REFERENCE_MODEL) -indexable_vcf_files = get_reference_output_files(REFERENCE_FILES[genome_ver], - file_type='vcf', - gzip = True) - -# intialize reference files -REFERENCE_FILES[genome_ver]['basedir'] = basedir -reference_file_model = ReferenceMeta.parse_obj(REFERENCE_FILES[genome_ver]) - -reference_genome_url = reference_file_model.reference_genome -dbsnp_url = reference_file_model.dbsnp -hc_vcf_1kg_url = reference_file_model.hc_vcf_1kg -mills_1kg_url = reference_file_model.mills_1kg -known_indel_1kg_url = reference_file_model.known_indel_1kg -vcf_1kg_url = reference_file_model.vcf_1kg -gnomad_url = reference_file_model.gnomad_variant -gnomad_tbi_url = reference_file_model.gnomad_variant_index -cosmicdb_url = reference_file_model.cosmicdb -wgs_calling_url = reference_file_model.wgs_calling -genome_chrom_size_url = reference_file_model.genome_chrom_size -refgene_txt_url = reference_file_model.refgene_txt -refgene_sql_url = reference_file_model.refgene_sql -rankscore_url = reference_file_model.rankscore -access_regions_url = reference_file_model.access_regions -delly_exclusion_url = reference_file_model.delly_exclusion -delly_mappability_url = reference_file_model.delly_mappability -delly_mappability_gindex_url = reference_file_model.delly_mappability_gindex -delly_mappability_findex_url = reference_file_model.delly_mappability_findex -ascat_gccorrection_url = reference_file_model.ascat_gccorrection -ascat_chryloci_url = reference_file_model.ascat_chryloci -clinvar_url = reference_file_model.clinvar -somalier_sites_url = reference_file_model.somalier_sites +# Temporary directory and shell options +os.environ["TMPDIR"] = cache_config.references_dir.as_posix() +shell.executable("/bin/bash") +shell.prefix("set -euo pipefail; ") -# add secrets from config to items that need them -cosmicdb_url.secret=config['cosmic_key'] +# Rules to include +for rule in SNAKEMAKE_RULES["cache"][cache_config.genome_version]: -check_md5 = os.path.join(basedir, "reference.json.md5") + include: Path(BALSAMIC_DIR, rule).as_posix() -shell.executable("/bin/bash") -shell.prefix("set -eo pipefail; ") -singularity_image_path = config['singularity']['image_path'] -singularity_images = [Path(singularity_image_path, image_name + ".sif").as_posix() for image_name in config["singularity"]["containers"].keys()] +LOG.info( + f"The rules {SNAKEMAKE_RULES['cache'][cache_config.genome_version]} will be included in the reference workflow" +) -########################################################## -# Generating Reference files for BALSAMIC pipeline -# Writing reference json file -########################################################## rule all: + """Target rule for Balsamic cache generation.""" input: - singularity_images, - reference_genome = reference_genome_url.get_output_file, - bwa_index = expand(reference_genome_url.get_output_file + "{ext}", ext=['.amb','.ann','.bwt','.pac','.sa']), - refgenome_fai = reference_genome_url.get_output_file + ".fai", - refgenome_dict = reference_genome_url.get_output_file.replace("fasta","dict"), - refseq_bed = refgene_txt_url.get_output_file.replace("txt", "flat") + ".bed", - refseq_flat = refgene_txt_url.get_output_file.replace("txt", "flat"), - refgene = refgene_txt_url.get_output_file, - dbsnp_vcf = dbsnp_url.get_output_file + ".gz", - th_genome_vcf = vcf_1kg_url.get_output_file + ".gz", - tg_high_vcf = hc_vcf_1kg_url.get_output_file+ ".gz", - mills_1kg = mills_1kg_url.get_output_file + ".gz", - known_indel_1kg = known_indel_1kg_url.get_output_file + ".gz", - gnomad_variant_vcf = gnomad_url.get_output_file, - gnomad_variant_index = gnomad_tbi_url.get_output_file, - cosmic_vcf = cosmicdb_url.get_output_file + ".gz", - variants_idx = expand(os.path.join(vcf_dir,"{vcf}.gz.tbi"), vcf=indexable_vcf_files), - vep = directory(vep_dir), - wgs_calling = wgs_calling_url.get_output_file, - genome_chrom_size = genome_chrom_size_url.get_output_file, - rankscore = rankscore_url.get_output_file, - access_regions = access_regions_url.get_output_file, - delly_exclusion = delly_exclusion_url.get_output_file, - delly_exclusion_converted = delly_exclusion_url.get_output_file.replace(".tsv", "_converted.tsv"), - delly_mappability= delly_mappability_url.get_output_file, - delly_mappability_gindex= delly_mappability_gindex_url.get_output_file, - delly_mappability_findex= delly_mappability_findex_url.get_output_file, - ascat_gccorrection = ascat_gccorrection_url.get_output_file, - ascat_chryloci = ascat_chryloci_url.get_output_file, - clinvar = clinvar_url.get_output_file + ".gz", - somalier_sites = somalier_sites_url.get_output_file + ".gz", - output: - finished = os.path.join(basedir,"reference.finished"), - reference_json = os.path.join(basedir, "reference.json"), - check_md5 = check_md5 - log: - os.path.join(basedir, "reference.json.log") - run: - import json - from datetime import datetime - - today = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - - ref_json = dict() - ref_json['reference'] = { - "reference_genome": input.reference_genome, - "dbsnp": input.dbsnp_vcf, - "1kg_snps_all": input.th_genome_vcf, - "1kg_snps_high": input.tg_high_vcf, - "1kg_known_indel": input.known_indel_1kg, - "mills_1kg": input.mills_1kg, - "gnomad_variant": input.gnomad_variant_vcf, - "cosmic": input.cosmic_vcf, - "exon_bed": input.refseq_bed, - "refflat": input.refseq_flat, - "refGene": input.refgene, - "wgs_calling_interval": input.wgs_calling, - "genome_chrom_size": input.genome_chrom_size, - "vep": input.vep, - "rankscore": input.rankscore, - "access_regions": input.access_regions, - "delly_exclusion" : input.delly_exclusion, - "delly_exclusion_converted" : input.delly_exclusion_converted, - "delly_mappability": input.delly_mappability, - "ascat_gccorrection" : input.ascat_gccorrection, - "ascat_chryloci" : input.ascat_chryloci, - "clinvar": input.clinvar, - "somalier_sites": input.somalier_sites, - "reference_access_date": today, - } - - with open(str(output.reference_json), "w") as fh: - json.dump(ref_json, fh, indent=4) - - create_md5(ref_json['reference'], output.check_md5) - - with open(str(output.finished), mode='w') as finish_file: - finish_file.write('%s\n' % today ) - -########################################################### -# Download all singularity container images from dockerhub -########################################################### -wildcard_constraints: - container_image = "|".join(list(config["singularity"]["containers"])), - -def download_container_file(output_file: str): - image_name = Path(output_file).stem - docker_path = config["singularity"]["containers"][image_name] - cmd = "singularity pull {}/{}.sif {}".format(config["singularity"]["image_path"],image_name,docker_path) - shell(cmd) - -rule download_container: - output: Path(singularity_image_path, "{container_image}" + ".sif").as_posix(), - run: - download_container_file(output_file=output[0]) - -########################################################## -# Download the reference genome, variant db -########################################################## -download_content = [reference_genome_url, dbsnp_url, hc_vcf_1kg_url, - mills_1kg_url, known_indel_1kg_url, vcf_1kg_url, - wgs_calling_url, genome_chrom_size_url, - gnomad_url, gnomad_tbi_url, - cosmicdb_url, refgene_txt_url, refgene_sql_url, rankscore_url, access_regions_url, - delly_exclusion_url, delly_mappability_url, delly_mappability_gindex_url, - delly_mappability_findex_url, ascat_gccorrection_url, ascat_chryloci_url, clinvar_url, - somalier_sites_url] - -download_dict = dict([(ref.get_output_file, ref) for ref in download_content]) - -def download_reference_file(output_file: str): - import requests - - ref = download_dict[output_file] - log_file = output_file + ".log" - - if ref.url.scheme == "gs": - cmd = "export TMPDIR=/tmp; gsutil cp -L {} {} -".format(log_file,ref.url) - else: - cmd = "wget -a {} -O - {}".format(log_file,ref.url) - - if ref.secret: - try: - response = requests.get(ref.url,headers={'Authorization': 'Basic %s' % ref.secret}) - download_url = response.json()["url"] - except: - LOG.error("Unable to download {}".format(ref.url)) - raise - cmd = "curl -o - '{}'".format(download_url) - - if ref.gzip: - cmd += " | gunzip " - - cmd += " > {}".format(output_file) - shell(cmd) - ref.write_md5 - -ref_subdirs = set([ref.output_path for ref in download_content]) -ref_files = set([ref.output_file for ref in download_content]) - -wildcard_constraints: - ref_subdir="|".join(ref_subdirs), - ref_file = "|".join(ref_files), - - -rule download_reference: + cache_config.get_container_output_paths(), + cache_config.get_reference_output_paths(), output: - Path("{ref_subdir}","{ref_file}").as_posix(), + finish_file=f"{cache_config.references_dir.as_posix()}/reference.finish", + threads: get_threads(cluster_config=cluster_config, rule_name="all") run: - download_reference_file(output_file=output[0]) - - - - -########################################################## -# Preprocess refseq file by fetching relevant columns and -# standardize the chr column -########################################################## - -rule prepare_refgene: - input: - singularity_images, - refgene_txt = refgene_txt_url.get_output_file, - refgene_sql = refgene_sql_url.get_output_file, - accessible_regions = access_regions_url.get_output_file, - params: - refgene_sql_awk = get_script_path('refseq_sql.awk'), - output: - refflat = refgene_txt_url.get_output_file.replace("txt", "flat"), - bed = refgene_txt_url.get_output_file.replace("txt", "flat") + ".bed", - log: - refgene_sql = os.path.join(basedir, "genome", "refgene_sql.log"), - refgene_txt = os.path.join(basedir, "genome", "refgene_txt.log") - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix() - shell: - """ -header=$(awk -f {params.refgene_sql_awk} {input.refgene_sql}); -(echo \"$header\"; cat {input.refgene_txt};) \ -| csvcut -t -c chrom,exonStarts,exonEnds,name,score,strand,exonCount,txStart,txEnd,name2 \ -| csvformat -T \ -| bedtools expand -c 2,3 \ -| awk '$1~/chr[1-9]/ && $1!~/[_]/' | cut -c 4- | sort -k1,1 -k2,2n > {output.bed}; - -awk -v OFS=\"\\t\" '$3!~/_/ {{ gsub(\"chr\",\"\",$3); $1=$13; print }}' {input.refgene_txt} \ -| cut -f 1-11 > {output.refflat}; -sed -i 's/chr//g' {input.refgene_txt}; -sed -i 's/chr//g' {input.accessible_regions}; - """ - -########################################################## -# bgzip and tabix the vcf files that are vcf -########################################################## - -rule bgzip_tabix: - input: - singularity_img = singularity_images, - vcf = os.path.join(vcf_dir, "{vcf}.vcf") - params: - type = 'vcf', - output: - os.path.join(vcf_dir, "{vcf}.vcf.gz"), - os.path.join(vcf_dir, "{vcf}.vcf.gz.tbi") - log: - os.path.join(vcf_dir, "{vcf}.vcf.gz_tbi.log") - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("tabix") + ".sif").as_posix() - shell: - """ -bgzip {input.vcf} && tabix -p {params.type} {input.vcf}.gz 2> {log}; - """ - - -########################################################## -# Create BWA Index for reference genome -########################################################## - -rule bwa_index: - input: - singularity_img = singularity_images, - reference_genome = reference_genome_url.get_output_file - output: - expand(reference_genome_url.get_output_file + "{ext}", ext=['.amb','.ann','.bwt','.pac','.sa']) - log: - reference_genome_url.get_output_file + ".bwa_index.log" - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bwa") + ".sif").as_posix() - shell: - """ -bwa index -a bwtsw {input.reference_genome} 2> {log}; - """ - -########################################################## -# Create index for fasta file - .fai -########################################################## - -rule samtools_index_fasta: - input: - singularity_img = singularity_images, - reference_genome = reference_genome_url.get_output_file - output: - reference_genome_url.get_output_file + ".fai" - log: - reference_genome_url.get_output_file + ".faidx.log" - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("samtools") + ".sif").as_posix() - shell: - """ -samtools faidx {input.reference_genome} 2> {log}; - """ - - -########################################################## -# create reference dictionary using picard -########################################################## - -rule picard_ref_dict: - input: - singularity_img = singularity_images, - reference_genome = reference_genome_url.get_output_file - output: - reference_genome_url.get_output_file.replace("fasta","dict") - log: - reference_genome_url.get_output_file + ".ref_dict.log" - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("picard") + ".sif").as_posix() - shell: - """ -picard CreateSequenceDictionary REFERENCE={input.reference_genome} OUTPUT={output} 2> {log}; - """ - - -########################################################## -# ENSEMBL VEP - download and install vep package, -# cache conversion -########################################################## - -rule vep_install: - input: - singularity_img = singularity_images - params: - species = "homo_sapiens_merged", - assembly = "GRCh37" if genome_ver == 'hg19' else "GRCh38", - plugins = "all", - output: - directory(vep_dir) - log: - os.path.join(vep_dir, "vep_install_cache.log") - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() - shell: - """ -vep_install --SPECIES {params.species} \ ---AUTO cfp \ ---ASSEMBLY {params.assembly} \ ---CACHEDIR {output} \ ---PLUGINS {params.plugins} \ ---NO_HTSLIB --CONVERT --NO_UPDATE 2> {log}; - """ - -########################################################## -# Remove chr from delly exclusion -########################################################## - -rule prepare_delly_exclusion: - input: - singularity_img = singularity_images, - delly_exclusion = delly_exclusion_url.get_output_file, - output: - delly_exclusion_converted = delly_exclusion_url.get_output_file.replace(".tsv", "_converted.tsv"), - log: - os.path.join(basedir, "genome", "delly_exclusion.log"), - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("delly") + ".sif").as_posix() - shell: - """ -sed 's/chr//g' {input.delly_exclusion} > {output.delly_exclusion_converted} 2> {log} - """ + analysis_references: Dict[str, str] = get_relative_paths_dict( + base_path=cache_config.references_dir, + data=cache_config.get_analysis_references().model_dump(), + ) + write_json( + json_obj=analysis_references, + path=Path(cache_config.references_dir, "reference.json").as_posix(), + ) + write_finish_file(file_path=output.finish_file) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ab0cd9889..f5e05d067 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,110 @@ +[13.0.0] +------- + +Added: +^^^^^^ +* Fastq concatenation https://github.com/Clinical-Genomics/BALSAMIC/pull/1069 +* `CADD` SNV references https://github.com/Clinical-Genomics/BALSAMIC/pull/1126 +* `CADD` SNV annotation https://github.com/Clinical-Genomics/BALSAMIC/pull/1150 +* Samtools `stats`, `flagstat`, `idxstat` to WGS workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1176 +* Functionality for dynamically assigning fastq-info to sample dict in config from input fastq-dir https://github.com/Clinical-Genomics/BALSAMIC/pull/1176 +* Annotate SNVs with cancer germline SNV observations from Loqusdb https://github.com/Clinical-Genomics/BALSAMIC/pull/1178 +* Annotate SNVs with somatic SNV observations from Loqusdb https://github.com/Clinical-Genomics/BALSAMIC/pull/1187 +* Tests for Annotation with Cancer germline, somatic and clinical observations, and swegen frequencies https://github/Clinical-Genomics/BALSAMIC/pull/1190 +* Annotate SVs with somatic SV observations from Loqusdb https://github.com/Clinical-Genomics/BALSAMIC/pull/1194 +* Support singularity bind paths with different destination directories https://github/Clinical-Genomics/BALSAMIC/pull/1211 +* Added `--rerun-trigger mtime` option to Snakemake command https://github.com/Clinical-Genomics/BALSAMIC/pull/1217 +* `CADD` container https://github.com/Clinical-Genomics/BALSAMIC/pull/1222 +* Container ettiquette to ReadtheDocs https://github.com/Clinical-Genomics/BALSAMIC/pull/1232 +* `htslib` (samtools, bcftools tabix) container https://github.com/Clinical-Genomics/BALSAMIC/pull/1234 +* Release version support for cache generation https://github.com/Clinical-Genomics/BALSAMIC/pull/1231 +* `CADD` scores for INDELs https://github.com/Clinical-Genomics/BALSAMIC/pull/1238 +* `CADD` reference to tests https://githuc.com/Clinical-Genomics/BALSAMIC/pull/1241 +* Add cache version option to config case https://github.com/Clinical-Genomics/BALSAMIC/pull/1244 +* `cnvkit` container https://github.com/Clinical-Genomics/BALSAMIC/pull/1252 +* `PureCN` container https://github.com/Clinical-Genomics/BALSAMIC/pull/1255 +* `GATK` container https://github.com/Clinical-Genomics/BALSAMIC/pull/1266 +* Resolved FASTQ paths to sample dictionary (balsamic logging) https://github.com/Clinical-Genomics/BALSAMIC/pull/1275 +* Picard HsMetrics and CollectGcBiasMetrics for WGS https://github.com/Clinical-Genomics/BALSAMIC/pull/1288 +* `LOH` to TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1278 +* CNVs from PureCN to TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1278 +* Command-line arguments and rules for creation of GENS files https://github.com/Clinical-Genomics/BALSAMIC/pull/1279 +* Somatic and germline Loqusdb annotation to ReadtheDocs https://github.com/Clinical-Genomics/BALSAMIC/pull/1317 +* Postprocess step before VarDict in TGA https://github.com/Clinical-Genomics/BALSAMIC/pull/1332 +* CNV report for TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1339 +* `wkhtmltopdf` to system requirements https://github.com/Clinical-Genomics/BALSAMIC/pull/1339 +* Store WGS CNV report plots https://github.com/Clinical-Genomics/BALSAMIC/pull/1347 + +Changed: +^^^^^^^^ +* Changed CN header field in cnvpytor in cnvpytor_tumor_only to be Float instead of Integer https://github.com/Clinical-Genomics/BALSAMIC/pull/1182 +* Changed samples in case_config.json from being a dict to a list of dicts https://github.com/Clinical-Genomics/BALSAMIC/pull/1176 +* Updated snakemake version to 7.25.0 https://github.com/Clinical-Genomics/BALSAMIC/pull/1099 +* Updated cryptography version to 41.0.1 https://github.com/Clinical-Genomics/BALSAMIC/pull/1173 +* Refactor bam and fastq inputs in snakemake to call pydantic model functions https://github.com/Clinical-Genomics/BALSAMIC/pull/1176 +* Standardised alignment workflows to WGS-workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1176 +* Implemented parallel trimming and alignment in all workflows per lane https://github.com/Clinical-Genomics/BALSAMIC/pull/1176 +* All bam-QC tools take the final dedup.realign bamfile as input https://github.com/Clinical-Genomics/BALSAMIC/pull/1176 +* Validation of pydantic models done both during config and run https://github.com/Clinical-Genomics/BALSAMIC/pull/1176 +* Refactored fastp rules, and changed order of UMI-trimming and quality trimming https://github.com/Clinical-Genomics/BALSAMIC/pull/1176 +* Fix pydantic version (<2.0) https://github.com/Clinical-Genomics/BALSAMIC/pull/1191 +* Refactor constants https://github.com/Clinical-Genomics/BALSAMIC/pull/1174 +* Move models to their own folder https://github.com/Clinical-Genomics/BALSAMIC/pull/1176 +* Balsamic init workflow refactoring https://github.com/Clinical-Genomics/BALSAMIC/pull/1188 +* Updated cryptography version to 41.0.2 https://github.com/Clinical-Genomics/BALSAMIC/pull/1205 +* Refactor snakemake executable command generation https://github/Clinical-Genomics/BALSAMIC/pull/1211 +* Updated Python version to 3.11 and its dependencies https://github.com/Clinical-Genomics/BALSAMIC/pull/1216 +* Tools versions in doc https:/github.com/Clinical-Genomics/BALSAMIC/pull/1239 +* Reuse common Balsamic CLI options https://github.com/Clinical-Genomics/BALSAMIC/pull/1242 +* Update `reference.json` file to use relative paths https://github.com/Clinical-Genomics/BALSAMIC/pull/1251 +* Update pydantic to v2 while maintaining support for v1 models https://github.com/Clinical-Genomics/BALSAMIC/pull/1253 +* `PCT_PF_READS_IMPROPER_PAIRS` QC threshold lowered to 5% https://github.com/Clinical-Genomics/BALSAMIC/issues/1265 +* Migrate Metrics models to pydantic v2 https://github.com/Clinical-Genomics/BALSAMIC/pull/1270 +* Migrate Snakemake models to pydantic v2 https://github.com/Clinical-Genomics/BALSAMIC/pull/1268 +* Migrate Cache models to pydantic v2 https://github.com/Clinical-Genomics/BALSAMIC/pull/1274 +* Made BALSAMIC compatible with multiple PON creation workflows https://github.com/Clinical-Genomics/BALSAMIC/pull/1279 +* Use StrEnum from python enum https://github.com/Clinical-Genomics/BALSAMIC/pull/1303 +* Renamed final cram bamfile to format `..cram` https://github.com/Clinical-Genomics/BALSAMIC/pull/1307 +* Updated snakemake version to 7.32.4 https://github.com/Clinical-Genomics/BALSAMIC/pull/1308 +* Migrate analysis models to pydantic v2 https://github.com/Clinical-Genomics/BALSAMIC/pull/1306 +* Split analysis model into config and params models https://github.com/Clinical-Genomics/BALSAMIC/pull/1306 +* Renamed name in sample column of final clincial vcfs https://github.com/Clinical-Genomics/BALSAMIC/pull/1310 +* Update Gens HK tags https://github.com/Clinical-Genomics/BALSAMIC/pull/1319 +* Increased memory and threads for VarDict https://github.com/Clinical-Genomics/BALSAMIC/pull/1332 +* Updated ReadtheDocs with GENS and structural pipeline changes https://github.com/Clinical-Genomics/BALSAMIC/pull/1327 +* Migrate WGS CNV report generation to pypdf & pdfkit https://github.com/Clinical-Genomics/BALSAMIC/pull/1346 + +Fixed: +^^^^^^ +* vcf2cytosure container https://github.com/Clinical-Genomics/BALSAMIC/pull/1159 +* Link external fastqs to case folder & create case directory https://github.com/Clinical-Genomics/BALSAMIC/pull/1195 +* vcf2cytosure container missing constants https://github.com/Clinical-Genomics/BALSAMIC/pull/1198 +* Bash commands in vep_somatic_clinical_snv https://github.com/Clinical-Genomics/BALSAMIC/pull/1200 +* Fix SVDB annotation intermediate rule https://github.com/Clinical-Genomics/BALSAMIC/pull/1218 +* Broken documentation links https://github.com/Clinical-Genomics/BALSAMIC/pull/1226 +* Updated contributors in main README https://github.com/Clinical-Genomics/BALSAMIC/pull/1237 +* CNVpytor container https://github.com/Clinical-Genomics/BALSAMIC/pull/1246 +* Restored balsamic container in UMI concatenation rule https://github.com/Clinical-Genomics/BALSAMIC/pull/1261 +* CNVpytor container, fixing numpy version https://github.com/Clinical-Genomics/BALSAMIC/pull/1273 +* QC workflow store https://github.com/Clinical-Genomics/BALSAMIC/pull/1295 +* MultiQC rule missing input files https://github.com/Clinical-Genomics/BALSAMIC/pull/1321 +* `gens_preprocessing` rule missing python directive https://github.com/Clinical-Genomics/BALSAMIC/pull/1322 +* CADD annotations container path and code smells https://github.com/Clinical-Genomics/BALSAMIC/pull/1323 +* Sonarcloud reported issues https://github.com/Clinical-Genomics/BALSAMIC/pull/1348 +* Loqusdb SV annotation somatic fields https://github.com/Clinical-Genomics/BALSAMIC/pull/1354 + +Removed: +^^^^^^^^ +* Config folder https://github.com/Clinical-Genomics/BALSAMIC/pull/1175 +* Quality trimming of fastqs for UMI workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1176 +* Balsamic container https://github.com/Clinical-Genomics/BALSAMIC/pull/1230 +* Plugin CLI https://github.com/Clinical-Genomics/BALSAMIC/pull/1245 +* Realignment step for TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1272 +* Archived/outdated workflows and scripts https://github.com/Clinical-Genomics/BALSAMIC/pull/1296 +* Sed command to convert CNVpytor integer to float, deprecated by updated CNVpytor version https://github.com/Clinical-Genomics/BALSAMIC/pull/1310 +* Removed max AF 1 filter from bcftools https://github.com/Clinical-Genomics/BALSAMIC/pull/1338 +* Extra samtools sort command from WGS cases https://github.com/Clinical-Genomics/BALSAMIC/pull/1334 + [12.0.2] -------- diff --git a/README.rst b/README.rst index 3799b8721..ee78c9bb7 100644 --- a/README.rst +++ b/README.rst @@ -2,7 +2,7 @@

- +

Bioinformatic Analysis pipeLine for SomAtic MutatIons in Cancer (v 12.0.2)

FastQ to Annotated VCF

@@ -11,14 +11,14 @@ - + BALSAMIC is basically a wrapper for its core workflow manager. The goal is to have a package with well defined cli to make it reproducible for user to run somatic calling regaradless of the workflow manger at its core. Right now, BALSAMIC is using Snakemake as its core. So one can run the sample using workflows available within this package and standard Snakemake cli given that there is a proper config file created. -.. list-table:: +.. list-table:: :widths: 20 50 :header-rows: 0 :stub-columns: 1 @@ -35,7 +35,7 @@ Snakemake cli given that there is a proper config file created. - |test_status_badge| * - Container latest release status - |docker_latest_release_status| - * - Container master and develop status + * - Container master and develop status - |docker_latest_build_status| * - Code coverage - |code_cov_badge|_ @@ -44,21 +44,21 @@ Snakemake cli given that there is a proper config file created. * - Dependencies - |snakemake_badge| |singularity_badge| * - Contributors - - @ashwini06, @ivadym, @khurrammaqbool, @keyvanelhami, @mropat, @imsarath + - @ashwini06, @ivadym, @khurrammaqbool, @keyvanelhami, @mropat, @imsarath, @rannick, @fevac, @mathiasbio -.. |code_cov_badge| image:: https://codecov.io/gh/Clinical-Genomics/BALSAMIC/branch/develop/graph/badge.svg?token=qP68U3PNwV +.. |code_cov_badge| image:: https://codecov.io/gh/Clinical-Genomics/BALSAMIC/branch/develop/graph/badge.svg?token=qP68U3PNwV .. _code_cov_badge: https://codecov.io/gh/Clinical-Genomics/BALSAMIC .. |latest_tag| image:: https://img.shields.io/github/v/tag/clinical-genomics/BALSAMIC .. |test_status_badge| image:: https://github.com/Clinical-Genomics/BALSAMIC/actions/workflows/pytest_and_coveralls.yml/badge.svg -.. |docker_latest_build_status| image:: https://github.com/Clinical-Genomics/BALSAMIC/actions/workflows/docker_build_push.yml/badge.svg +.. |docker_latest_build_status| image:: https://github.com/Clinical-Genomics/BALSAMIC/actions/workflows/docker_build_push.yml/badge.svg + +.. |docker_latest_release_status| image:: https://github.com/Clinical-Genomics/BALSAMIC/actions/workflows/docker_build_push_release.yml/badge.svg?tag=v12.0.2 -.. |docker_latest_release_status| image:: https://github.com/Clinical-Genomics/BALSAMIC/actions/workflows/docker_build_push_release.yml/badge.svg?tag=v12.0.2 - -.. |snakemake_badge| image:: https://img.shields.io/badge/snakemake-%E2%89%A55.12.3-brightgreen.svg +.. |snakemake_badge| image:: https://img.shields.io/badge/snakemake-%E2%89%A55.12.3-brightgreen.svg .. |singularity_badge| image:: https://img.shields.io/badge/singularity-%E2%89%A53.1.1-brightgreen.svg diff --git a/container_tests/varcall_cnvkit/varcall_cnvkit.sh b/container_tests/cadd/cadd.sh similarity index 84% rename from container_tests/varcall_cnvkit/varcall_cnvkit.sh rename to container_tests/cadd/cadd.sh index bcfb7a61d..bb7e8470a 100644 --- a/container_tests/varcall_cnvkit/varcall_cnvkit.sh +++ b/container_tests/cadd/cadd.sh @@ -1,7 +1,7 @@ #!/bin/bash # Test if commands exist -valid_commands=( "cnvkit.py" "bcftools" "tabix" ) +valid_commands=( "CADD.sh" "snakemake" ) for valid_command in "${valid_commands[@]}" do @@ -13,3 +13,4 @@ do echo "${valid_command} command is found and valid" fi done + diff --git a/container_tests/cnvkit/cnvkit.sh b/container_tests/cnvkit/cnvkit.sh new file mode 100644 index 000000000..30c661fd3 --- /dev/null +++ b/container_tests/cnvkit/cnvkit.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Test if commands exist + +valid_commands=( "cnvkit.py" "R" ) + +for valid_command in "${valid_commands[@]}" +do + if ! command -v "${valid_command}" &> /dev/null + then + echo "${valid_command} could not be found" + exit 1 + else + echo "${valid_command} command is found and valid" + fi +done + + diff --git a/container_tests/gatk/gatk.sh b/container_tests/gatk/gatk.sh new file mode 100644 index 000000000..05c958bbc --- /dev/null +++ b/container_tests/gatk/gatk.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Test if commands exist + +valid_commands=( "gatk" ) + +for valid_command in "${valid_commands[@]}" +do + if ! command -v "${valid_command}" &> /dev/null + then + echo "${valid_command} could not be found" + exit 1 + else + echo "${valid_command} command is found and valid" + fi +done diff --git a/container_tests/htslib/htslib.sh b/container_tests/htslib/htslib.sh new file mode 100644 index 000000000..9ea3c1442 --- /dev/null +++ b/container_tests/htslib/htslib.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Test if commands exist + +valid_commands=( "samtools" "bcftools" "tabix" "bgzip" ) + +for valid_command in "${valid_commands[@]}" +do + if ! command -v "${valid_command}" &> /dev/null + then + echo "${valid_command} could not be found" + exit 1 + else + echo "${valid_command} command is found and valid" + fi +done + diff --git a/container_tests/purecn/purecn.sh b/container_tests/purecn/purecn.sh new file mode 100644 index 000000000..c7c92a572 --- /dev/null +++ b/container_tests/purecn/purecn.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Test if commands exist + +valid_commands=( "R" ) + +for valid_command in "${valid_commands[@]}" +do + if ! command -v "${valid_command}" &> /dev/null + then + echo "${valid_command} could not be found" + exit 1 + else + echo "${valid_command} command is found and valid" + fi +done + + diff --git a/docs/README.rst b/docs/README.rst index 71cc679a6..8f30be7b4 100644 --- a/docs/README.rst +++ b/docs/README.rst @@ -1,6 +1,9 @@ -========= -Build doc -========= +======================= +Documentation Guidline +======================= + +BALSAMIC uses Sphinx to build the documentation, see the official documentation of Sphinx: https://www.sphinx-doc.org/en/master/index.html + Following steps explains how to build documents locally. @@ -8,13 +11,14 @@ Create a conda environment: .. code-block:: - conda create -n balsamic_doc -c bioconda -c conda-forge python=3.7 pip pygraphviz + conda create -n balsamic_doc -c bioconda -c conda-forge python=3.11 pip pygraphviz wkhtmltopdf conda activate balsamic_doc Install Sphinx and extensions: .. code-block:: + cd /path/to/BALSAMIC python -m pip install --upgrade --upgrade-strategy eager --no-cache-dir . cd docs pip install -r requirements.txt -r ../requirements-dev.txt diff --git a/docs/balsamic_annotation.rst b/docs/balsamic_annotation.rst index 60823026d..0d8e7954d 100644 --- a/docs/balsamic_annotation.rst +++ b/docs/balsamic_annotation.rst @@ -1,12 +1,15 @@ -*********************************** +********************** Annotation resources -*********************************** +********************** BALSAMIC annotates somatic single nucleotide variants (SNVs) using ``ensembl-vep`` and ``vcfanno``. Somatic structural variants (SVs), somatic copy-number variants (CNVs) and germline single nucleotide variants are annotated using only ``ensembl-vep``. All SVs and CNVs are merged using ``SVDB`` before annotating for `Target Genome Analysis (TGA)` or `Whole Genome Sequencing (WGS)` analyses. +gnomAD +------ + `BALSAMIC` adds the following annotation from `gnomAD` database using ``vcfanno``. -.. list-table:: gnomAD +.. list-table:: gnomAD annotations :widths: 50 50 :header-rows: 1 @@ -17,9 +20,12 @@ BALSAMIC annotates somatic single nucleotide variants (SNVs) using ``ensembl-vep * - GNOMADAF - fraction of the reads supporting the alternate allele, allelic frequency +ClinVar +------- + `BALSAMIC` adds the following annotation from `ClinVar` database using ``vcfanno``. -.. list-table:: ClinVar +.. list-table:: ClinVar annotations :widths: 50 50 :header-rows: 1 @@ -40,7 +46,7 @@ BALSAMIC annotates somatic single nucleotide variants (SNVs) using ``ensembl-vep The values for `ORIGIN` are described below: -.. list-table:: ORIGIN +.. list-table:: ClinVar ORIGIN :widths: 25 25 :header-rows: 1 @@ -71,9 +77,12 @@ The values for `ORIGIN` are described below: * - 1073741824 - other +COSMIC +------ + `BALSAMIC` uses `ensembl-vep` to add the following annotation from `COSMIC` database. -.. list-table:: COSMIC +.. list-table:: COSMIC annotations :widths: 50 50 :header-rows: 1 @@ -90,6 +99,116 @@ The values for `ORIGIN` are described below: * - COSMIC_AA - peptide annotation +CADD +---- + +`BALSAMIC` adds the following annotation for SNVs from `CADD` database using ``vcfanno``. + +.. list-table:: CADD annotations + :widths: 50 50 + :header-rows: 1 + + * - VCF tag + - description + * - CADD + - Combined Annotation Dependent Depletion + +LoqusDB somatic frequencies (cancer cases) +------------------------------------------ + +.. list-table:: LoqusDB Somatic Annotations + :widths: 50 150 50 + :header-rows: 1 + + * - VCF tag + - description + - variant type + * - Cancer_Somatic_Frq + - Frequency of observation for somatic mutations + - SNV, SV + * - Cancer_Somatic_Obs + - allele counts of the somatic variant + - SNV, SV + * - Cancer_Somatic_Hom + - allele counts of the homozygous somatic variant + - SNV + +LoqusDB germline frequencies (cancer cases) +------------------------------------------- + +.. list-table:: loqusDB germline SNV annotations + :widths: 50 150 50 + :header-rows: 1 + + * - VCF tag + - description + - variant type + * - Cancer_Germline_Frq + - Frequency of observation for germline mutations + - SNV + * - Cancer_Germline_Obs + - allele counts of the germline variant + - SNV + * - Cancer_Germline_Hom + - allele counts of the homozygous germline variant + - SNV + +LoqusDB germline frequencies (non-cancer cases) +------------------------------------------------ + +`BALSAMIC` adds the following annotation from database of `non-cancer clinical` samples using ``vcfanno`` for SNVs and SVDB for SVs. + +.. list-table:: loqusDB germline (non-cancer) SNV annotations + :widths: 50 150 50 + :header-rows: 1 + + * - VCF tag + - description + - variant type + * - Frq + - Frequency of observation of the variants from normal `non-cancer clinical` samples + - SNV, SV + * - Obs + - allele counts of the variant in normal `non-cancer clinical` samples + - SNV + * - Hom + - allele counts of the homozygous variant in normal `non-cancer clinical` samples + - SNV + * - clin_obs + - allele counts + - SV + +SWEGEN +------ + +`BALSAMIC` adds the following annotation from `SWEGEN` database using ``vcfanno`` for SNVs and SVDB for SVs. + +.. list-table:: Swegen SNV annotations + :widths: 50 150 50 + :header-rows: 1 + + * - VCF tag + - description + - variant type + * - SWEGENAF + - allele frequency from 1000 Swedish genomes project + - SNV, SV + * - SWEGENAAC_Hom + - allele counts of homozygous variants + - SNV + * - SWEGENAAC_Het + - allele counts of heterozygous variants + - SNV + * - SWEGENAAC_Hemi + - allele counts of hemizygous variants + - SNV + * - swegen_obs + - allele count + - SV + + +ENSEMBL-VEP annotations +----------------------- Where relevant, `BALSAMIC` uses `ensembl-vep` to annotate somatic and germline SNVs and somatic SVs/CNVs from `1000genomes (phase3)`, `ClinVar`, `ESP, HGMD-PUBLIC`, `dbSNP`, `gencode`, `gnomAD`, `polyphen`, `refseq`, and `sift` databases. The following annotations are added by `ensembl-vep`. @@ -256,52 +375,3 @@ VEP has a setting for the maximum size of a structural variant that it will anno - indicating if the transcript is the MANE Select or MANE Plus Clinical transcript for the gene. * - miRNA - Reports where the variant lies in the miRNA secondary structure. - - -`BALSAMIC` adds the following annotation from `Swegen` database using ``vcfanno`` for SNVs and SVDB for SVs. - -.. list-table:: Swegen SNV - :widths: 50 150 50 - :header-rows: 1 - - * - VCF tag - - description - - variant type - * - SWEGENAF - - allele frequency from 1000 Swedish genomes project - - SNV, SV - * - SWEGENAAC_Hom - - allele counts of homozygous variants - - SNV - * - SWEGENAAC_Het - - allele counts of heterozygous variants - - SNV - * - SWEGENAAC_Hemi - - allele counts of hemizygous variants - - SNV - * - swegen_obs - - allele count - - SV - -`BALSAMIC` adds the following annotation from database of normal `Clinical` samples using ``vcfanno`` for SNVs and SVDB for SVs. - -.. list-table:: Clinical Normal samples SNV - :widths: 50 150 50 - :header-rows: 1 - - * - VCF tag - - description - - variant type - * - Frq - - Frequency of observation of the variants from normal `Clinical` samples - - SNV, SV - * - Obs - - allele counts of the variant in normal `Clinical` samples - - SNV - * - Hom - - allele counts of the homozygous variant in normal `Clinical` samples - - SNV - * - clin_obs - - allele counts - - SV - diff --git a/docs/balsamic_methods.rst b/docs/balsamic_methods.rst index 6adcf616c..ad18ffc4c 100644 --- a/docs/balsamic_methods.rst +++ b/docs/balsamic_methods.rst @@ -8,21 +8,20 @@ Target Genome Analysis BALSAMIC :superscript:`1` (**version** = 12.0.2) was used to analyze the data from raw FASTQ files. We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. Adapter sequences and low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. -Trimmed reads were mapped to the reference genome hg19 using BWA MEM v0.7.17 :superscript:`4`. -The resulted SAM files were converted to BAM files and sorted using samtools v1.15.1 :superscript:`5`. -Duplicated reads were marked using Picard tools MarkDuplicate v2.27.1 :superscript:`6`. The unmapped reads are removed -and promptly quality controlled using CollectHsMetrics, CollectInsertSizeMetrics and CollectAlignmentSummaryMetrics functionalities. +Trimmed reads were mapped to the reference genome hg19 using sentieon-tools 202010.02 :superscript:`15`. +Duplicated reads were marked using Dedup from sentieon-tools 202010.02 :superscript:`15`. +The final BAM is promptly quality controlled using CollectHsMetrics, CollectInsertSizeMetrics and CollectAlignmentSummaryMetrics functionalities from Picard tools v2.27.1 :superscript:`6`. Results of the quality controlled steps were summarized by MultiQC v1.12 :superscript:`7`. -Small somatic mutations (SNVs and INDELs) were called for each sample using VarDict v2019.06.04 :superscript:`8`. +Small somatic mutations (SNVs and INDELs) were called for each sample using VarDict v1.8.2 :superscript:`8`. Apart from the Vardict filters to report the variants, the called-variants were also further second filtered using the criteria (*MQ >= 40, DP >= 100, VD >= 5, Minimum AF >= 0.007, Maximum AF < 1, GNOMADAF_popmax <= 0.005, swegen AF < 0.01*). Only those variants that fulfilled the filtering criteria and scored as `PASS` in the VCF file were reported. Structural variants (SV) were called using Manta v1.6.0 :superscript:`9` and Dellyv1.0.3 :superscript:`10`. Copy number variations (CNV) were called using CNVkit v0.9.9 :superscript:`11`. The variant calls from CNVkit, Manta and Delly were merged using SVDB v2.8.1 :superscript:`12`. -The clinical set of SNV and SV is also annotated and filtered against loqusDB curated frequency of observed variants (frequency < 0.01) from normal samples. +The clinical set of SNV and SV is also annotated and filtered against loqusDB curated frequency of observed variants (frequency < 0.01) from non-cancer cases and only annotated using frequency of observed variants from cancer cases (somatic and germline). All variants were annotated using Ensembl VEP v104.3 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` -to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`, SweGen :superscript:`22` and frequency of observed variants in normal samples. +to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`, CADD v1.6 :superscript:`24`, SweGen :superscript:`22` and frequency of observed variants in normal samples. Whole Genome Analysis ~~~~~~~~~~~~~~~~~~~~~ @@ -30,27 +29,27 @@ Whole Genome Analysis BALSAMIC :superscript:`1` (**version** = 12.0.2) was used to analyze the data from raw FASTQ files. We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. Adapter sequences and low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. -Trimmed reads were mapped to the reference genome hg19 using sentieon-tools :superscript:`15`. -The resulted SAM files were converted to BAM files and sorted using samtools v1.15.1 :superscript:`5`. -Duplicated reads were marked using Picard tools MarkDuplicate v2.27.1 :superscript:`6`. The unmapped reads are removed -and promptly quality controlled using CollectMultipleMetrics and CollectWgsMetrics functionalities. +Trimmed reads were mapped to the reference genome hg19 using sentieon-tools 202010.02 :superscript:`15`. +Duplicated reads were marked using Dedup from sentieon-tools 202010.02 :superscript:`15`. +The BAM file was then realigned using Realign from sentieon-tools 202010.02 :superscript:`15` and common population InDels. +The final BAM is quality controlled using WgsMetricsAlgo and CoverageMetrics from sentieon-tools 202010.02 :superscript:`15` and CollectWgsMetrics, CollectMultipleMetrics, CollectGcBiasMetrics, and CollectHsMetrics functionalities from Picard tools v2.27.1 :superscript:`6`. Results of the quality controlled steps were summarized by MultiQC v1.12 :superscript:`7`. Small somatic mutations (SNVs and INDELs) were called for each sample using Sentieon TNscope :superscript:`16`. The called-variants were also further second filtered using the criteria (DP(tumor,normal) >= 10; AD(tumor) >= 3; AF(tumor) >= 0.05, Maximum AF(tumor < 1; GNOMADAF_popmax <= 0.001; normalized base quality scores >= 20, read_counts of alt,ref alle > 0). Structural variants were called using Manta v1.6.0 :superscript:`9`, Delly v1.0.3 :superscript:`10` and TIDDIT v3.3.2 :superscript:`12`. -Copy number variations (CNV) were called using ascatNgs v4.5.0 :superscript:`17` (tumor-normal), Delly v1.0.3 :superscript:`10` and CNVpytor v1.2.1 :superscript:`22` (tumor-only) and converted from CNV to deletions (DEL) and duplications (DUP). -The structural variant (SV) calls from Manta, Delly, TIDDIT ascatNgs (tumor-normal) and CNVpytor (tumor-only) were merged using SVDB v2.8.1 :superscript:`12` -The clinical set of SNV and SV is also annotated and filtered against loqusDB curated frequency of observed variants (frequency < 0.01) from normal samples. +Copy number variations (CNV) were called using ascatNgs v4.5.0 :superscript:`17` (tumor-normal), Delly v1.0.3 :superscript:`10` and CNVpytor v1.3.1 :superscript:`22` (tumor-only) and converted from CNV to deletions (DEL) and duplications (DUP). +The structural variant (SV) calls from Manta, Delly, TIDDIT, ascatNgs (tumor-normal) and CNVpytor (tumor-only) were merged using SVDB v2.8.1 :superscript:`12` +The clinical set of SNV and SV is also annotated and filtered against loqusDB curated frequency of observed variants (frequency < 0.01) from non-cancer cases and only annotated using frequency of observed variants from cancer cases (somatic and germline). All variants were annotated using Ensembl VEP v104.3 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` -to annotate somatic single nucleotide variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`, SweGen :superscript:`22` and frequency of observed variants in normal samples. +to annotate somatic single nucleotide variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`, CADD v1.6 :superscript:`24`, SweGen :superscript:`22` and frequency of observed variants in normal samples. UMI Data Analysis ~~~~~~~~~~~~~~~~~~~~~ BALSAMIC :superscript:`1` (**version** = 12.0.2) was used to analyze the data from raw FASTQ files. We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. -Adapter sequences and low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. UMI tag extraction and consensus generation were performed using Sentieon tools v202010.02 :superscript:`15`. +Adapter sequences and low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. The alignment of UMI extracted and consensus called reads to the human reference genome (hg19) was done by bwa-mem and samtools using Sentieon utils. Consensus reads were filtered based on the number of minimum reads supporting each UMI tag group. We applied a criteria filter of minimum reads `3,1,1`. It means that at least three UMI tag groups should be ideally considered from both DNA strands, @@ -58,8 +57,8 @@ where a minimum of at least one UMI tag group should exist in each single-strand The filtered consensus reads were quality controlled using Picard CollectHsMetrics v2.27.1 :superscript:`5`. Results of the quality controlled steps were summarized by MultiQC v1.12 :superscript:`6`. For each sample, somatic mutations were called using Sentieon TNscope :superscript:`16`, with non-default parameters for passing the final list of variants (--min_tumor_allele_frac 0.0005, --filter_t_alt_frac 0.0005, --min_init_tumor_lod 0.5, min_tumor_lod 4, --max_error_per_read 5 --pcr_indel_model NONE, GNOMADAF_popmax <= 0.02). -The clinical set of SNV and SV is also annotated and filtered against loqusDB curated frequency of observed variants (frequency < 0.01) from normal samples. -All variants were annotated using Ensembl VEP v104.3 :superscript:`7`. We used vcfanno v0.3.3 :superscript:`8` to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`, SweGen :superscript:`22` and frequency of observed variants in normal samples. +The clinical set of SNV and SV is also annotated and filtered against loqusDB curated frequency of observed variants (frequency < 0.01) from non-cancer cases and only annotated using frequency of observed variants from cancer cases (somatic and germline). +All variants were annotated using Ensembl VEP v104.3 :superscript:`7`. We used vcfanno v0.3.3 :superscript:`8` to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`, CADD v1.6 :superscript:`24`, SweGen :superscript:`22` and frequency of observed variants in normal samples. For exact parameters used for each software, please refer to https://github.com/Clinical-Genomics/BALSAMIC. We used three commercially available products from SeraCare [Material numbers: 0710-067110 :superscript:`19`, 0710-067211 :superscript:`20`, 0710-067312 :superscript:`21`] for validating the efficiency of the UMI workflow in identifying 14 mutation sites at known allelic frequencies. @@ -69,24 +68,25 @@ We used three commercially available products from SeraCare [Material numbers: 0 1. Foroughi-Asl, H., Jeggari, A., Maqbool, K., Ivanchuk, V., Elhami, K., & Wirta, V. BALSAMIC: Bioinformatic Analysis pipeLine for SomAtic MutatIons in Cancer (Version v8.2.10) [Computer software]. https://github.com/Clinical-Genomics/BALSAMIC 2. Babraham Bioinformatics - FastQC A Quality Control tool for High Throughput Sequence Data. Accessed June 22, 2020. https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ -3. Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. 2018;34(17):i884-i890. doi:10.1093/bioinformatics/bty560 -4. Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN] -5. Li H, Handsaker B, Wysoker A, Fennell T, Ruan J., Homer N., Marth G., Abecasis G., Durbin R. and 1000 Genome Project Data Processing Subgroup (2009) The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics, 25, 2078-9. doi: 10.1093/bioinformatics/btp352 +3. Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. 2018;34(17):i884-i890. https://doi.org/10.1093/bioinformatics/bty560 +4. Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. https://doi.org/10.48550/arXiv.1303.3997 +5. Li H, Handsaker B, Wysoker A, Fennell T, Ruan J., Homer N., Marth G., Abecasis G., Durbin R. and 1000 Genome Project Data Processing Subgroup (2009) The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics, 25, 2078-9. https://doi.org/10.1093/bioinformatics/btp352 6. Picard Tools - By Broad Institute. Accessed June 22, 2020. https://broadinstitute.github.io/picard/ -7. Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016;32(19):3047-3048. doi:10.1093/bioinformatics/btw354 +7. Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016;32(19):3047-3048. https://doi.org/10.1093/bioinformatics/btw354 8. Lai Z, Markovets A, Ahdesmaki M, Chapman B, Hofmann O, McEwen R, Johnson J, Dougherty B, Barrett JC, and Dry JR. VarDict: a novel and versatile variant caller for next-generation sequencing in cancer research. Nucleic Acids Res. 2016. https://doi.org/10.1093/nar/gkw227 -9. Chen, X. et al. (2016) Manta: rapid detection of structural variants and indels for germline and cancer sequencing applications. Bioinformatics, 32, 1220-1222. doi:10.1093/bioinformatics/btv710 +9. Chen, X. et al. (2016) Manta: rapid detection of structural variants and indels for germline and cancer sequencing applications. Bioinformatics, 32, 1220-1222. https://doi.org/10.1093/bioinformatics/btv710 10. Tobias Rausch, Thomas Zichner, Andreas Schlattl, Adrian M. Stuetz, Vladimir Benes, Jan O. Korbel. DELLY: structural variant discovery by integrated paired-end and split-read analysis. Bioinformatics. 2012 Sep 15;28(18):i333-i339. https://doi.org/10.1093/bioinformatics/bts378 11. Talevich, E, Shain, A.H, Botton, T, & Bastian, B.C. CNVkit: Genome-wide copy number detection and visualization from targeted sequencing. PLOS Computational Biology. 2016, 12(4):e1004873. https://doi.org/10.1371/journal.pcbi.1004873 -12. Jesper Eisfeldt et.al. TIDDIT, an efficient and comprehensive structural variant caller for massive parallel sequencing data. F1000 research. 2017. doi: 10.12688/f1000research.11168.2 -13. McLaren W, Gil L, Hunt SE, et al. The Ensembl Variant Effect Predictor. Genome Biology. 2016;17(1):122. -14. Pedersen BS, Layer RM, Quinlan AR. Vcfanno: fast, flexible annotation of genetic variants. Genome Biology. 2016;17(1):118. doi:10.1186/s13059-016-0973-5 +12. Jesper Eisfeldt et.al. TIDDIT, an efficient and comprehensive structural variant caller for massive parallel sequencing data. F1000 research. 2017. https://doi.org/10.12688/f1000research.11168.2 +13. McLaren W, Gil L, Hunt SE, et al. The Ensembl Variant Effect Predictor. Genome Biology. 2016;17(1):122. https://doi.org/10.1186/s13059-016-0974-4 +14. Pedersen BS, Layer RM, Quinlan AR. Vcfanno: fast, flexible annotation of genetic variants. Genome Biology. 2016;17(1):118. https://doi.org/10.1186/s13059-016-0973-5 15. Donald Freed, Rafael Aldana, Jessica A. Weber, Jeremy S. Edwards. The Sentieon Genomics Tools - A fast and accurate solution to variant calling from next-generation sequence data. Bioinformatics. 2016, Volume 32,Issue 8. https://doi.org/10.1093/bioinformatics/btv710 -16. Donald Freed, Renke Pan, Rafael Aldana. TNscope: Accurate Detection of Somatic Mutations with Haplotype-based Variant Candidate Detection and Machine Learning Filtering. bioRvix. doi: https://doi.org/10.1101/250647 -17. Keiran MR, Peter VL, David CW, David J, Andrew M, Adam PB , Jon WT, Patrick T, Serena Nik-Zainal, Peter J C. ascatNgs: Identifying Somatically Acquired Copy-Number Alterations from Whole-Genome Sequencing Data. Curr Protoc Bioinformatics. 2016. doi:https://doi.org/10.1002/cpbi.17 +16. Donald Freed, Renke Pan, Rafael Aldana. TNscope: Accurate Detection of Somatic Mutations with Haplotype-based Variant Candidate Detection and Machine Learning Filtering. bioRvix. https://doi.org/10.1101/250647 +17. Keiran MR, Peter VL, David CW, David J, Andrew M, Adam PB , Jon WT, Patrick T, Serena Nik-Zainal, Peter J C. ascatNgs: Identifying Somatically Acquired Copy-Number Alterations from Whole-Genome Sequencing Data. Curr Protoc Bioinformatics. 2016. https://doi.org/10.1002/cpbi.17 18. Karczewski, K.J., Francioli, L.C., Tiao, G. et al. The mutational constraint spectrum quantified from variation in 141,456 humans. Nature 581, 434–443 (2020). https://doi.org/10.1038/s41586-020-2308-7 -19. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF1-0710-0671/ -20. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF05-0710-0672/ -21. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF01-0710-0673/ +19. Seraseq ctDNA Complete Reference Material AF 1%. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF1-0710-0671/ +20. Seraseq ctDNA Complete Reference Material AF 0.5%. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF05-0710-0672/ +21. Seraseq ctDNA Complete Reference Material AF 0.1%. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF01-0710-0673/ 22. Ameur, A., Dahlberg, J., Olason, P. et al. SweGen: a whole-genome data resource of genetic variability in a cross-section of the Swedish population. Eur J Hum Genet 25, 1253–1260 (2017). https://doi.org/10.1038/ejhg.2017.130 23. Milovan Suvakov, Arijit Panda, Colin Diesh, Ian Holmes, Alexej Abyzov, CNVpytor: a tool for copy number variation detection and analysis from read depth and allele imbalance in whole-genome sequencing, GigaScience, Volume 10, Issue 11, November 2021, giab074, https://doi.org/10.1093/gigascience/giab074 +24. Rentzsch P., Witten D., Cooper G.M., Shendure J., Kircher M. CADD: predicting the deleteriousness of variants throughout the human genome. Nucleic Acids Res. 2018. https://doi.org/10.1093/nar/gky1016. PubMed PMID: 30371827. diff --git a/docs/balsamic_pon.rst b/docs/balsamic_pon.rst index a14eed6f2..cdc13a473 100644 --- a/docs/balsamic_pon.rst +++ b/docs/balsamic_pon.rst @@ -1,6 +1,16 @@ Panel of Normals (PON) ====================== +Currently two PON-methods are implemented in BALSAMIC to correct for biases and normalise coverage values: + +- For producing more accurate CNV variant-calls using ``CNVkit`` for TGA cases. + +- To produce normalised CN-profiles for WGS cases visualised in ``GENS``. + + +CNVkit PON +====================== + BALSAMIC provides a functionality to generate a Panel of Normals (PON) for more accurate copy-number filtering of false positives and that can be used as an input for the ``CNVkit`` variant caller. For a more detailed PON use case, please refer to the following documentation: @@ -28,7 +38,7 @@ When creating a new PON reference file, the next steps have to be followed: .. code-block:: - balsamic config pon --case-id --balsamic-cache --analysis-dir --fastq-path --panel-bed + balsamic config pon --pon-workflow CNVkit --case-id --balsamic-cache --analysis-dir --fastq-path --panel-bed 3. Run the BALSAMIC PON workflow: @@ -58,3 +68,72 @@ BALSAMIC can use a PON reference file if its provided while running CNVkit analy In the absence of a PON reference file, CNVkit is capable of generating a flat reference (tumor-only) or normal reference (tumor-normal) file on its own to correct for GC content and regional coverage +GENS PON +====================== + +In order to produce an accurate CN-profile to visualise in GENS you need to create 2 PONs one for each gender (see instructions below). + +The original instructions for how to create this PON, and which has been implemented in this BALSAMIC workflow can be found on the Clinical-Genomics-Lund GENS-repository: + +- `Clinical-Genomics-Lund-GENS`_ + +.. _Clinical-Genomics-Lund-GENS: https://github.com/Clinical-Genomics-Lund/gens + +To create the PON using the GENS PON creation workflow you can follow the guide below. + +PON Generation +-------------- + +To create a GENS PON using the BALSAMIC workflow you need to follow these steps: + +1. Create a genome-interval file. + +**Note:** + + These are the genome bins within which the coverage will be calculated, and consequently is the lowest resolution of viewing the CN-profile. + +This is the setting we used: + +.. code-block:: + + gatk PreprocessIntervals --reference [ref] --bin-length 100 --interval-merging-rule OVERLAPPING_ONLY -O human_g1k_v37_gens_targets_preprocessed_100bp.interval_list + + +2. Identify the samples to be included in the PON and add or link their ``fastq`` files to the ``fastq`` directory + +**Note:** + + It is recommended to include approximately 100 samples of the same gender, using the same library preparation and sequencing method as your intended analysis-samples. + +2. Generate the ``_PON.json`` configuration file: + +.. code-block:: + + balsamic config pon --pon-creation-type <[GENS_female,GENS_male]> --genome-interval <[path-to-file-from-step1]> --case-id --balsamic-cache --analysis-dir --fastq-path --panel-bed + +3. Run the BALSAMIC PON workflow: + +**Note:** + If you are following these instructions using 100 WGS samples, you require access to compute-nodes with a lot of memory (one of our jobs crashed at 117GB). + +.. code-block:: + + balsamic run analysis -s /_PON.json -r + +This workflow runs trimming and alignment for all samples to be included in the PON. Calculates coverages in bins using ``GATK CollectReadCounts`` then creates the PON using all read-counts with the tool ``GATK CreateReadCountPanelOfNormals``. + +4. Check for the PON output files: + +.. code-block:: + + /path/analysis/analysis_PON_finish + /path/analysis/cnv/gens_pon_100bp...hdf5 + +Using the PON during analysis +----------------------------- + +This PON is a required input in order to produce the final output-files to be loaded into the GENS platform. + +How to run a case using this PON and to activate GENS for your WGS analysis you are referred to this page: + +`Using GENS for WGS `_. \ No newline at end of file diff --git a/docs/balsamic_sv_cnv.rst b/docs/balsamic_sv_cnv.rst index 1d39bde72..2a14b35db 100644 --- a/docs/balsamic_sv_cnv.rst +++ b/docs/balsamic_sv_cnv.rst @@ -147,6 +147,49 @@ The following command can be used to fetch the variants identified by a specific zgrep -E "#|" <*.svdb.vcf.gz> +**Using GENS for WGS** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +GENS is a visualization tool similar to IGV, originally developed in Clinical Genomics Lund, and primarily for visualizing genomic copy number profiles from WGS samples. + +To visualise the GENS-formatted files from BALSAMIC you need to have GENS installed, and to do this you can follow the instructions on the Clinical-Genomics-Lund GENS-repository: + +- `Clinical-Genomics-Lund-GENS`_ + +.. _Clinical-Genomics-Lund-GENS: https://github.com/Clinical-Genomics-Lund/gens + +Two files per sample are uploaded to GENS, one file with allele-frequencies from SNV & InDel germline-calls (BAF file) which can be used to aid the interpretation of the CN-profile, and one file with the Log2 copy number ratios normalized against a PON. Instructions for how to generate this PON using the BALSAMIC PON workflow can be found here: + +`Generate GENS PON `_. + +There are three required arguments for creating the input files for GENS: +1. Genome interval file produced by GATK ``PreprocessIntervals`` (see instructions in GENS PON creation) +2. A gender specific PON (see instructions in GENS PON creation) +3. A population database VCF with variant positions to be reported in the BAF file. + +We created the file mentioned in **3** using the file ``gnomad.genomes.r2.1.1.sites`` filtered with bcftools to only keep variants with an AF above 0.05. + +.. code-block:: + + bcftools view -i AF>=0.05 -Oz + +To config BALSAMIC to run with GENS activated you supply these files like this: + +:: + + balsamic config case \ + --case-id + --balsamic-cache + --analysis-dir + --fastq-path + --gender <[male/female]> + --analysis-workflow balsamic + --genome-version hg19 + --tumor-sample-name + --genome-interval + --gens-coverage-pon + --gnomad-min-af5 + **Genome Reference Files** ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -193,5 +236,3 @@ Second step is to use *SnpPositions.tsv* file and generate *SnpGcCorrections.tsv ascatSnpPanelGcCorrections.pl genome.fa SnpPositions.tsv > SnpGcCorrections.tsv -**Attention:** -**BALSAMIC >= v11.0.0 removes unmapped reads from the bam and cram files for all the workflows.** diff --git a/docs/bioinfo_softwares.rst b/docs/bioinfo_softwares.rst index 64261167f..f6dcf34cf 100644 --- a/docs/bioinfo_softwares.rst +++ b/docs/bioinfo_softwares.rst @@ -5,11 +5,10 @@ Tools and software BALSAMIC ( **version** = 12.0.2 ) uses myriad of tools and softwares to analyze fastq files. This section covers why each one is included: usage and parameters, and relevant external links. - ascatNgs ~~~~~~~~ :Source code: `GitHub` ``_ -:Article: `PNAS` ``_ +:Article: `PNAS` ``_ :Version: `4.5.0` bcftools @@ -30,14 +29,26 @@ bwa :Article: `Bioinformatics` ``_ :Version: `0.7.17` +cadd +~~~~ +:Source code: `GitHub` ``_ +:Article: `Nature Genetics` ``_ +:Version: `1.6` + cnvkit ~~~~~~ :Source code: `GitHub` ``_ -:Article: `PLOS Computational Biology` ``_ +:Article: `PLOS Computational Biology` ``_ :Version: `0.9.9` +cnvpytor +~~~~~~~~ +:Source code: `GitHub` `` +:Article: `GigaSciences` `` +:Version: `1.3.1` + delly -~~~~~~~ +~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ :Version: `1.0.3` @@ -57,7 +68,7 @@ fastp fastqc ~~~~~~ :Source code: `GitHub` ``_ -:Article: `Babraham` ``_ +:Article: `Babraham` ``_ :Version: `0.11.9` gatk @@ -66,24 +77,29 @@ gatk :Article: `Current Protocols in Bioinformatics` ``_ :Version: `3.8` +genmod +~~~~~~ +:Source code: `Github` ``_ +:Version: `0.2.16` + manta ~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ :Version: `1.6.0` -multiqc -~~~~~~~ -:Source code: `GitHub` ``_ -:Article: `Bioinformatics` ``_ -:Version: `1.12` - mosdepth ~~~~~~~~ :Source code: `GitHub` ``_ -:Article: `Bioinformatics` ``_ +:Article: `Bioinformatics` ``_ :Version: `0.3.3` +multiqc +~~~~~~~ +:Source code: `GitHub` ``_ +:Article: `Bioinformatics` ``_ +:Version: `1.12` + picard ~~~~~~ :Source code: `GitHub` ``_ @@ -108,23 +124,29 @@ sentieon-tools :Article: `Bioinformatics` ``_ :Version: `202010.02` +somalier +~~~~~~~~ +:Source code: `Github` ``_ +:Article: `Genome Medicine` ``_ +:Version: `0.2.16` + svdb ~~~~ :Source code: `Github` ``_ :Article: `F1000Res` ``_ -:Version: `2.6.0` +:Version: `2.8.1` tabix ~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `1.11` +:Version: `>=1.11` tiddit ~~~~~~ :Source code: `Github` ``_ :Article: `F1000Res` ``_ -:Version: `3.0.0` +:Version: `3.3.2` vardict ~~~~~~~ @@ -142,4 +164,4 @@ vcf2cytosure ~~~~~~~~~~~~~ :Source code: `GitHub` ``_ :Article: `-` -:Version: `0.7.1` \ No newline at end of file +:Version: `0.8` diff --git a/docs/conf.py b/docs/conf.py index 361774b1b..76f5affaa 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,8 +18,7 @@ # -- Project information ----------------------------------------------------- project = "BALSAMIC" -copyright = "2020, Hassan Foroughi Asl" -author = "Hassan Foroughi Asl" +copyright = "2023, Clinical Genomics" # -- General configuration --------------------------------------------------- @@ -30,7 +29,7 @@ "sphinx.ext.autodoc", "sphinx.ext.mathjax", "sphinx.ext.viewcode", - "sphinxcontrib.napoleon", + "sphinx.ext.napoleon", "sphinx_click", "sphinxarg.ext", "recommonmark", @@ -56,7 +55,7 @@ # a list of builtin themes. # # html_theme = 'sphinx_rtd_theme' -html_logo = "../BALSAMIC/assets/balsamic_logo.png" +html_logo = "../BALSAMIC/assets/images/balsamic_logo.png" html_theme = "furo" # Add any paths that contain custom static files (such as style sheets) here, diff --git a/docs/git_etiquette.rst b/docs/git_etiquette.rst deleted file mode 100644 index d72ed3b62..000000000 --- a/docs/git_etiquette.rst +++ /dev/null @@ -1,51 +0,0 @@ -============= -Git etiquette -============= - -It is recommended to follow a system to standardize the commit messages loosely. Following up from commit messages discussed on https://github.com/Clinical-Genomics/development/pull/97 , the format below is recommended for commit messages: - -**Code formatting** -^^^^^^^^^^^^^^^^^^^ - -BALSAMIC is using Black (https://github.com/psf/black) as code formatter. - - -**Conventional commits and PRs** -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -PRs should follow the following keywords in the title: https://www.conventionalcommits.org/en/v1.0.0/ - -Commit messages are recommended to following the following similar to PRs: - - -#. **feat**\ : Introducing a new features. This includes but not limited to workflows, SnakeMake rule, cli, and plugins. In other words, anything that is new and fundamental change will also go here. Enhancements and optimizations will go into refactor. -#. **fix**\ : This is essentially a patch. Included but not limited to: bug fixes, hotfixes, and any patch to address a known issue. -#. **doc**\ : Any changes to the documentation are part of doc subject line, included but not limited to docstrings, cli-help, readme, tutorial, documentation, CHANGELOG, and addition of ipython/jupyter notebook in the form of tutorial. -#. **test**\ : Any changes to the tests are part of test subject line. This includes adding, removing or updating of the following: unittests, validation/verification dataset, and test related configs. -#. **refactor**\ : Refactoring refers to a rather broad term. Any style changes, code enhancement, and analysis optimization. -#. **version**\ : Any changes to .bumpversion config and or change of version will be specified with this. This includes comments within .bumpversion, structure of .bumpversion, etc. - -**Scope** -^^^^^^^^^ - -Scope is specified within parenthesis. It show the *scope* of the subject line. The following scope are valid: - - -* cli -* style -* rule (refers to SnakeMake rules) -* workflow (refer to SnakeMake workflows) -* config (refers to configs that are either used or generated by BALSAMIC) -* Relevant scopes that might fit into a scope description - -Note: If scope is broad or matching with multiple (it shouldn't, but if it does) one can leave out the scope. - -**Message** -^^^^^^^^^^^ - -It's better to start Git commit message with the following words: - - -* added -* removed -* updated diff --git a/docs/index.rst b/docs/index.rst index e6f8ce973..797a0383b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,7 +1,7 @@ .. include:: ../README.rst .. toctree:: - :caption: Getting started + :caption: Installation and Usage :name: getting_started :hidden: :maxdepth: 1 @@ -12,7 +12,7 @@ .. toctree:: - :caption: Detailed documentation + :caption: Analysis and Interpretation :name: detailed documentation :hidden: :maxdepth: 1 @@ -33,8 +33,7 @@ :hidden: :maxdepth: 1 - git_etiquette - snakemake_etiquette README + programming_guide semver FAQs diff --git a/docs/install.rst b/docs/install.rst index a344abf5d..f6d65fa52 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -10,8 +10,8 @@ Software Requirements ~~~~~~~~~~~~~~~~~~~~~ - Conda >=version 4.5.0: For detailed software and python requirements please see ``setup.py`` and ``BALSAMIC/conda/balsamic.yaml`` -- Singularity >=version 3.0.0: BALSAMIC uses singularity to run vairous parts of the workflow. -- Python 3.6 +- Singularity >=version 3.0.0: BALSAMIC uses singularity to run vairous parts of the workflow. +- Python 3.11 - BALSAMIC is dependent on third-party bioinformatics software ``Sentieon-tools``. Example: for running wgs variant calling using ``TNScope``, and to execute ``UMIworkflow``. ``Note: Set Sentieon envionment variables in your ~/.bashrc file by adding following two lines`` @@ -28,10 +28,10 @@ Step 1. Installing BALSAMIC :: - conda create -c conda-forge -c defaults --name S_BALSAMIC python==3.7 pip pygraphviz + conda create -c conda-forge -c defaults --name S_balsamic python==3.11 pip pygraphviz wkhtmltopdf -2. Activate environment: +2. Activate environment: :: @@ -39,7 +39,7 @@ Step 1. Installing BALSAMIC -3. Install BALSAMIC using ``pip`` within the newly created environment: +3. Install BALSAMIC using ``pip`` within the newly created environment: :: @@ -67,7 +67,7 @@ NOTE: This process can take couple of hours # Note: # 1. COSMIC key is in variable $COSMIC_KEY # 2. For genome version hg38, set --genome-version to hg38 - # 3. For using develop container version, set --container-version to develop + # 3. For using develop container version, set --cache-version to develop # 4. For submitting jobs to slurm cluster, use option --account balsamic init --outdir ~/balsamic_cache \ diff --git a/docs/snakemake_etiquette.rst b/docs/programming_guide.rst similarity index 57% rename from docs/snakemake_etiquette.rst rename to docs/programming_guide.rst index 27b064770..011b55a65 100644 --- a/docs/snakemake_etiquette.rst +++ b/docs/programming_guide.rst @@ -1,4 +1,60 @@ -=================== +================= +Coding etiquettes +================= + +* Structure the code properly +* Maintain good and consistent naming convention +* Keep it simple +* Don’t repeat yourself + + +Git etiquette +============= + +**Code formatting** +^^^^^^^^^^^^^^^^^^^ + +BALSAMIC is using Black as code formatter: https://github.com/psf/black + +**Conventional commits and PRs** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +PRs should follow the following keywords in the title: https://www.conventionalcommits.org/en/v1.0.0/ + +Commit messages are recommended to following the following similar to PRs: + +#. **feat**: Introducing a new features. This includes but not limited to workflows, SnakeMake rule, cli, and plugins. In other words, anything that is new and fundamental change will also go here. Enhancements and optimizations will go into refactor. +#. **fix**: This is essentially a patch. Included but not limited to: bug fixes, hotfixes, and any patch to address a known issue. +#. **doc**: Any changes to the documentation are part of doc subject line, included but not limited to docstrings, cli-help, readme, tutorial, documentation, CHANGELOG, and addition of ipython/jupyter notebook in the form of tutorial. +#. **test**: Any changes to the tests are part of test subject line. This includes adding, removing or updating of the following: unittests, validation/verification dataset, and test related configs. +#. **refactor**: Refactoring refers to a rather broad term. Any style changes, code enhancement, and analysis optimization. +#. **version**: Any changes to .bumpversion config and or change of version will be specified with this. This includes comments within .bumpversion, structure of .bumpversion, etc. + +**Scope** +^^^^^^^^^ + +Scope is specified within parenthesis. It show the *scope* of the subject line. The following scope are valid: + +* cli +* style +* rule (refers to SnakeMake rules) +* workflow (refer to SnakeMake workflows) +* config (refers to configs that are either used or generated by BALSAMIC) +* Relevant scopes that might fit into a scope description + +Note: If scope is broad or matching with multiple (it shouldn't, but if it does) one can leave out the scope. + +**Message** +^^^^^^^^^^^ + +It's better to start Git commit message with the following words: + +* added +* removed +* updated + + + Snakemake etiquette =================== @@ -103,3 +159,47 @@ Similarly ``awk`` or ``R`` external scripts can be saved in ``assets/scripts/*aw 1. https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html 2. https://snakemake.readthedocs.io/en/stable/snakefiles/writing_snakefiles.html + + + +Container etiquette +=================== + +BALSAMIC uses singularity containers to perform the bioinformatics analysis. These containers are built using Docker and pushed to Docker Hub. +For more details on building containers using docker, please refer to the official docker documentation: https://docs.docker.com/ + +**Structure of Docker recipe** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: + + FROM : + + LABEL base.image=":" + LABEL maintainer="Clinical Genomics" + LABEL about.contact="support@clinicalgenomics.se" + LABEL software="" + LABEL software.version="" + LABEL about.summary="" + LABEL about.home=""" + LABEL about.documentation="" + LABEL about.license="MIT License (MIT)" + + RUN apt-get update && apt-get -y upgrade && \ + apt-get -y install --no-install-recommends && \ + && \ + apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + + RUN .... + + USER ubuntu + WORKDIR /home/ubuntu + CMD ["/bin/bash"] + + +It is preferable to: + * Use official image as the base + * Use Ubuntu-LTS as the base image + * Avoid Conda unless necessary + * Add versions + * Avoid building containers with multiple software used in the rules \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index 83c6dc506..2f9a3bda4 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,11 +1,11 @@ appdirs==1.4.4 -configargparse==1.5.3 -docutils>=0.14,<0.18 +configargparse==1.7 +docutils==0.18.1 recommonmark==0.7.1 -sphinx==4.2.0 -sphinx-argparse==0.3.1 +sphinx==6.2.1 +sphinx-argparse==0.4.0 sphinx-click==3.0.2 -sphinx_rtd_theme==1.0.0 -sphinxcontrib-napoleon==0.7 -furo==2021.10.9 +sphinx_rtd_theme==1.2.2 +furo==2023.7.26 cyvcf2 + diff --git a/docs/resources.rst b/docs/resources.rst index b8b57b9ed..90ec6eb04 100644 --- a/docs/resources.rst +++ b/docs/resources.rst @@ -2,190 +2,160 @@ References and other resources =============================== - *Main resources including knowledge base and databases necessary for pipeline development* - -#. **MSK-Impact pipeline**\ : https://www.mskcc.org/msk-impact -#. **TCGA**\ : https://cancergenome.nih.gov/ -#. **COSMIC**\ : http://cancer.sanger.ac.uk/cosmic -#. **dbSNP**\ : Database of single nucleotide polymorphisms (SNPs) and multiple small-scale variations that include insertions/deletions, microsatellites, and non-polymorphic variants. https://www.ncbi.nlm.nih.gov/snp/ Download link: ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh38p7/VCF/All_20170710.vcf.gz -#. **ClinVar**\ : ClinVar aggregates information about genomic variation and its relationship to human health. https://www.ncbi.nlm.nih.gov/clinvar/ Download link: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20171029.vcf.gz -#. **SweGen**\ : This dataset contains whole-genome variant frequencies for 1000 Swedish individuals generated within the SweGen project. Download link: https://swefreq.nbis.se/ -#. **ExAC**\ : The Exome Aggregation Consortium (ExAC) is a coalition of investigators seeking to aggregate and harmonize exome sequencing data from a wide variety of large-scale sequencing projects, and to make summary data available for the wider scientific community. http://exac.broadinstitute.org/ Download link: ftp://ftp.broadinstitute.org/pub/ExAC_release/release1/ExAC.r1.sites.vep.vcf.gz -#. **GTEx**\ : The Genotype-Tissue Expression (GTEx) project aims to provide to the scientific community a resource with which to study human gene expression and regulation and its relationship to genetic variation. https://gtexportal.org/static/ Download URL by applying through: https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000424.v6.p1 -#. **OMIM**\ : OMIM®, Online Mendelian Inheritance in Man®, An Online Catalog of Human Genes and Genetic Disorders. https://www.omim.org/ Download link: https://omim.org/downloads/ (registration required) -#. **Drug resistance**\ : An effort by Cosmic to annotate mutations identified in the literature as resistance mutations, including those conferring acquired resistance (after treatment) and intrinsic resistance (before treatment). Available through Cosmic: http://cancer.sanger.ac.uk/cosmic/drug_resistance -#. **Mutational signatures**\ : Signatures of Mutational Processes in Human Cancer. Available through Cosmic: http://cancer.sanger.ac.uk/cosmic/signatures -#. **DGVa**\ : The Database of Genomic Variants archive (DGVa) is a repository that provides archiving, accessioning and distribution of publicly available genomic structural variants, in all species. https://www.ebi.ac.uk/dgva -#. **Cancer genomics workflow**\ : MGI's CWL Cancer Pipelines. https://github.com/genome/cancer-genomics-workflow/wiki -#. **GIAB**\ : The priority of GIAB is authoritative characterization of human genomes for use in analytical validation and technology development, optimization, and demonstration. http://jimb.stanford.edu/giab/ and https://github.com/genome-in-a-bottle Download links: http://jimb.stanford.edu/giab-resources -#. **dbNSFP**\ : dbNSFP is a database developed for functional prediction and annotation of all potential non-synonymous single-nucleotide variants (nsSNVs) in the human genome. https://sites.google.com/site/jpopgen/dbNSFP -#. **1000Genomes**\ : The goal of the 1000 Genomes Project was to find most genetic variants with frequencies of at least 1% in the populations studied. http://www.internationalgenome.org/ Download link: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/ -#. **HapMap3**\ : The International HapMap Project was an organization that aimed to develop a haplotype map (HapMap) of the human genome, to describe the common patterns of human genetic variation. HapMap 3 is the third phase of the International HapMap project. http://www.sanger.ac.uk/resources/downloads/human/hapmap3.html Download link: ftp://ftp.ncbi.nlm.nih.gov/hapmap/ -#. **GRCh38.p11**\ : GRCh38.p11 is the eleventh patch release for the GRCh38 (human) reference assembly. https://www.ncbi.nlm.nih.gov/grc/human Download link: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/ -#. **dbVar**\ : dbVar is NCBI's database of genomic structural variation – insertions, deletions, duplications, inversions, mobile element insertions, translocations, and complex chromosomal rearrangements https://www.ncbi.nlm.nih.gov/dbvar Download link: https://www.ncbi.nlm.nih.gov/dbvar/content/ftp_manifest/ -#. **Drug sensitivity in cancer**\ : Identifying molecular features of cancers that predict response to anti-cancer drugs. http://www.cancerrxgene.org/ Download link: ftp://ftp.sanger.ac.uk/pub4/cancerrxgene/releases -#. **VarSome**\ : VarSome is a knowledge base and aggregator for human genomic variants. https://varsome.com/about/ -#. **Google Genomics Public Data**\ : Google Genomics helps the life science community organize the world’s genomic information and make it accessible and useful. and http://googlegenomics.readthedocs.io +#. **MSK-Impact pipeline**: https://www.mskcc.org/msk-impact +#. **TCGA**: https://cancergenome.nih.gov/ +#. **COSMIC**: http://cancer.sanger.ac.uk/cosmic +#. **dbSNP**: Database of single nucleotide polymorphisms (SNPs) and multiple small-scale variations that include insertions/deletions, microsatellites, and non-polymorphic variants. https://www.ncbi.nlm.nih.gov/snp/ Download link: ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh38p7/VCF/All_20170710.vcf.gz +#. **ClinVar**: ClinVar aggregates information about genomic variation and its relationship to human health. https://www.ncbi.nlm.nih.gov/clinvar/ Download link: ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20171029.vcf.gz +#. **SweGen**: This dataset contains whole-genome variant frequencies for 1000 Swedish individuals generated within the SweGen project. Download link: https://swefreq.nbis.se/ +#. **ExAC**: The Exome Aggregation Consortium (ExAC) is a coalition of investigators seeking to aggregate and harmonize exome sequencing data from a wide variety of large-scale sequencing projects, and to make summary data available for the wider scientific community. http://exac.broadinstitute.org/ Download link: ftp://ftp.broadinstitute.org/pub/ExAC_release/release1/ExAC.r1.sites.vep.vcf.gz +#. **GTEx**: The Genotype-Tissue Expression (GTEx) project aims to provide to the scientific community a resource with which to study human gene expression and regulation and its relationship to genetic variation. https://www.gtexportal.org/home/ Download URL by applying through: https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000424.v6.p1 +#. **OMIM**: OMIM®, Online Mendelian Inheritance in Man®, An Online Catalog of Human Genes and Genetic Disorders. https://www.omim.org/ Download link: https://omim.org/downloads/ (registration required) +#. **Drug resistance**: An effort by Cosmic to annotate mutations identified in the literature as resistance mutations, including those conferring acquired resistance (after treatment) and intrinsic resistance (before treatment). Available through Cosmic: http://cancer.sanger.ac.uk/cosmic/drug_resistance +#. **Mutational signatures**: Signatures of Mutational Processes in Human Cancer. Available through Cosmic: http://cancer.sanger.ac.uk/cosmic/signatures +#. **DGVa**: The Database of Genomic Variants archive (DGVa) is a repository that provides archiving, accessioning and distribution of publicly available genomic structural variants, in all species. https://www.ebi.ac.uk/dgva +#. **Cancer genomics workflow**: MGI's CWL Cancer Pipelines. https://github.com/genome/cancer-genomics-workflow/wiki +#. **GIAB**: The priority of GIAB is authoritative characterization of human genomes for use in analytical validation and technology development, optimization, and demonstration. https://github.com/genome-in-a-bottle +#. **dbNSFP**: dbNSFP is a database developed for functional prediction and annotation of all potential non-synonymous single-nucleotide variants (nsSNVs) in the human genome. https://sites.google.com/site/jpopgen/dbNSFP +#. **1000Genomes**: The goal of the 1000 Genomes Project was to find most genetic variants with frequencies of at least 1% in the populations studied. http://www.internationalgenome.org/ Download link: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/ +#. **HapMap3**: The International HapMap Project was an organization that aimed to develop a haplotype map (HapMap) of the human genome, to describe the common patterns of human genetic variation. HapMap 3 is the third phase of the International HapMap project. http://www.sanger.ac.uk/resources/downloads/human/hapmap3.html Download link: ftp://ftp.ncbi.nlm.nih.gov/hapmap/ +#. **GRCh38.p11**: GRCh38.p11 is the eleventh patch release for the GRCh38 (human) reference assembly. https://www.ncbi.nlm.nih.gov/grc/human Download link: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/ +#. **dbVar**: dbVar is NCBI's database of genomic structural variation – insertions, deletions, duplications, inversions, mobile element insertions, translocations, and complex chromosomal rearrangements https://www.ncbi.nlm.nih.gov/dbvar Download link: https://www.ncbi.nlm.nih.gov/dbvar/content/ftp_manifest/ +#. **Drug sensitivity in cancer**: Identifying molecular features of cancers that predict response to anti-cancer drugs. http://www.cancerrxgene.org/ Download link: ftp://ftp.sanger.ac.uk/pub4/cancerrxgene/releases +#. **VarSome**: VarSome is a knowledge base and aggregator for human genomic variants. https://varsome.com/about/ +#. **CADD**\ : CADD is a tool for scoring the deleteriousness of single nucleotide variants as well as insertion/deletions variants in the human genome. CADD can quantitatively prioritize functional, deleterious, and disease causal variants across a wide range of functional categories, effect sizes and genetic architectures and can be used prioritize causal variation in both research and clinical settings. Sample datasets --------------- - - -#. **TCRB**\ : he Texas Cancer Research Biobank (TCRB) was created to bridge the gap between doctors and scientific researchers to improve the prevention, diagnosis and treatment of cancer. This work occurred with funding from the Cancer Prevention & Research Institute of Texas (CPRIT) from 2010-2014. http://txcrb.org/data.html Article: https://www.nature.com/articles/sdata201610 +#. **TCRB**: The Texas Cancer Research Biobank (TCRB) was created to bridge the gap between doctors and scientific researchers to improve the prevention, diagnosis and treatment of cancer. This work occurred with funding from the Cancer Prevention & Research Institute of Texas (CPRIT) from 2010-2014. http://txcrb.org/data.html Article: https://www.nature.com/articles/sdata201610 Relevant publications --------------------- *Including methodological benchmarking* +#. **MSK-IMPACT**: -#. - **MSK-IMPACT:** - - - * - **Original pipeline**\ : Cheng, D. T., Mitchell, T. N., Zehir, A., Shah, R. H., Benayed, R., Syed, A., … Berger, M. F. (2015). Memorial sloan kettering-integrated mutation profiling of actionable cancer targets (MSK-IMPACT): A hybridization capture-based next-generation sequencing clinical assay for solid tumor molecular oncology. Journal of Molecular Diagnostics, 17(3), 251–264. https://doi.org/10.1016/j.jmoldx.2014.12.006 - - * - **Case study**\ : Cheng, D. T., Prasad, M., Chekaluk, Y., Benayed, R., Sadowska, J., Zehir, A., … Zhang, L. (2017). Comprehensive detection of germline variants by MSK-IMPACT, a clinical diagnostic platform for solid tumor molecular oncology and concurrent cancer predisposition testing. BMC Medical Genomics, 10(1), 33. https://doi.org/10.1186/s12920-017-0271-4 + * **Original pipeline**: Cheng, D. T., Mitchell, T. N., Zehir, A., Shah, R. H., Benayed, R., Syed, A., … Berger, M. F. (2015). Memorial sloan kettering-integrated mutation profiling of actionable cancer targets (MSK-IMPACT): A hybridization capture-based next-generation sequencing clinical assay for solid tumor molecular oncology. Journal of Molecular Diagnostics, 17(3), 251–264. https://doi.org/10.1016/j.jmoldx.2014.12.006 + * **Case study**: Cheng, D. T., Prasad, M., Chekaluk, Y., Benayed, R., Sadowska, J., Zehir, A., … Zhang, L. (2017). Comprehensive detection of germline variants by MSK-IMPACT, a clinical diagnostic platform for solid tumor molecular oncology and concurrent cancer predisposition testing. BMC Medical Genomics, 10(1), 33. https://doi.org/10.1186/s12920-017-0271-4 + * **Case study**: Zehir, A., Benayed, R., Shah, R. H., Syed, A., Middha, S., Kim, H. R., … Berger, M. F. (2017). Mutational landscape of metastatic cancer revealed from prospective clinical sequencing of 10,000 patients. Nature Medicine, 23(6), 703–713. https://doi.org/10.1038/nm.4333 - * **Case study**\ : Zehir, A., Benayed, R., Shah, R. H., Syed, A., Middha, S., Kim, H. R., … Berger, M. F. (2017). Mutational landscape of metastatic cancer revealed from prospective clinical sequencing of 10,000 patients. Nature Medicine, 23(6), 703–713. https://doi.org/10.1038/nm.4333 - -#. **Application of MSK-IMPACT:** Zehir, A., Benayed, R., Shah, R. H., Syed, A., Middha, S., Kim, H. R., … Berger, M. F. (2017). Mutational landscape of metastatic cancer revealed from prospective clinical sequencing of 10,000 patients. Nature Medicine, 23(6), 703–713. https://doi.org/10.1038/nm.4333 -#. **Review on bioinformatic pipelins:** Leipzig, J. (2017). A review of bioinformatic pipeline frameworks. Briefings in Bioinformatics, 18(3), 530–536. https://doi.org/10.1093/bib/bbw020 -#. **Mutational signature reviews:** +#. **Application of MSK-IMPACT**: Zehir, A., Benayed, R., Shah, R. H., Syed, A., Middha, S., Kim, H. R., … Berger, M. F. (2017). Mutational landscape of metastatic cancer revealed from prospective clinical sequencing of 10,000 patients. Nature Medicine, 23(6), 703–713. https://doi.org/10.1038/nm.4333 +#. **Review on bioinformatic pipelins**: Leipzig, J. (2017). A review of bioinformatic pipeline frameworks. Briefings in Bioinformatics, 18(3), 530–536. https://doi.org/10.1093/bib/bbw020 +#. **Mutational signature reviews**: * Helleday, T., Eshtad, S., & Nik-Zainal, S. (2014). Mechanisms underlying mutational signatures in human cancers. Nature Reviews Genetics, 15(9), 585–598. https://doi.org/10.1038/nrg3729 - * Alexandrov, L. B., & Stratton, M. R. (2014). Mutational signatures: The patterns of somatic mutations hidden in cancer genomes. Current Opinion in Genetics and Development, 24(1), 52–60. https://doi.org/10.1016/j.gde.2013.11.01 + * Alexandrov, L. B., & Stratton, M. R. (2014). Mutational signatures: The patterns of somatic mutations hidden in cancer genomes. Current Opinion in Genetics and Development, 24(1), 52–60. https://doi.org/10.1016/j.gde.2013.11.014 -#. **Review on structural variation detection tools**\ : +#. **Review on structural variation detection tools**: * Lin, K., Bonnema, G., Sanchez-Perez, G., & De Ridder, D. (2014). Making the difference: Integrating structural variation detection tools. Briefings in Bioinformatics, 16(5), 852–864. https://doi.org/10.1093/bib/bbu047 * Tattini, L., D’Aurizio, R., & Magi, A. (2015). Detection of Genomic Structural Variants from Next-Generation Sequencing Data. Frontiers in Bioengineering and Biotechnology, 3(June), 1–8. https://doi.org/10.3389/fbioe.2015.00092 -#. **Two case studies and a pipeline (unpublished)**\ : Noll, A. C., Miller, N. A., Smith, L. D., Yoo, B., Fiedler, S., Cooley, L. D., … Kingsmore, S. F. (2016). Clinical detection of deletion structural variants in whole-genome sequences. Npj Genomic Medicine, 1(1), 16026. https://doi.org/10.1038/npjgenmed.2016.26 -#. **Review on driver gene methods**\ : Tokheim, C. J., Papadopoulos, N., Kinzler, K. W., Vogelstein, B., & Karchin, R. (2016). Evaluating the evaluation of cancer driver genes. Proceedings of the National Academy of Sciences, 113(50), 14330–14335. https://doi.org/10.1073/pnas.1616440113 - +#. **Two case studies and a pipeline (unpublished)**: Noll, A. C., Miller, N. A., Smith, L. D., Yoo, B., Fiedler, S., Cooley, L. D., … Kingsmore, S. F. (2016). Clinical detection of deletion structural variants in whole-genome sequences. Npj Genomic Medicine, 1(1), 16026. https://doi.org/10.1038/npjgenmed.2016.26 +#. **Review on driver gene methods**: Tokheim, C. J., Papadopoulos, N., Kinzler, K. W., Vogelstein, B., & Karchin, R. (2016). Evaluating the evaluation of cancer driver genes. Proceedings of the National Academy of Sciences, 113(50), 14330–14335. https://doi.org/10.1073/pnas.1616440113 *Resource, or general notable papers including resource and KB papers related to cancer genomics* - -#. **GIAB**\ : Zook, J. M., Catoe, D., McDaniel, J., Vang, L., Spies, N., Sidow, A., … Salit, M. (2016). Extensive sequencing of seven human genomes to characterize benchmark reference materials. Scientific Data, 3, 160025. https://doi.org/10.1038/sdata.2016.25 +#. **GIAB**: Zook, J. M., Catoe, D., McDaniel, J., Vang, L., Spies, N., Sidow, A., … Salit, M. (2016). Extensive sequencing of seven human genomes to characterize benchmark reference materials. Scientific Data, 3, 160025. https://doi.org/10.1038/sdata.2016.25 Methods and tools ----------------- *Excluding multiple method comparison or benchmarking tools* - -* - **BreakDancer**\ : Chen, K., Wallis, J. W., Mclellan, M. D., Larson, D. E., Kalicki, J. M., Pohl, C. S., … Elaine, R. (2013). BreakDancer - An algorithm for high resolution mapping of genomic structure variation. Nature Methods, 6(9), 677–681. https://doi.org/10.1038/nmeth.1363.BreakDancer - -* - **Pindel**\ : Ye, K., Schulz, M. H., Long, Q., Apweiler, R., & Ning, Z. (2009). Pindel: A pattern growth approach to detect break points of large deletions and medium sized insertions from paired-end short reads. Bioinformatics, 25(21), 2865–2871. https://doi.org/10.1093/bioinformatics/btp394 - -* **SVDetect**\ : Zeitouni, B., Boeva, V., Janoueix-Lerosey, I., Loeillet, S., Legoix-né, P., Nicolas, A., … Barillot, E. (2010). SVDetect: A tool to identify genomic structural variations from paired-end and mate-pair sequencing data. Bioinformatics, 26(15), 1895–1896. https://doi.org/10.1093/bioinformatics/btq293 -* **Purityest**\ : Su, X., Zhang, L., Zhang, J., Meric-bernstam, F., & Weinstein, J. N. (2012). Purityest: Estimating purity of human tumor samples using next-generation sequencing data. Bioinformatics, 28(17), 2265–2266. https://doi.org/10.1093/bioinformatics/bts365 -* **PurBayes**\ : Larson, N. B., & Fridley, B. L. (2013). PurBayes: Estimating tumor cellularity and subclonality in next-generation sequencing data. Bioinformatics, 29(15), 1888–1889. https://doi.org/10.1093/bioinformatics/btt293 -* **ANNOVAR**\ : Wang, K., Li, M., & Hakonarson, H. (2010). ANNOVAR: Functional annotation of genetic variants from high-throughput sequencing data. Nucleic Acids Research, 38(16), 1–7. https://doi.org/10.1093/nar/gkq603 -* **ASCAT**\ : Van Loo, P., Nordgard, S. H., Lingjaerde, O. C., Russnes, H. G., Rye, I. H., Sun, W., … Kristensen, V. N. (2010). Allele-specific copy number analysis of tumors. Proceedings of the National Academy of Sciences, 107(39), 16910–16915. https://doi.org/10.1073/pnas.1009843107 -* **Treeomics**\ : Reiter, J. G., Makohon-Moore, A. P., Gerold, J. M., Bozic, I., Chatterjee, K., Iacobuzio-Donahue, C. A., … Nowak, M. A. (2017). Reconstructing metastatic seeding patterns of human cancers. Nature Communications, 8, 14114. https://doi.org/10.1038/ncomms14114 -* **deconstructSigs**\ : Rosenthal, R., McGranahan, N., Herrero, J., Taylor, B. S., & Swanton, C. (2016). deconstructSigs: delineating mutational processes in single tumors distinguishes DNA repair deficiencies and patterns of carcinoma evolution. Genome Biology, 17(1), 31. https://doi.org/10.1186/s13059-016-0893-4 -* **MutationalPatterns**\ : Blokzijl, F., Janssen, R., van Boxtel, R., & Cuppen, E. (2017). MutationalPatterns: comprehensive genome-wide analysis of mutational processes. bioRxiv, 1–20. https://doi.org/https://doi.org/10.1101/071761 -* **MaSuRCA**\ : Zimin, A. V., Marçais, G., Puiu, D., Roberts, M., Salzberg, S. L., & Yorke, J. A. (2013). The MaSuRCA genome assembler. Bioinformatics, 29(21), 2669–2677. https://doi.org/10.1093/bioinformatics/btt476 -* **VarDict**\ : Lai, Z., Markovets, A., Ahdesmaki, M., Chapman, B., Hofmann, O., Mcewen, R., … Dry, J. R. (2016). VarDict: A novel and versatile variant caller for next-generation sequencing in cancer research. Nucleic Acids Research, 44(11), 1–11. https://doi.org/10.1093/nar/gkw227 -* **vt**\ : Tan, A., Abecasis, G. R., & Kang, H. M. (2015). Unified representation of genetic variants. Bioinformatics, 31(13), 2202–2204. https://doi.org/10.1093/bioinformatics/btv112 -* **peddy**\ : Pedersen, B. S., & Quinlan, A. R. (2017). Who’s Who? Detecting and Resolving Sample Anomalies in Human DNA Sequencing Studies with Peddy. American Journal of Human Genetics, 100(3), 406–413. https://doi.org/10.1016/j.ajhg.2017.01.017 -* **GQT**\ : Layer, R. M., Kindlon, N., Karczewski, K. J., & Quinlan, A. R. (2015). Efficient genotype compression and analysis of large genetic-variation data sets. Nature Methods, 13(1). https://doi.org/10.1038/nmeth.3654 +* **BreakDancer**: Chen, K., Wallis, J. W., Mclellan, M. D., Larson, D. E., Kalicki, J. M., Pohl, C. S., … Elaine, R. (2013). BreakDancer - An algorithm for high resolution mapping of genomic structure variation. Nature Methods, 6(9), 677–681. https://doi.org/10.1038/nmeth.1363 +* **Pindel**: Ye, K., Schulz, M. H., Long, Q., Apweiler, R., & Ning, Z. (2009). Pindel: A pattern growth approach to detect break points of large deletions and medium sized insertions from paired-end short reads. Bioinformatics, 25(21), 2865–2871. https://doi.org/10.1093/bioinformatics/btp394 +* **SVDetect**: Zeitouni, B., Boeva, V., Janoueix-Lerosey, I., Loeillet, S., Legoix-né, P., Nicolas, A., … Barillot, E. (2010). SVDetect: A tool to identify genomic structural variations from paired-end and mate-pair sequencing data. Bioinformatics, 26(15), 1895–1896. https://doi.org/10.1093/bioinformatics/btq293 +* **Purityest**: Su, X., Zhang, L., Zhang, J., Meric-bernstam, F., & Weinstein, J. N. (2012). Purityest: Estimating purity of human tumor samples using next-generation sequencing data. Bioinformatics, 28(17), 2265–2266. https://doi.org/10.1093/bioinformatics/bts365 +* **PurBayes**: Larson, N. B., & Fridley, B. L. (2013). PurBayes: Estimating tumor cellularity and subclonality in next-generation sequencing data. Bioinformatics, 29(15), 1888–1889. https://doi.org/10.1093/bioinformatics/btt293 +* **ANNOVAR**: Wang, K., Li, M., & Hakonarson, H. (2010). ANNOVAR: Functional annotation of genetic variants from high-throughput sequencing data. Nucleic Acids Research, 38(16), 1–7. https://doi.org/10.1093/nar/gkq603 +* **ASCAT**: Van Loo, P., Nordgard, S. H., Lingjaerde, O. C., Russnes, H. G., Rye, I. H., Sun, W., … Kristensen, V. N. (2010). Allele-specific copy number analysis of tumors. Proceedings of the National Academy of Sciences, 107(39), 16910–16915. https://doi.org/10.1073/pnas.1009843107 +* **Treeomics**: Reiter, J. G., Makohon-Moore, A. P., Gerold, J. M., Bozic, I., Chatterjee, K., Iacobuzio-Donahue, C. A., … Nowak, M. A. (2017). Reconstructing metastatic seeding patterns of human cancers. Nature Communications, 8, 14114. https://doi.org/10.1038/ncomms14114 +* **deconstructSigs**: Rosenthal, R., McGranahan, N., Herrero, J., Taylor, B. S., & Swanton, C. (2016). deconstructSigs: delineating mutational processes in single tumors distinguishes DNA repair deficiencies and patterns of carcinoma evolution. Genome Biology, 17(1), 31. https://doi.org/10.1186/s13059-016-0893-4 +* **MutationalPatterns**: Blokzijl, F., Janssen, R., van Boxtel, R., & Cuppen, E. (2017). MutationalPatterns: comprehensive genome-wide analysis of mutational processes. bioRxiv, 1–20. https://doi.org/https://doi.org/10.1101/071761 +* **MaSuRCA**: Zimin, A. V., Marçais, G., Puiu, D., Roberts, M., Salzberg, S. L., & Yorke, J. A. (2013). The MaSuRCA genome assembler. Bioinformatics, 29(21), 2669–2677. https://doi.org/10.1093/bioinformatics/btt476 +* **VarDict**: Lai, Z., Markovets, A., Ahdesmaki, M., Chapman, B., Hofmann, O., Mcewen, R., … Dry, J. R. (2016). VarDict: A novel and versatile variant caller for next-generation sequencing in cancer research. Nucleic Acids Research, 44(11), 1–11. https://doi.org/10.1093/nar/gkw227 +* **vt**: Tan, A., Abecasis, G. R., & Kang, H. M. (2015). Unified representation of genetic variants. Bioinformatics, 31(13), 2202–2204. https://doi.org/10.1093/bioinformatics/btv112 +* **peddy**: Pedersen, B. S., & Quinlan, A. R. (2017). Who’s Who? Detecting and Resolving Sample Anomalies in Human DNA Sequencing Studies with Peddy. American Journal of Human Genetics, 100(3), 406–413. https://doi.org/10.1016/j.ajhg.2017.01.017 +* **GQT**: Layer, R. M., Kindlon, N., Karczewski, K. J., & Quinlan, A. R. (2015). Efficient genotype compression and analysis of large genetic-variation data sets. Nature Methods, 13(1). https://doi.org/10.1038/nmeth.3654 *Tool sets and softwares required at various steps of pipeline development* - -#. - **Teaser**\ : NGS readmapping benchmarking. - - - * http://teaser.cibiv.univie.ac.at/ - * https://github.com/Cibiv/Teaser - -#. - **FastQC**\ : Quality control tool. https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ - -#. **Cutadapt**\ : Adapter removal tool. https://cutadapt.readthedocs.io/en/stable/ -#. **Trim Galore!**\ : FastQC and Cutadapt wrapper. https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/ -#. **Picardtools**\ : BAM/SAM/VCF/CRAM manipulator. http://broadinstitute.github.io/picard/ - - * **MarkDuplicate**\ : Mark duplicate reads and potentially remove them - * **LiftoverVcf**\ : liftover VCF between builds - * **CollectHsMetric**\ : Collects hybrid-selection (HS) metrics for a SAM or BAM file - * **CollectAlignmentSummaryMetrics**\ : Produces a summary of alignment metrics from a SAM or BAM file - * **CollectGcBiasMetrics**\ : Collect metrics regarding GC bias - * **CollectWgsMetrics**\ : Collect metrics about coverage and performance of whole genome sequencing (WGS) experiments - -#. **GATK**\ : A variant discovery tool: https://software.broadinstitute.org/gatk/ - - * **BaseRecalibrator**\ : Detect systematic error in base quality score - * **Somatic Indel Realigner**\ : Local Realignment around Indels - * **ContEst**\ : Estimate cross sample contamination - * **DepthOfCoverage**\ : Assess sequence coverage by sample, read group, or libraries - * **DuplicateReadFilter**\ : remove duplicated from flag set by MarkDuplicates - -#. **Samtools**\ : Reading/writing/editing/indexing/viewing SAM/BAM/CRAM format http://www.htslib.org/ -#. **Sambamba**\ : Tools for working with SAM/BAM/CRAM data http://lomereiter.github.io/sambamba/ -#. **bcftools**\ : Reading/writing BCF2/VCF/gVCF files and calling/filtering/summarising SNP and short indel sequence variants http://www.htslib.org/doc/bcftools.html -#. **vcftools**\ : VCFtools is a program package designed for working with VCF files, such as those generated by the 1000 Genomes Project. https://vcftools.github.io/index.html -#. **Delly2**\ : An integrated structural variant prediction method that can discover, genotype and visualize deletions, tandem duplications, inversions and translocations https://github.com/dellytools/delly -#. **PLINK**\ : PLINK: Whole genome data analysis toolset https://www.cog-genomics.org/plink2 -#. **freebayes**\ : a haplotype-based variant detector. https://github.com/ekg/freebayes -#. **AscatNGS**\ : Allele-Specific Copy Number Analysis of Tumors, tumor purity and ploidy https://github.com/cancerit/ascatNgs -#. **MutationalPatterns**\ : R package for extracting and visualizing mutational patterns in base substitution catalogues https://github.com/UMCUGenetics/MutationalPatterns -#. **desconstructSigs**\ : identification of mutational signatures within a single tumor sample https://github.com/raerose01/deconstructSigs -#. **treeOmics**\ : Decrypting somatic mutation patterns to reveal the evolution of cancer - https://github.com/johannesreiter/treeomics -#. **controlFreeC**\ : Copy number and allelic content caller http://boevalab.com/FREEC/ -#. **MuTect2**\ : Call somatic SNPs and indels via local re-assembly of haplotypes https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_gatk_tools_walkers_cancer_m2_MuTect2.php -#. **Annovar**\ : annotation of detected genetic variation http://annovar.openbioinformatics.org/en/latest/ -#. **Strelka**\ : Small variant caller https://github.com/Illumina/strelka -#. **Manta**\ : Structural variant caller https://github.com/Illumina/manta -#. **PurBayes**\ : estimate tumor purity and clonality -#. **VarDict**\ : variant caller for both single and paired sample variant calling from BAM files https://github.com/AstraZeneca-NGS/VarDict -#. **SNPeff/SNPSift**\ : Genomic variant annotations and functional effect prediction toolbox. http://snpeff.sourceforge.net/ and http://snpeff.sourceforge.net/SnpSift.html -#. **IGV**\ : visualization tool for interactive exploration http://software.broadinstitute.org/software/igv/ -#. **SVDetect**\ : a tool to detect genomic structural variations http://svdetect.sourceforge.net/Site/Home.html -#. **GenomeSTRiP**\ : A suite of tools for discovering and genotyping structural variations using sequencing data http://software.broadinstitute.org/software/genomestrip/ -#. **BreakDancer**\ : SV detection from paired end reads mapping https://github.com/genome/breakdancer -#. **pIndel**\ : Detect breakpoints of large deletions, medium sized insertions, inversions, and tandem duplications https://github.com/genome/pindel -#. **VarScan**\ : Variant calling and somatic mutation/CNV detection https://github.com/dkoboldt/varscan -#. **VEP**\ : Variant Effect Predictor https://www.ensembl.org/info/docs/tools/vep/index.html -#. **Probablistic2020**\ : Simulates somatic mutations, and calls statistically significant oncogenes and tumor suppressor genes based on a randomization-based test https://github.com/KarchinLab/probabilistic2020 -#. **2020plus**\ : Classifies genes as an oncogene, tumor suppressor gene, or as a non-driver gene by using Random Forests https://github.com/KarchinLab/2020plus -#. **vtools**\ : variant tools is a software tool for the manipulation, annotation, selection, simulation, and analysis of variants in the context of next-gen sequencing analysis. http://varianttools.sourceforge.net/Main/HomePage -#. **vt**\ : A variant tool set that discovers short variants from Next Generation Sequencing data. https://genome.sph.umich.edu/wiki/Vt and https://github.com/atks/vt -#. **CNVnator**\ : a tool for CNV discovery and genotyping from depth-of-coverage by mapped reads. https://github.com/abyzovlab/CNVnator -#. **CNVpytor**\ : a tool for copy number variation detection and analysis from read depth and allele imbalance in whole-genome sequencing. https://github.com/abyzovlab/CNVpytor -#. **SvABA**\ : Structural variation and indel detection by local assembly. https://github.com/walaj/svaba -#. **indelope**\ : find indels and SVs too small for structural variant callers and too large for GATK. https://github.com/brentp/indelope -#. **peddy**\ : peddy compares familial-relationships and sexes as reported in a PED/FAM file with those inferred from a VCF. https://github.com/brentp/peddy -#. **cyvcf2**\ : cyvcf2 is a cython wrapper around htslib built for fast parsing of Variant Call Format (VCF) files. https://github.com/brentp/cyvcf2 -#. **GQT**\ : Genotype Query Tools (GQT) is command line software and a C API for indexing and querying large-scale genotype data sets. https://github.com/ryanlayer/gqt -#. **LOFTEE**\ : Loss-Of-Function Transcript Effect Estimator. A VEP plugin to identify LoF (loss-of-function) variation. Assesses variants that are: Stop-gained, Splice site disrupting, and Frameshift variants. https://github.com/konradjk/loftee -#. **PureCN**\ : copy number calling and SNV classification using targeted short read sequencing https://bioconductor.org/packages/release/bioc/html/PureCN.html -#. **SVCaller**\ : A structural variant caller. https://github.com/tomwhi/svcaller -#. **SnakeMake**\ : A workflow manager. http://snakemake.readthedocs.io/en/stable/index.html -#. **BWA**\ : BWA is a software package for mapping low-divergent sequences against a large reference genome, such as the human genome. It consists of three algorithms: BWA-backtrack, BWA-SW and BWA-MEM. http://bio-bwa.sourceforge.net/ -#. **wgsim**\ : Wgsim is a small tool for simulating sequence reads from a reference genome. It is able to simulate diploid genomes with SNPs and insertion/deletion (INDEL) polymorphisms, and simulate reads with uniform substitution sequencing errors. https://github.com/lh3/wgsim -#. **dwgsim**\ : Whole genome simulation can be performed with dwgsim. dwgsim is based off of wgsim found in SAMtools. https://github.com/nh13/DWGSIM -#. **ABSOLUTE**\ : ABSOLUTE can estimate purity/ploidy, and from that compute absolute copy-number and mutation multiplicities. http://archive.broadinstitute.org/cancer/cga/absolute -#. **THetA**\ : Tumor Heterogeneity Analysis. This algorithm estimates tumor purity and clonal/subclonal copy number aberrations directly from high-throughput DNA sequencing data. https://github.com/raphael-group/THetA -#. **Skewer**\ : Adapter trimming, similar to cutadapt. https://github.com/relipmoc/skewer -#. **Phylowgs**\ : Application for inferring subclonal composition and evolution from whole-genome sequencing data. https://github.com/morrislab/phylowgs -#. **superFreq**\ : SuperFreq is an R package that analyses cancer exomes to track subclones. https://github.com/ChristofferFlensburg/superFreq -#. **readVCF-r**\ : Read VCFs into R and annotatte them. https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html -#. **vcfr**\ : Read VCFs into R. https://github.com/knausb/vcfR -#. **msisensor**\ : microsatellite instability detection using paired tumor-normal https://github.com/ding-lab/msisensor -#. **MOSAIC**\ : MicrOSAtellite Instability Classifier https://github.com/ronaldhause/mosaic -#. **MANTIS**\ : Microsatellite Analysis for Normal-Tumor InStability https://github.com/OSU-SRLab/MANTIS -#. **SBDB**\ : A toolkit for constricting and querying structural variant databases https://github.com/J35P312/SVDB +#. **FastQC**: Quality control tool. https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ +#. **Cutadapt**: Adapter removal tool. https://cutadapt.readthedocs.io/en/stable/ +#. **Trim Galore!**: FastQC and Cutadapt wrapper. https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/ +#. **Picardtools**: BAM/SAM/VCF/CRAM manipulator. http://broadinstitute.github.io/picard/ + + * **MarkDuplicate**: Mark duplicate reads and potentially remove them + * **LiftoverVcf**: liftover VCF between builds + * **CollectHsMetric**: Collects hybrid-selection (HS) metrics for a SAM or BAM file + * **CollectAlignmentSummaryMetrics**: Produces a summary of alignment metrics from a SAM or BAM file + * **CollectGcBiasMetrics**: Collect metrics regarding GC bias + * **CollectWgsMetrics**: Collect metrics about coverage and performance of whole genome sequencing (WGS) experiments + +#. **GATK**: A variant discovery tool: https://gatk.broadinstitute.org/hc/en-us + + * **BaseRecalibrator**: Detect systematic error in base quality score + * **Somatic Indel Realigner**: Local Realignment around Indels + * **ContEst**: Estimate cross sample contamination + * **DepthOfCoverage**: Assess sequence coverage by sample, read group, or libraries + * **DuplicateReadFilter**: remove duplicated from flag set by MarkDuplicates + +#. **Samtools**: Reading/writing/editing/indexing/viewing SAM/BAM/CRAM format http://www.htslib.org/ +#. **Sambamba**: Tools for working with SAM/BAM/CRAM data http://lomereiter.github.io/sambamba/ +#. **bcftools**: Reading/writing BCF2/VCF/gVCF files and calling/filtering/summarising SNP and short indel sequence variants http://www.htslib.org/doc/bcftools.html +#. **vcftools**: VCFtools is a program package designed for working with VCF files, such as those generated by the 1000 Genomes Project. https://vcftools.github.io/index.html +#. **Delly2**: An integrated structural variant prediction method that can discover, genotype and visualize deletions, tandem duplications, inversions and translocations https://github.com/dellytools/delly +#. **PLINK**: PLINK: Whole genome data analysis toolset https://www.cog-genomics.org/plink2 +#. **freebayes**: a haplotype-based variant detector. https://github.com/ekg/freebayes +#. **AscatNGS**: Allele-Specific Copy Number Analysis of Tumors, tumor purity and ploidy https://github.com/cancerit/ascatNgs +#. **MutationalPatterns**: R package for extracting and visualizing mutational patterns in base substitution catalogues https://github.com/UMCUGenetics/MutationalPatterns +#. **desconstructSigs**: identification of mutational signatures within a single tumor sample https://github.com/raerose01/deconstructSigs +#. **treeOmics**: Decrypting somatic mutation patterns to reveal the evolution of cancer https://github.com/johannesreiter/treeomics +#. **controlFreeC**: Copy number and allelic content caller http://boevalab.com/FREEC/ +#. **MuTect2**: Call somatic SNPs and indels via local re-assembly of haplotypes https://gatk.broadinstitute.org/hc/en-us/articles/360037593851-Mutect2 +#. **Annovar**: annotation of detected genetic variation http://annovar.openbioinformatics.org/en/latest/ +#. **Strelka**: Small variant caller https://github.com/Illumina/strelka +#. **Manta**: Structural variant caller https://github.com/Illumina/manta +#. **PurBayes**: estimate tumor purity and clonality +#. **VarDict**: variant caller for both single and paired sample variant calling from BAM files https://github.com/AstraZeneca-NGS/VarDict +#. **SNPeff/SNPSift**: Genomic variant annotations and functional effect prediction toolbox. http://snpeff.sourceforge.net/ and http://snpeff.sourceforge.net/SnpSift.html +#. **IGV**: visualization tool for interactive exploration http://software.broadinstitute.org/software/igv/ +#. **SVDetect**: a tool to detect genomic structural variations http://svdetect.sourceforge.net/Site/Home.html +#. **GenomeSTRiP**: A suite of tools for discovering and genotyping structural variations using sequencing data http://software.broadinstitute.org/software/genomestrip/ +#. **BreakDancer**: SV detection from paired end reads mapping https://github.com/genome/breakdancer +#. **pIndel**: Detect breakpoints of large deletions, medium sized insertions, inversions, and tandem duplications https://github.com/genome/pindel +#. **VarScan**: Variant calling and somatic mutation/CNV detection https://github.com/dkoboldt/varscan +#. **VEP**: Variant Effect Predictor https://www.ensembl.org/info/docs/tools/vep/index.html +#. **Probablistic2020**: Simulates somatic mutations, and calls statistically significant oncogenes and tumor suppressor genes based on a randomization-based test https://github.com/KarchinLab/probabilistic2020 +#. **2020plus**: Classifies genes as an oncogene, tumor suppressor gene, or as a non-driver gene by using Random Forests https://github.com/KarchinLab/2020plus +#. **vtools**: variant tools is a software tool for the manipulation, annotation, selection, simulation, and analysis of variants in the context of next-gen sequencing analysis. https://vatlab.github.io/vat-docs/ +#. **vt**: A variant tool set that discovers short variants from Next Generation Sequencing data. https://genome.sph.umich.edu/wiki/Vt and https://github.com/atks/vt +#. **CNVnator**: a tool for CNV discovery and genotyping from depth-of-coverage by mapped reads. https://github.com/abyzovlab/CNVnator +#. **CNVpytor**: a tool for copy number variation detection and analysis from read depth and allele imbalance in whole-genome sequencing. https://github.com/abyzovlab/CNVpytor +#. **SvABA**: Structural variation and indel detection by local assembly. https://github.com/walaj/svaba +#. **indelope**: find indels and SVs too small for structural variant callers and too large for GATK. https://github.com/brentp/indelope +#. **peddy**: peddy compares familial-relationships and sexes as reported in a PED/FAM file with those inferred from a VCF. https://github.com/brentp/peddy +#. **cyvcf2**: cyvcf2 is a cython wrapper around htslib built for fast parsing of Variant Call Format (VCF) files. https://github.com/brentp/cyvcf2 +#. **GQT**: Genotype Query Tools (GQT) is command line software and a C API for indexing and querying large-scale genotype data sets. https://github.com/ryanlayer/gqt +#. **LOFTEE**: Loss-Of-Function Transcript Effect Estimator. A VEP plugin to identify LoF (loss-of-function) variation. Assesses variants that are: Stop-gained, Splice site disrupting, and Frameshift variants. https://github.com/konradjk/loftee +#. **PureCN**: copy number calling and SNV classification using targeted short read sequencing https://bioconductor.org/packages/release/bioc/html/PureCN.html +#. **SVCaller**: A structural variant caller. https://github.com/tomwhi/svcaller +#. **SnakeMake**: A workflow manager. http://snakemake.readthedocs.io/en/stable/index.html +#. **BWA**: BWA is a software package for mapping low-divergent sequences against a large reference genome, such as the human genome. It consists of three algorithms: BWA-backtrack, BWA-SW and BWA-MEM. http://bio-bwa.sourceforge.net/ +#. **wgsim**: Wgsim is a small tool for simulating sequence reads from a reference genome. It is able to simulate diploid genomes with SNPs and insertion/deletion (INDEL) polymorphisms, and simulate reads with uniform substitution sequencing errors. https://github.com/lh3/wgsim +#. **dwgsim**: Whole genome simulation can be performed with dwgsim. dwgsim is based off of wgsim found in SAMtools. https://github.com/nh13/DWGSIM +#. **THetA**: Tumor Heterogeneity Analysis. This algorithm estimates tumor purity and clonal/subclonal copy number aberrations directly from high-throughput DNA sequencing data. https://github.com/raphael-group/THetA +#. **Skewer**: Adapter trimming, similar to cutadapt. https://github.com/relipmoc/skewer +#. **Phylowgs**: Application for inferring subclonal composition and evolution from whole-genome sequencing data. https://github.com/morrislab/phylowgs +#. **superFreq**: SuperFreq is an R package that analyses cancer exomes to track subclones. https://github.com/ChristofferFlensburg/superFreq +#. **readVCF-r**: Read VCFs into R and annotatte them. https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html +#. **vcfr**: Read VCFs into R. https://github.com/knausb/vcfR +#. **msisensor**: microsatellite instability detection using paired tumor-normal https://github.com/ding-lab/msisensor +#. **MOSAIC**: MicrOSAtellite Instability Classifier https://github.com/ronaldhause/mosaic +#. **MANTIS**: Microsatellite Analysis for Normal-Tumor InStability https://github.com/OSU-SRLab/MANTIS +#. **SBDB**: A toolkit for constricting and querying structural variant databases https://github.com/J35P312/SVDB diff --git a/docs/semver.rst b/docs/semver.rst index 8d1ba863d..4a83676e6 100644 --- a/docs/semver.rst +++ b/docs/semver.rst @@ -13,7 +13,6 @@ Since October 24, 2018 the following changes were added in addition to SemVer al callers, aligners, quality trimmers, and/or anything other than QC reporting and rule `all`. - Addition of annotation softwares or sources, variant callers, aligners, quality trimmers, and/or anything other than QC reporting - - **minor**: diff --git a/docs/user_guide.rst b/docs/user_guide.rst index b6f13a205..1b20d8e8e 100644 --- a/docs/user_guide.rst +++ b/docs/user_guide.rst @@ -4,27 +4,36 @@ Short tutorial Here a short tutorial is provided for BALSAMIC (**version** = 12.0.2). +Regarding fastq-inputs +--------------------- + +Previous versions of BALSAMIC only accepted one fastq-pair per sample, which required concatenation of fastq-pairs if multiple existed. + +The current version BALSAMIC takes ``--fastq-path`` instead which is a path to a directory containing ALL fastq-files you want to include in the analysis, for tumor and normal (if it exists for the analysis). + +**NOTE**: The fastq-files in ``--fastq-path`` need to contain the names from ``--tumor-sample-name [sample_name]`` and ``--normal-sample-name [sample_name]`` as a sub-string in the fastq-names to correctly assign them to their respective sample. + Running a test sample --------------------- -Given the +Example config demo case: :: balsamic config case \ - --tumor tests/test_data/fastq/S1_R_1.fastq.gz \ - --normal tests/test_data/fastq/S2_R_1.fastq.gz \ - --case-id demo_run_balsamic \ - --analysis-dir demo/ \ - --panel-bed tests/test_data/references/panel/panel.bed \ - --balsamic-cache ~/balsamic_cache \ - --quiet + --analysis-dir demo/ + --balsamic-cache ~/balsamic_cache + --fastq-path tests/test_data/fastq/ + --case-id demo_run_balsamic + --gender female + --analysis-workflow balsamic + --genome-version hg19 + --tumor-sample-name S1 + --panel-bed tests/test_data/references/panel/panel.bed + -Notes: -- If you want to test tumor_only mode, remove the ``--normal tests/test_data/fastq/S2_R_1.fastq.gz`` line. -- ``--output-config demo_run_balsamic.json`` is also optional Let's try a dry run and see everything is in place: diff --git a/requirements-dev.txt b/requirements-dev.txt index fea21c52a..4a4d19506 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,5 @@ -pytest==7.1.3 -pytest-cov==2.8.1 -coveralls -pylint==2.13.9 -black==22.3.0 -pillow>=8.4.0 -fpdf2==2.4.6 +black==23.7.0 +bump2version==1.0.1 +coveralls==3.3.1 +pytest-cov==4.1.0 +pytest==7.4.0 diff --git a/setup.py b/setup.py index 65543ad9b..43d355725 100644 --- a/setup.py +++ b/setup.py @@ -1,38 +1,119 @@ +"""Description of the Balsamic package.""" import os -from setuptools import setup, find_packages +from typing import List -here = os.path.abspath(os.path.dirname(__file__)) +from setuptools import find_packages, setup -# Load the package's __version__.py module as a dictionary. -about = {} -with open(os.path.join(here, "BALSAMIC", "__version__.py")) as f: - exec(f.read(), about) +NAME: str = "BALSAMIC" +AUTHOR: str = "Clinical Genomics" +URL: str = "https://github.com/Clinical-Genomics/BALSAMIC" +EMAIL: str = "support@clinicalgenomics.se" +REQUIRES_PYTHON: str = ">=3.11.0" # Requirements -requirements = [ - "click==8.1.3", - "colorclass>=2.2.0", - "coloredlogs>=14.0", - "graphviz==0.16", - "gsutil==5.23", - "jinja2>=2.11.2", - "matplotlib==3.5.2", - "networkx>=2.6.3", - "numpy>=1.21.6", - "pandas>=1.1.5", - "psutil>=5.7.0", - "pydantic>=1.9.0,<2.0", - "pygments>=2.6.1", - "pyyaml>=5.3.1", - "six>=1.12.0", - "snakemake==6.5.3", - "yapf>=0.30.0", - "h5py>=3.6.0", - "PyPDF2>=1.26.0", - "markdown==3.3.3", - "cryptography==40.0.2", - "tabulate==0.8.10", +requirements: List[str] = [ + "aiohttp==3.8.5", + "aiosignal==1.3.1", + "appdirs==1.4.4", + "argcomplete==3.1.1", + "async-timeout==4.0.2", + "attrs==23.1.0", + "boto==2.49.0", + "cachetools==5.3.1", + "certifi==2023.7.22", + "cffi==1.15.1", + "charset-normalizer==3.2.0", + "click==8.1.6", + "colorclass==2.2.2", + "coloredlogs==15.0.1", + "ConfigArgParse==1.7", + "connection-pool==0.0.3", + "contourpy==1.1.0", + "crcmod==1.7", + "cryptography==41.0.3", + "cycler==0.11.0", + "cyvcf2==0.30.22", + "datrie==0.8.2", + "defusedxml==0.7.1", + "dpath==2.1.6", + "fasteners==0.18", + "fastjsonschema==2.18.0", + "fonttools==4.42.0", + "fpdf2==2.7.4", + "frozenlist==1.4.0", + "gcs-oauth2-boto-plugin==3.0", + "gitdb==4.0.10", + "GitPython==3.1.32", + "google-apitools==0.5.32", + "google-auth==2.22.0", + "google-reauth==0.1.1", + "graphviz==0.20.1", + "gsutil==5.25", + "h5py==3.9.0", + "httplib2==0.20.4", + "humanfriendly==10.0", + "idna==3.4", + "importlib-metadata==6.8.0", + "iniconfig==2.0.0", + "Jinja2==3.1.2", + "jsonschema-specifications==2023.7.1", + "jsonschema==4.18.6", + "jupyter_core==5.3.1", + "kiwisolver==1.4.4", + "MarkupSafe==2.1.3", + "matplotlib==3.7.2", + "monotonic==1.6", + "multidict==6.0.4", + "nbformat==5.9.2", + "numpy==1.25.2", + "oauth2client==4.1.3", + "packaging==23.1", + "pandas==2.0.3", + "pdfkit==1.0.0", + "Pillow==10.0.0", + "plac==1.3.5", + "platformdirs==3.10.0", + "pluggy==1.2.0", + "psutil==5.9.5", + "PuLP==2.7.0", + "pyasn1-modules==0.3.0", + "pyasn1==0.5.0", + "pycparser==2.21", + "pydantic>=2.4", + "Pygments==2.15.1", + "pyOpenSSL==23.2.0", + "pyparsing==3.0.9", + "pypdf==3.17.1", + "pytest==7.4.0", + "python-dateutil==2.8.2", + "pytz==2023.3", + "pyu2f==0.1.5", + "PyYAML==6.0.1", + "referencing==0.30.0", + "requests==2.31.0", + "reretry==0.11.8", + "retry-decorator==1.1.1", + "rpds-py==0.9.2", + "rsa==4.7.2", + "six==1.16.0", + "smart-open==6.3.0", + "smmap==5.0.0", + "snakemake==7.32.4", + "stopit==1.1.2", + "tabulate==0.9.0", + "throttler==1.2.2", "toml==0.10.2", + "tomli==2.0.1", + "toposort==1.10", + "traitlets==5.9.0", + "typing_extensions==4.7.1", + "tzdata==2023.3", + "urllib3==1.26.16", + "wrapt==1.15.0", + "yapf==0.40.1", + "yarl==1.9.2", + "yte==1.5.1", + "zipp==3.16.2", ] # The C libraries required to build numpy are not available on RTD @@ -40,12 +121,13 @@ requirements.extend(["cyvcf2==0.30.22"]) setup( - name="BALSAMIC", - version=about["__version__"], - url="https://github.com/Clinical-Genomics/BALSAMIC", - author="Hassan Foroughi Asl", - author_email="hassan.foroughi@scilifelab.se", + name=NAME, + version="12.0.2", + url=URL, + author=AUTHOR, + author_email=EMAIL, install_requires=requirements, + python_requires=REQUIRES_PYTHON, packages=find_packages(), package_data={ "": [ diff --git a/tests/commands/config/test_config_pon.py b/tests/commands/config/test_config_pon.py index eede28625..e8a7a5b23 100644 --- a/tests/commands/config/test_config_pon.py +++ b/tests/commands/config/test_config_pon.py @@ -1,20 +1,22 @@ -import os import json import graphviz import logging from unittest import mock from pathlib import Path +from BALSAMIC.constants.analysis import PONWorkflow -from BALSAMIC.utils.cli import create_pon_fastq_symlink - -def test_pon_config( - invoke_cli, tmp_path, balsamic_cache, panel_bed_file, pon_fastq_path +def test_cnvkit_pon_config( + invoke_cli, + analysis_dir: str, + balsamic_cache: str, + panel_bed_file: str, + fastq_dir_pon: str, + case_id_pon: str, ): + """Test balsamic PON config case command for CNVkit.""" + # GIVEN a case ID, fastq files, and an analysis dir - case_id = "sample_pon" - test_analysis_dir = tmp_path / "test_analysis_dir" - test_analysis_dir.mkdir() # WHEN creating a case config result = invoke_cli( @@ -22,38 +24,114 @@ def test_pon_config( "config", "pon", "--case-id", - case_id, - "--version", - "v5", + case_id_pon, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_pon, "-p", panel_bed_file, + "--version", + "v5", + "--balsamic-cache", + balsamic_cache, + "--pon-workflow", + PONWorkflow.CNVKIT, + ] + ) + + # THEN a config should be created and exist + assert result.exit_code == 0 + assert Path(analysis_dir, case_id_pon, case_id_pon + "_PON.json").exists() + + +def test_gens_pon_config( + invoke_cli, + analysis_dir: str, + balsamic_cache: str, + fastq_dir_gens_pon: str, + case_id_gens_pon: str, + gens_hg19_interval_list: str, +): + """Test balsamic PON config case command for GENS.""" + + # GIVEN a case ID, fastq files, and an analysis dir + + # WHEN creating a config for GENS pon creation workflow + result = invoke_cli( + [ + "config", + "pon", + "--case-id", + case_id_gens_pon, "--analysis-dir", - test_analysis_dir, + analysis_dir, "--fastq-path", - pon_fastq_path, + fastq_dir_gens_pon, + "--version", + "v5", "--balsamic-cache", balsamic_cache, + "--pon-workflow", + PONWorkflow.GENS_MALE, + "--genome-interval", + gens_hg19_interval_list, ] ) # THEN a config should be created and exist assert result.exit_code == 0 - assert Path(test_analysis_dir, case_id, case_id + "_PON.json").exists() - # load json file and check if dag exists - pon_config = json.load( - open(Path(test_analysis_dir, case_id, case_id + "_PON.json")) + assert Path(analysis_dir, case_id_gens_pon, case_id_gens_pon + "_PON.json").exists() + + +def test_gens_pon_config( + invoke_cli, + analysis_dir: str, + balsamic_cache: str, + fastq_dir_gens_pon: str, + case_id_gens_pon: str, +): + """Test detection of missing genome_interval file which is optional but required for GENS.""" + + # GIVEN a case ID, fastq files, and an analysis dir + + # WHEN creating a config for GENS pon creation workflow without required genome_interval file + result = invoke_cli( + [ + "config", + "pon", + "--case-id", + case_id_gens_pon, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_gens_pon, + "--version", + "v5", + "--balsamic-cache", + balsamic_cache, + "--pon-workflow", + PONWorkflow.GENS_MALE, + ] + ) + + # THEN command should exit with error + assert ( + "Argument: genome_interval is required for GENS PON creation." in result.output ) - # assert if config json dag file is created - assert Path(pon_config["analysis"]["dag"]).exists() + assert result.exit_code == 2 -def test_pon_config_failed(invoke_cli, tmp_path, balsamic_cache, panel_bed_file): +def test_cnvkit_pon_config_failed( + invoke_cli, tmp_path: str, balsamic_cache: str, panel_bed_file: str +): + """Test detection of missing option for a PON config without required arguments..""" # GIVEN a case ID, fastq files, and an analysis dir test_analysis_dir = tmp_path / "test_analysis_dir" test_analysis_dir.mkdir() case_id = "sample_pon" - # WHEN creating a case analysis + # WHEN creating config for cnvkit pon creation workflow result = invoke_cli( [ "config", @@ -74,26 +152,69 @@ def test_pon_config_failed(invoke_cli, tmp_path, balsamic_cache, panel_bed_file) assert result.exit_code == 2 -def test_create_pon_fastq_symlink(tmp_path_factory, caplog): - pon_symlink_from = tmp_path_factory.mktemp("pon_symlink_from") - pon_symlink_to = tmp_path_factory.mktemp("pon_symlink_to") - files = ["normal1_R_1.fastq.gz", "normal1_R_2.fastq.gz"] - ponfiles = [Path(pon_symlink_from, x) for x in files] - for ponfile in ponfiles: - ponfile.touch() - with caplog.at_level(logging.INFO): - create_pon_fastq_symlink(pon_symlink_from, pon_symlink_to) - # THEN destination should have files - assert len(list(Path(pon_symlink_to).rglob("*.fastq.gz"))) == 2 - # THEN exception triggers log message containing "skipping" - assert "skipping" not in caplog.text +def test_cnvkit_pon_config_missing_panel( + invoke_cli, tmp_path: str, balsamic_cache: str, fastq_dir_pon: str +): + """Test detection of missing panel which is optional but required for CNVkit.""" + # GIVEN a case ID, fastq files, and an analysis dir + test_analysis_dir = tmp_path / "test_analysis_dir" + test_analysis_dir.mkdir() + case_id = "sample_pon" -def test_config_pon_graph_failed( - invoke_cli, analysis_dir, balsamic_cache, pon_fastq_path, panel_bed_file + # WHEN creating config for cnvkit pon creation workflow + result = invoke_cli( + [ + "config", + "pon", + "--case-id", + case_id, + "--analysis-dir", + test_analysis_dir, + "--balsamic-cache", + balsamic_cache, + "--fastq-path", + fastq_dir_pon, + "--pon-workflow", + PONWorkflow.CNVKIT, + "--version", + "v5", + ] + ) + + # THEN a config should not be created and exit + assert "Argument: panel_bed is required for CNVkit PON creation." in result.output + assert result.exit_code == 2 + + +def test_dag_graph_success_cnvkit_pon(cnvkit_pon_creation_config: str): + """Test DAG graph building success.""" + # WHEN creating config using standard CLI input and setting Sentieon env vars + + # THEN DAG graph should be created successfully + assert Path(json.load(open(cnvkit_pon_creation_config))["analysis"]["dag"]).exists() + + +def test_dag_graph_success_gens_pon(gens_pon_creation_config: str): + """Test DAG graph building success.""" + # WHEN creating config using standard CLI input and setting Sentieon env vars + + # THEN DAG graph should be created successfully + assert Path(json.load(open(gens_pon_creation_config))["analysis"]["dag"]).exists() + + +def test_cnvkit_pon_config_graph_failed( + invoke_cli, + analysis_dir: str, + balsamic_cache: str, + panel_bed_file: str, ): + """Test DAG graph building failure.""" + # GIVEN an analysis config - pon_case_id = "sample_pon" + case_id = "sample_pon" + fastq_dir: Path = Path(analysis_dir, case_id, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) with mock.patch.object(graphviz, "Source") as mocked: mocked.return_value = None @@ -101,17 +222,22 @@ def test_config_pon_graph_failed( [ "config", "pon", - "-p", - panel_bed_file, "--case-id", - pon_case_id, + case_id, "--analysis-dir", analysis_dir, + "--fastq-path", + fastq_dir, + "-p", + panel_bed_file, + "--version", + "v5", "--balsamic-cache", balsamic_cache, - "--fastq-path", - pon_fastq_path, + "--pon-workflow", + PONWorkflow.CNVKIT, ] ) + # THEN the graph should not have been built assert pon_result.exit_code == 1 diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py index 8347da9e9..728a5156e 100644 --- a/tests/commands/config/test_config_sample.py +++ b/tests/commands/config/test_config_sample.py @@ -1,32 +1,31 @@ -import os import json - +from pathlib import Path from unittest import mock import graphviz -from pathlib import Path +from BALSAMIC.constants.constants import FileType +from tests.conftest import MOCKED_OS_ENVIRON def test_tumor_normal_config( invoke_cli, - sample_fastq, - tmp_path, - balsamic_cache, - panel_bed_file, - sentieon_license, - sentieon_install_dir, + case_id_tumor_normal: str, + tumor_sample_name: str, + normal_sample_name: str, + analysis_dir: str, + fastq_dir_tumor_normal_parameterize: str, + balsamic_cache: str, + panel_bed_file: str, + sentieon_license: str, + sentieon_install_dir: str, ): - # GIVEN a case ID, fastq files, and an analysis dir - test_analysis_dir = tmp_path / "test_analysis_dir" - test_analysis_dir.mkdir() - case_id = "sample_tumor_normal" - tumor = sample_fastq["tumor"] - normal = sample_fastq["normal"] + """Test tumor normal balsamic config case command.""" + # GIVEN a case ID, fastq files, and an analysis dir # WHEN creating a case analysis with mock.patch.dict( - "os.environ", + MOCKED_OS_ENVIRON, { "SENTIEON_LICENSE": sentieon_license, "SENTIEON_INSTALL_DIR": sentieon_install_dir, @@ -36,50 +35,50 @@ def test_tumor_normal_config( [ "config", "case", - "-p", - panel_bed_file, - "-t", - tumor, - "-n", - normal, "--case-id", - case_id, + case_id_tumor_normal, "--gender", "male", "--analysis-dir", - test_analysis_dir, + analysis_dir, + "--fastq-path", + fastq_dir_tumor_normal_parameterize, + "-p", + panel_bed_file, "--balsamic-cache", balsamic_cache, "--tumor-sample-name", - "ACC1", + tumor_sample_name, "--normal-sample-name", - "ACC2", + normal_sample_name, ], ) # THEN a config should be created and exist assert result.exit_code == 0 - assert Path(test_analysis_dir, case_id, case_id + ".json").exists() + assert Path( + analysis_dir, case_id_tumor_normal, f"{case_id_tumor_normal}.{FileType.JSON}" + ).exists() def test_tumor_only_config( invoke_cli, - sample_fastq, - tmp_path, - balsamic_cache, - panel_bed_file, - sentieon_license, - sentieon_install_dir, + case_id_tumor_only: str, + tumor_sample_name: str, + analysis_dir: str, + fastq_dir_tumor_only: str, + balsamic_cache: str, + panel_bed_file: str, + sentieon_license: str, + sentieon_install_dir: str, ): + """Test tumor only balsamic config case command.""" + # GIVEN a case ID, fastq files, and an analysis dir - test_analysis_dir = tmp_path / "test_analysis_dir" - test_analysis_dir.mkdir() - case_id = "sample_tumor_only" - tumor = sample_fastq["tumor"] # WHEN creating a case analysis with mock.patch.dict( - "os.environ", + MOCKED_OS_ENVIRON, { "SENTIEON_LICENSE": sentieon_license, "SENTIEON_INSTALL_DIR": sentieon_install_dir, @@ -89,197 +88,378 @@ def test_tumor_only_config( [ "config", "case", - "-p", - panel_bed_file, - "-t", - tumor, "--case-id", - case_id, + case_id_tumor_only, "--analysis-dir", - test_analysis_dir, + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only, + "-p", + panel_bed_file, "--balsamic-cache", balsamic_cache, "--tumor-sample-name", - "ACC1", + tumor_sample_name, ], ) # THEN a config should be created and exist assert result.exit_code == 0 - assert Path(test_analysis_dir, case_id, case_id + ".json").exists() + assert Path( + analysis_dir, case_id_tumor_only, f"{case_id_tumor_only}.{FileType.JSON}" + ).exists() -def test_dag_graph_success( - tumor_normal_wgs_config, - tumor_only_config, - tumor_normal_config, - tumor_only_wgs_config, -): - # WHEN creating config using standard CLI input and setting Sentieon env vars - # THEN DAG graph should be created successfully - assert Path(json.load(open(tumor_normal_config))["analysis"]["dag"]).exists() - assert Path(json.load(open(tumor_only_config))["analysis"]["dag"]).exists() - assert Path(json.load(open(tumor_only_wgs_config))["analysis"]["dag"]).exists() - assert Path(json.load(open(tumor_normal_wgs_config))["analysis"]["dag"]).exists() - - -def test_config_bad_filename( +def test_run_without_permissions( invoke_cli, - tmp_path_factory, - analysis_dir, - panel_bed_file, - balsamic_cache, + case_id_tumor_only: str, + tumor_sample_name: str, + fastq_dir_tumor_only: str, + no_write_perm_path: str, + panel_bed_file: str, + balsamic_cache: str, ): - # GIVEN existing fastq file with wrong naming convention - faulty_fastq_dir = tmp_path_factory.mktemp("error_fastq") - fastq_file_name_tumor = "tumor_error.fastq.gz" - Path(faulty_fastq_dir / fastq_file_name_tumor).touch() + """Test balsamic config case with no write permissions to the analysis directory.""" - case_id1 = "faulty_tumor" - tumor = Path(faulty_fastq_dir / fastq_file_name_tumor).as_posix() + # GIVEN CLI arguments including an analysis_dir without write permissions - # Invoke CLI command using file as argument - case_result = invoke_cli( + # WHEN invoking the config case command + result = invoke_cli( [ "config", "case", - "-t", - tumor, - "-p", - panel_bed_file, "--case-id", - case_id1, + case_id_tumor_only, "--analysis-dir", - analysis_dir, + no_write_perm_path, + "--fastq-path", + fastq_dir_tumor_only, + "-p", + panel_bed_file, "--balsamic-cache", balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, ], ) - # THEN run should abort - assert case_result.exit_code == 1 + # THEN program exits before completion + assert result.exit_code == 1 -def test_run_without_permissions( +def test_tumor_only_umi_config_background_file( invoke_cli, - no_write_perm_path, - sample_fastq, - panel_bed_file, - balsamic_cache, + case_id_tumor_only: str, + tumor_sample_name: str, + analysis_dir: str, + fastq_dir_tumor_only: str, + balsamic_cache: str, + panel_bed_file: str, + background_variant_file: str, ): - # GIVEN CLI arguments including an analysis_dir without write permissions - case_id = "sample_tumor_only" - tumor = sample_fastq["tumor"] + """Test balsamic umi config case providing a background variants file.""" + + # GIVEN CLI arguments including a background variant file + # WHEN invoking the config case command result = invoke_cli( [ "config", "case", - "-p", - panel_bed_file, - "-t", - tumor, "--case-id", - case_id, + case_id_tumor_only, + "--analysis-workflow", + "balsamic-umi", "--analysis-dir", - no_write_perm_path, + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only, + "-p", + panel_bed_file, + "--background-variants", + background_variant_file, "--balsamic-cache", balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, ], ) - # THEN program exits before completion - assert result.exit_code == 1 + # THEN program exits and checks for filepath + assert result.exit_code == 0 + assert Path(background_variant_file).exists() -def test_tumor_only_umi_config_background_file( - invoke_cli, sample_fastq, analysis_dir, balsamic_cache, panel_bed_file + +def test_pon_cnn_file( + invoke_cli, + tumor_sample_name: str, + analysis_dir: str, + balsamic_cache: str, + panel_bed_file: str, + pon_cnn_path: str, + fastq_dir_tumor_only: str, + case_id_tumor_only: str, ): + """Test balsamic config case with a PON reference.""" - # GIVEN CLI arguments including a background variant file - case_id = "sample_umi_tumor_only" - tumor = sample_fastq["tumor"] - background_file = "tests/test_data/references/panel/background_variants.txt" - background_variant_file = background_file + # GIVEN CLI arguments including optional pon reference ".cnn" file + # WHEN invoking the config case command result = invoke_cli( [ "config", "case", - "-p", - panel_bed_file, - "-t", - tumor, "--case-id", - case_id, + case_id_tumor_only, "--analysis-dir", analysis_dir, - "--background-variants", - background_variant_file, + "--fastq-path", + fastq_dir_tumor_only, + "-p", + panel_bed_file, + "--pon-cnn", + pon_cnn_path, "--balsamic-cache", balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, ], ) + # THEN program exits and checks for filepath assert result.exit_code == 0 - assert Path(background_variant_file).exists() + assert Path(pon_cnn_path).exists() + + +def test_dag_graph_success_tumor_only(tumor_only_config: str): + """Test DAG graph building success.""" + # WHEN creating config using standard CLI input and setting Sentieon env vars + + # THEN DAG graph should be created successfully + assert Path(json.load(open(tumor_only_config))["analysis"]["dag"]).exists() + + +def test_dag_graph_success_tumor_only_pon(tumor_only_pon_config: str): + """Test DAG graph building success.""" + + # WHEN creating config using standard CLI input and setting Sentieon env vars + + # THEN DAG graph should be created successfully + assert Path(json.load(open(tumor_only_pon_config))["analysis"]["dag"]).exists() + + +def test_dag_graph_success_tumor_normal(tumor_normal_config: str): + """Test DAG graph building success.""" + # WHEN creating config using standard CLI input and setting Sentieon env vars + + # THEN DAG graph should be created successfully + assert Path(json.load(open(tumor_normal_config))["analysis"]["dag"]).exists() + + +def test_dag_graph_success_tumor_only_umi(tumor_only_umi_config: str): + """Test DAG graph building success.""" + # WHEN creating config using standard CLI input and setting Sentieon env vars + + # THEN DAG graph should be created successfully + assert Path(json.load(open(tumor_only_umi_config))["analysis"]["dag"]).exists() + + +def test_dag_graph_success_tumor_normal_umi(tumor_normal_umi_config: str): + """Test DAG graph building success.""" + # WHEN creating config using standard CLI input and setting Sentieon env vars + + # THEN DAG graph should be created successfully + assert Path(json.load(open(tumor_normal_umi_config))["analysis"]["dag"]).exists() + + +def test_dag_graph_success_tumor_only_wgs( + tumor_only_wgs_config: str, +): + """Test DAG graph building success.""" + # WHEN creating config using standard CLI input and setting Sentieon env vars + + # THEN DAG graph should be created successfully + assert Path(json.load(open(tumor_only_wgs_config))["analysis"]["dag"]).exists() + + +def test_dag_graph_success_tumor_normal_wgs(tumor_normal_wgs_config: str): + """Test DAG graph building success.""" + # WHEN creating config using standard CLI input and setting Sentieon env vars + + # THEN DAG graph should be created successfully + assert Path(json.load(open(tumor_normal_wgs_config))["analysis"]["dag"]).exists() def test_config_graph_failed( - invoke_cli, sample_fastq, analysis_dir, balsamic_cache, panel_bed_file + invoke_cli, + case_id_tumor_only: str, + tumor_sample_name: str, + analysis_dir: str, + fastq_dir_tumor_only: str, + balsamic_cache: str, + panel_bed_file: str, ): + """Test DAG graph building failure.""" + # GIVEN an analysis config - case_id = "sample_tumor_only" - tumor = sample_fastq["tumor"] + # GIVEN an empty graphviz instance with mock.patch.object(graphviz, "Source") as mocked: mocked.return_value = None case_result = invoke_cli( [ "config", "case", - "-p", - panel_bed_file, - "-t", - tumor, "--case-id", - case_id, + case_id_tumor_only, "--analysis-dir", analysis_dir, + "--fastq-path", + fastq_dir_tumor_only, + "-p", + panel_bed_file, "--balsamic-cache", balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, ], ) + # THEN the graph should not have been built assert case_result.exit_code == 1 -def test_pon_cnn_file( - invoke_cli, sample_fastq, analysis_dir, balsamic_cache, panel_bed_file +def test_missing_required_gens_arguments( + invoke_cli, + tumor_sample_name: str, + analysis_dir: str, + balsamic_cache: str, + fastq_dir_tumor_only: str, + case_id_tumor_only: str, + gens_cov_pon_file: str, + gens_min_5_af_gnomad_file: str, ): + """Test balsamic config case with 2 out of 3 required GENS arguments.""" - # GIVEN CLI arguments including optional pon reference '.cnn' file - case_id = "test_sample_cnv" - tumor = sample_fastq["tumor"] - pon_file = "tests/test_data/references/panel/test_panel_ponn.cnn" + # GIVEN CLI arguments including optional GENS input-files + # WHEN invoking the config case command result = invoke_cli( [ "config", "case", - "-p", - panel_bed_file, - "-t", - tumor, "--case-id", - case_id, + case_id_tumor_only, "--analysis-dir", analysis_dir, - "--pon-cnn", - pon_file, + "--fastq-path", + fastq_dir_tumor_only, "--balsamic-cache", balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, + "--gens-coverage-pon", + gens_cov_pon_file, + "--gnomad-min-af5", + gens_min_5_af_gnomad_file, ], ) - # THEN program exits and checks for filepath + # THEN the CLI should exit code 2 and display an informative error message + assert result.exit_code == 2 + assert ( + "All three arguments (genome_interval gens_coverage_pon, gnomad_min_af5) are required for GENS." + in result.output + ) + + +def test_config_with_gens_arguments( + invoke_cli, + tumor_sample_name: str, + analysis_dir: str, + balsamic_cache: str, + fastq_dir_tumor_only: str, + case_id_tumor_only: str, + gens_cov_pon_file: str, + gens_min_5_af_gnomad_file: str, + gens_hg19_interval_list: str, +): + """Test balsamic config case with GENS arguments.""" + + # GIVEN CLI arguments including optional GENS input-files + + # WHEN invoking the config case command + result = invoke_cli( + [ + "config", + "case", + "--case-id", + case_id_tumor_only, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, + "--gens-coverage-pon", + gens_cov_pon_file, + "--gnomad-min-af5", + gens_min_5_af_gnomad_file, + "--genome-interval", + gens_hg19_interval_list, + ], + ) + # THEN a config should be created and exist assert result.exit_code == 0 - assert Path(pon_file).exists() + assert Path( + analysis_dir, case_id_tumor_only, f"{case_id_tumor_only}.{FileType.JSON}" + ).exists() + + +def test_config_with_gens_arguments_for_tga( + invoke_cli, + tumor_sample_name: str, + analysis_dir: str, + balsamic_cache: str, + fastq_dir_tumor_only: str, + case_id_tumor_only: str, + gens_cov_pon_file: str, + gens_min_5_af_gnomad_file: str, + gens_hg19_interval_list: str, + panel_bed_file: str, +): + """Test balsamic config case with GENS arguments for TGA.""" + + # GIVEN CLI arguments including optional GENS input-files + + # WHEN invoking the config case command + result = invoke_cli( + [ + "config", + "case", + "--case-id", + case_id_tumor_only, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, + "--gens-coverage-pon", + gens_cov_pon_file, + "--gnomad-min-af5", + gens_min_5_af_gnomad_file, + "--genome-interval", + gens_hg19_interval_list, + "-p", + panel_bed_file, + ], + ) + # THEN a config should be created and exist + assert result.exit_code == 2 + assert ( + "GENS is currently not compatible with TGA analysis, only WGS." in result.output + ) diff --git a/tests/commands/init/test_init.py b/tests/commands/init/test_init.py index 74fcce209..2403a9f55 100644 --- a/tests/commands/init/test_init.py +++ b/tests/commands/init/test_init.py @@ -1,292 +1,199 @@ -import subprocess -import logging -import graphviz - +"""Test Balsamic init command.""" +from functools import partial from pathlib import Path from unittest import mock -from BALSAMIC import __version__ as balsamic_version +from click.testing import Result +from graphviz import Source -def test_init_reference_write_json( - invoke_cli, - tmp_path, +from BALSAMIC import __version__ as balsamic_version +from BALSAMIC.constants.analysis import RunMode +from BALSAMIC.constants.cache import GenomeVersion +from BALSAMIC.constants.cluster import ClusterAccount +from BALSAMIC.constants.constants import EXIT_SUCCESS, EXIT_FAIL +from BALSAMIC.utils.exc import BalsamicError + + +def test_init_hg( + invoke_cli: partial, + tmp_path: Path, + cosmic_key: str, + config_json: str, + reference_graph: str, ): - # Given test_reference.json - test_genome_version = "hg19" - test_container_version = "develop" - test_new_dir = tmp_path / "test_reference_dir" - test_new_dir.mkdir() - - # WHEN creating config.json in reference dir - test_output_reference_config = ( - test_new_dir / balsamic_version / test_genome_version / "config.json" - ) - test_output_reference_pdf = ( - test_new_dir - / balsamic_version - / test_genome_version - / "generate_ref_worflow_graph.pdf" - ) - - result = invoke_cli( - [ - "init", - "-o", - str(test_new_dir), - "--cosmic-key", - "secret_key", - "-v", - test_container_version, - ] - ) - - # THEN output config and pdf file generate and command exit code 0 - assert result.exit_code == 0 - assert Path(test_output_reference_pdf).exists() - assert Path(test_output_reference_config).exists() + """Test Balsamic init command.""" + # GIVEN a temporary output directory and a COSMIC key -def test_init_reference_no_write_perm(tmp_path, invoke_cli, no_write_perm_path): - # Given a path with no write permission - test_genome_version = "hg19" - test_container_version = "develop" - test_new_dir = str(no_write_perm_path) - - # WHEN invoking config sample - result = invoke_cli( + # WHEN invoking the init command + result: Result = invoke_cli( [ "init", - "-o", - str(test_new_dir), + "--out-dir", + tmp_path.as_posix(), + "--genome-version", + GenomeVersion.HG19, "--cosmic-key", - "secret_key", - "-v", - test_container_version, - "-g", - test_genome_version, + cosmic_key, ] ) - # THEN it should create test_reference.json and exist with no error - assert result.exit_code == 1 - + # THEN the human reference generation workflow should have successfully started + assert Path(tmp_path, balsamic_version, GenomeVersion.HG19, config_json).exists() + assert Path( + tmp_path, balsamic_version, GenomeVersion.HG19, reference_graph + ).exists() + assert result.exit_code == EXIT_SUCCESS -def test_init_reference_no_cosmic_abort(tmp_path, invoke_cli): - # Given a path with no write permission - test_genome_version = "hg19" - test_container_version = "develop" - test_new_dir = tmp_path / "test_reference_dir" - test_new_dir.mkdir() - - # WHEN invoking config sample - result = invoke_cli( - [ - "init", - "-o", - str(test_new_dir), - "-v", - test_container_version, - "-g", - test_genome_version, - ] - ) - - # THEN it should create test_reference.json and exist with no error - assert result.exit_code == 1 +def test_init_canfam( + invoke_cli: partial, + tmp_path: Path, + cosmic_key: str, + config_json: str, + reference_graph: str, +): + """Test Balsamic canine workflow init command.""" -def test_init_reference_no_cosmic_run(tmp_path, invoke_cli): - # Given a path with no write permission - test_genome_version = "canfam3" - test_container_version = "develop" - test_new_dir = tmp_path / "test_reference_dir" - test_new_dir.mkdir() + # GIVEN a temporary output directory and a COSMIC key - # WHEN invoking config sample - result = invoke_cli( + # WHEN invoking the init command + result: Result = invoke_cli( [ "init", - "-o", - str(test_new_dir), - "-v", - test_container_version, - "-g", - test_genome_version, + "--out-dir", + tmp_path.as_posix(), + "--genome-version", + GenomeVersion.CanFam3, ] ) - # THEN it should create test_reference.json and exist with no error - assert result.exit_code == 0 + # THEN the canine reference generation workflow should have successfully started + assert Path(tmp_path, balsamic_version, GenomeVersion.CanFam3, config_json).exists() + assert Path( + tmp_path, balsamic_version, GenomeVersion.CanFam3, reference_graph + ).exists() + assert result.exit_code == EXIT_SUCCESS + +def test_init_hg_no_cosmic_key(invoke_cli: partial, tmp_path: Path, cosmic_key: str): + """Test Balsamic init command when a COSMIC key is not provided.""" -def test_init_reference_click_abort(invoke_cli, tmp_path): - # Given test_reference output directory - test_container_version = "develop" - test_new_dir = tmp_path / "test_reference_dir" - test_new_dir.mkdir() + # GIVEN a temporary output directory and a COSMIC key - # WHEN running the command - result = invoke_cli( + # WHEN invoking the init command + result: Result = invoke_cli( [ "init", - "-o", - str(test_new_dir), - "--cosmic-key", - "secret_key", - "-v", - test_container_version, - "--run-analysis", + "--out-dir", + tmp_path.as_posix(), + "--genome-version", + GenomeVersion.HG19, ] ) - # THEN it should exit code for not providing the run-mode - assert result.exit_code == 1 + # THEN an exception should have been raised + assert ( + f"No COSMIC authentication key specified. It is required when using {GenomeVersion.HG19} reference" + in result.output + ) + assert result.exit_code == EXIT_FAIL -def test_init_reference_mail_type(invoke_cli, tmp_path): - # Given test_reference output directory - test_container_version = "develop" - test_new_dir = tmp_path / "test_reference_dir" - test_new_dir.mkdir() +def test_init_hg_run_analysis( + invoke_cli: partial, + tmp_path: Path, + cosmic_key: str, + config_json: str, + reference_graph: str, +): + """Test Balsamic init command when actually running the analysis.""" - dummy_mail_type = "END" - dummy_mail_user = "dummy@gmail.com" + # GIVEN a temporary output directory, a cluster account, and a COSMIC key - # WHEN running the command - result = invoke_cli( + # WHEN invoking the init command + result: Result = invoke_cli( [ "init", - "-o", - str(test_new_dir), + "--out-dir", + tmp_path.as_posix(), + "--genome-version", + GenomeVersion.HG19, "--cosmic-key", - "secret_key", - "-v", - test_container_version, - "--run-analysis", + cosmic_key, "--run-mode", - "local", - "--mail-type", - dummy_mail_type, - "--mail-user", - dummy_mail_user, + RunMode.CLUSTER, + "--account", + ClusterAccount.DEVELOPMENT, + "--run-analysis", ] ) - # THEN it should exit code for not providing the run-mode - assert result.exit_code == 0 - + # THEN the human reference generation workflow should have successfully started + assert Path(tmp_path, balsamic_version, GenomeVersion.HG19, config_json).exists() + assert Path( + tmp_path, balsamic_version, GenomeVersion.HG19, reference_graph + ).exists() + assert result.exit_code == EXIT_SUCCESS -def test_init_reference_graph_exception(invoke_cli, tmp_path): - # Given test_reference.json - test_new_dir = tmp_path / "test_reference_nonfunctional_graph" - test_new_dir.mkdir() - - with mock.patch.object(graphviz, "Source") as mocked: - mocked.return_value = None - result = invoke_cli( - [ - "init", - "-o", - str(test_new_dir), - "--cosmic-key", - "secret_key", - ] - ) - - assert result.exit_code == 1 - - -def test_init_container_force_dry(invoke_cli, tmp_path): - # Given a dummy path - test_new_dir = tmp_path / "test_container_dry_force" - test_new_dir.mkdir() - test_container_version = "develop" - - # WHEN force pull dry-run container - result = invoke_cli( - [ - "init", - "--outdir", - str(test_new_dir), - "--cosmic-key", - "secret_key", - "--force", - "-v", - test_container_version, - ] - ) - - # THEN command exit code 0 - assert result.exit_code == 0 +def test_init_hg_run_analysis_no_account( + invoke_cli: partial, tmp_path: Path, cosmic_key: str +): + """Test Balsamic init command when actually running the analysis without specifying a cluster account.""" -def test_init_container_specific_tag(invoke_cli, tmp_path): - # Given a dummy path - test_new_dir = tmp_path / "test_container_dir" - test_new_dir.mkdir() - dummy_tag = "develop" + # GIVEN a temporary output directory and a COSMIC key - # WHEN pulling a specific tag other than standard version - result = invoke_cli( + # WHEN invoking the init command + result: Result = invoke_cli( [ "init", - "--outdir", - str(test_new_dir), + "--out-dir", + tmp_path.as_posix(), + "--genome-version", + GenomeVersion.HG19, "--cosmic-key", - "secret_key", - "--container-version", - dummy_tag, + cosmic_key, + "--run-mode", + RunMode.CLUSTER, + "--run-analysis", ] ) - # THEN command exit code 0 - assert result.exit_code == 0 + # THEN an exception should have been raised + assert "A cluster account is required for cluster run mode" in result.output + assert result.exit_code == EXIT_FAIL -def test_init_container_without_dry_run(invoke_cli, tmp_path): - # Given a dummy path - test_new_dir = tmp_path / "test_container_dir" - test_new_dir.mkdir() +def test_init_hg_graph_exception( + invoke_cli: partial, + tmp_path: Path, + cosmic_key: str, + config_json: str, + reference_graph: str, +): + """Test Balsamic init command with a graphviz exception.""" - with mock.patch.object(subprocess, "run") as mocked: - mocked.return_value = 0 + # GIVEN a temporary output directory and a COSMIC key - # WHEN pulling a container in a non dry-run mode - result = invoke_cli( + # WHEN invoking the init command + with mock.patch.object(Source, "render", side_effect=BalsamicError("Test error")): + result: Result = invoke_cli( [ "init", - "--outdir", - str(test_new_dir), + "--out-dir", + tmp_path.as_posix(), + "--genome-version", + GenomeVersion.HG19, "--cosmic-key", - "secret_key", - "--run-analysis", - "--account", - "development", + cosmic_key, ] ) - # THEN output config and pdf file generate and command exit code 0 - assert result.exit_code == 0 - - -def test_init_container_wrong_tag(invoke_cli, tmp_path): - # Given a dummy path - test_new_dir = tmp_path / "test_container_dir" - test_new_dir.mkdir() - dummy_tag = "some_tag_that_does_not_exist_ngrtf123jsds3wqe2" - - # WHEN pulling a wrong container tag - result = invoke_cli( - [ - "init", - "--outdir", - str(test_new_dir), - "--cosmic-key", - "secret_key", - "--container-version", - dummy_tag, - ] - ) - - # THEN capture error log and error code - assert result.exit_code > 0 + # THEN the human reference generation workflow should fail + assert "Workflow graph generation failed" in result.output + assert Path(tmp_path, balsamic_version, GenomeVersion.HG19, config_json).exists() + assert not Path( + tmp_path, balsamic_version, GenomeVersion.HG19, reference_graph + ).exists() + assert result.exit_code == EXIT_FAIL diff --git a/tests/commands/plugins/conftest.py b/tests/commands/plugins/conftest.py deleted file mode 100644 index cedf926ed..000000000 --- a/tests/commands/plugins/conftest.py +++ /dev/null @@ -1,18 +0,0 @@ -from pathlib import Path - -import pytest - - -@pytest.fixture -def input_file(): - return "tests/test_data/vcf_tables/test_input.txt" - - -@pytest.fixture -def output_file(tmp_path): - return Path(tmp_path, "test_createVCF_output.vcf.gz").as_posix() - - -@pytest.fixture -def reference_file(): - return "tests/test_data/vcf_tables/test_reference.vcf.gz" diff --git a/tests/commands/plugins/test_scout.py b/tests/commands/plugins/test_scout.py deleted file mode 100644 index 066315617..000000000 --- a/tests/commands/plugins/test_scout.py +++ /dev/null @@ -1,36 +0,0 @@ -def test_scout_tumor_normal(invoke_cli, tumor_normal_config): - # GIVEN a tumor-normal config file - # WHEN running analysis - result = invoke_cli( - [ - "plugins", - "scout", - "--sample-config", - tumor_normal_config, - "--customer-id", - "cust000", - ] - ) - - # THEN it should run without any error - print(result) - assert result.exit_code == 0 - - -def test_scout_tumor_only(invoke_cli, tumor_only_config): - # GIVEN a tumor-only config file - # WHEN running analysis - result = invoke_cli( - [ - "plugins", - "scout", - "--sample-config", - tumor_only_config, - "--customer-id", - "cust000", - ] - ) - - # THEN it should run without any error - print(result) - assert result.exit_code == 0 diff --git a/tests/commands/plugins/test_vcfutils_createvcf.py b/tests/commands/plugins/test_vcfutils_createvcf.py deleted file mode 100644 index a9d764981..000000000 --- a/tests/commands/plugins/test_vcfutils_createvcf.py +++ /dev/null @@ -1,110 +0,0 @@ -from datetime import date - -from BALSAMIC.commands.plugins.vcfutils import vcfheader -from BALSAMIC.commands.plugins.vcfutils import collect_vcf_info -from BALSAMIC.commands.plugins.vcfutils import collect_ref_info -from BALSAMIC.commands.plugins.vcfutils import readinput -from BALSAMIC.commands.plugins.vcfutils import createvcf -from click.testing import CliRunner -from pathlib import Path - -import re - - -def test_readinput_return_dict(input_file): - """test input file for properly returning the required fields""" - - # GIVEN input file with required fields - valid_input_info = { - "COSV62571334:AKT1:p.E17K": "0.118;SNP;p.Glu17Lys", - "COSV51765161:EGFR:p.L858R": "0.106;SNP;p.Leu858Arg", - "COSV51765492:EGFR:p.T790M": "0.116;SNP;p.Thr790Met", - "COSV56056643:BRAF:p.V600E": "0.104;SNP;p.Val600Glu", - } - - # WHEN calling readinput - build_read_input = readinput(input_file) - - # THEN it should return a input info with dict value - assert valid_input_info == build_read_input - - -def test_vcfheader_return_string(): - """test vcfheader for properly returning a VCF header""" - - # GIVEN current datetime - current_time = date.today().strftime("%Y%m%d") - valid_date_in_vcf = "##fileDate=" + current_time - - # WHEN calling vcfheader - built_vcf_header = vcfheader() - - # THEN it should return a VCF header with a valid current date - assert valid_date_in_vcf in built_vcf_header - - -def test_ensids_return_string(): - """test ensembl ids in a reference vcf file""" - - # GIVEN the ensembl ID - info = "GENE=AKT1_ENST00000555528" - valid_ens_id = re.sub(r"(.*)(_ENST\d+)", r"\1", info) - - # WHEN substitute the ensembl_ids - ens_id = collect_vcf_info(valid_ens_id) - - # THEN it should return a valid matching - assert valid_ens_id in ens_id - - -def test_collect_ref_info_return_list(): - """test fields in a reference file""" - - # GIVEN the variant info fields in input file - valid_variant = "0.00119999998;SNP;p.Glu17Lys" - allele_freq, variant_type, aa_hgvs = valid_variant.split(";") - valid_info_variant = "VARIANT_TYPE=SNP;AA_HGVS=p.Glu17Lys;AF=1e-05" - - # WHEN calling the reference file - info = collect_ref_info(valid_variant) - - # THEN it should return a list of info fields - assert valid_info_variant in info - - -def test_cli_inputs_return_filepaths(input_file, reference_file, output_file): - """test commandline cli inputs for file exists""" - - # GIVEN the required files - - # WHEN calling the createvcf cli commands() - runner = CliRunner() - result = runner.invoke( - createvcf, ["-i", input_file, "-r", reference_file, "-o", output_file] - ) - - # THEN checks for filepaths and command exit code 0 - assert result.exit_code == 0 - assert Path(input_file).exists() - assert Path(reference_file).exists() - - -def test_createvcf_return_str(): - """test createvcf function return values""" - - # GIVEN the required variant info - info = "7\t140453136\tCOSV56056643\tA\tT\t.\t.\tGENE=BRAF;STRAND=-;LEGACY_ID=COSM476;CDS=c.1799T>A;AA=p.V600E;VARIANT_TYPE=SNP;AA_HGVS=p.Val600Glu;AF=0.00104" - variant = "7\t140453136\tCOSV56056643\tA\tT\t.\t.\tGENE=BRAF;STRAND=-;LEGACY_ID=COSM476;CDS=c.1799T>A;AA=p.V600E;CNT=28296\n" - vcf_id_value = "0.104;SNP;p.Val600Glu" - - # WHEN call related functions defined in createvcf() - variant_info = collect_vcf_info(str(variant)) - reference_info = collect_ref_info(vcf_id_value) - build_info = "\t".join(variant_info) + ";" + reference_info - filtered_variants = [ - "14\t105246551\tCOSV62571334\tC\tT\t.\t.\tGENE=AKT1;STRAND=-;LEGACY_ID=COSM33765;CDS=c.49G>A;AA=p.E17K;VARIANT_TYPE=SNP;AA_HGVS=p.Glu17Lys;AF=0.00118" - ] - - # THEN it should return valid matches - assert info in build_info - assert info not in filtered_variants diff --git a/tests/commands/run/test_run_analysis.py b/tests/commands/run/test_run_analysis.py index b3d1d8e2e..e3a594102 100644 --- a/tests/commands/run/test_run_analysis.py +++ b/tests/commands/run/test_run_analysis.py @@ -44,9 +44,7 @@ def test_run_analysis_tumor_normal_dry_run(invoke_cli, tumor_normal_config): assert result.exit_code == 0 -def test_run_analysis_tumor_only_dry_run( - invoke_cli, tumor_only_config, tumor_normal_config -): +def test_run_analysis_tumor_only_dry_run(invoke_cli, tumor_only_config): # GIVEN a tumor-only config file # WHEN running analysis result = invoke_cli( @@ -90,8 +88,8 @@ def test_run_analysis_create_dir(invoke_cli, tumor_only_config): "development", ] ) - # THEN it should abort with error - assert Path(re.sub("/$", ".1/", log_dir)).exists() + # THEN it should create a log_dir + assert Path(re.sub("/$", ".1/", log_dir)).exists() def test_run_analysis_ponpath(invoke_cli, tumor_only_pon_config): diff --git a/tests/commands/run/test_scheduler.py b/tests/commands/run/test_scheduler.py index 3e0e68709..d82e5d794 100644 --- a/tests/commands/run/test_scheduler.py +++ b/tests/commands/run/test_scheduler.py @@ -189,33 +189,32 @@ def test_qsub_scheduler(): ) -def test_read_sample_config_err(config_files): - with pytest.raises(Exception): - # GIVEN a bed file instead of json file - bed_file = config_files["panel_bed_file"] +def test_read_sample_config_err(panel_bed_file: str): + # GIVEN a bed file instead of json file - # WHEN calling read_sample_config + # WHEN calling read_sample_config + with pytest.raises(Exception): # THEN It should raise the exception error - assert read_sample_config(bed_file) + read_sample_config(panel_bed_file) def test_write_sacct_file_err(): - with pytest.raises(FileNotFoundError): - # GIVEN a non-existing file path and jobid - dummy_file_path = "dummy/dummy_fname" - dummy_jobid = "12345" + # GIVEN a non-existing file path and jobid + dummy_file_path = "dummy/dummy_fname" + dummy_jobid = "12345" - # WHEN calling write_sacct_file + # WHEN calling write_sacct_file + with pytest.raises(FileNotFoundError): # THEN It should raise the exception - assert write_sacct_file(dummy_file_path, dummy_jobid) + write_sacct_file(dummy_file_path, dummy_jobid) def test_submit_job_err(): - with pytest.raises(subprocess.CalledProcessError): - # GIVEN a wrong command - sbatch_cmd = "SBATCH jobscript.sh" - profile = "slurm" + # GIVEN a wrong command + sbatch_cmd = "SBATCH jobscript.sh" + profile = "slurm" - # WHEN calling submit_job function + # WHEN calling submit_job function + with pytest.raises(subprocess.CalledProcessError): # THEN it should return the exit code 1 and raise the subprocess error - assert submit_job(sbatch_cmd, profile) + submit_job(sbatch_cmd, profile) diff --git a/tests/commands/test_cli.py b/tests/commands/test_cli.py index ad55b65b4..7925dd47e 100644 --- a/tests/commands/test_cli.py +++ b/tests/commands/test_cli.py @@ -24,8 +24,8 @@ def test_config_case(invoke_cli): # WHEN asking to show params for config-sample result = invoke_cli(["config", "case", "--help"]) - # THEN It should show all params reuired for config-sample - assert "sample-id" in result.output + # THEN It should show all params required for config-sample + assert "--case-id" in result.output assert result.exit_code == 0 @@ -54,23 +54,6 @@ def test_report_status(invoke_cli): assert result.exit_code == 0 -def test_plugins(invoke_cli): - # GIVEN want to see config-sample params with help option - # WHEN asking to show params for config-sample - result = invoke_cli(["plugins", "--help"]) - - # THEN It should show all params reuired for config-sample - assert result.exit_code == 0 - - -def test_plugins_scout(invoke_cli): - # WHEN invoking command with missing options - result = invoke_cli(["plugins", "scout", "--help"]) - - # THEN It should throw missing option error - assert result.exit_code == 0 - - def test_run(invoke_cli): # WHEN asking to options for run command result = invoke_cli(["run", "--help"]) @@ -85,7 +68,7 @@ def test_run_analysis(invoke_cli): result = invoke_cli(["run", "analysis", "--help"]) # THEN it should show all params without error - assert "--snake-file" in result.output + assert "--snakefile" in result.output assert "--sample-config" in result.output assert "--run-mode" in result.output assert "--cluster-config" in result.output diff --git a/tests/conftest.py b/tests/conftest.py index 6a6cf445e..8e3f9f19b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,137 +1,579 @@ -import pytest -import json +import copy import os - -from unittest import mock -from distutils.dir_util import copy_tree -from pathlib import Path +import shutil +from datetime import datetime from functools import partial +from pathlib import Path +from typing import Any, Dict, List +from unittest import mock -from BALSAMIC.constants.workflow_params import VCF_DICT +import pytest +from _pytest.tmpdir import TempPathFactory from click.testing import CliRunner +from pydantic_core import Url -from BALSAMIC.utils.io import read_json, read_yaml -from .helpers import ConfigHelper, Map -from BALSAMIC.commands.base import cli from BALSAMIC import __version__ as balsamic_version +from BALSAMIC.assets.scripts.preprocess_gens import cli as gens_preprocessing_cli +from BALSAMIC.commands.base import cli +from BALSAMIC.constants.analysis import ( + BIOINFO_TOOL_ENV, + AnalysisWorkflow, + PONWorkflow, + RunMode, +) +from BALSAMIC.constants.cache import REFERENCE_FILES, DockerContainers, GenomeVersion +from BALSAMIC.constants.cluster import ( + QOS, + ClusterAccount, + ClusterConfigType, + ClusterProfile, +) +from BALSAMIC.constants.constants import FileType +from BALSAMIC.constants.paths import CONSTANTS_DIR, FASTQ_TEST_INFO, TEST_DATA_DIR +from BALSAMIC.constants.workflow_params import VCF_DICT +from BALSAMIC.models.cache import ( + AnalysisReferencesHg, + CacheAnalysis, + CacheConfig, + References, + ReferencesHg, +) +from BALSAMIC.models.config import ConfigModel +from BALSAMIC.models.snakemake import SingularityBindPath, SnakemakeExecutable +from BALSAMIC.utils.io import read_json, read_yaml, write_json +from .helpers import ConfigHelper, Map MOCKED_OS_ENVIRON = "os.environ" +def fastq_patterns() -> list: + """ + Returns a list of dicts containing different formatted fastq-file names to be used in parameterized tests. + """ + fastq_test_info_path = Path(FASTQ_TEST_INFO).as_posix() + fastq_test_info_dict = read_json(fastq_test_info_path) + return fastq_test_info_dict["fastq_pattern_types"] + + +def fastq_pattern_ids() -> list: + """ + Returns a list of IDs for the parameterized testing of different fastq file name formats. + """ + fastq_test_info_path = Path(FASTQ_TEST_INFO).as_posix() + fastq_test_info_dict = read_json(fastq_test_info_path) + fastq_pattern_types = fastq_test_info_dict["fastq_pattern_types"] + fastq_pattern_ids = ["FastqPattern{}".format(p["id"]) for p in fastq_pattern_types] + return fastq_pattern_ids + + +@pytest.fixture(scope="session") +def test_data_dir() -> Path: + """ + Creates path for test data directory. + """ + return TEST_DATA_DIR + + +@pytest.fixture(scope="session") +def load_test_fastq_data(test_data_dir) -> Dict: + """Returns dict from loaded json containing strings of fastq-names.""" + fastq_test_info_path = Path(FASTQ_TEST_INFO).as_posix() + return read_json(fastq_test_info_path) + + +@pytest.fixture(scope="session") +def pon_fastq_list(load_test_fastq_data) -> list: + """Returns list of fastq names to be used in PON creation testing.""" + return load_test_fastq_data["pon_fastq_list"] + + +@pytest.fixture(scope="session") +def standard_samples_list( + load_test_fastq_data, +) -> Dict[str, List]: + """Returns a list of standard tumor normal sample dicts.""" + return load_test_fastq_data["samples_standard_fastq_names"] + + +@pytest.fixture(scope="session") +def standard_samples_list_pon( + load_test_fastq_data, +) -> Dict[str, List]: + """Returns a list of standard tumor normal sample dicts for PON.""" + return load_test_fastq_data["pon_samples_standard_fastq_names"] + + +@pytest.fixture(scope="session") +def tumor_fastq_names( + load_test_fastq_data, +) -> Dict[str, List]: + """Returns a list of standard tumor fastq-names.""" + return load_test_fastq_data["standard_fastq_names"]["tumor"] + + +@pytest.fixture(scope="session") +def normal_fastq_names( + load_test_fastq_data, +) -> Dict[str, List]: + """Returns a list of standard normal fastq-names.""" + return load_test_fastq_data["standard_fastq_names"]["normal"] + + +@pytest.fixture(scope="session") +def fastq_names_duplicate_assigned_fastq_patterns( + load_test_fastq_data, +) -> Dict[str, List]: + """Returns dict with list of fastq file names for testing of duplicate assigned fastq patterns.""" + return load_test_fastq_data["fastq_fails"]["duplicate_fastq_patterns"] + + +@pytest.fixture(scope="session") +def sample_list_duplicate_assigned_fastq_patterns_model( + load_test_fastq_data, +) -> Dict[str, List]: + """Returns List of sample-dicts with fastq-dicts with duplicate assigned fastq patterns.""" + return load_test_fastq_data["fastq_fails"]["duplicate_fastq_patterns_model"] + + +@pytest.fixture(scope="session") +def tumor_normal_fastq_info_correct(load_test_fastq_data) -> Dict[str, Dict]: + """Mock tumor normal fastq info in sample_dict.""" + return load_test_fastq_data["test_fastq_info"] + + +@pytest.fixture(scope="session") +def valid_dnascope_variant() -> str: + """Mock valid DNAscope variant.""" + return ( + "1\t100\trs1\tT\tC\t389.77\t.\tINFO\tGT:AD:DP:GQ:PL\t0/1:9,14:23:99:418,0,257" + ) + + +@pytest.fixture(scope="session") +def invalid_dnascope_variant_no_ad() -> str: + """Mock invalid DNAscope variant without any read support.""" + return "1\t200\t.\tCAAA\tCAAAA,C\t0.00\tLowQual\tINFO\tGT:AD:DP:GQ:PL\t0/0:0,0,0:0:0:0,0,0,3,3,19" + + +@pytest.fixture(scope="session") +def invalid_dnascope_variant_illegal_chrom() -> str: + """Mock invalid DNAscope variant with non-standard chromosome.""" + return ( + "25\t100\trs1\tT\tC\t389.77\t.\tINFO\tGT:AD:DP:GQ:PL\t0/1:9,14:23:99:418,0,257" + ) + + +@pytest.fixture(scope="session", name="session_tmp_path") +def fixture_session_tmp_path(tmp_path_factory: TempPathFactory) -> Path: + """Return a non-existent files directory path.""" + return tmp_path_factory.mktemp("session_tests") + + +@pytest.fixture(scope="session") +def tumor_sample_name() -> str: + """Create mock name for tumor sample.""" + return "ACC1" + + +@pytest.fixture(scope="session") +def normal_sample_name() -> str: + """Create mock name for normal sample.""" + return "ACC2" + + +@pytest.fixture(scope="session") +def case_id_tumor_only() -> str: + """Create mock case-id for TGA tumor-only.""" + return "sample_tumor_only" + + +@pytest.fixture(scope="session") +def case_id_tumor_only_dummy_vep() -> str: + """Mock TGA tumor-only case ID for dummy vep file testing.""" + return "sample_tumor_only_dummy_vep" + + +@pytest.fixture(scope="session") +def case_id_tumor_only_qc() -> str: + """Mock TGA tumor-only case ID for QC workflow.""" + return "sample_tumor_only_qc" + + +@pytest.fixture(scope="session") +def case_id_tumor_only_pon_cnn() -> str: + """Mock TGA tumor-only case ID for testing with PON CNN file.""" + return "sample_tumor_only_pon_cnn" + + +@pytest.fixture(scope="session") +def case_id_pon() -> str: + """ + Creates mock case-id for PON creation workflow + """ + return "sample_pon_creation" + + +@pytest.fixture(scope="session") +def case_id_gens_pon() -> str: + """ + Creates mock case-id for PON creation workflow + """ + return "genscreation" + + +def case_id_tumor_only_pon() -> str: + """Create mock case-id for TGA PON tumor-only.""" + return "sample_tumor_only_pon" + + +@pytest.fixture(scope="session") +def case_id_tumor_only_umi() -> str: + """Creates mock case-id for TGA tumor-only UMI workflow.""" + return "sample_tumor_only_umi" + + +@pytest.fixture(scope="session") +def case_id_tumor_normal_umi() -> str: + """Creates mock case-id for TGA tumor-only UMI workflow.""" + return "sample_tumor_normal_umi" + + +@pytest.fixture(scope="session") +def case_id_tumor_normal_fastqdir() -> str: + """Mock case ID for dummy fastqdir.""" + return "sample_tumor_normal_fastqdir" + + +@pytest.fixture(scope="session") +def case_id_tumor_normal() -> str: + """Create mock case-id for TGA tumor-normal.""" + return "sample_tumor_normal" + + +@pytest.fixture(scope="session") +def case_id_tumor_normal_qc() -> str: + """Mock TGA tumor-normal case ID for QC TGA test.""" + return "sample_tumor_normal_qc" + + +@pytest.fixture(scope="session") +def case_id_tumor_normal_qc_wgs() -> str: + """Mock TGA tumor-normal case ID for QC WGS test.""" + return "sample_tumor_normal_qc_wgs" + + +@pytest.fixture(scope="session") +def case_id_tumor_only_wgs() -> str: + """Create mock case-id for WGS tumor-only.""" + return "sample_tumor_only_wgs" + + +@pytest.fixture(scope="session") +def case_id_tumor_normal_wgs() -> str: + """Create mock case-id for WGS tumor-normal.""" + return "sample_tumor_normal_wgs" + + @pytest.fixture def cli_runner(): - """click - cli testing""" + """Run click for command line interface testing.""" runner = CliRunner() return runner @pytest.fixture def invoke_cli(cli_runner): - """invoking cli commands with options""" + """Invoke cli commands with options.""" return partial(cli_runner.invoke, cli) +@pytest.fixture +def invoke_gens_cli(cli_runner): + """Invoke cli commands with options.""" + return partial(cli_runner.invoke, gens_preprocessing_cli) + + @pytest.fixture(scope="session") def environ(): - """environment process""" + """Create operating system's environment object.""" return "os.environ" @pytest.fixture(scope="session") -def config_files(): - """dict: path of the config files""" - return { - "sample": "BALSAMIC/config/sample.json", - "analysis_paired": "BALSAMIC/config/analysis_paired.json", - "cluster_json": "BALSAMIC/config/cluster.json", - "analysis_paired_umi": "BALSAMIC/config/analysis_paired_umi.json", - "analysis_single": "BALSAMIC/config/analysis_single.json", - "analysis_single_umi": "BALSAMIC/config/analysis_single_umi.json", - "panel_bed_file": "tests/test_data/references/panel/panel.bed", - "background_variant_file": "tests/test_data/references/panel/background_variants.txt", - "pon_cnn": "tests/test_data/references/panel/test_panel_ponn.cnn", - "pon_fastq_path": "tests/test_data/fastq/", - } +def cluster_analysis_config_path() -> str: + """Return cluster analysis configuration file.""" + return Path( + CONSTANTS_DIR, f"{ClusterConfigType.ANALYSIS}.{FileType.JSON}" + ).as_posix() @pytest.fixture(scope="session") def reference(): - """reference json model""" + """Return a dictionary for reference json model.""" return { - "reference": { - "reference_genome": "tests/test_data/references/genome/human_g1k_v37_decoy.fasta", - "dbsnp": "tests/test_data/references/variants/dbsnp_grch37_b138.vcf.gz", - "1kg_snps_all": "tests/test_data/references/variants/1k_genome_wgs_p1_v3_all_sites.vcf.gz", - "1kg_snps_high": "tests/test_data/references/variants/1kg_phase1_snps_high_confidence_b37.vcf.gz", - "1kg_known_indel": "tests/test_data/references/variants/1kg_known_indels_b37.vcf.gz", - "mills_1kg": "tests/test_data/references/variants/mills_1kg_index.vcf.gz", - "gnomad_variant": "tests/test_data/reference/variants/gnomad.genomes.r2.1.1.sites.vcf.bgz", - "cosmic": "tests/test_data/references/variants/cosmic_coding_muts_v89.vcf.gz", - "vep": "tests/test_data/references/vep/", - "refflat": "tests/test_data/references/genome/refseq.flat", - "refGene": "tests/test_data/references/genome/refGene.txt", - "wgs_calling_interval": "tests/test_data/references/genome/wgs_calling_regions.v1", - "genome_chrom_size": "tests/test_data/references/genome/hg19.chrom.sizes", - "exon_bed": "tests/test_data/references/genome/refseq.flat.bed", - "rankscore": "tests/test_data/references/genome/cancer_rank_model_-v0.1-.ini", - "access_regions": "tests/test_data/references/genome/access-5k-mappable.hg19.bed", - "delly_exclusion": "tests/test_data/references/genome/delly_exclusion.tsv", - "delly_exclusion_converted": "tests/test_data/references/genome/delly_exclusion_converted.tsv", - "delly_mappability": "tests/test_data/references/genome/delly_mappability.gz", - "delly_mappability_gindex": "tests/test_data/references/genome/delly_mappability.gz.gzi", - "delly_mappability_findex": "tests/test_data/references/genome/delly_mappability.fai", - "ascat_gccorrection": "tests/test_data/references/genome/GRCh37_SnpGcCorrections.tsv", - "ascat_chryloci": "tests/test_data/references/genome/GRCh37_Y.loci", - "clinvar": "tests/test_data/references/genome/clinvar.vcf.gz", - "clinical_snv_observations": "tests/test_data/references/variants/clinical_snv_variants.vcf.gz", - "clinical_sv_observations": "tests/test_data/references/variants/clinical_sv_variants.vcf.gz", - "swegen_snv_frequency": "tests/test_data/references/variants/swegen_snv.vcf.gz", - "swegen_sv_frequency": "tests/test_data/references/variants/swegen_sv.vcf.gz", - "somalier_sites": "tests/test_data/references/variants/GRCh37.somalier.sites.vcf.gz", - } + "reference_genome": "genome/human_g1k_v37_decoy.fasta", + "dbsnp": "variants/dbsnp_grch37_b138.vcf.gz", + "vcf_1kg": "variants/1k_genome_wgs_p1_v3_all_sites.vcf.gz", + "hc_vcf_1kg": "variants/1kg_phase1_snps_high_confidence_b37.vcf.gz", + "known_indel_1kg": "variants/1kg_known_indels_b37.vcf.gz", + "mills_1kg": "variants/mills_1kg_index.vcf.gz", + "gnomad_variant": "variants/gnomad.genomes.r2.1.1.sites.vcf.bgz", + "cosmic": "variants/cosmic_coding_muts_v89.vcf.gz", + "vep_dir": "vep/", + "refgene_flat": "genome/refseq.flat", + "refgene_txt": "genome/refGene.txt", + "wgs_calling_regions": "genome/wgs_calling_regions.v1", + "genome_chrom_size": "genome/hg19.chrom.sizes", + "refgene_bed": "genome/refseq.flat.bed", + "rank_score": "genome/cancer_rank_model_-v0.1-.ini", + "access_regions": "genome/access-5k-mappable.hg19.bed", + "delly_exclusion": "genome/delly_exclusion.tsv", + "delly_exclusion_converted": "genome/delly_exclusion_converted.tsv", + "delly_mappability": "genome/delly_mappability.gz", + "delly_mappability_gindex": "genome/delly_mappability.gz.gzi", + "delly_mappability_findex": "genome/delly_mappability.fai", + "ascat_gc_correction": "genome/GRCh37_SnpGcCorrections.tsv", + "ascat_chr_y_loci": "genome/GRCh37_Y.loci", + "clinvar": "genome/clinvar.vcf.gz", + "somalier_sites": "variants/GRCh37.somalier.sites.vcf.gz", + "cadd_snv": "variants/hg19.cadd_snv.tsv.gz", + "cadd_annotations": "cadd/", + "simple_repeat": "genome/simpleRepeat.txt.gz", } @pytest.fixture(scope="session") -def test_data_dir(): - return "tests/test_data" +def reference_panel_dir_path(test_data_dir: str) -> str: + """Return path for reference panel directory.""" + return Path(test_data_dir, "references", "panel").as_posix() @pytest.fixture(scope="session") -def config_path(): - return "tests/test_data/config.json" +def reference_variants_dir_path(test_data_dir: str) -> str: + """Return path for reference variants directory.""" + return Path(test_data_dir, "references", "variants").as_posix() @pytest.fixture(scope="session") -def config_dict(config_path): +def config_path(test_data_dir: str) -> str: + """Created path for config json file.""" + return Path(test_data_dir, f"config.{FileType.JSON}").as_posix() + + +@pytest.fixture(scope="session") +def config_dict(config_path: str) -> str: + """Read and return config from json.""" return read_json(config_path) @pytest.fixture(scope="session") -def pon_fastq_path(): - return "tests/test_data/fastq/" +def config_dict_w_singularity(config_dict: str, balsamic_cache: str) -> str: + """Read and return config from json with singularity image path.""" + modify_dict = copy.deepcopy(config_dict) + modify_dict["singularity"] = { + "image": f"{balsamic_cache}/{balsamic_version}/containers" + } + return modify_dict + + +@pytest.fixture(scope="session") +def pon_config_path(test_data_dir: str) -> str: + """Created path for PON config json file.""" + return Path(test_data_dir, f"config_pon.{FileType.JSON}").as_posix() + + +@pytest.fixture(scope="session") +def pon_config_dict(pon_config_path: str) -> str: + """Read and return PON config from json.""" + return read_json(pon_config_path) + + +@pytest.fixture(scope="session") +def pon_config_dict_w_singularity(pon_config_dict: str, balsamic_cache: str) -> str: + """Read and return PON config from json with singularity image path.""" + modify_pon_config_dict = copy.deepcopy(pon_config_dict) + modify_pon_config_dict["singularity"] = { + "image": f"{balsamic_cache}/{balsamic_version}/containers" + } + return modify_pon_config_dict + + +@pytest.fixture(scope="session") +def cadd_annotations(test_data_dir: str) -> str: + """Return path for CADD annotations.""" + return Path(test_data_dir, "references", "cadd").as_posix() + + +@pytest.fixture(scope="session") +def vcf_file_path(test_data_dir: str) -> str: + """Return path for minimal VCF.""" + return Path(test_data_dir, "vcfs", "SNV.germline.sample.dnascope.vcf").as_posix() + + +@pytest.fixture(scope="session") +def vcf_file_gz_path(test_data_dir: str) -> str: + """Return path for minimal gzipped VCF.""" + return Path(test_data_dir, "vcfs", "SNV.germline.sample.dnascope.vcf.gz").as_posix() + + +@pytest.fixture(scope="session") +def gens_cov_pon_file(test_data_dir: str) -> str: + """Return path for dummy GENS male PON file.""" + return Path( + test_data_dir, "references", "gens", "grch37_gens_male_pon_100bp.hdf5" + ).as_posix() + + +@pytest.fixture(scope="session") +def gens_min_5_af_gnomad_file(test_data_dir: str) -> str: + """Return path for dummy GENS minimum af 5 gnomad file.""" + return Path( + test_data_dir, "references", "gens", "gnomad.genomes.r2.1.1.sites_0.05AF.vcf.gz" + ).as_posix() + + +@pytest.fixture(scope="session") +def gens_hg19_interval_list(test_data_dir: str) -> str: + """Return path for dummy hg19 genome 100bp interval list used in GENS.""" + return Path( + test_data_dir, + "references", + "gens", + "grch37_gens_targets_preprocessed_100bp.interval_list", + ).as_posix() + + +@pytest.fixture(scope="session") +def gens_dummy_gnomad_baf_bed(test_data_dir: str) -> str: + """Return path expected dummy result-file created from GENS pre-processing test.""" + return Path( + test_data_dir, + "gens_files", + "dummy.baf.bed", + ).as_posix() + + +@pytest.fixture(scope="session") +def gens_dummy_gnomad_vcf(test_data_dir: str) -> str: + """Return path dummy vcf called in given gnomad for GENS pre-processing test.""" + return Path( + test_data_dir, + "gens_files", + "SNV.germline.dummy.dnascope_gnomad_af5.vcf", + ).as_posix() + + +@pytest.fixture(scope="session") +def gens_dummy_cov_bed(test_data_dir: str) -> str: + """Return path expected dummy result-file created from GENS pre-processing test.""" + return Path( + test_data_dir, + "gens_files", + "dummy.cov.bed", + ).as_posix() + + +@pytest.fixture(scope="session") +def gens_dummy_denoised_cov(test_data_dir: str) -> str: + """Return path dummy coverage file for GENS pre-processing test.""" + return Path( + test_data_dir, + "gens_files", + "dummy.denoisedCR.tsv", + ).as_posix() + + +@pytest.fixture(scope="session") +def panel_bed_file(reference_panel_dir_path: str) -> str: + """Return path for panel bed file.""" + return Path(reference_panel_dir_path, "panel.bed").as_posix() + + +@pytest.fixture(scope="session") +def background_variant_file(reference_panel_dir_path: str) -> str: + """Return path for background variants for TGA.""" + return Path(reference_panel_dir_path, "background_variants.txt").as_posix() + + +@pytest.fixture(scope="session") +def pon_cnn_path(reference_panel_dir_path: str) -> str: + """Creates path for Panel Of Normal (PON), cnn file for cnvkit.""" + return Path(reference_panel_dir_path, "test_panel_ponn.cnn").as_posix() + + +@pytest.fixture(scope="session") +def clinical_snv_observations_path(reference_variants_dir_path: str) -> str: + """Return path for clinical SNVs from loqusDB.""" + return Path(reference_variants_dir_path, "clinical_snv_variants.vcf.gz").as_posix() + + +@pytest.fixture(scope="session") +def cancer_germline_snv_observations_path(reference_variants_dir_path: str) -> str: + """Return path of cancer germline SNVs from loqusDB.""" + return Path( + reference_variants_dir_path, "cancer_germline_snv_variants.vcf.gz" + ).as_posix() + + +@pytest.fixture(scope="session") +def cancer_somatic_snv_observations_path(reference_variants_dir_path: str) -> str: + """Return path for somatic SNVs from loqusDB.""" + return Path( + reference_variants_dir_path, "cancer_somatic_snv_variants.vcf.gz" + ).as_posix() + + +@pytest.fixture(scope="session") +def clinical_sv_observations_path(reference_variants_dir_path: str) -> str: + """Return path for clinical SV observations from loqusDB.""" + return Path(reference_variants_dir_path, "clinical_sv_variants.vcf.gz").as_posix() @pytest.fixture(scope="session") -def panel_bed_file(): - return "tests/test_data/references/panel/panel.bed" +def somatic_sv_observations_path(reference_variants_dir_path: str) -> str: + """Return path for somatic SV observations from loqusDB.""" + return Path(reference_variants_dir_path, "somatic_sv_variants.vcf.gz").as_posix() @pytest.fixture(scope="session") -def background_variant_file(): - return "tests/test_data/references/panel/background_variants.txt" +def swegen_snv_frequency_path(reference_variants_dir_path: str) -> str: + """Return path for Swegen SNVs.""" + return Path(reference_variants_dir_path, "swegen_snv.vcf.gz").as_posix() @pytest.fixture(scope="session") -def pon_cnn(): - return "tests/test_data/references/panel/test_panel_ponn.cnn" +def swegen_sv_frequency_path(reference_variants_dir_path: str) -> str: + """Create path for Swegen SVs.""" + return Path(reference_variants_dir_path, "swegen_sv.vcf.gz").as_posix() + + +@pytest.fixture(scope="session", name="invalid_json_file") +def fixture_invalid_json_file(session_tmp_path: Path) -> Path: + """Return a non-existent json file path.""" + return Path(session_tmp_path, f"invalid_file.{FileType.JSON}") + + +@pytest.fixture(scope="session", name="json_file") +def fixture_json_file(session_tmp_path: Path) -> Path: + """Return a mocked json file path.""" + return Path(session_tmp_path, f"write_json.{FileType.JSON}") + + +@pytest.fixture(scope="session", name="config_json") +def fixture_config_json() -> str: + """Return Balsamic analysis config json file name.""" + return f"config.{FileType.JSON}" + + +@pytest.fixture(scope="session", name="reference_graph") +def fixture_reference_graph() -> str: + """Return Balsamic reference graph pdf file name.""" + return "reference_graph.pdf" @pytest.fixture(scope="session") def sentieon_license(tmp_path_factory): - """ - Sentieon's license path fixture - """ + """Create Sentieon's license path""" sentieon_license_dir = tmp_path_factory.mktemp("sentieon_licence") sentieon_license_path = sentieon_license_dir / "license_file.lic" sentieon_license_path.touch() @@ -141,9 +583,7 @@ def sentieon_license(tmp_path_factory): @pytest.fixture(scope="session") def sentieon_install_dir(tmp_path_factory): - """ - Sentieon's license path fixture - """ + """Create install directory for Sentieon tools""" sentieon_install_dir = tmp_path_factory.mktemp("sentieon_install_dir") Path(sentieon_install_dir / "bin").mkdir(exist_ok=True) sentieon_executable = sentieon_install_dir / "bin" / "sentieon" @@ -152,125 +592,918 @@ def sentieon_install_dir(tmp_path_factory): return sentieon_install_dir.as_posix() +@pytest.fixture() +def no_write_perm_path(tmp_path_factory) -> str: + """Return path with no write permissions.""" + bad_perm_path: Path = tmp_path_factory.mktemp("bad_perm_path") + bad_perm_path.chmod(0o444) + return bad_perm_path.as_posix() + + +@pytest.fixture(scope="session") +def references_dir(test_data_dir) -> Path: + """Return a references directory path.""" + return Path(test_data_dir, "references") + + +@pytest.fixture(scope="session") +def purity_csv_path(test_data_dir) -> Path: + """Return pureCN purity CSV path.""" + return Path(test_data_dir, "cnv_report", "CNV.somatic.case_id.purecn.purity.csv") + + +@pytest.fixture(scope="session") +def cnv_plot_path(test_data_dir) -> Path: + """Return AscatNgs CNV plot path.""" + return Path( + test_data_dir, + "cnv_report", + "CNV.somatic.sample_tumor_normal_wgs.ascat.ASPCF.png", + ) + + +@pytest.fixture(scope="session") +def cnv_statistics_path(test_data_dir) -> Path: + """Return CNV sample statistics path.""" + return Path( + test_data_dir, + "cnv_report", + "CNV.somatic.sample_tumor_normal_wgs.ascat.samplestatistics.txt", + ) + + +@pytest.fixture(scope="session") +def balsamic_cache( + tmp_path_factory: TempPathFactory, reference: Dict[str, Path], references_dir: Path +) -> str: + """Create and return the path for balsamic-cache.""" + balsamic_cache: Path = tmp_path_factory.mktemp("balsamic_cache") + # Mocked containers directory + container: Path = Path(balsamic_cache, balsamic_version, "containers") + container.mkdir(parents=True, exist_ok=True) + # Mocked cache directory + hg19_reference: Path = Path(balsamic_cache, balsamic_version, "hg19") + shutil.copytree(references_dir, hg19_reference) + reference_json: Path = Path(hg19_reference, f"reference.{FileType.JSON}") + reference_json.touch() + write_json(json_obj=reference, path=reference_json.as_posix()) + return balsamic_cache.as_posix() + + +@pytest.fixture(scope="session") +def analysis_dir(tmp_path_factory: TempPathFactory) -> str: + """Create and return the directory where the case analysis will be saved.""" + analysis_dir = tmp_path_factory.mktemp("analysis", numbered=False) + return analysis_dir.as_posix() + + +@pytest.fixture(scope="session", params=fastq_patterns(), ids=fastq_pattern_ids()) +def fastq_dir(case_id_tumor_normal_fastqdir: str, analysis_dir: str, request): + """Mock directory with tumor and normal FASTQs.""" + fastq_dir: Path = Path(analysis_dir, case_id_tumor_normal_fastqdir, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Fill the fastq path folder with the test fastq-files + fastq_test_dict = request.param + + for fastq in fastq_test_dict["tumor"]: + Path(fastq_dir, fastq).touch() + + for fastq in fastq_test_dict["normal"]: + Path(fastq_dir, fastq).touch() + + yield fastq_dir.as_posix() + + for fastq in fastq_test_dict["tumor"]: + Path.unlink(fastq_dir / fastq) + + for fastq in fastq_test_dict["normal"]: + Path.unlink(fastq_dir / fastq) + + @pytest.fixture(scope="session") -def no_write_perm_path(tmp_path_factory): +def fastq_dir_tumor_only( + analysis_dir: str, case_id_tumor_only: str, tumor_fastq_names: List[str] +) -> str: """ - A path with no write permission + Creates and returns the directory containing the FASTQs for tumor-only. """ - # create a conda_env directory - bad_perm_path = tmp_path_factory.mktemp("bad_perm_path") + fastq_dir: Path = Path(analysis_dir, case_id_tumor_only, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) - Path(bad_perm_path).chmod(0o444) + # Fill the fastq path folder with the test fastq-files + for fastq in tumor_fastq_names: + Path(fastq_dir, fastq).touch() + + yield fastq_dir.as_posix() - return bad_perm_path.as_posix() + +@pytest.fixture(scope="session") +def fastq_dir_symlinked( + tumor_fastq_names: List[str], session_tmp_path: Path, reference_file: Path +) -> Path: + """Return directory containing symlinked FASTQs for tumor-only.""" + fastq_dir: Path = Path(session_tmp_path, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) + for fastq in tumor_fastq_names: + Path(session_tmp_path, fastq).touch() + for fastq in tumor_fastq_names: + Path(fastq_dir, fastq).symlink_to(reference_file) + return fastq_dir @pytest.fixture(scope="session") -def sample_fastq(tmp_path_factory): +def fastq_dir_tumor_only_pon_cnn( + analysis_dir: str, case_id_tumor_only_pon_cnn: str, tumor_fastq_names: List[str] +) -> str: """ - create sample fastq files + Creates and returns the directory containing the FASTQs for tumor-only w pon cnn. """ - fastq_dir = tmp_path_factory.mktemp("fastq") - fastq_valid = fastq_dir / "S1_R_1.fastq.gz" - fastq_invalid = fastq_dir / "sample.fastq.gz" - - # dummy tumor fastq file - tumorfastqr1 = fastq_dir / "concatenated_tumor_XXXXXX_R_1.fastq.gz" - tumorfastqr2 = fastq_dir / "concatenated_tumor_XXXXXX_R_2.fastq.gz" - - # dummy normal fastq file - normalfastqr1 = fastq_dir / "concatenated_normal_XXXXXX_R_1.fastq.gz" - normalfastqr2 = fastq_dir / "concatenated_normal_XXXXXX_R_2.fastq.gz" - - for fastq_file in ( - fastq_valid, - fastq_invalid, - tumorfastqr1, - tumorfastqr2, - normalfastqr1, - normalfastqr2, - ): - fastq_file.touch() + fastq_dir: Path = Path(analysis_dir, case_id_tumor_only_pon_cnn, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) - return { - "fastq_valid": fastq_valid.absolute().as_posix(), - "fastq_invalid": fastq_invalid.absolute().as_posix(), - "tumor": tumorfastqr1.absolute().as_posix(), - "normal": normalfastqr1.absolute().as_posix(), - } + # Fill the fastq path folder with the test fastq-files + for fastq in tumor_fastq_names: + Path(fastq_dir, fastq).touch() + + yield fastq_dir.as_posix() + + +@pytest.fixture(scope="session") +def fastq_dir_tumor_only_wgs( + analysis_dir: str, case_id_tumor_only_wgs: str, tumor_fastq_names: List[str] +) -> str: + """ + Creates and returns the directory containing the FASTQs for tumor-only WGS. + """ + fastq_dir: Path = Path(analysis_dir, case_id_tumor_only_wgs, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Fill the fastq path folder with the test fastq-files + for fastq in tumor_fastq_names: + Path(fastq_dir, fastq).touch() + + yield fastq_dir.as_posix() + + +@pytest.fixture(scope="session") +def fastq_dir_tumor_normal( + analysis_dir: str, + case_id_tumor_normal: str, + tumor_fastq_names: List[str], + normal_fastq_names: List[str], +) -> str: + """ + Creates and returns the directory containing the FASTQs for tumor-normal. + """ + fastq_dir: Path = Path(analysis_dir, case_id_tumor_normal, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Fill the fastq path folder with the test fastq-files + for fastq in tumor_fastq_names: + Path(fastq_dir, fastq).touch() + + for fastq in normal_fastq_names: + Path(fastq_dir, fastq).touch() + + yield fastq_dir.as_posix() + + +@pytest.fixture(scope="session") +def fastq_dir_tumor_normal_wgs( + analysis_dir: str, + case_id_tumor_normal_wgs: str, + tumor_fastq_names: List[str], + normal_fastq_names: List[str], +) -> str: + """ + Creates and returns the directory containing the FASTQs for tumor-normal WGS. + """ + fastq_dir: Path = Path(analysis_dir, case_id_tumor_normal_wgs, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Fill the fastq path folder with the test fastq-files + for fastq in tumor_fastq_names: + Path(fastq_dir, fastq).touch() + + for fastq in normal_fastq_names: + Path(fastq_dir, fastq).touch() + + yield fastq_dir.as_posix() + + +@pytest.fixture(scope="session") +def fastq_dir_tumor_normal_qc( + analysis_dir: str, + case_id_tumor_normal_qc: str, + tumor_fastq_names: List[str], + normal_fastq_names: List[str], +) -> str: + """ + Creates and returns the directory containing the FASTQs for tumor-normal QC workflow. + """ + fastq_dir: Path = Path(analysis_dir, case_id_tumor_normal_qc, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Fill the fastq path folder with the test fastq-files + for fastq in tumor_fastq_names: + Path(fastq_dir, fastq).touch() + + for fastq in normal_fastq_names: + Path(fastq_dir, fastq).touch() + + yield fastq_dir.as_posix() + + +@pytest.fixture(scope="session") +def fastq_dir_tumor_only_qc( + analysis_dir: str, case_id_tumor_only_qc: str, tumor_fastq_names: List[str] +) -> str: + """Creates and returns the directory containing the FASTQs.""" + fastq_dir: Path = Path(analysis_dir, case_id_tumor_only_qc, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Fill the fastq path folder with the test fastq-files + for fastq in tumor_fastq_names: + Path(fastq_dir, fastq).touch() + + yield fastq_dir.as_posix() + + +@pytest.fixture(scope="session") +def fastq_dir_tumor_only_dummy_vep( + analysis_dir: str, case_id_tumor_only_dummy_vep: str, tumor_fastq_names: List[str] +) -> str: + """Creates and returns the directory containing the FASTQs.""" + fastq_dir: Path = Path(analysis_dir, case_id_tumor_only_dummy_vep, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Fill the fastq path folder with the test fastq-files + for fastq in tumor_fastq_names: + Path(fastq_dir, fastq).touch() + + vep_dir: Path = Path(analysis_dir, case_id_tumor_only_dummy_vep, "analysis", "vep") + vep_dir.mkdir(parents=True, exist_ok=True) + vep_test_file = ( + "SNV.somatic.sample_tumor_only.vardict.research.filtered.pass.vcf.gz" + ) + Path(vep_dir, vep_test_file).touch() + + yield fastq_dir.as_posix() @pytest.fixture(scope="session") -def balsamic_cache(tmp_path_factory, reference): +def fastq_dir_pon(analysis_dir: str, case_id_pon: str, pon_fastq_list: list) -> str: """ - Create singularity container + Creates and returns the directory containing the FASTQs for PON creation workflow. """ - cache_dir = tmp_path_factory.mktemp("balsmic_coche") + fastq_dir: Path = Path(analysis_dir, case_id_pon, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) - cache_container = cache_dir / balsamic_version / "containers" / "align_qc" - cache_container.mkdir(parents=True, exist_ok=True) - cache_container_example = cache_container / "example.sif" - cache_container_example.touch() + for fastq in pon_fastq_list: + Path(fastq_dir, fastq).touch() - cache_reference = cache_dir / balsamic_version / "hg19" - cache_reference.mkdir(parents=True, exist_ok=True) + yield fastq_dir.as_posix() - cache_reference_json = cache_reference / "reference.json" - cache_reference_json.touch() - with open(cache_reference_json, "w") as fp: - json.dump(reference, fp) - return cache_dir.as_posix() +@pytest.fixture(scope="session") +def fastq_dir_gens_pon( + analysis_dir: str, case_id_gens_pon: str, pon_fastq_list: list +) -> str: + """ + Creates and returns the directory containing the FASTQs for PON creation workflow. + """ + + fastq_dir: Path = Path(analysis_dir, case_id_gens_pon, "fastq") + fastq_dir.mkdir(parents=True, exist_ok=True) + + for fastq in pon_fastq_list: + Path(fastq_dir, fastq).touch() + + yield fastq_dir.as_posix() + + +@pytest.fixture(scope="session") +def empty_fastq_dir(analysis_dir: str, case_id_tumor_normal: str) -> str: + """ + Creates and returns an empty FASTQ directory. + """ + fastq_dir: Path = Path(analysis_dir, case_id_tumor_normal, "fastq_empty") + fastq_dir.mkdir(parents=True, exist_ok=True) + + yield fastq_dir.as_posix() + + +@pytest.fixture(scope="session", params=fastq_patterns(), ids=fastq_pattern_ids()) +def fastq_dir_tumor_normal_parameterize( + analysis_dir: str, case_id_tumor_normal: str, request +) -> str: + """ + Creates and returns the directory containing the FASTQs for tumor-normal once for each fastq-name structure. + """ + fastq_dir: Path = Path(analysis_dir, case_id_tumor_normal, "fastq_parameterize") + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Fill the fastq path folder with the test fastq-files + fastq_test_dict = request.param + for fastq in fastq_test_dict["tumor"]: + Path(fastq_dir, fastq).touch() + + for fastq in fastq_test_dict["normal"]: + Path(fastq_dir, fastq).touch() + + yield fastq_dir.as_posix() + + for fastq in fastq_test_dict["tumor"]: + Path.unlink(fastq_dir / fastq) + + for fastq in fastq_test_dict["normal"]: + Path.unlink(fastq_dir / fastq) + + +@pytest.fixture(scope="session") +def fastq_dir_tumor_duplicate_fastq_patterns( + analysis_dir: str, + case_id_tumor_normal: str, + fastq_names_duplicate_assigned_fastq_patterns: Dict, +) -> str: + """Creates and returns the directory containing the FASTQs to test duplicate fastq-patterns.""" + fastq_dir: Path = Path( + analysis_dir, case_id_tumor_normal, "fastq_duplicate_assigned_fastq_patterns" + ) + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Fill the fastq path folder with the test fastq-files + for fastq in fastq_names_duplicate_assigned_fastq_patterns["tumor"]: + Path(fastq_dir, fastq).touch() + + yield fastq_dir.as_posix() + + +@pytest.fixture(scope="session") +def config_dict_w_fastqs( + analysis_dir: str, + case_id_tumor_normal: str, + config_dict_w_singularity: str, + standard_samples_list: List[Dict], +) -> str: + """Change samples-list in config and create test fastq-files.""" + + fastq_dir: Path = Path( + analysis_dir, + case_id_tumor_normal, + "fastq_standard_names", + ) + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Change fastq_path to be the newly created test fastq dir + modified_config = copy.deepcopy(config_dict_w_singularity) + modified_config["analysis"]["fastq_path"] = fastq_dir.as_posix() + + # Create analysis_dirs + analysis_sub_dirs = ["script", "log", "result", "benchmark"] + for analysis_sub_dir in analysis_sub_dirs: + analysis_sub_dir_path: Path = Path( + analysis_dir, case_id_tumor_normal, analysis_sub_dir + ) + analysis_sub_dir_path.mkdir(parents=True, exist_ok=True) + modified_config["analysis"][analysis_sub_dir] = analysis_sub_dir_path.as_posix() + + # Fill the fastq path folder with the test fastq-files + samples = standard_samples_list + for sample_dict in samples: + for fastq_pattern, values in sample_dict["fastq_info"].items(): + values["fwd"] = fastq_dir.joinpath(values["fwd"]).as_posix() + values["rev"] = fastq_dir.joinpath(values["rev"]).as_posix() + # Create dummy fastq files + Path(values["fwd"]).touch() + Path(values["rev"]).touch() + + # Modify input config sample list to correspond to current test sample list + modified_config["samples"] = samples + + return modified_config + + +@pytest.fixture(scope="session") +def pon_config_dict_w_fastq( + analysis_dir: str, + case_id_pon, + pon_config_dict_w_singularity: str, + balsamic_cache: str, + standard_samples_list_pon: List[Dict], +) -> str: + """Create fastqs and modify pon config to contain created fastq paths.""" + fastq_dir: Path = Path( + analysis_dir, + case_id_pon, + "fastq_standard_names_pon", + ) + fastq_dir.mkdir(parents=True, exist_ok=True) + + pon_config_w_fastq = copy.deepcopy(pon_config_dict_w_singularity) + pon_config_w_fastq["analysis"]["fastq_path"] = fastq_dir.as_posix() + + # Create analysis_dirs and modify config + analysis_sub_dirs = ["script", "log", "result", "benchmark"] + for analysis_sub_dir in analysis_sub_dirs: + analysis_sub_dir_path: Path = Path(analysis_dir, case_id_pon, analysis_sub_dir) + analysis_sub_dir_path.mkdir(parents=True, exist_ok=True) + pon_config_w_fastq["analysis"][ + analysis_sub_dir + ] = analysis_sub_dir_path.as_posix() + + # Fill the fastq path folder with the test fastq-files + samples_list = standard_samples_list_pon + for sample_dict in samples_list: + for fastq_pattern, values in sample_dict["fastq_info"].items(): + fwd_fastq_path = f"{fastq_dir}/{os.path.basename(values['fwd'])}" + rev_fastq_path = f"{fastq_dir}/{os.path.basename(values['rev'])}" + values["fwd"] = fwd_fastq_path + values["rev"] = rev_fastq_path + Path(fwd_fastq_path).touch() + Path(rev_fastq_path).touch() + + # Modify input config sample list to correspond to current test sample list + pon_config_w_fastq["samples"] = samples_list + + return pon_config_w_fastq + + +@pytest.fixture(scope="session") +def config_w_fastq_dir_for_duplicate_fastq_patterns_model( + analysis_dir: str, + case_id_tumor_normal: str, + sample_list_duplicate_assigned_fastq_patterns_model: List[Dict], + config_dict_w_singularity: Dict, +) -> str: + """Creates and returns the directory containing the FASTQs to test duplicate fastq-patterns to test model.""" + + fastq_dir: Path = Path( + analysis_dir, + case_id_tumor_normal, + "fastq_duplicate_assigned_fastq_patterns_model", + ) + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Change fastq_path to be the newly created test fastq dir + modified_config = copy.deepcopy(config_dict_w_singularity) + modified_config["analysis"]["fastq_path"] = fastq_dir.as_posix() + + # Create analysis_dirs and modify config + analysis_sub_dirs = ["script", "log", "result", "benchmark"] + for analysis_sub_dir in analysis_sub_dirs: + analysis_sub_dir_path: Path = Path( + analysis_dir, case_id_tumor_normal, analysis_sub_dir + ) + analysis_sub_dir_path.mkdir(parents=True, exist_ok=True) + modified_config["analysis"][analysis_sub_dir] = analysis_sub_dir_path.as_posix() + + # Fill the fastq path folder with the test fastq-files + samples = sample_list_duplicate_assigned_fastq_patterns_model + for sample_dict in samples: + for fastq_pattern, values in sample_dict["fastq_info"].items(): + values["fwd"] = fastq_dir.joinpath(values["fwd"]).as_posix() + values["rev"] = fastq_dir.joinpath(values["rev"]).as_posix() + # Create dummy fastq files + Path(values["fwd"]).touch() + Path(values["rev"]).touch() + + # Modify input config sample list to correspond to current test sample list + modified_config["samples"] = samples + + return modified_config + + +@pytest.fixture(scope="session") +def config_tumor_normal_extrafile( + analysis_dir: str, + case_id_tumor_normal: str, + config_dict_w_singularity: Dict, +) -> str: + """Creates and returns the directory containing the FASTQs to test detection of unassigned fastq-files.""" + fastq_dir: Path = Path(analysis_dir, case_id_tumor_normal, "fastq_extrafile") + fastq_dir.mkdir(parents=True, exist_ok=True) + + # Fill the fastq path folder with the test fastq-files + modified_config = copy.deepcopy(config_dict_w_singularity) + + # Change fastq_path to be the newly created test fastq dir + modified_config["analysis"]["fastq_path"] = fastq_dir.as_posix() + + # Create analysis_dirs and modify config + analysis_sub_dirs = ["script", "log", "result", "benchmark"] + for analysis_sub_dir in analysis_sub_dirs: + analysis_sub_dir_path: Path = Path( + analysis_dir, case_id_tumor_normal, analysis_sub_dir + ) + analysis_sub_dir_path.mkdir(parents=True, exist_ok=True) + modified_config["analysis"][analysis_sub_dir] = analysis_sub_dir_path.as_posix() + + samples_list = modified_config["samples"] + for sample_dict in samples_list: + for fastq_pattern, values in sample_dict["fastq_info"].items(): + fwd_fastq_path = f"{fastq_dir}/{os.path.basename(values['fwd'])}" + rev_fastq_path = f"{fastq_dir}/{os.path.basename(values['rev'])}" + values["fwd"] = fwd_fastq_path + values["rev"] = rev_fastq_path + Path(fwd_fastq_path).touch() + Path(rev_fastq_path).touch() + modified_config["samples"] = samples_list + + # Add extra files not assigned to dict + extra_file1 = "ACC3fail_S1_L001_R1_001.fastq.gz" + extra_file2 = "ACC3fail_S1_L001_R2_001.fastq.gz" + Path(fastq_dir, extra_file1).touch() + Path(fastq_dir, extra_file2).touch() + + # Returned modified dict + return modified_config + + +@pytest.fixture(scope="session") +def snakemake_job_script(tmp_path_factory, tumor_normal_config): + """Create a dummy snakemake jobscript""" + script_dir = tmp_path_factory.mktemp("snakemake_script") + snakemake_script_file = script_dir / "example_script.sh" + snakemake_script = """#!/bin/sh +# properties = {"type": "single", "rule": "all", "local": false, "input": ["dummy_path"], "output": ["dummy_path"], "wildcards": {}, "params": {}, "log": [], "threads": 1, "resources": {}, "jobid": 0, "cluster": {"name": "BALSAMIC.all.", "time": "00:15:00", "n": 1, "mail_type": "END", "partition": "core"}} +ls -l # dummy command +""" + snakemake_script_file.touch() + with open(snakemake_script_file, "w") as fn: + fn.write(snakemake_script) + + return {"snakescript": str(snakemake_script_file)} + + +@pytest.fixture(name="helpers") +def fixture_config_helpers(): + """Return helper for case config files""" + return ConfigHelper() + + +@pytest.fixture(scope="session") +def balsamic_model( + config_dict_w_fastqs: Dict, +) -> ConfigModel: + """Return ConfigModel parsed from static tumor normal config dict.""" + # Initialize balsamic model + balsamic_config = ConfigModel.model_validate(config_dict_w_fastqs) + return balsamic_config + + +@pytest.fixture(scope="session") +def balsamic_pon_model( + pon_config_dict_w_fastq: Dict, +) -> ConfigModel: + """Return ConfigModel parsed from static PON config dict.""" + # Initialize ConfigModel + balsamic_pon_config = ConfigModel.model_validate(pon_config_dict_w_fastq) + return balsamic_pon_config + + +@pytest.fixture(scope="session") +def config_case_cli( + balsamic_cache: str, + background_variant_file: str, + cadd_annotations: str, + swegen_snv_frequency_path: str, + swegen_sv_frequency_path: str, + clinical_snv_observations_path: str, + clinical_sv_observations_path: str, + somatic_sv_observations_path: str, + cancer_germline_snv_observations_path: str, + cancer_somatic_snv_observations_path: str, +) -> List[str]: + """Return common config case CLI.""" + return [ + "--balsamic-cache", + balsamic_cache, + "--background-variants", + background_variant_file, + "--cadd-annotations", + cadd_annotations, + "--swegen-snv", + swegen_snv_frequency_path, + "--swegen-sv", + swegen_sv_frequency_path, + "--clinical-snv-observations", + clinical_snv_observations_path, + "--clinical-sv-observations", + clinical_sv_observations_path, + "--cancer-somatic-sv-observations", + somatic_sv_observations_path, + "--cancer-germline-snv-observations", + cancer_germline_snv_observations_path, + "--cancer-somatic-snv-observations", + cancer_somatic_snv_observations_path, + ] + + +@pytest.fixture(scope="session") +def tumor_only_config_qc( + case_id_tumor_only_qc: str, + analysis_dir: str, + fastq_dir_tumor_only_qc: str, + tumor_sample_name: str, + panel_bed_file: str, + sentieon_license: str, + sentieon_install_dir: str, + config_case_cli: list[str], +) -> str: + """Invoke balsamic config sample to create sample configuration file for tumor-only TGA.""" + + with mock.patch.dict( + MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "case", + "--case-id", + case_id_tumor_only_qc, + "--analysis-workflow", + AnalysisWorkflow.BALSAMIC_QC, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only_qc, + "--tumor-sample-name", + tumor_sample_name, + "-p", + panel_bed_file, + ] + + config_case_cli, + ) + + return Path( + analysis_dir, case_id_tumor_only_qc, f"{case_id_tumor_only_qc}.{FileType.JSON}" + ).as_posix() + + +@pytest.fixture(scope="session") +def tumor_normal_config_qc( + case_id_tumor_normal_qc: str, + tumor_sample_name: str, + normal_sample_name: str, + analysis_dir: str, + fastq_dir_tumor_normal_qc: str, + sentieon_license: str, + sentieon_install_dir: str, + config_case_cli: list[str], +) -> str: + """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA QC workflow.""" + + with mock.patch.dict( + MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "case", + "--case-id", + case_id_tumor_normal_qc, + "--analysis-workflow", + AnalysisWorkflow.BALSAMIC_QC, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_normal_qc, + "--tumor-sample-name", + tumor_sample_name, + "--normal-sample-name", + normal_sample_name, + ] + + config_case_cli, + ) + + return Path( + analysis_dir, + case_id_tumor_normal_qc, + f"{case_id_tumor_normal_qc}.{FileType.JSON}", + ).as_posix() + + +@pytest.fixture(scope="session") +def tumor_normal_config_qc_wgs( + case_id_tumor_normal_qc_wgs: str, + analysis_dir: str, + fastq_dir_tumor_normal_qc_wgs: str, + tumor_sample_name: str, + normal_sample_name: str, + sentieon_license: str, + sentieon_install_dir: str, + config_case_cli: List[str], +) -> str: + """Invoke balsamic config sample to create sample configuration file for tumor-normal WGS QC workflow.""" + + with mock.patch.dict( + MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "case", + "--case-id", + case_id_tumor_normal_qc_wgs, + "--analysis-workflow", + AnalysisWorkflow.BALSAMIC_QC, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_normal_qc_wgs, + "--tumor-sample-name", + tumor_sample_name, + "--normal-sample-name", + normal_sample_name, + ] + + config_case_cli, + ) + + return Path( + analysis_dir, + case_id_tumor_normal_qc_wgs, + f"{case_id_tumor_normal_qc_wgs}.{FileType.JSON}", + ).as_posix() + + +@pytest.fixture(scope="session") +def tumor_only_config( + case_id_tumor_only: str, + tumor_sample_name: str, + analysis_dir: str, + fastq_dir_tumor_only: str, + panel_bed_file: str, + sentieon_license: str, + sentieon_install_dir: str, + config_case_cli: list[str], +) -> str: + """Invoke balsamic config sample to create sample configuration file for tumor-only TGA.""" + with mock.patch.dict( + MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "case", + "--case-id", + case_id_tumor_only, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only, + "--tumor-sample-name", + tumor_sample_name, + "-p", + panel_bed_file, + ] + + config_case_cli, + ) + return Path( + analysis_dir, + case_id_tumor_only, + f"{case_id_tumor_only}.{FileType.JSON}", + ).as_posix() @pytest.fixture(scope="session") -def analysis_dir(tmp_path_factory): - """ - Creates and returns analysis directory - """ - analysis_dir = tmp_path_factory.mktemp("analysis", numbered=False) +def tumor_normal_config( + case_id_tumor_normal: str, + tumor_sample_name: str, + normal_sample_name: str, + analysis_dir: str, + fastq_dir_tumor_normal: str, + panel_bed_file: str, + sentieon_license: str, + sentieon_install_dir: str, + config_case_cli: list[str], +) -> str: + """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA.""" - return analysis_dir.as_posix() + with mock.patch.dict( + MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "case", + "--case-id", + case_id_tumor_normal, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_normal, + "--tumor-sample-name", + tumor_sample_name, + "--normal-sample-name", + normal_sample_name, + "-p", + panel_bed_file, + ] + + config_case_cli, + ) + + return Path( + analysis_dir, + case_id_tumor_normal, + f"{case_id_tumor_normal}.{FileType.JSON}", + ).as_posix() @pytest.fixture(scope="session") -def snakemake_job_script(tmp_path_factory, tumor_normal_config): - """ - Creates a dummy snakemake jobscript - """ +def tumor_only_umi_config( + case_id_tumor_only_umi: str, + tumor_sample_name: str, + analysis_dir: str, + fastq_dir_tumor_only: str, + panel_bed_file: str, + sentieon_license: str, + sentieon_install_dir: str, + config_case_cli: list[str], +) -> str: + """Invoke balsamic config sample to create sample configuration file for tumor-only TGA.""" - script_dir = tmp_path_factory.mktemp("snakemake_script") - snakemake_script_file = script_dir / "example_script.sh" - snakemake_script = """#!/bin/sh -# properties = {"type": "single", "rule": "all", "local": false, "input": ["dummy_path"], "output": ["dummy_path"], "wildcards": {}, "params": {}, "log": [], "threads": 1, "resources": {}, "jobid": 0, "cluster": {"name": "BALSAMIC.all.", "time": "00:15:00", "n": 1, "mail_type": "END", "partition": "core"}} -ls -l # dummy command -""" - snakemake_script_file.touch() - with open(snakemake_script_file, "w") as fn: - fn.write(snakemake_script) + with mock.patch.dict( + MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "case", + "--case-id", + case_id_tumor_only_umi, + "--analysis-workflow", + AnalysisWorkflow.BALSAMIC_UMI, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only, + "--tumor-sample-name", + tumor_sample_name, + "-p", + panel_bed_file, + ] + + config_case_cli, + ) - return {"snakescript": str(snakemake_script_file)} + return Path( + analysis_dir, + case_id_tumor_only_umi, + f"{case_id_tumor_only_umi}.{FileType.JSON}", + ).as_posix() @pytest.fixture(scope="session") -def tumor_normal_config( - tmp_path_factory, - sample_fastq, - analysis_dir, - balsamic_cache, - background_variant_file, - panel_bed_file, - sentieon_license, - sentieon_install_dir, -): - """ - invokes balsamic config sample -t xxx -n xxx to create sample config - for tumor-normal - """ - case_id = "sample_tumor_normal" - tumor = sample_fastq["tumor"] - normal = sample_fastq["normal"] +def tumor_normal_umi_config( + case_id_tumor_normal_umi: str, + tumor_sample_name: str, + normal_sample_name: str, + analysis_dir: str, + fastq_dir_tumor_normal: str, + panel_bed_file: str, + sentieon_license: str, + sentieon_install_dir: str, + config_case_cli: list[str], +) -> str: + """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA.""" with mock.patch.dict( MOCKED_OS_ENVIRON, @@ -287,55 +1520,87 @@ def tumor_normal_config( "case", "-p", panel_bed_file, - "-t", - tumor, - "-n", - normal, "--case-id", - case_id, + case_id_tumor_normal_umi, + "--analysis-workflow", + AnalysisWorkflow.BALSAMIC_UMI, "--analysis-dir", analysis_dir, - "--balsamic-cache", - balsamic_cache, + "--fastq-path", + fastq_dir_tumor_normal, "--tumor-sample-name", - "ACC1", + tumor_sample_name, "--normal-sample-name", - "ACC2", - "--background-variants", - background_variant_file, - ], + normal_sample_name, + "-p", + panel_bed_file, + ] + + config_case_cli, ) - qc_dir = Path(analysis_dir, case_id, "analysis", "qc") - qc_dir.mkdir(parents=True, exist_ok=False) - copy_tree("tests/test_data/qc_files/analysis/qc/", qc_dir.as_posix()) + return Path( + analysis_dir, + case_id_tumor_normal_umi, + f"{case_id_tumor_normal_umi}.{FileType.JSON}", + ).as_posix() - return Path(analysis_dir, case_id, case_id + ".json").as_posix() +@pytest.fixture(scope="session") +def tumor_only_wgs_config( + case_id_tumor_only_wgs: str, + analysis_dir: str, + fastq_dir_tumor_only_wgs: str, + tumor_sample_name: str, + sentieon_license: str, + sentieon_install_dir: str, + config_case_cli: List[str], +) -> str: + """Invoke balsamic config sample to create sample configuration file for tumor-only WGS.""" -@pytest.fixture(name="helpers") -def fixture_config_helpers(): - """Helper fixture for case config files""" - return ConfigHelper() + with mock.patch.dict( + MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "case", + "--case-id", + case_id_tumor_only_wgs, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only_wgs, + "--tumor-sample-name", + tumor_sample_name, + ] + + config_case_cli, + ) + + return Path( + analysis_dir, + case_id_tumor_only_wgs, + f"{case_id_tumor_only_wgs}.{FileType.JSON}", + ).as_posix() @pytest.fixture(scope="session") def tumor_normal_wgs_config( - tmp_path_factory, - sample_fastq, - analysis_dir, - balsamic_cache, - sentieon_license, - sentieon_install_dir, -): - """ - invokes balsamic config sample -t xxx -n xxx to create sample config - for tumor-normal - """ - case_id = "sample_tumor_normal_wgs" - tumor = sample_fastq["tumor"] - normal = sample_fastq["normal"] - + case_id_tumor_normal_wgs: str, + analysis_dir: str, + fastq_dir_tumor_normal_wgs: str, + tumor_sample_name: str, + normal_sample_name: str, + sentieon_license: str, + sentieon_install_dir: str, + config_case_cli: str, +) -> str: + """Invoke balsamic config sample to create sample configuration file for tumor-normal WGS.""" with mock.patch.dict( MOCKED_OS_ENVIRON, { @@ -349,39 +1614,40 @@ def tumor_normal_wgs_config( [ "config", "case", - "-t", - tumor, - "-n", - normal, "--case-id", - case_id, - "--balsamic-cache", - balsamic_cache, + case_id_tumor_normal_wgs, "--analysis-dir", analysis_dir, - ], + "--fastq-path", + fastq_dir_tumor_normal_wgs, + "--tumor-sample-name", + tumor_sample_name, + "--normal-sample-name", + normal_sample_name, + ] + + config_case_cli, ) - return Path(analysis_dir, case_id, case_id + ".json").as_posix() + return Path( + analysis_dir, + case_id_tumor_normal_wgs, + f"{case_id_tumor_normal_wgs}.{FileType.JSON}", + ).as_posix() @pytest.fixture(scope="session") -def tumor_only_config( - tmp_path_factory, - sample_fastq, - balsamic_cache, - background_variant_file, - analysis_dir, - panel_bed_file, - sentieon_license, - sentieon_install_dir, -): - """ - invokes balsamic config sample -t xxx to create sample config - for tumor only - """ - case_id = "sample_tumor_only" - tumor = sample_fastq["tumor"] +def tumor_only_config_dummy_vep( + case_id_tumor_only_dummy_vep: str, + tumor_sample_name: str, + balsamic_cache: str, + analysis_dir: str, + fastq_dir_tumor_only_dummy_vep: str, + panel_bed_file: str, + background_variant_file: str, + sentieon_license: str, + sentieon_install_dir: str, +) -> str: + """Invoke balsamic config sample to create sample configuration file for tumor-only TGA with dummy VEP file.""" with mock.patch.dict( MOCKED_OS_ENVIRON, @@ -396,43 +1662,42 @@ def tumor_only_config( [ "config", "case", - "-p", - panel_bed_file, - "-t", - tumor, "--case-id", - case_id, + case_id_tumor_only_dummy_vep, "--analysis-dir", analysis_dir, + "--fastq-path", + fastq_dir_tumor_only_dummy_vep, + "-p", + panel_bed_file, "--balsamic-cache", balsamic_cache, "--background-variants", background_variant_file, + "--tumor-sample-name", + tumor_sample_name, ], ) - - qc_dir = Path(analysis_dir, case_id, "analysis", "qc") - qc_dir.mkdir(parents=True, exist_ok=False) - copy_tree("tests/test_data/qc_files/analysis/qc/", qc_dir.as_posix()) - - return Path(analysis_dir, case_id, case_id + ".json").as_posix() + return Path( + analysis_dir, + case_id_tumor_only_dummy_vep, + f"{case_id_tumor_only_dummy_vep}.{FileType.JSON}", + ).as_posix() @pytest.fixture(scope="session") -def tumor_only_wgs_config( - tmp_path_factory, - sample_fastq, - analysis_dir, - balsamic_cache, - sentieon_license, - sentieon_install_dir, -): - """ - invokes balsamic config sample -t xxx to create sample config - for tumor only - """ - case_id = "sample_tumor_only_wgs" - tumor = sample_fastq["tumor"] +def tumor_only_pon_config( + case_id_tumor_only_pon_cnn: str, + tumor_sample_name: str, + analysis_dir: str, + fastq_dir_tumor_only_pon_cnn: str, + panel_bed_file: str, + pon_cnn_path: str, + balsamic_cache: str, + sentieon_license: str, + sentieon_install_dir: str, +) -> str: + """Invoke balsamic PON config sample to create sample configuration file for tumor-only TGA.""" with mock.patch.dict( MOCKED_OS_ENVIRON, @@ -447,37 +1712,41 @@ def tumor_only_wgs_config( [ "config", "case", - "-t", - tumor, "--case-id", - case_id, + case_id_tumor_only_pon_cnn, "--analysis-dir", analysis_dir, + "--fastq-path", + fastq_dir_tumor_only_pon_cnn, + "-p", + panel_bed_file, + "--pon-cnn", + pon_cnn_path, "--balsamic-cache", balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, ], ) - return Path(analysis_dir, case_id, case_id + ".json").as_posix() + return Path( + analysis_dir, + case_id_tumor_only_pon_cnn, + f"{case_id_tumor_only_pon_cnn}.{FileType.JSON}", + ).as_posix() @pytest.fixture(scope="session") -def tumor_only_pon_config( - tmp_path_factory, - sample_fastq, - balsamic_cache, - analysis_dir, - panel_bed_file, - sentieon_license, - sentieon_install_dir, - pon_cnn, -): - """ - invokes balsamic config sample -t xxx to create sample config - for tumor only - """ - case_id = "sample_tumor_only_pon" - tumor = sample_fastq["tumor"] +def cnvkit_pon_creation_config( + case_id_pon: str, + analysis_dir: str, + fastq_dir_pon: str, + panel_bed_file: str, + balsamic_cache: str, + sentieon_license: str, + sentieon_install_dir: str, +) -> str: + """Invoke PON creation config configuration file for CNVkit PON workflow.""" with mock.patch.dict( MOCKED_OS_ENVIRON, @@ -491,30 +1760,83 @@ def tumor_only_pon_config( cli, [ "config", - "case", + "pon", + "--case-id", + case_id_pon, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_pon, "-p", panel_bed_file, - "-t", - tumor, + "--version", + "v5", + "--balsamic-cache", + balsamic_cache, + "--pon-workflow", + PONWorkflow.CNVKIT, + ], + ) + + return Path( + analysis_dir, case_id_pon, f"{case_id_pon}_PON.{FileType.JSON}" + ).as_posix() + + +@pytest.fixture(scope="session") +def gens_pon_creation_config( + case_id_gens_pon: str, + analysis_dir: str, + fastq_dir_gens_pon: str, + balsamic_cache: str, + sentieon_license: str, + sentieon_install_dir: str, + gens_hg19_interval_list: str, +) -> str: + """Invoke PON creation config configuration file for GENS PON workflow.""" + + with mock.patch.dict( + MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "pon", "--case-id", - case_id, + case_id_gens_pon, "--analysis-dir", analysis_dir, + "--fastq-path", + fastq_dir_gens_pon, + "--version", + "v5", "--balsamic-cache", balsamic_cache, - "--pon-cnn", - pon_cnn, + "--pon-workflow", + PONWorkflow.GENS_MALE, + "--genome-interval", + gens_hg19_interval_list, ], ) - return Path(analysis_dir, case_id, case_id + ".json").as_posix() + return Path( + analysis_dir, case_id_gens_pon, f"{case_id_gens_pon}_PON.{FileType.JSON}" + ).as_posix() @pytest.fixture(scope="session") -def sample_config(): - """ - sample config dict to test workflow utils - """ +def sample_config( + tumor_sample_name: str, + normal_sample_name: str, + tumor_normal_fastq_info_correct: dict, +): + """Create and return sample config dict to test workflow utils""" sample_config = { "QC": { "picard_rmdup": "False", @@ -537,18 +1859,15 @@ def sample_config(): "BALSAMIC_version": "2.9.8", "dag": "tests/test_data/id1/id1_analysis.json_BALSAMIC_2.9.8_graph.pdf", }, + "reference": { + "reference_genome": "tests/test_data/references/genome/human_g1k_v37_decoy.fasta", + "genome_chrom_size": "tests/test_data/references/genome/hg19.chrom.sizes", + }, "vcf": VCF_DICT, - "samples": { - "S1_R": { - "file_prefix": "S1_R", - "type": "tumor", - "readpair_suffix": ["1", "2"], - }, - "S2_R": { - "file_prefix": "S2_R", - "type": "normal", - "readpair_suffix": ["1", "2"], - }, + "samples": tumor_normal_fastq_info_correct, + "panel": { + "capture_kit": "tests/test_data/references/panel/panel.bed", + "pon_cnn": "tests/test_data/references/panel/test_panel_ponn.cnn", }, "umiworkflow": "true", } @@ -558,25 +1877,25 @@ def sample_config(): @pytest.fixture(scope="session") def analysis_path(): - """Analysis test path""" + """Return path for test analysis""" return "tests/test_data/qc_files/analysis" @pytest.fixture(scope="session") def multiqc_data_path(analysis_path): - """multiqc_data.json test path""" + """Return path of JSON for MultiQC test data""" return os.path.join(analysis_path, "qc", "multiqc_data", "multiqc_data.json") @pytest.fixture(scope="session") def multiqc_data_dict(multiqc_data_path): - """multiqc_data.json test path""" + """Read and Return test data from JASON of MultiQC test data""" return read_json(multiqc_data_path) @pytest.fixture(scope="session") def metrics_yaml_path(analysis_path): - """sample_tumor_only_metrics_deliverables.yaml test path""" + """Return path for Tumor-Only deliverable metrics from YAML""" return os.path.join( analysis_path, "qc", "sample_tumor_only_metrics_deliverables.yaml" ) @@ -584,15 +1903,15 @@ def metrics_yaml_path(analysis_path): @pytest.fixture(scope="session") def bcftools_counts_path(analysis_path): - """svdb.all.filtered.pass.stats test path""" + """Return path for svdb.clinical.filtered.pass.stats""" return os.path.join( - analysis_path, "vep", "SNV.somatic.case.svdb.all.filtered.pass.stats" + analysis_path, "vep", "SNV.somatic.case.svdb.clinical.filtered.pass.stats" ) @pytest.fixture(scope="session") def qc_requested_metrics(): - """Raw requested metrics""" + """Return raw requested metrics.""" return { "targeted": { "default": { @@ -615,47 +1934,48 @@ def qc_requested_metrics(): @pytest.fixture(scope="session") -def qc_extracted_metrics(metrics_yaml_path): - """Extracted and formatted QC metrics""" +def qc_extracted_metrics(metrics_yaml_path: str) -> dict: + """Return extracted and formatted QC metrics.""" return read_yaml(metrics_yaml_path) @pytest.fixture(scope="function") -def snakemake_fastqc_rule(tumor_only_config, helpers): - """FastQC snakemake mock rule""" +def snakemake_bcftools_filter_vardict_research_tumor_only( + tumor_only_config_dummy_vep, helpers +): + """bcftools_filter_vardict_research_tumor_only snakemake mock rule""" - helpers.read_config(tumor_only_config) - fastq_path = os.path.join( + helpers.read_config(tumor_only_config_dummy_vep) + vep_path = os.path.join( helpers.analysis_dir, helpers.case_id, "analysis", - "fastq", - "concatenated_tumor_XXXXXX_R_{read}.fastq.gz", + "vep", + "{var_type}.somatic.{case_name}.vardict.research.filtered.pass.vcf.gz", ) - return Map( { - "fastqc": Map( + "bcftools_filter_vardict_research_tumor_only": Map( { "params": Map( { "housekeeper_id": { - "id": "sample_tumor_only", - "tags": "quality-trimmed-seq", + "id": "sample_tumor_only_single", + "tags": "research", } } ), "output": Map( { - "_names": Map({"fastqc": fastq_path}), - "fastqc": fastq_path, + "_names": Map({"vcf_pass_vardict": vep_path}), + "vcf_pass_vardict": vep_path, } ), "rule": Map( { - "name": "fastq", + "name": "bcftools_filter_vardict_research_tumor_only", "output": [ - fastq_path, + vep_path, ], "temp_output": set(), } @@ -664,3 +1984,431 @@ def snakemake_fastqc_rule(tumor_only_config, helpers): ) } ) + + +@pytest.fixture(scope="session", name="timestamp_now") +def fixture_timestamp_now() -> datetime: + """Return a time stamp of current date in date time format.""" + return datetime.now() + + +@pytest.fixture(scope="session", name="cosmic_key") +def fixture_cosmic_key() -> str: + """Return a mocked COSMIC key.""" + return "ZW1haWxAZXhhbXBsZS5jb206bXljb3NtaWNwYXNzd29yZAo=" + + +@pytest.fixture(scope="session", name="develop_containers") +def fixture_develop_containers() -> Dict[str, str]: + """Return a dictionary of docker hub containers for develop branch.""" + return { + DockerContainers.ASCAT: "docker://clinicalgenomics/balsamic:develop-ascatNgs", + DockerContainers.VCF2CYTOSURE: "docker://clinicalgenomics/balsamic:develop-vcf2cytosure", + DockerContainers.PYTHON_3: "docker://clinicalgenomics/balsamic:develop-varcall_py3", + DockerContainers.SOMALIER: "docker://clinicalgenomics/balsamic:develop-somalier", + DockerContainers.CNVPYTOR: "docker://clinicalgenomics/balsamic:develop-cnvpytor", + DockerContainers.ALIGN_QC: "docker://clinicalgenomics/balsamic:develop-align_qc", + DockerContainers.ANNOTATE: "docker://clinicalgenomics/balsamic:develop-annotate", + DockerContainers.PYTHON_27: "docker://clinicalgenomics/balsamic:develop-varcall_py27", + DockerContainers.CNVKIT: "docker://clinicalgenomics/balsamic:develop-cnvkit", + DockerContainers.COVERAGE_QC: "docker://clinicalgenomics/balsamic:develop-coverage_qc", + DockerContainers.DELLY: "docker://clinicalgenomics/balsamic:develop-delly", + DockerContainers.CADD: "docker://clinicalgenomics/balsamic:develop-cadd", + DockerContainers.HTSLIB: "docker://clinicalgenomics/balsamic:develop-htslib", + DockerContainers.PURECN: "docker://clinicalgenomics/balsamic:develop-purecn", + DockerContainers.GATK: "docker://clinicalgenomics/balsamic:develop-gatk", + } + + +@pytest.fixture(scope="session", name="cache_config_data") +def fixture_cache_config_data( + cache_analysis: CacheAnalysis, + develop_containers: Dict[str, str], + cosmic_key: str, + timestamp_now: datetime, + session_tmp_path: Path, +) -> Dict[str, Any]: + """Return mocked cache config data.""" + + return { + "analysis": cache_analysis, + "references_dir": session_tmp_path, + "genome_dir": session_tmp_path, + "variants_dir": session_tmp_path, + "vep_dir": session_tmp_path, + "containers_dir": session_tmp_path, + "genome_version": GenomeVersion.HG19, + "cosmic_key": cosmic_key, + "bioinfo_tools": BIOINFO_TOOL_ENV, + "containers": develop_containers, + "references": REFERENCE_FILES[GenomeVersion.HG19], + "references_date": timestamp_now.strftime("%Y-%m-%d %H:%M"), + } + + +@pytest.fixture(scope="session", name="cache_config") +def fixture_cache_config(cache_config_data: Dict[str, dict]) -> CacheConfig: + """Return mocked cache config model.""" + cache_config: CacheConfig = CacheConfig(**cache_config_data) + for reference in cache_config.references: + reference_file: Path = Path(reference[1].file_path) + reference_file.parent.mkdir(parents=True, exist_ok=True) + reference_file.touch() + return cache_config + + +@pytest.fixture(scope="session", name="cache_analysis_data") +def fixture_cache_analysis_data(case_id_tumor_only: str) -> Dict[str, str]: + """Return mocked cache analysis data.""" + return {"case_id": case_id_tumor_only} + + +@pytest.fixture(scope="session", name="cache_analysis") +def fixture_cache_analysis(cache_analysis_data: Dict[str, str]) -> CacheAnalysis: + """Return mocked cache analysis model.""" + return CacheAnalysis(**cache_analysis_data) + + +@pytest.fixture(scope="session", name="refgene_bed_file") +def fixture_refgene_bed_file(session_tmp_path: Path) -> Path: + """Return dummy refseq's gene bed file.""" + refgene_bed_file: Path = Path(session_tmp_path, "genome", "refGene.flat.bed") + refgene_bed_file.touch() + return refgene_bed_file + + +@pytest.fixture(scope="session", name="refgene_flat_file") +def fixture_refgene_flat_file(session_tmp_path: Path) -> Path: + """Return dummy refseq's gene flat file.""" + refgene_flat_file: Path = Path(session_tmp_path, "genome", "refGene.flat") + refgene_flat_file.touch() + return refgene_flat_file + + +@pytest.fixture(scope="session", name="analysis_references_data") +def fixture_analysis_references_data( + cache_config: CacheConfig, + refgene_bed_file: Path, + refgene_flat_file: Path, +) -> Dict[str, Path]: + """Return analysis references model data.""" + return { + "genome_chrom_size": Path(cache_config.references.genome_chrom_size.file_path), + "reference_genome": Path(cache_config.references.reference_genome.file_path), + "refgene_bed": refgene_bed_file, + "refgene_flat": refgene_flat_file, + "refgene_txt": Path(cache_config.references.refgene_txt.file_path), + } + + +@pytest.fixture(scope="session", name="cadd_snv_indexed_file") +def fixture_cadd_snv_indexed_file(session_tmp_path: Path) -> Path: + """Return dummy cadd snv indexed file.""" + reference_file: Path = Path( + session_tmp_path, "variants", "hg19.cadd_snv.tsv.gz.tbi" + ) + reference_file.touch() + return reference_file + + +@pytest.fixture(scope="session", name="delly_exclusion_converted_file") +def fixture_delly_exclusion_converted_file(session_tmp_path: Path) -> Path: + """Return dummy delly exclusion converted file.""" + reference_file: Path = Path( + session_tmp_path, "genome", "delly_exclusion_converted.tsv" + ) + reference_file.touch() + return reference_file + + +@pytest.fixture(scope="session", name="clinvar_file") +def fixture_clinvar_file(session_tmp_path: Path) -> Path: + """Return dummy clinvar file.""" + clinvar_file: Path = Path(session_tmp_path, "variants", "clinvar.vcf.gz") + clinvar_file.touch() + return clinvar_file + + +@pytest.fixture(scope="session", name="cosmic_file") +def fixture_cosmic_file(session_tmp_path: Path) -> Path: + """Return dummy cosmic file.""" + cosmic_file: Path = Path( + session_tmp_path, "variants", "cosmic_coding_muts_v97.vcf.gz" + ) + cosmic_file.touch() + return cosmic_file + + +@pytest.fixture(scope="session", name="dbsnp_file") +def fixture_dbsnp_file(session_tmp_path: Path) -> Path: + """Return dummy dbsnp file.""" + dbsnp_file: Path = Path(session_tmp_path, "variants", "dbsnp_grch37_b138.vcf.gz") + dbsnp_file.touch() + return dbsnp_file + + +@pytest.fixture(scope="session", name="hc_vcf_1kg_file") +def fixture_hc_vcf_1kg(session_tmp_path: Path) -> Path: + """Return dummy high confidence 1000 genome vcf file.""" + hc_vcf_1kg_file: Path = Path( + session_tmp_path, "variants", "1kg_phase1_snps_high_confidence_b37.vcf.gz" + ) + hc_vcf_1kg_file.touch() + return hc_vcf_1kg_file + + +@pytest.fixture(scope="session", name="known_indel_1kg_file") +def fixture_known_indel_1kg_file(session_tmp_path: Path) -> Path: + """Return dummy 1000 genome known indels vcf file.""" + known_indel_1kg_file: Path = Path( + session_tmp_path, "variants", "1kg_known_indels_b37.vcf.gz" + ) + known_indel_1kg_file.touch() + return known_indel_1kg_file + + +@pytest.fixture(scope="session", name="mills_1kg_file") +def fixture_mills_1kg_file(session_tmp_path: Path) -> Path: + """Return dummy Mills' high confidence indels vcf file.""" + mills_1kg_file: Path = Path(session_tmp_path, "variants", "mills_1kg_index.vcf.gz") + mills_1kg_file.touch() + return mills_1kg_file + + +@pytest.fixture(scope="session", name="somalier_sites_file") +def fixture_somalier_sites_file(session_tmp_path: Path) -> Path: + """Return dummy somalier sites vcf file.""" + somalier_sites_file: Path = Path( + session_tmp_path, "variants", "GRCh37.somalier.sites.vcf.gz" + ) + somalier_sites_file.touch() + return somalier_sites_file + + +@pytest.fixture(scope="session", name="vcf_1kg_file") +def fixture_vcf_1kg_file(session_tmp_path: Path) -> Path: + """Return dummy 1000 genome all snps file.""" + vcf_1kg_file: Path = Path( + session_tmp_path, "variants", "1k_genome_wgs_p1_v3_all_sites.vcf.gz" + ) + vcf_1kg_file.touch() + return vcf_1kg_file + + +@pytest.fixture(scope="session", name="analysis_references_hg_data") +def fixture_analysis_references_hg_data( + cache_config: CacheConfig, + analysis_references_data: Dict[str, Path], + delly_exclusion_converted_file: Path, + clinvar_file: Path, + cosmic_file: Path, + dbsnp_file: Path, + hc_vcf_1kg_file: Path, + known_indel_1kg_file: Path, + mills_1kg_file: Path, + somalier_sites_file: Path, + vcf_1kg_file: Path, +) -> Dict[str, Path]: + """Return human genome analysis references model data.""" + analysis_references_hg_data: Dict[str, Path] = { + "access_regions": Path(cache_config.references.access_regions.file_path), + "ascat_chr_y_loci": Path(cache_config.references.ascat_chr_y_loci.file_path), + "ascat_gc_correction": Path( + cache_config.references.ascat_gc_correction.file_path + ), + "cadd_snv": Path(cache_config.references.cadd_snv.file_path), + "simple_repeat": Path(cache_config.references.simple_repeat.file_path), + "clinvar": clinvar_file, + "cosmic": cosmic_file, + "dbsnp": dbsnp_file, + "delly_exclusion": Path(cache_config.references.delly_exclusion.file_path), + "delly_exclusion_converted": delly_exclusion_converted_file, + "delly_mappability": Path(cache_config.references.delly_mappability.file_path), + "gnomad_variant": Path(cache_config.references.gnomad_variant.file_path), + "hc_vcf_1kg": hc_vcf_1kg_file, + "known_indel_1kg": known_indel_1kg_file, + "mills_1kg": mills_1kg_file, + "rank_score": Path(cache_config.references.rank_score.file_path), + "somalier_sites": somalier_sites_file, + "vcf_1kg": vcf_1kg_file, + "vep_dir": cache_config.references_dir, + "wgs_calling_regions": Path( + cache_config.references.wgs_calling_regions.file_path + ), + } + analysis_references_hg_data.update(analysis_references_data) + return analysis_references_hg_data + + +@pytest.fixture(scope="session", name="analysis_references_hg") +def fixture_analysis_references_hg( + analysis_references_hg_data: Dict[str, Path] +) -> AnalysisReferencesHg: + """Return mocked human genome analysis references model.""" + return AnalysisReferencesHg(**analysis_references_hg_data) + + +@pytest.fixture(scope="session", name="reference_url") +def fixture_reference_url() -> Url: + """Return dummy reference url.""" + return Url("gs://gatk-legacy-bundles/b37/reference.vcf.gz") + + +@pytest.fixture(scope="session", name="reference_file") +def fixture_reference_file(session_tmp_path: Path) -> Path: + """Return dummy reference file.""" + reference_file: Path = Path(session_tmp_path, "reference.vcf") + reference_file.touch() + return reference_file + + +@pytest.fixture(scope="session", name="reference_url_data") +def fixture_reference_url_data( + reference_url: Url, reference_file: Path, cosmic_key: str +) -> Dict[str, Any]: + """return reference url model data.""" + return { + "url": reference_url, + "file_type": FileType.VCF, + "gzip": False, + "file_name": "reference.vcf", + "dir_name": "variants", + "file_path": reference_file.as_posix(), + "secret": cosmic_key, + } + + +@pytest.fixture(scope="session", name="references_data") +def fixture_references_data( + cache_config: CacheConfig, +) -> Dict[str, dict]: + """Return references model data.""" + return { + "genome_chrom_size": cache_config.references.genome_chrom_size, + "reference_genome": cache_config.references.reference_genome, + "refgene_sql": cache_config.references.refgene_sql, + "refgene_txt": cache_config.references.refgene_txt, + } + + +@pytest.fixture(scope="session", name="references") +def fixture_references(references_data: Dict[str, dict]) -> References: + """Return mocked references model.""" + return References(**references_data) + + +@pytest.fixture(scope="session", name="references_hg_data") +def fixture_references_hg_data( + cache_config: CacheConfig, +) -> Dict[str, dict]: + """Return human genome references model data.""" + return dict(cache_config.references) + + +@pytest.fixture(scope="session", name="references_hg") +def fixture_references_hg(references_hg_data: Dict[str, dict]) -> ReferencesHg: + """Return mocked human genome references model.""" + return ReferencesHg(**references_hg_data) + + +@pytest.fixture(scope="session", name="singularity_bind_path_data") +def fixture_singularity_bind_path_data(session_tmp_path: Path) -> Dict[str, Path]: + """Return singularity bind path data.""" + return {"source": session_tmp_path, "destination": Path("/")} + + +@pytest.fixture(scope="session", name="singularity_bind_path") +def fixture_singularity_bind_path( + singularity_bind_path_data: Dict[str, Path] +) -> SingularityBindPath: + """Return mocked singularity bind path model.""" + return SingularityBindPath(**singularity_bind_path_data) + + +@pytest.fixture(scope="session", name="snakemake_options_command") +def fixture_snakemake_options_command() -> List[str]: + """Return mocked singularity bind path model.""" + return ["--cores", "36"] + + +@pytest.fixture(scope="session", name="mail_user_option") +def fixture_mail_user_option() -> str: + """Return mail user option.""" + return "balsamic@scilifelab.se" + + +@pytest.fixture(scope="session", name="snakemake_executable_data") +def fixture_snakemake_executable_data( + case_id_tumor_only: str, + reference_file: Path, + session_tmp_path: Path, + mail_user_option: str, + singularity_bind_path: SingularityBindPath, + snakemake_options_command: List[str], +) -> Dict[str, Any]: + """Return snakemake executable model data.""" + return { + "account": ClusterAccount.DEVELOPMENT, + "case_id": case_id_tumor_only, + "cluster_config_path": reference_file, + "config_path": reference_file, + "disable_variant_caller": "tnscope,vardict", + "log_dir": session_tmp_path, + "mail_user": mail_user_option, + "profile": ClusterProfile.SLURM, + "qos": QOS.HIGH, + "quiet": True, + "result_dir": session_tmp_path, + "run_analysis": True, + "run_mode": RunMode.CLUSTER, + "script_dir": session_tmp_path, + "singularity_bind_paths": [singularity_bind_path], + "snakefile": reference_file, + "snakemake_options": snakemake_options_command, + "working_dir": session_tmp_path, + } + + +@pytest.fixture(scope="session", name="snakemake_executable") +def fixture_snakemake_executable( + snakemake_executable_data: Dict[str, Any] +) -> SnakemakeExecutable: + """Return mocked snakemake executable model.""" + return SnakemakeExecutable(**snakemake_executable_data) + + +@pytest.fixture(scope="session", name="snakemake_executable_validated_data") +def fixture_snakemake_executable_validated_data( + case_id_tumor_only: str, + reference_file: Path, + session_tmp_path: Path, + mail_user_option: str, + singularity_bind_path: SingularityBindPath, + snakemake_options_command: List[str], +) -> Dict[str, Any]: + """Return snakemake model expected data.""" + return { + "account": ClusterAccount.DEVELOPMENT, + "benchmark": False, + "case_id": case_id_tumor_only, + "cluster_config_path": reference_file, + "config_path": reference_file, + "disable_variant_caller": "disable_variant_caller=tnscope,vardict", + "dragen": False, + "force": False, + "log_dir": session_tmp_path, + "mail_type": None, + "mail_user": f"--mail-user {mail_user_option}", + "profile": ClusterProfile.SLURM, + "qos": QOS.HIGH, + "quiet": True, + "report_path": None, + "result_dir": session_tmp_path, + "run_analysis": True, + "run_mode": RunMode.CLUSTER, + "script_dir": session_tmp_path, + "singularity_bind_paths": [singularity_bind_path], + "snakefile": reference_file, + "snakemake_options": snakemake_options_command, + "working_dir": session_tmp_path, + } diff --git a/tests/models/__init__.py b/tests/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/models/test_cache_models.py b/tests/models/test_cache_models.py new file mode 100644 index 000000000..fd373c97b --- /dev/null +++ b/tests/models/test_cache_models.py @@ -0,0 +1,626 @@ +"""Test reference cache models.""" +from pathlib import Path +from typing import Dict, Any, List + +import pytest +from _pytest.logging import LogCaptureFixture +from pydantic import ValidationError + +from BALSAMIC.constants.cache import ( + GRCHVersion, + DockerContainers, + GenomeVersion, +) +from BALSAMIC.constants.constants import FileType, BwaIndexFileType +from BALSAMIC.models.cache import ( + AnalysisReferences, + AnalysisReferencesCanFam, + AnalysisReferencesHg, + ReferenceUrl, + References, + ReferencesCanFam, + ReferencesHg, + CacheAnalysis, + CacheConfig, +) +from BALSAMIC.utils.exc import BalsamicError + + +def test_analysis_references(analysis_references_data: Dict[str, Path]): + """Test common analysis references model.""" + + # GIVEN an input for the analysis reference model + + # WHEN initialising the model + model: AnalysisReferences = AnalysisReferences(**analysis_references_data) + + # THEN the model should have been correctly built + assert model.model_dump() == analysis_references_data + + +def test_analysis_references_empty(): + """Test common analysis references model for an empty input.""" + + # GIVEN no input for the analysis reference model + + # WHEN initialising the model + with pytest.raises(ValidationError): + # THEN an empty model should raise a ValidationError + AnalysisReferences() + + +def test_analysis_references_canfam(analysis_references_data: Dict[str, Path]): + """Test canine analysis references model.""" + + # GIVEN an input for the canine analysis reference model + + # WHEN initialising the model + model: AnalysisReferencesCanFam = AnalysisReferencesCanFam( + **analysis_references_data + ) + + # THEN the model should have been correctly built + assert model.model_dump() == analysis_references_data + + +def test_analysis_references_canfam_empty(): + """Test canine analysis references model for an empty input.""" + + # GIVEN no input for the canine analysis reference model + + # WHEN initialising the model + with pytest.raises(ValidationError): + # THEN an empty model should raise a ValidationError + AnalysisReferencesCanFam() + + +def test_analysis_references_hg(analysis_references_hg_data: Dict[str, Path]): + """Test human genome analysis references model.""" + + # GIVEN an input for the human genome analysis reference model + + # WHEN initialising the model + model: AnalysisReferencesHg = AnalysisReferencesHg(**analysis_references_hg_data) + + # THEN the model should have been correctly built + assert model.model_dump() == analysis_references_hg_data + + +def test_analysis_references_hg_empty(): + """Test human genome analysis references model for an empty input.""" + + # GIVEN no input for the human genome analysis reference model + + # WHEN initialising the model + with pytest.raises(ValidationError): + # THEN an empty model should raise a ValidationError + AnalysisReferencesHg() + + +def test_reference_url(reference_url_data: Dict[str, Any]): + """Test references URL model.""" + + # GIVEN an input for the reference URL model + + # WHEN initialising the model + model: ReferenceUrl = ReferenceUrl(**reference_url_data) + + # THEN the model should have been correctly built + assert model.model_dump() == reference_url_data + + +def test_reference_url_empty(): + """Test references URL model for an empty input.""" + + # GIVEN no input for the references URL model + + # WHEN initialising the model + with pytest.raises(ValidationError): + # THEN an empty model should raise a ValidationError + ReferenceUrl() + + +def test_references(references_data: Dict[str, dict], references: References): + """Test references model.""" + + # GIVEN an input for the reference model + + # WHEN initialising the model + model: References = References(**references_data) + + # THEN the model should have been correctly built + assert model == references + + +def test_references_empty(): + """Test references model for an empty input.""" + + # GIVEN no input for the references model + + # WHEN initialising the model + with pytest.raises(ValidationError): + # THEN an empty model should raise a ValidationError + References() + + +def test_get_reference_genome_file_paths(references: References): + """Test reference genome files retrieval.""" + + # GIVEN a references model + + # GIVEN the expected files to be retrieved + expected_file_types: set = {FileType.FASTA, FileType.FAI, FileType.DICT} + expected_file_types.update(BwaIndexFileType) + + # WHEN getting the reference genome files + reference_genome_files: List[str] = references.get_reference_genome_file_paths() + + # THEN the expected reference genome files should be returned + assert len(reference_genome_files) == len(expected_file_types) + for file_type in expected_file_types: + assert file_type in [file.split(".")[-1] for file in reference_genome_files] + + +def test_get_reference_genome_bwa_index_file_paths(references: References): + """Test extraction of reference genome BWA index files.""" + + # GIVEN a references model + + # GIVEN the expected files to be retrieved + expected_file_types: set = set(BwaIndexFileType) + + # WHEN getting the reference genome BWA index files + bwa_index_files: List[str] = references.get_reference_genome_bwa_index_file_paths() + + # THEN the expected reference genome BWA index files should be returned + assert len(bwa_index_files) == len(expected_file_types) + for file_type in expected_file_types: + assert file_type in [file.split(".")[-1] for file in bwa_index_files] + + +def test_get_refgene_file_paths( + references: References, refgene_bed_file: Path, refgene_flat_file: Path +): + """Test extraction of RefSeq's gene files.""" + + # GIVEN a references model and some mocked RefSeq's gene file + + # WHEN getting the RefSeq's gene files + refgene_files: List[str] = references.get_refgene_file_paths() + + # THEN the expected RefSeq's gene files should be returned + assert len(refgene_files) == 3 + assert references.refgene_txt.file_path in refgene_files + assert refgene_bed_file.as_posix() in refgene_files + assert refgene_flat_file.as_posix() in refgene_files + + +def test_get_refgene_flat_file_path(references: References, refgene_flat_file: Path): + """Test extraction of RefSeq's gene FLAT file.""" + + # GIVEN a references model and a mocked RefSeq's gene FLAT file + + # WHEN getting the RefSeq's gene FLAT file + refgene_output_file: str = references.get_refgene_flat_file_path() + + # THEN the correctly formatted flat file should be returned + assert refgene_output_file == refgene_flat_file.as_posix() + + +def test_get_refgene_bed_file_path(references: References, refgene_bed_file: Path): + """Test extraction of RefSeq's gene BED file.""" + + # GIVEN a references model and a mocked RefSeq's gene BED file + + # WHEN getting the RefSeq's gene BED file + refgene_output_file: str = references.get_refgene_bed_file_path() + + # THEN the correctly formatted flat file should be returned + assert refgene_output_file == refgene_bed_file.as_posix() + + +def test_references_canfam(references_data: Dict[str, dict], references: References): + """Test canine references model.""" + + # GIVEN an input for the canine reference model + + # WHEN initialising the model + model: ReferencesCanFam = ReferencesCanFam(**references_data) + + # THEN the model should have been correctly built + assert model.model_dump() == references.model_dump() + + +def test_references_canfam_empty(): + """Test canine references model for an empty input.""" + + # GIVEN no input for the canine references model + + # WHEN initialising the model + with pytest.raises(ValidationError): + # THEN an empty model should raise a ValidationError + ReferencesCanFam() + + +def test_references_hg( + references_hg_data: Dict[str, dict], references_hg: ReferencesHg +): + """Test human genome references model.""" + + # GIVEN an input for the human genome reference model + + # WHEN initialising the model + model: ReferencesHg = ReferencesHg(**references_hg_data) + + # THEN the model should have been correctly built + assert model == references_hg + + +def test_references_hg_empty(): + """Test human genome references model for an empty input.""" + + # GIVEN no input for the human genome references model + + # WHEN initialising the model + with pytest.raises(ValidationError): + # THEN an empty model should raise a ValidationError + ReferencesHg() + + +def test_get_cadd_snv_file_paths( + references_hg: ReferencesHg, cadd_snv_indexed_file: Path +): + """Test get CADD SNV reference output files.""" + + # GIVEN a human genome references model and a mocked CADD SNV indexed file + + # WHEN getting the CADD specific reference files + cadd_snv_files: List[str] = references_hg.get_cadd_snv_file_paths() + + # THEN all the CADD SNV reference files should be returned + assert len(cadd_snv_files) == 2 + assert references_hg.cadd_snv.file_path in cadd_snv_files + assert cadd_snv_indexed_file.as_posix() in cadd_snv_files + + +def test_get_delly_file_paths( + references_hg: ReferencesHg, delly_exclusion_converted_file: Path +): + """Test Delly specific files retrieval.""" + + # GIVEN a human genome references model and a mocked Delly exclusion converted file + + # WHEN getting the Delly specific reference files + delly_files: List[str] = references_hg.get_delly_file_paths() + + # THEN all the delly reference files should be returned + assert len(delly_files) == 5 + assert references_hg.delly_exclusion.file_path in delly_files + assert delly_exclusion_converted_file.as_posix() in delly_files + assert references_hg.delly_mappability.file_path in delly_files + assert references_hg.delly_mappability_findex.file_path in delly_files + assert references_hg.delly_mappability_gindex.file_path in delly_files + + +def test_get_delly_exclusion_converted_file_path( + references_hg: ReferencesHg, delly_exclusion_converted_file: Path +): + """Test get Delly exclusion converted file.""" + + # GIVEN a human genome references model and a delly exclusion converted file + + # WHEN getting the Delly exclusion converted file + converted_file: str = references_hg.get_delly_exclusion_converted_file_path() + + # THEN the returned file should match the expected one + assert converted_file == delly_exclusion_converted_file.as_posix() + + +def test_get_gnomad_file_paths(references_hg: ReferencesHg): + """Test get gnomad reference files.""" + + # GIVEN a human genome references model + + # WHEN getting the gnomad reference files + gnomad_files: List[str] = references_hg.get_gnomad_file_paths() + + # THEN the gnomad files should be returned + assert len(gnomad_files) == 2 + assert references_hg.gnomad_variant.file_path in gnomad_files + assert references_hg.gnomad_variant_index.file_path in gnomad_files + + +def test_get_1k_genome_file_paths(references_hg: ReferencesHg): + """Test get 1000 Genome related files.""" + + # GIVEN a human genome references model + + # WHEN getting the 1k genome files + genome_files: List[str] = references_hg.get_1k_genome_file_paths() + + # THEN the 1k genome files should be returned + assert len(genome_files) == 4 + assert f"{references_hg.known_indel_1kg.file_path}.{FileType.GZ}" in genome_files + assert f"{references_hg.mills_1kg.file_path}.{FileType.GZ}" in genome_files + assert f"{references_hg.hc_vcf_1kg.file_path}.{FileType.GZ}" in genome_files + assert f"{references_hg.vcf_1kg.file_path}.{FileType.GZ}" in genome_files + + +def test_cache_analysis(cache_analysis_data: Dict[str, str]): + """Test cache analysis model initialisation.""" + + # GIVEN an input for the cache analysis model + + # WHEN initialising the model + model: CacheAnalysis = CacheAnalysis(**cache_analysis_data) + + # THEN the model should have been correctly built + assert model.model_dump() == cache_analysis_data + + +def test_cache_analysis_empty(): + """Test ache analysis model for an empty input.""" + + # GIVEN no input for the cache analysis model + + # WHEN initialising the model + with pytest.raises(ValidationError): + # THEN an empty model should raise a ValidationError + CacheAnalysis() + + +def test_cache_config(cache_config_data: Dict[str, Any], cache_config: CacheConfig): + """Test cache config model initialisation.""" + + # GIVEN an input for the cache config model and a mocked reference model + + # WHEN initialising the model + model: CacheConfig = CacheConfig(**cache_config_data) + + # THEN the model should have been correctly built + assert model == cache_config + + +def test_cache_config_empty(): + """Test cache config model for an empty input.""" + + # GIVEN no input for the cache config model + + # WHEN initialising the model + with pytest.raises(ValidationError): + # THEN an empty model should raise a ValidationError + CacheConfig() + + +def test_cache_config_empty_file_path(cache_config_data: Dict[str, dict]): + """Test cache config model reference validation method and file path assignment.""" + + # GIVEN a cache config model data with empty file paths + + # WHEN initialising the model + model: CacheConfig = CacheConfig(**cache_config_data) + + # THEN the file paths should have been assigned + for reference in model.references: + assert reference[1].file_path + + +def test_cache_config_empty_cosmic_key( + cache_config_data: Dict[str, dict], cosmic_key: str +): + """Test cache config model reference validation method and cosmic key assignment.""" + + # GIVEN a cache config model data with empty cosmic keys + + # WHEN initialising the model + model: CacheConfig = CacheConfig(**cache_config_data) + + # THEN a cosmic key should only have been assigned to a cosmic reference file + for reference in model.references: + if reference[0] == "cosmic": + assert reference[1].secret == cosmic_key + continue + assert reference[1].secret is None + + +def test_get_grch_version(cache_config: CacheConfig): + """Test extraction of the GRCH format version having a specific genome version.""" + + # GIVEN a cache config model + + # WHEN getting the GRCH version + grch_version: GRCHVersion = cache_config.get_grch_version() + + # THEN a correct GRCH format version should be returned + assert grch_version == GRCHVersion.GRCH37 + + +def test_get_reference_file_paths(cache_config: CacheConfig): + """Test reference path extraction.""" + + # GIVEN a cache config model + + # WHEN extracting the list of reference paths + reference_paths: List[str] = cache_config.get_reference_file_paths() + + # THEN a complete list of reference path should be returned + assert reference_paths == [ + reference[1].file_path for reference in cache_config.references + ] + + +def test_get_reference_by_path(cache_config: CacheConfig): + """Test reference extraction given its path.""" + + # GIVEN a cache config model + + # WHEN getting the reference genome by path + reference_genome: ReferenceUrl = cache_config.get_reference_by_path( + reference_path=cache_config.references.reference_genome.file_path + ) + + # THEN the correct reference should be returned + assert reference_genome == cache_config.references.reference_genome + + +def test_get_reference_by_path_error( + cache_config: CacheConfig, invalid_json_file: Path, caplog: LogCaptureFixture +): + """Test reference extraction given an invalid path.""" + + # GIVEN a cache config model + + # WHEN getting the reference genome by path + with pytest.raises(BalsamicError): + cache_config.get_reference_by_path(reference_path=invalid_json_file.as_posix()) + + # THEN a Balsamic error should be returned + assert ( + f"No reference with the provided reference path {invalid_json_file.as_posix()}" + in caplog.text + ) + + +def test_get_reference_file_paths_by_file_type_and_compression( + cache_config: CacheConfig, +): + """Test reference path extraction by file type and compression.""" + + # GIVEN a cache config model + + # WHEN extracting the reference paths by file type and compression status + reference_paths: List[ + str + ] = cache_config.get_reference_file_paths_by_file_type_and_compression( + file_type=FileType.FASTA, compression=True + ) + + # THEN the expected reference path should be returned + assert reference_paths == [cache_config.references.reference_genome.file_path] + + +def test_get_reference_file_paths_by_file_type(cache_config: CacheConfig): + """Test reference path extraction by file type.""" + + # GIVEN a cache config model + + # WHEN extracting the reference paths by file type + reference_paths: List[str] = cache_config.get_reference_file_paths_by_file_type( + file_type=FileType.FASTA + ) + + # THEN the TXT file should be returned + assert reference_paths == [cache_config.references.reference_genome.file_path] + + +def test_get_reference_file_paths_by_compression(cache_config: CacheConfig): + """Test reference path extraction by compression.""" + + # GIVEN a cache config model + + # WHEN extracting the reference paths by compression status + reference_paths: List[str] = cache_config.get_reference_file_paths_by_compression( + compression=True + ) + + # THEN the expected reference path should be returned + assert len(reference_paths) == 11 + for reference in [ + cache_config.references.ascat_gc_correction.file_path, + cache_config.references.clinvar.file_path, + cache_config.references.cosmic.file_path, + cache_config.references.dbsnp.file_path, + cache_config.references.hc_vcf_1kg.file_path, + cache_config.references.known_indel_1kg.file_path, + cache_config.references.mills_1kg.file_path, + cache_config.references.reference_genome.file_path, + cache_config.references.refgene_txt.file_path, + cache_config.references.somalier_sites.file_path, + cache_config.references.vcf_1kg.file_path, + ]: + assert reference in reference_paths + + +def test_get_compressed_indexed_vcf_paths(cache_config: CacheConfig): + """Test get compressed indexed VCFs.""" + + # GIVEN a cache config model + + # WHEN retrieving the compressed and indexed VCFs + compressed_indexed_vcfs: List[str] = cache_config.get_compressed_indexed_vcf_paths() + + # THEN the indexed VCFs should be returned + assert len(compressed_indexed_vcfs) == 8 + for reference in [ + cache_config.references.dbsnp.file_path, + cache_config.references.vcf_1kg.file_path, + cache_config.references.known_indel_1kg.file_path, + cache_config.references.mills_1kg.file_path, + cache_config.references.clinvar.file_path, + cache_config.references.somalier_sites.file_path, + cache_config.references.hc_vcf_1kg.file_path, + cache_config.references.cosmic.file_path, + ]: + assert f"{reference}.{FileType.GZ}.{FileType.TBI}" in compressed_indexed_vcfs + + +def test_get_container_output_paths(cache_config: CacheConfig, tmp_path: Path): + """Test retrieval of the containers output paths.""" + + # GIVEN a cache config model + + # WHEN getting the list of container paths + container_paths: List[str] = cache_config.get_container_output_paths() + + # THEN all the container paths should be returned + assert len(container_paths) == len(set(DockerContainers)) + for container in set(DockerContainers): + assert Path(tmp_path, f"{container}.{FileType.SIF}") + + +def test_get_reference_output_paths(cache_config: CacheConfig): + """Test get reference list to be downloaded.""" + + # GIVEN a cache config model + + # WHEN retrieving the reference output paths + reference_output_paths: List[str] = cache_config.get_reference_output_paths() + + # THEN all the reference paths should be returned + assert len(reference_output_paths) == 45 + + +def test_get_analysis_references_hg( + cache_config: CacheConfig, + analysis_references_hg_data: Dict[str, Path], + analysis_references_hg: AnalysisReferences, +): + """Test analysis references retrieval to be used for Balsamic human genome analyses.""" + + # GIVEN a canine cache config model + cache_config.genome_version = GenomeVersion.HG19 + + # WHEN getting the analysis references + analysis_references: AnalysisReferencesHg = cache_config.get_analysis_references() + + # THEN the retrieved analysis references should match the mocked one + assert type(analysis_references) is AnalysisReferencesHg + assert analysis_references == analysis_references_hg + + +def test_get_analysis_references_canfam( + cache_config: CacheConfig, analysis_references_data: Dict[str, Path] +): + """Test analysis references retrieval to be used for Balsamic canine analyses.""" + + # GIVEN a canine cache config model + cache_config.genome_version = GenomeVersion.CanFam3 + + # WHEN getting the analysis references + analysis_references: AnalysisReferencesCanFam = ( + cache_config.get_analysis_references() + ) + + # THEN the retrieved analysis references should match the mocked one + assert type(analysis_references) is AnalysisReferencesCanFam + assert analysis_references.model_dump() == analysis_references_data diff --git a/tests/models/test_config_models.py b/tests/models/test_config_models.py new file mode 100644 index 000000000..34d36dc7c --- /dev/null +++ b/tests/models/test_config_models.py @@ -0,0 +1,412 @@ +import copy +from datetime import datetime +from pathlib import Path +from typing import Dict, List + +import pytest +from pydantic import ValidationError + +from BALSAMIC.constants.analysis import FastqName, SampleType, SequencingType +from BALSAMIC.models.config import AnalysisModel, ConfigModel, SampleInstanceModel + + +def test_analysis_model(test_data_dir: Path, timestamp_now: datetime): + """Test analysis model instantiation.""" + + # GIVEN valid input arguments + valid_args = { + "case_id": "case_id", + "gender": "female", + "analysis_type": "paired", + "sequencing_type": "targeted", + "analysis_dir": test_data_dir.as_posix(), + "fastq_path": test_data_dir.as_posix(), + "log": test_data_dir.as_posix(), + "result": test_data_dir.as_posix(), + "script": test_data_dir.as_posix(), + "benchmark": test_data_dir.as_posix(), + "dag": test_data_dir.as_posix(), + "config_creation_date": str(timestamp_now), + "analysis_workflow": "balsamic-umi", + } + + # THEN we can successfully create a config dict + assert AnalysisModel.model_validate(valid_args) + + # GIVEN invalid input arguments + invalid_args = { + "case_id": "case_id", + "gender": "unknown", + "analysis_type": "odd", + "sequencing_type": "wrong", + "analysis_dir": "tests/test_data", + "analysis_workflow": "umi", + } + + # THEN should trigger ValueError + with pytest.raises(ValueError): + AnalysisModel.model_validate(invalid_args) + + +def test_sample_instance_model(config_dict_w_fastqs: Dict): + """Test sample instance model initialisation.""" + + # GIVEN a sample list + sample_list = config_dict_w_fastqs["samples"] + + # WHEN parsing the sample dictionary + for idx, sample in enumerate(sample_list): + sample: SampleInstanceModel = SampleInstanceModel.model_validate(sample) + + sample_dict_copy = sample_list[idx].copy() + for fastq_pattern, values in sample_dict_copy["fastq_info"].items(): + values["fwd"] = Path(values["fwd"]).resolve().as_posix() + values["rev"] = Path(values["rev"]).resolve().as_posix() + + # THEN the sample model should be correctly initialised + assert sample.model_dump(exclude_none=True) == sample_dict_copy + + +def test_sample_instance_model_sample_type_error(tumor_normal_fastq_info_correct: Dict): + """Test sample instance model error raise.""" + + # GIVEN a sample dictionary with an invalid sample type + samples: List[Dict] = copy.deepcopy(tumor_normal_fastq_info_correct) + illegal_sample_type: str = "affected" + tumor_dict = samples[0] + tumor_dict["type"] = illegal_sample_type + + # WHEN parsing the sample dictionary + with pytest.raises(ValueError) as exc: + SampleInstanceModel.model_validate(tumor_dict) + + # THEN a ValueError should be triggered + assert "Input should be 'normal' or 'tumor'" in str(exc.value) + + +def test_analysis_model_for_pon(test_data_dir: Path, timestamp_now: datetime): + """Tests PON model parsing.""" + + # GIVEN valid input arguments + valid_args = { + "case_id": "case_id", + "analysis_type": "pon", + "sequencing_type": "targeted", + "analysis_dir": test_data_dir.as_posix(), + "fastq_path": test_data_dir.as_posix(), + "log": test_data_dir.as_posix(), + "result": test_data_dir.as_posix(), + "script": test_data_dir.as_posix(), + "benchmark": test_data_dir.as_posix(), + "dag": test_data_dir.as_posix(), + "analysis_workflow": "balsamic", + "config_creation_date": str(timestamp_now), + "pon_version": "v1", + } + + # THEN we can successfully create a config dict + assert AnalysisModel.model_validate(valid_args) + + # GIVEN an invalid version argument + invalid_args = { + "case_id": "case_id", + "analysis_type": "pon", + "sequencing_type": "targeted", + "analysis_dir": test_data_dir, + "fastq_path": test_data_dir, + "analysis_workflow": "balsamic", + "pon_version": "v01", + } + + # THEN should trigger ValueError + with pytest.raises(ValidationError) as excinfo: + AnalysisModel.model_validate(invalid_args) + + assert ( + f"The provided PON version ({invalid_args['pon_version']}) does not follow the defined syntax (v)" + in str(excinfo.value) + ) + + +def test_detect_duplicate_fastq_pattern( + config_w_fastq_dir_for_duplicate_fastq_patterns_model: Dict, +): + """Test balsamic models ability to detect duplicate assigned fastq patterns.""" + config_dict = config_w_fastq_dir_for_duplicate_fastq_patterns_model + # Initialize balsamic model + with pytest.raises(ValueError) as exc: + ConfigModel.model_validate(config_dict) + + assert ( + "Duplicate FastqPattern(s) found: ACC1_S1_L001_R across multiple samples" + in str(exc.value) + ) + + +def test_detection_unassigned_fastq_file(config_tumor_normal_extrafile: Dict): + """Test instantiating balsamic model with fastq dir containing unassigned fastq-files.""" + # Initialize balsamic model + with pytest.raises(ValueError) as exc: + ConfigModel.model_validate(config_tumor_normal_extrafile) + + assert "Fastqs in fastq-dir not assigned to sample config:" in str(exc.value) + + +def test_get_all_sample_names(balsamic_model: ConfigModel): + """Validate retrieval of all sample names in analysis from ConfigModel.""" + sample_names = balsamic_model.get_all_sample_names() + assert ["ACC1", "ACC2"] == sample_names + + +def test_get_fastq_patterns_by_sample( + balsamic_model: ConfigModel, tumor_sample_name: str, normal_sample_name: str +): + """Validate retrieval of fastq-pattern by sample from ConfigModel.""" + + def compare_fastq_pattern_lists(expected: List[str], found: List[str]): + assert all( + fastq_pattern in found for fastq_pattern in expected + ), "Not all expected fastq patterns found." + assert len(expected) == len(found), "Not same number of fastq patterns" + + tumor_fastq_patterns_expected = [ + "1_171015_HJ7TLDSX5_ACC1_XXXXXX", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX", + ] + normal_fastq_patterns_expected = [ + "1_171015_HJ7TLDSX5_ACC2_XXXXXX", + "2_171015_HJ7TLDSX5_ACC2_XXXXXX", + ] + + fastq_patterns_all_expected = ( + tumor_fastq_patterns_expected + normal_fastq_patterns_expected + ) + + tumor_fastq_patterns = balsamic_model.get_fastq_patterns_by_sample( + [tumor_sample_name] + ) + normal_fastq_patterns = balsamic_model.get_fastq_patterns_by_sample( + [normal_sample_name] + ) + fastq_patterns_all = balsamic_model.get_fastq_patterns_by_sample( + [tumor_sample_name, normal_sample_name] + ) + + compare_fastq_pattern_lists(tumor_fastq_patterns_expected, tumor_fastq_patterns) + compare_fastq_pattern_lists(normal_fastq_patterns_expected, normal_fastq_patterns) + compare_fastq_pattern_lists(fastq_patterns_all_expected, fastq_patterns_all) + + +def test_get_all_fastqs_for_sample(balsamic_model: ConfigModel, tumor_sample_name: str): + """Validate retrieval of fastq-files by sample and fastq-type from ConfigModel.""" + + def compare_fastq_file_lists(expected: List[str], found: List[str]): + found_file_names = [] + for found_file in found: + found_file_names.append(Path(found_file).name) + assert all( + fastq_file in found_file_names for fastq_file in expected + ), f"Not all expected fastq files found. {expected}: {found_file_names}" + assert len(expected) == len(found), "Not same number of fastq files" + + fwd_fastq_files_expected = [ + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz", + ] + rev_fastq_files_expected = [ + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz", + ] + fastq_files_expected = fwd_fastq_files_expected + rev_fastq_files_expected + + normal_fastq = "1_171015_HJ7TLDSX5_ACC2_XXXXXX_1.fastq.gz" + + fwd_fastq_files = balsamic_model.get_all_fastqs_for_sample( + sample_name=tumor_sample_name, fastq_types=[FastqName.FWD] + ) + rev_fastq_files = balsamic_model.get_all_fastqs_for_sample( + sample_name=tumor_sample_name, fastq_types=[FastqName.REV] + ) + fastq_files = balsamic_model.get_all_fastqs_for_sample( + sample_name=tumor_sample_name + ) + + compare_fastq_file_lists(fwd_fastq_files_expected, fwd_fastq_files) + compare_fastq_file_lists(rev_fastq_files_expected, rev_fastq_files) + compare_fastq_file_lists(fastq_files_expected, fastq_files) + assert normal_fastq not in fastq_files + + +def test_get_all_fastq_names(balsamic_model: ConfigModel): + """Validate retrieval of all fastq-files from ConfigModel and optional removal of suffix.""" + + all_fastqs_expected = [ + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz", + "1_171015_HJ7TLDSX5_ACC2_XXXXXX_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACC2_XXXXXX_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACC2_XXXXXX_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACC2_XXXXXX_2.fastq.gz", + ] + all_fastqs_expected_wo_suffix = [ + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_1", + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_2", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_1", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_2", + "1_171015_HJ7TLDSX5_ACC2_XXXXXX_1", + "1_171015_HJ7TLDSX5_ACC2_XXXXXX_2", + "2_171015_HJ7TLDSX5_ACC2_XXXXXX_1", + "2_171015_HJ7TLDSX5_ACC2_XXXXXX_2", + ] + + all_fastqs = balsamic_model.get_all_fastq_names() + all_fastqs_wo_suffix = balsamic_model.get_all_fastq_names(remove_suffix=True) + + assert all_fastqs == all_fastqs_expected + assert all_fastqs_wo_suffix == all_fastqs_expected_wo_suffix + + +def test_fastq_by_fastq_pattern(balsamic_model: ConfigModel): + """Validate retrieval of fastq-file by fastq-pattern and fastq-type from ConfigModel.""" + + fastq_pattern = "2_171015_HJ7TLDSX5_ACC2_XXXXXX" + expected_fwd = "2_171015_HJ7TLDSX5_ACC2_XXXXXX_1.fastq.gz" + expected_rev = "2_171015_HJ7TLDSX5_ACC2_XXXXXX_2.fastq.gz" + + fwd_fastq = balsamic_model.get_fastq_by_fastq_pattern(fastq_pattern, FastqName.FWD) + rev_fastq = balsamic_model.get_fastq_by_fastq_pattern(fastq_pattern, FastqName.REV) + + assert Path(fwd_fastq).name == expected_fwd + assert Path(rev_fastq).name == expected_rev + + +def test_sample_name_by_type(balsamic_model: ConfigModel): + """Validate retrieval of sample name by sample type from ConfigModel.""" + + tumor_name_expected = "ACC1" + normal_name_expected = "ACC2" + + # Given sample type + tumor_name_retrieved = balsamic_model.get_sample_name_by_type(SampleType.TUMOR) + normal_name_retrieved = balsamic_model.get_sample_name_by_type(SampleType.NORMAL) + + # Then the retrieved sample name should match the expected + assert tumor_name_retrieved == tumor_name_expected + assert normal_name_retrieved == normal_name_expected + + +def test_sample_type_by_name(balsamic_model: ConfigModel): + """Validate retrieval of sample type by sample name from ConfigModel.""" + + tumor_name = "ACC1" + normal_name = "ACC2" + + # Given sample name + tumor_type_retrieved = balsamic_model.get_sample_type_by_name(tumor_name) + normal_type_retrieved = balsamic_model.get_sample_type_by_name(normal_name) + + # Then the retrieved sample type should match the expected + assert tumor_type_retrieved == SampleType.TUMOR + assert normal_type_retrieved == SampleType.NORMAL + + +def test_get_bam_name_per_lane(balsamic_model: ConfigModel): + """Validate retrieval of per lane bam names by sample name.""" + + def compare_bam_file_lists(expected: List[str], found: List[str]): + assert all( + bam_file in found for bam_file in expected + ), "Not all expected bam files found." + assert len(expected) == len(found), "Not same number of bam files" + + # Fastq patterns for ACC2 in config.json + normal_lane1_fastq_pattern = "1_171015_HJ7TLDSX5_ACC2_XXXXXX" + normal_lane2_fastq_pattern = "2_171015_HJ7TLDSX5_ACC2_XXXXXX" + + # Given bam_dir path and sample name + normal_name = "ACC2" + result_dir = balsamic_model.analysis.result + bam_dir = Path(result_dir, "bam", "").as_posix() + + # When retrieving all per lane bam names for sample + bam_names = balsamic_model.get_bam_name_per_lane(bam_dir, normal_name) + + # Then the bam names for all fastq patterns should be retrieved and match the expected format + expected_bam_name_lane1 = ( + f"{bam_dir}{normal_name}_align_sort_{normal_lane1_fastq_pattern}.bam" + ) + expected_bam_name_lane2 = ( + f"{bam_dir}{normal_name}_align_sort_{normal_lane2_fastq_pattern}.bam" + ) + expected_bam_names = [expected_bam_name_lane1, expected_bam_name_lane2] + compare_bam_file_lists(expected_bam_names, bam_names) + + +def test_get_final_bam_name(balsamic_model: ConfigModel): + """Validate retrieval of final bam name by either sample type or sample name.""" + + # Given bam_dir path and sample name or sample type + sample_name = "ACC1" + sample_type = SampleType.TUMOR + result_dir = balsamic_model.analysis.result + bam_dir = Path(result_dir, "bam", "").as_posix() + + # When retrieving final bam file name by sample name or sample type + bam_name_sample_name = balsamic_model.get_final_bam_name( + bam_dir, sample_name=sample_name + ) + bam_name_sample_type = balsamic_model.get_final_bam_name( + bam_dir, sample_type=sample_type + ) + + # Then retrieved final bam names should match the expected format and be identical regardless of request parameter + expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup_sorted.bam" + assert expected_final_bam_name == bam_name_sample_name + assert bam_name_sample_name == bam_name_sample_type + + # WHEN changing sequencing_type to WGS + balsamic_model.analysis.sequencing_type = SequencingType.WGS + # Then retrieved final bam names should have realignment suffix + expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.realign.bam" + bam_name_sample_type = balsamic_model.get_final_bam_name( + bam_dir, sample_type=sample_type + ) + assert expected_final_bam_name == bam_name_sample_type + + +def test_no_info_error_get_final_bam_name(balsamic_model: ConfigModel): + """Validate raise ValueError by not sample type or sample name.""" + + # Given bam_dir path + result_dir = balsamic_model.analysis.result + bam_dir = Path(result_dir, "bam").as_posix() + + # When retrieving final bam file name without supplying sample name or sample type + # ValueError should be raised + with pytest.raises(ValueError) as excinfo: + balsamic_model.get_final_bam_name(bam_dir) + assert ( + "Either sample_name or sample_type must be provided to get the final bam name." + in str(excinfo.value) + ) + + +def test_get_final_bam_name_pon(balsamic_pon_model: ConfigModel): + """Validate retrieval of final bam name for PON by either sample type or sample name.""" + + # Given bam_dir path and sample name or sample type + sample_name = "ACCN6" + sample_type = SampleType.NORMAL + result_dir = balsamic_pon_model.analysis.result + bam_dir = Path(result_dir, "bam").as_posix() + + # When retrieving final bam file name by sample name or sample type + bam_name_sample_name = balsamic_pon_model.get_final_bam_name( + bam_dir, sample_name=sample_name + ) + + # Then retrieved final bam names should match the expected format and be identical regardless of request parameter + expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.bam" + assert expected_final_bam_name == bam_name_sample_name diff --git a/tests/models/test_metric_models.py b/tests/models/test_metric_models.py new file mode 100644 index 000000000..a93b6f105 --- /dev/null +++ b/tests/models/test_metric_models.py @@ -0,0 +1,110 @@ +"""Tests for the QC metrics related methods.""" +import copy +from typing import Any, Dict + +import pytest + +from BALSAMIC.models.metrics import MetricValidation, Metric, MetricCondition + + +def test_metric_condition(): + """Test MetricCondition attributes parsing.""" + + # GIVEN input attributes + metric_condition: Dict[str, Any] = {"norm": "gt", "threshold": 1} + + # WHEN building the metric condition model + metric_model: MetricCondition = MetricCondition(**metric_condition) + + # THEN assert retrieved values from the created model + assert metric_model.model_dump() == metric_condition + + +def test_metric_pass_validation(): + """Test Metric attributes parsing.""" + + # GIVEN input attributes + metrics: Dict[str, Any] = { + "header": None, + "id": "ACC1", + "input": "ACC1.sorted.mrkdup.hsmetric", + "name": "MEDIAN_TARGET_COVERAGE", + "step": "multiqc_picard_HsMetrics", + "value": 2393.0, + "condition": {"norm": "gt", "threshold": 1000.0}, + } + + # WHEN building the metric model + metric_model: Metric = Metric(**metrics) + + # THEN assert retrieved values from the created model + assert metric_model.model_dump() == metrics + + +def test_metric_fail_validation(): + """Test Metric behaviour for an incorrect input.""" + + # GIVEN an invalid input + invalid_input: Dict[str, Any] = {"header": None, "id": "ACC1"} + + # THEN the model raises an error due to an incomplete input + with pytest.raises(ValueError) as input_exc: + Metric(**invalid_input) + assert "Field required" in str(input_exc.value) + + +def test_metric_validation_pass(qc_extracted_metrics: dict): + """Test MetricValidation attribute parsing and positive validation.""" + + # WHEN building the MetricValidation model + model: MetricValidation = MetricValidation(metrics=qc_extracted_metrics) + + # THEN assert retrieved values from the created model + assert model.model_dump()["metrics"] == qc_extracted_metrics + + +def test_metric_validation_fail(qc_extracted_metrics: dict): + """Test MetricValidation for an overly restrictive metric condition.""" + + # GIVEN input attributes with a value that does not meet the filtering condition + metrics: dict = copy.deepcopy(qc_extracted_metrics) + metrics[4]["value"] = 2.0 # GC_DROPOUT set to 2.0 (failing condition) + + # THEN check that the model filters the metric according to its norm + with pytest.raises(ValueError) as val_exc: + MetricValidation(metrics=metrics) + assert ( + f"QC metric {metrics[4]['name']}: {metrics[4]['value']} validation has failed. " + f"(Condition: {metrics[4]['condition']['norm']} {metrics[4]['condition']['threshold']}, ID: {metrics[4]['id']})" + in str(val_exc.value) + ) + + +def test_multiple_metric_validation_fail(qc_extracted_metrics: dict): + """Test MetricValidation for multiple metrics with failing conditions.""" + + # GIVEN input attributes that does not meet the specified conditions + metrics: dict = copy.deepcopy(qc_extracted_metrics) + metrics[4]["value"] = 2.0 # GC_DROPOUT set to 2.0 (failing condition) + metrics[8]["value"] = 0.5 # PCT_TARGET_BASES_500X set to 50% (failing condition) + + # THEN check that the model filters the metrics according to its norm + with pytest.raises(ValueError) as val_exc: + MetricValidation(metrics=metrics) + assert "2 validation errors for MetricValidation" in str(val_exc.value) + assert metrics[4]["name"] in str(val_exc.value) + assert metrics[8]["name"] in str(val_exc.value) + + +def test_metric_validation_norm_fail(qc_extracted_metrics: dict): + """Test MetricValidation ValueError raising for an operator that it is not accepted.""" + + # GIVEN a metric with an incorrect norm attribute + metrics: dict = copy.deepcopy(qc_extracted_metrics) + metrics[4]["condition"]["norm"] = "lower" + + # THEN model raises an error due to a non accepted norm + try: + MetricValidation(metrics=metrics) + except KeyError as key_exc: + assert metrics[4]["condition"]["norm"] in str(key_exc) diff --git a/tests/models/test_params_models.py b/tests/models/test_params_models.py new file mode 100644 index 000000000..50dd10b52 --- /dev/null +++ b/tests/models/test_params_models.py @@ -0,0 +1,198 @@ +"""Tests for Balsamic analysis params models.""" +from math import isclose + +import pytest +from pydantic import ValidationError + +from BALSAMIC.models.config import VarcallerAttribute +from BALSAMIC.models.params import ( + ParamsVardict, + ParamsVEP, + QCModel, + UMIParamsCommon, + UMIParamsConsensuscall, + UMIParamsTNscope, + UMIParamsUMIextract, + VarCallerFilter, + VCFAttributes, +) + + +def test_params_vardict(): + """test UMIParamsVardict model for correct validation""" + + # GIVEN vardict params + test_vardict_params = { + "allelic_frequency": 0.01, + "max_pval": 0.5, + "max_mm": 2, + "column_info": "-a 1 -b 2 -c 3", + } + + # WHEN building the model + test_vardict_built = ParamsVardict(**test_vardict_params) + + # THEN assert values + assert isclose(test_vardict_built.allelic_frequency, 0.01) + assert isclose(test_vardict_built.max_pval, 0.5) + assert test_vardict_built.max_mm == 2 + assert test_vardict_built.column_info == "-a 1 -b 2 -c 3" + + +def test_params_vep(): + """test UMIParamsVEP model for correct validation""" + + # GIVEN vardict params + test_vep = {"vep_filters": "all defaults params"} + + # WHEN building the model + test_vep_built = ParamsVEP(**test_vep) + + # THEN assert values + assert test_vep_built.vep_filters == "all defaults params" + + +def test_qc_model(): + # GIVEN valid input arguments + # THEN we can successully create a config dict + valid_args = { + "umi_trim": True, + "min_seq_length": 25, + "umi_trim_length": 5, + "n_base_limit": 50, + } + assert QCModel.model_validate(valid_args) + + +def test_vcfattributes(): + """test VCFAttributes model for correct validation""" + + # GIVEN a VCF attribute + dummy_attribute = { + "tag_value": 5.0, + "filter_name": "dummy_filter_name", + "field": "INFO", + } + + # WHEN building the model + dummy_attribute_built = VCFAttributes(**dummy_attribute) + + # THEN assert values can be reterived currently + assert isclose(dummy_attribute_built.tag_value, 5.0) + assert dummy_attribute_built.field == "INFO" + assert dummy_attribute_built.filter_name == "dummy_filter_name" + + +def test_varcallerfilter(): + """test required VarCallerFilters for being set correctly""" + + # GIVEN a VarCallerFilter + dummy_varcaller = { + "AD": {"tag_value": 5.0, "filter_name": "dummy_alt_depth", "field": "INFO"}, + "DP": {"tag_value": 100.0, "filter_name": "dummy_depth", "field": "INFO"}, + "pop_freq": { + "tag_value": 0.005, + "filter_name": "dummy_pop_freq", + "field": "INFO", + }, + "varcaller_name": "dummy_varcaller", + "filter_type": "dummy_ffpe_filter", + "analysis_type": "dummy_tumor_only", + "description": "dummy description of this filter", + } + + # WHEN building the model + dummy_varcaller_filter = VarCallerFilter(**dummy_varcaller) + + # THEN assert required values are set + assert isclose(dummy_varcaller_filter.AD.tag_value, 5.0) + assert isclose(dummy_varcaller_filter.DP.tag_value, 100.0) + assert dummy_varcaller_filter.analysis_type == "dummy_tumor_only" + + +def test_varcaller_attribute(): + # GIVEN valid input arguments + valid_args = {"mutation": "somatic", "mutation_type": "SNV"} + # THEN we can successully create a config dict + assert VarcallerAttribute.model_validate(valid_args) + # GIVEN invalid input arguments + invalid_args = {"mutation": "strange", "mutation_type": "unacceptable"} + # THEN should trigger ValueError + with pytest.raises(ValidationError) as excinfo: + VarcallerAttribute.model_validate(invalid_args) + assert "2 validation errors" in str(excinfo.value) + + +def test_umiparams_common(): + """test UMIParamsCommon model for correct validation""" + + # GIVEN a UMI workflow common params + test_commonparams = { + "align_header": "test_header_name", + "align_intbases": 100, + "filter_tumor_af": 0.01, + } + # WHEN building the model + test_commonparams_built = UMIParamsCommon(**test_commonparams) + # THEN assert values + assert test_commonparams_built.align_header == "test_header_name" + assert isclose(test_commonparams_built.filter_tumor_af, 0.01) + assert test_commonparams_built.align_intbases == 100 + + +def test_umiparams_umiextract(): + """test UMIParamsUMIextract model for correct validation""" + # GIVEN umiextract params + test_umiextractparams = {"read_structure": "['mode', 'r1,r2']"} + + # WHEN building the model + test_umiextractparams_built = UMIParamsUMIextract(**test_umiextractparams) + + # THEN assert values + assert test_umiextractparams_built.read_structure == "['mode', 'r1,r2']" + + +def test_umiparams_consensuscall(): + """test UMIParamsConsensuscall model for correct validation""" + + # GIVEN consensuscall params + test_consensuscall = { + "align_format": "BAM", + "filter_minreads": "6,3,3", + "tag": "XZ", + } + + # WHEN building the model + test_consensuscall_built = UMIParamsConsensuscall(**test_consensuscall) + + # THEN assert values + assert test_consensuscall_built.align_format == "BAM" + assert test_consensuscall_built.filter_minreads == "6,3,3" + assert test_consensuscall_built.tag == "XZ" + + +def test_umiparams_tnscope(): + """test UMIParamsTNscope model for correct validation""" + + # GIVEN tnscope params + test_tnscope_params = { + "algo": "algoname", + "init_tumorLOD": 0.5, + "min_tumorLOD": 6, + "error_rate": 5, + "prunefactor": 3, + "padding": 30, + "disable_detect": "abc", + } + + # WHEN building the model + test_tnscope_params_built = UMIParamsTNscope(**test_tnscope_params) + + # THEN assert values + assert test_tnscope_params_built.algo == "algoname" + assert isclose(test_tnscope_params_built.init_tumorLOD, 0.5) + assert test_tnscope_params_built.min_tumorLOD == 6 + assert test_tnscope_params_built.error_rate == 5 + assert test_tnscope_params_built.prunefactor == 3 + assert test_tnscope_params_built.disable_detect == "abc" + assert test_tnscope_params_built.padding == 30 diff --git a/tests/models/test_snakemake_models.py b/tests/models/test_snakemake_models.py new file mode 100644 index 000000000..edd803737 --- /dev/null +++ b/tests/models/test_snakemake_models.py @@ -0,0 +1,310 @@ +"""Tests for the Snakemake model related methods.""" +import copy +import sys +from pathlib import Path +from typing import Dict, Any + +import pytest +from pydantic import ValidationError + +from BALSAMIC.constants.cluster import ( + ClusterMailType, + MAX_JOBS, + ClusterProfile, + ClusterAccount, + QOS, +) +from BALSAMIC.constants.paths import SCHEDULER_PATH +from BALSAMIC.models.snakemake import SingularityBindPath, SnakemakeExecutable + + +def test_singularity_bind_path_model(singularity_bind_path_data: Dict[str, Path]): + """Test singularity bind path model initialisation.""" + + # GIVEN singularity bind path model data + + # WHEN initialising the model + singularity_bind_path_model: SingularityBindPath = SingularityBindPath( + **singularity_bind_path_data + ) + + # THEN the model should have been correctly built + assert dict(singularity_bind_path_model) == singularity_bind_path_data + + +def test_snakemake_model( + snakemake_executable_data: Dict[str, Any], + snakemake_executable_validated_data: Dict[str, Any], +): + """Test snakemake model initialisation.""" + + # GIVEN a cluster ready snakemake data + + # WHEN initialising the model + snakemake_model: SnakemakeExecutable = SnakemakeExecutable( + **snakemake_executable_data + ) + + # THEN the model should have been correctly built + assert dict(snakemake_model) == snakemake_executable_validated_data + + +def test_snakemake_model_empty(): + """Test Snakemake empty model initialisation.""" + + # GIVEN no input for the snakemake model + + # WHEN initialising the model + with pytest.raises(ValidationError): + # THEN an empty model should raise a ValidationError + SnakemakeExecutable() + + +def test_get_config_files_option( + snakemake_executable: SnakemakeExecutable, reference_file: Path +): + """Test formatting of the configuration files.""" + + # GIVEN a snakemake executable model with a mocked config file + + # WHEN calling the method + config_files_option: str = snakemake_executable.get_config_files_option() + + # THEN the expected format should be returned + assert ( + config_files_option + == f"--configfiles {reference_file.as_posix()} {reference_file.as_posix()}" + ) + + +def test_get_config_options(snakemake_executable: SnakemakeExecutable): + """Test formatting of the snakemake config options.""" + + # GIVEN a snakemake executable model disabling some variant callers + + # WHEN calling the method + snakemake_config_options: str = snakemake_executable.get_config_options() + + # THEN the expected format should be returned + assert snakemake_config_options == "--config disable_variant_caller=tnscope,vardict" + + +def test_get_dragen_flag(snakemake_executable: SnakemakeExecutable): + """Test formatting of the dragen flag.""" + + # GIVEN a snakemake executable model with a dragen flag + snakemake_model: SnakemakeExecutable = copy.deepcopy(snakemake_executable) + snakemake_model.dragen = True + + # WHEN calling the method + dragen_flag: str = snakemake_model.get_dragen_flag() + + # THEN the expected format should be returned + assert dragen_flag == "dragen=True" + + +def test_get_force_flag(snakemake_executable: SnakemakeExecutable): + """Test formatting of the force flag.""" + + # GIVEN a snakemake executable model with a force flag + snakemake_model: SnakemakeExecutable = copy.deepcopy(snakemake_executable) + snakemake_model.force = True + + # WHEN calling the method + force_flag: str = snakemake_model.get_force_flag() + + # THEN the expected format should be returned + assert force_flag == "--forceall" + + +def test_get_mail_type_option(snakemake_executable: SnakemakeExecutable): + """Test formatting of the mail type option.""" + + # GIVEN a snakemake model with a mail type option + snakemake_model: SnakemakeExecutable = copy.deepcopy(snakemake_executable) + snakemake_model.mail_type = ClusterMailType.FAIL + + # WHEN calling the method + mail_type_option: str = snakemake_model.get_mail_type_option() + + # THEN the expected format should be returned + assert mail_type_option == "--mail-type FAIL" + + +def test_quiet_flag(snakemake_executable: SnakemakeExecutable): + """Test formatting of the quiet flag.""" + + # GIVEN a snakemake executable model with a quiet option + + # WHEN calling the method + quiet_flag: str = snakemake_executable.get_quiet_flag() + + # THEN the expected format should be returned + assert quiet_flag == "--quiet" + + +def test_get_report_path_option( + session_tmp_path: Path, snakemake_executable: SnakemakeExecutable +): + """Test formatting of the report path option.""" + + # GIVEN a snakemake executable model with a report path option + snakemake_model: SnakemakeExecutable = copy.deepcopy(snakemake_executable) + snakemake_model.report_path = session_tmp_path + + # WHEN calling the method + report_path_option: str = snakemake_model.get_report_path_option() + + # THEN the expected format should be returned + assert report_path_option == f"--report {session_tmp_path.as_posix()}" + + +def test_get_run_analysis_flag(snakemake_executable: SnakemakeExecutable): + """Test formatting of the run analysis flag.""" + + # GIVEN a snakemake executable model with a dry run option + snakemake_model: SnakemakeExecutable = copy.deepcopy(snakemake_executable) + snakemake_model.run_analysis = False + + # WHEN calling the method + run_analysis_flag: str = snakemake_model.get_run_analysis_flag() + + # THEN the expected format should be returned + assert run_analysis_flag == "--dryrun" + + +def test_get_singularity_bind_paths_option( + reference_file: Path, + session_tmp_path: Path, + snakemake_executable: SnakemakeExecutable, +): + """Test formatting of the singularity bind paths.""" + + # GIVEN a snakemake executable model with multiple binding paths + snakemake_model: SnakemakeExecutable = copy.deepcopy(snakemake_executable) + snakemake_model.singularity_bind_paths.append( + SingularityBindPath(source=reference_file, destination=reference_file) + ) + + # WHEN calling the method + singularity_bind_paths_option: str = ( + snakemake_model.get_singularity_bind_paths_option() + ) + + # THEN the expected format should be returned + assert ( + singularity_bind_paths_option + == f"--use-singularity --singularity-args '--cleanenv --bind {session_tmp_path.as_posix()}:/ " + f"--bind {reference_file.as_posix()}:{reference_file.as_posix()}'" + ) + + +def test_get_slurm_profiler_option(snakemake_executable: SnakemakeExecutable): + """Test formatting of the snakemake slurm profiler option.""" + + # GIVEN a snakemake executable model + snakemake_model: SnakemakeExecutable = copy.deepcopy(snakemake_executable) + snakemake_model.benchmark = True + + # WHEN calling the method + slurm_profiler: str = snakemake_model.get_slurm_profiler_option() + + # THEN the expected format should be returned + assert slurm_profiler == "--slurm-profiler task" + + +def test_get_snakemake_options_command(snakemake_executable: SnakemakeExecutable): + """Test formatting of the snakemake options command.""" + + # GIVEN a snakemake executable model with additional snakemake options command + + # WHEN calling the method + snakemake_options_command: str = ( + snakemake_executable.get_snakemake_options_command() + ) + + # THEN the expected format should be returned + assert snakemake_options_command == "--cores 36" + + +def test_get_snakemake_command( + case_id_tumor_only: str, + mail_user_option: str, + reference_file: Path, + session_tmp_path: Path, + snakemake_executable: SnakemakeExecutable, +): + """Test retrieval of the snakemake command to be submitted to Slurm.""" + + # GIVEN a snakemake executable model with working environment paths + + # WHEN calling the method + snakemake_command: str = snakemake_executable.get_command() + + # THEN the expected format should be returned + assert ( + snakemake_command + == f"snakemake --notemp -p --rerun-trigger mtime --directory {session_tmp_path.as_posix()} " + f"--snakefile {reference_file.as_posix()} " + f"--configfiles {reference_file.as_posix()} {reference_file.as_posix()} " + f"--use-singularity --singularity-args '--cleanenv --bind {session_tmp_path.as_posix()}:/' --quiet " + f"--immediate-submit -j {MAX_JOBS} --jobname BALSAMIC.{case_id_tumor_only}.{{rulename}}.{{jobid}}.sh " + f"--cluster-config {reference_file.as_posix()} --cluster '{sys.executable} {SCHEDULER_PATH} " + f"--sample-config {reference_file.as_posix()} --profile {ClusterProfile.SLURM} " + f"--account {ClusterAccount.DEVELOPMENT} --qos {QOS.HIGH} --log-dir {session_tmp_path} " + f"--script-dir {session_tmp_path} --result-dir {session_tmp_path} --mail-user {mail_user_option} " + f"{{dependencies}} ' --config disable_variant_caller=tnscope,vardict --cores 36" + ) + + +def test_get_snakemake_cluster_options( + case_id_tumor_only: str, + mail_user_option: str, + reference_file: Path, + session_tmp_path: Path, + snakemake_executable: SnakemakeExecutable, +): + """Test formatting of the snakemake cluster options.""" + + # GIVEN a snakemake executable model with working environment paths + + # WHEN calling the method + snakemake_cluster_options: str = ( + snakemake_executable.get_snakemake_cluster_options() + ) + + # THEN the expected format should be returned + assert ( + snakemake_cluster_options + == f"--immediate-submit -j {MAX_JOBS} --jobname BALSAMIC.{case_id_tumor_only}.{{rulename}}.{{jobid}}.sh " + f"--cluster-config {reference_file.as_posix()} --cluster '{sys.executable} {SCHEDULER_PATH.as_posix()} " + f"--sample-config {reference_file.as_posix()} --profile {ClusterProfile.SLURM} " + f"--account {ClusterAccount.DEVELOPMENT} --qos {QOS.HIGH} --log-dir {session_tmp_path} " + f"--script-dir {session_tmp_path} --result-dir {session_tmp_path} --mail-user {mail_user_option} " + "{dependencies} '" + ) + + +def test_get_cluster_submit_command( + mail_user_option: str, + reference_file: Path, + session_tmp_path: Path, + snakemake_executable: SnakemakeExecutable, +): + """Test formatting of the cluster submit command.""" + + # GIVEN a snakemake executable model with working environment paths + + # WHEN calling the method + snakemake_cluster_submit_command: str = ( + snakemake_executable.get_cluster_submit_command() + ) + + # THEN the expected format should be returned + assert snakemake_cluster_submit_command == ( + f"'{sys.executable} {SCHEDULER_PATH.as_posix()} " + f"--sample-config {reference_file.as_posix()} --profile {ClusterProfile.SLURM} " + f"--account {ClusterAccount.DEVELOPMENT} --qos {QOS.HIGH} --log-dir {session_tmp_path} " + f"--script-dir {session_tmp_path} --result-dir {session_tmp_path} --mail-user {mail_user_option} " + "{dependencies} '" + ) diff --git a/tests/scripts/test_collect_qc_metrics.py b/tests/scripts/test_collect_qc_metrics.py index 52b33bc9b..6ce9b5e79 100644 --- a/tests/scripts/test_collect_qc_metrics.py +++ b/tests/scripts/test_collect_qc_metrics.py @@ -87,7 +87,7 @@ def test_get_metric_condition(config_dict, qc_requested_metrics): config["panel"] = None # GIVEN a specific sample & metric name - sample = "concatenated_tumor_XXXXXX_R" + sample = "ACC1" metric = "METRIC_1" # GIVEN an expected output @@ -112,7 +112,7 @@ def test_get_metric_condition_pct_wgs(config_dict): config["panel"] = None # GIVEN a specific sample & metric name - sample = "concatenated_tumor_XXXXXX_R" + sample = "ACC1" metric = "PCT_60X" # GIVEN the requested metric @@ -134,7 +134,7 @@ def test_get_multiqc_data_source(multiqc_data_path): """test multiqc source extraction from multiqc_data.json analysis file""" # GIVEN input parameters and the multiqc data - sample = "concatenated_tumor_XXXXXX_R" + sample = "tumor.ACC1" source_name_hs_metrics = "multiqc_picard_HsMetrics" source_name_dup = "multiqc_picard_dups" @@ -142,8 +142,8 @@ def test_get_multiqc_data_source(multiqc_data_path): multiqc_data = json.load(f) # GIVEN an expected output - source_hs_metrics = "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric" - source_dup = "concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt" + source_hs_metrics = "ACC1.dedup.realign.hsmetric" + source_dup = "tumor.ACC1.dedup.metrics" # WHEN extracting the source of a specific sample and collection of metrics out_source_hs_metrics = get_multiqc_data_source( diff --git a/tests/scripts/test_create_pdf.py b/tests/scripts/test_create_pdf.py deleted file mode 100644 index bd70d481b..000000000 --- a/tests/scripts/test_create_pdf.py +++ /dev/null @@ -1,132 +0,0 @@ -from pathlib import Path - -from BALSAMIC.assets.scripts.generate_cnv_report import ( - get_pdf_instance, - add_data_to_pdf, - add_plots_to_pdf, - generate_cnv_report, - PDF, - get_pdf_data, -) - - -def test_get_pdf_instance(): - """Test FPDF instance generation.""" - - # WHEN creating a dummy FPDF file - pdf: PDF = get_pdf_instance() - - # THEN check if the PDF has been correctly created - assert isinstance(pdf, PDF) - - -def test_get_pdf_data(): - """Test pdf data extraction from a list of files.""" - - # GIVEN a list of input files - data_paths = ["statistics.txt", "plot_0.png", "plot_1.png"] - - # WHEN retrieving the statistics and plots tuple - statistics, plots = get_pdf_data(data_paths) - - # THEN the expected files should be returned - assert data_paths[0] in statistics - assert data_paths[1] in plots - assert data_paths[2] in plots - - -def test_add_data_to_pdf(): - """Test add statistics to a PDF instance.""" - - # GIVEN a PDF instance and an output sample statistics .txt file - pdf: PDF = get_pdf_instance() - statistics_paths = [ - "tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.samplestatistics.txt" - ] - - # WHEN generating the PDF with the statistics - pdf: PDF = add_data_to_pdf(pdf=pdf, data_paths=statistics_paths) - - # THEN check if the statistics are appended to the created PDF - assert isinstance(pdf, PDF) - assert pdf.page_no() == 1 - - -def test_add_plots_to_pdf(): - """Test plots appending to a PDF file.""" - - # GIVEN a PDF instance and some dummy PNG plots - pdf: PDF = get_pdf_instance() - plot_paths = [ - "tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.sunrise.png", - "tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.germline.png", - ] - - # WHEN adding the plots to a PDF instance - pdf: PDF = add_plots_to_pdf(pdf, plot_paths) - - # THEN check if the images are correctly appended to the PDF - assert isinstance(pdf, PDF) - assert pdf.page_no() == len(plot_paths) - - -def test_generate_cnv_report_tumor_normal(tmp_path, cli_runner): - """Test generation of a PDF report for a WGS TN case.""" - - # GIVEN dummy input data and plots - data_paths = [ - "tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.samplestatistics.txt", - "tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.germline.png", - "tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.sunrise.png", - "tests/test_data/cnv_report/CNV.somatic.sample_tumor_only_wgs.cnvpytor.circular.png", - "tests/test_data/cnv_report/CNV.somatic.sample_tumor_only_wgs.cnvpytor.scatter.png", - ] - - # GIVEN the output path - output_path: Path = Path(tmp_path, "report.pdf") - - # WHEN invoking the python script - result = cli_runner.invoke( - generate_cnv_report, - [ - data_paths[0], - data_paths[1], - data_paths[2], - data_paths[3], - data_paths[4], - "--output", - output_path, - ], - ) - - # THEN check if the PDF is correctly created and there is no errors - assert result.exit_code == 0 - assert Path(output_path).exists() - - -def test_generate_cnv_report_tumor_only(tmp_path, cli_runner): - """Test generation of a PDF report for a WGS TO case.""" - - # GIVEN dummy input data and plots - plot_paths = [ - "tests/test_data/cnv_report/CNV.somatic.sample_tumor_only_wgs.cnvpytor.circular.png", - "tests/test_data/cnv_report/CNV.somatic.sample_tumor_only_wgs.cnvpytor.scatter.png", - ] - - # GIVEN the output path - output_path: Path = Path(tmp_path, "report.pdf") - - # WHEN invoking the python script - result = cli_runner.invoke( - generate_cnv_report, - [ - plot_paths[0], - plot_paths[1], - "--output", - output_path, - ], - ) - - # THEN check if the PDF is correctly created and there is no errors - assert result.exit_code == 0 - assert Path(output_path).exists() diff --git a/tests/scripts/test_csv_to_pdf.py b/tests/scripts/test_csv_to_pdf.py new file mode 100644 index 000000000..1e279ded8 --- /dev/null +++ b/tests/scripts/test_csv_to_pdf.py @@ -0,0 +1,58 @@ +"""Test converting CSV to PDF.""" +from pathlib import Path + +from click.testing import CliRunner, Result +from pypdf import PdfReader + +from BALSAMIC.assets.scripts.csv_to_pdf import csv_to_pdf + + +def test_csv_to_pdf( + purity_csv_path: Path, tmp_path: Path, cli_runner: CliRunner +) -> None: + """Test converting of a CSV file to PDF.""" + + # GIVEN an input CSV file + + # GIVEN an output PDF file + pdf_path: Path = Path(tmp_path, "csv_to_pdf.pdf") + + # WHEN converting the CSV file to PDF + result: Result = cli_runner.invoke( + csv_to_pdf, [purity_csv_path.as_posix(), pdf_path.as_posix(), "--header"] + ) + + # THEN the output PDF file should exist + assert result.exit_code == 0 + assert pdf_path.is_file() + + # THEN the output PDF file should contain the CSV table + reader: PdfReader = PdfReader(stream=pdf_path) + pdf_page: str = reader.pages[0].extract_text() + assert purity_csv_path.stem in pdf_page + + +def test_txt_to_pdf( + cnv_statistics_path: Path, tmp_path: Path, cli_runner: CliRunner +) -> None: + """Test converting of a TXT file to PDF.""" + + # GIVEN an input TXT file + + # GIVEN an output PDF file + pdf_path: Path = Path(tmp_path, "csv_to_pdf.pdf") + + # WHEN converting the TXT file to PDF + result: Result = cli_runner.invoke( + csv_to_pdf, + [cnv_statistics_path.as_posix(), pdf_path.as_posix(), "--delimiter", " "], + ) + + # THEN the output PDF file should exist + assert result.exit_code == 0 + assert pdf_path.is_file() + + # THEN the output PDF file should contain the TXT table + reader: PdfReader = PdfReader(stream=pdf_path) + pdf_page: str = reader.pages[0].extract_text() + assert cnv_statistics_path.stem in pdf_page diff --git a/tests/scripts/test_image_to_pdf.py b/tests/scripts/test_image_to_pdf.py new file mode 100644 index 000000000..a6388a6db --- /dev/null +++ b/tests/scripts/test_image_to_pdf.py @@ -0,0 +1,32 @@ +"""Test converting images to PDF.""" +from pathlib import Path + +from click.testing import CliRunner, Result +from pypdf import PdfReader + +from BALSAMIC.assets.scripts.image_to_pdf import image_to_pdf + + +def test_image_to_pdf( + cnv_plot_path: Path, tmp_path: Path, cli_runner: CliRunner +) -> None: + """Test converting of an image file to PDF.""" + + # GIVEN an input CNV plot file + + # GIVEN an output PDF file + pdf_path: Path = Path(tmp_path, "image_to_pdf.pdf") + + # WHEN converting the plot to PDF + result: Result = cli_runner.invoke( + image_to_pdf, [cnv_plot_path.as_posix(), pdf_path.as_posix()] + ) + + # THEN the output PDF file should exist + assert result.exit_code == 0 + assert pdf_path.is_file() + + # THEN the output PDF file should contain the image + reader: PdfReader = PdfReader(stream=pdf_path) + pdf_page: str = reader.pages[0].extract_text() + assert cnv_plot_path.stem in pdf_page diff --git a/tests/scripts/test_merge_pdf.py b/tests/scripts/test_merge_pdf.py new file mode 100644 index 000000000..57ebfb199 --- /dev/null +++ b/tests/scripts/test_merge_pdf.py @@ -0,0 +1,32 @@ +"""Test PDF file merging.""" +from pathlib import Path +from typing import List + +from click.testing import CliRunner, Result +from pypdf import PdfReader, PdfWriter + +from BALSAMIC.assets.scripts.merge_pdfs import merge_pdfs + + +def test_merge_pdfs(tmp_path: Path, cli_runner: CliRunner) -> None: + """Test merging of multiple PDF files.""" + + # GIVEN a list of empty PDF files + input_pdfs: List[str] = [ + Path(tmp_path, f"file_{pdf}.pdf").as_posix() for pdf in [1, 2, 3] + ] + for pdf in input_pdfs: + pdf_writer = PdfWriter() + pdf_writer.add_blank_page(111, 111) + pdf_writer.write(pdf) + + # GIVEN an output PDF file and PDF reader + output_pdf: str = Path(tmp_path, "output.pdf").as_posix() + + # WHEN merging multiple PDFs + result: Result = cli_runner.invoke(merge_pdfs, input_pdfs + [output_pdf]) + + # THEN the output file should contain all the PDF files + pdf_reader = PdfReader(output_pdf) + assert len(pdf_reader.pages) == len(input_pdfs) + assert result.exit_code == 0 diff --git a/tests/scripts/test_preprocess_gens.py b/tests/scripts/test_preprocess_gens.py new file mode 100644 index 000000000..33acb53ae --- /dev/null +++ b/tests/scripts/test_preprocess_gens.py @@ -0,0 +1,149 @@ +from BALSAMIC.assets.scripts.preprocess_gens import ( + extract_variant_info, + get_valid_variants, + extract_coverage_line_values, +) +import filecmp +from pathlib import Path +from typing import Dict +from BALSAMIC.constants.analysis import SequencingType + + +def test_calculate_bafs( + tmp_path, gens_dummy_gnomad_vcf, gens_dummy_gnomad_baf_bed, invoke_gens_cli +): + """Test creation of baf file for GENS pre-processing.""" + + # GIVEN the dummy gnomad vcf + + # WHEN invoking the python script with calculate-bafs command + output_path = str(tmp_path / "dummy.baf.bed") + result = invoke_gens_cli( + [ + "-o", + output_path, + "-s", + SequencingType.WGS, + "calculate-bafs", + "-v", + gens_dummy_gnomad_vcf, + ], + ) + + # THEN there should be no errors + assert result.exit_code == 0 + # THEN the output file should be created + assert Path(output_path).exists() + # THEN the output file should be identical to the expected + assert filecmp.cmp( + gens_dummy_gnomad_baf_bed, output_path, shallow=False + ), f"{gens_dummy_gnomad_baf_bed} and {output_path} are not identical" + + +def test_create_coverage_regions( + tmp_path, gens_dummy_denoised_cov, gens_dummy_cov_bed, invoke_gens_cli +): + """Test creation of cov file for GENS pre-processing.""" + + # GIVEN the dummy gnomad vcf + + # WHEN invoking the python script with calculate-bafs command + output_path = str(tmp_path / "dummy.cov.bed") + result = invoke_gens_cli( + [ + "-o", + output_path, + "-s", + SequencingType.WGS, + "create-coverage-regions", + "-c", + gens_dummy_denoised_cov, + ], + ) + + # THEN there should be no errors + assert result.exit_code == 0 + # THEN the output file should be created + assert Path(output_path).exists() + # THEN the output file should be identical to the expected + assert filecmp.cmp( + gens_dummy_cov_bed, output_path, shallow=False + ), f"{gens_dummy_cov_bed} and {output_path} are not identical" + + +def test_extract_variant_info(valid_dnascope_variant, invalid_dnascope_variant_no_ad): + """test extraction of variant information from a vcf line from DNAscope.""" + + # GIVEN VALID VARIANT + + # WHEN extracting the variant information and calculating allele frequencies + variant_info = extract_variant_info(valid_dnascope_variant) + + # THEN AF should be correctly calculated + assert variant_info["af"] > 0.6 and variant_info["af"] < 0.61 + + # GIVEN INVALID VARIANT + + # WHEN extracting the variant information and calculating allele frequencies + variant_info = extract_variant_info(invalid_dnascope_variant_no_ad) + + # THEN no information should be returned + assert variant_info == None + + +def test_get_valid_variants( + caplog, + valid_dnascope_variant, + invalid_dnascope_variant_no_ad, + invalid_dnascope_variant_illegal_chrom, +): + """test extraction of variant information from a vcf line from DNAscope.""" + + # GIVEN LIST OF 2 INVALID AND 1 VALID VARIANT STRINGS + variant_list = [ + valid_dnascope_variant, + invalid_dnascope_variant_no_ad, + invalid_dnascope_variant_illegal_chrom, + ] + + # WHEN extracting information for valid variants + variant_dict: Dict = get_valid_variants(variant_list) + + # THEN the first variant should be correctly extracted into this structure + assert variant_dict[0] == { + "chr": "1", + "start": "100", + "ref": "T", + "alt": "C", + "sample": "0/1:9,14:23:99:418,0,257", + "af": 0.608696, + } + # THEN the remaining variants should throw WARNINGS + assert any(record.levelname == "WARNING" for record in caplog.records) + # THEN one variant should fail calculation of allele frequency and be ignored + assert any( + "Can't calc AF for a number of variants: 1." in record.message + for record in caplog.records + ) + # THEN one variant should have an illegal chromosome name and be ignored + assert any( + "A number of variants have illegal chromosomes and will be skipped: {'25': 1}." + in record.message + for record in caplog.records + ) + + +def test_extract_coverage_line_values(): + """test extraction coverage region values from a coverage line..""" + + # GIVEN A COVERAGE LINE + coverage_line = "1\t13401\t13500\t-0.956941" + + # WHEN extracting information from string + chrom, start, end, log2_ratio = extract_coverage_line_values(coverage_line) + + # THEN the coverage region variables should have the expected values + assert chrom == "1" + assert start == 13401 + assert end == 13500 + assert log2_ratio == -0.956941 diff --git a/tests/test_data/cnv_report/CNV.somatic.case_id.purecn.purity.csv b/tests/test_data/cnv_report/CNV.somatic.case_id.purecn.purity.csv new file mode 100644 index 000000000..1c3d510a8 --- /dev/null +++ b/tests/test_data/cnv_report/CNV.somatic.case_id.purecn.purity.csv @@ -0,0 +1,2 @@ +"Sampleid","Purity","Ploidy","Sex","Contamination","Flagged","Failed","Curated","Comment" +"tumor.initial",0.64,4.52136051294267,"Coverage: M VCF: F",0,TRUE,FALSE,FALSE,"RARE KARYOTYPE" diff --git a/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.ASPCF.png b/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.ASPCF.png new file mode 100644 index 000000000..e6e3b8441 Binary files /dev/null and b/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.ASPCF.png differ diff --git a/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.germline.png b/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.germline.png deleted file mode 100644 index 8d956976d..000000000 Binary files a/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.germline.png and /dev/null differ diff --git a/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.samplestatistics.txt b/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.samplestatistics.txt index d681e2968..8adf6c1b1 100644 --- a/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.samplestatistics.txt +++ b/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.samplestatistics.txt @@ -1,7 +1,7 @@ -NormalContamination 0.378222936036507 -Ploidy 2.69008904657384 +NormalContamination 0.378226168637498 +Ploidy 2.69005206950459 rho 0.55 psi 2.75 -goodnessOfFit 93.9311185291303 +goodnessOfFit 93.9316740620231 GenderChr Y GenderChrFound N diff --git a/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.sunrise.png b/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.sunrise.png deleted file mode 100644 index e232162e9..000000000 Binary files a/tests/test_data/cnv_report/CNV.somatic.sample_tumor_normal_wgs.ascat.sunrise.png and /dev/null differ diff --git a/tests/test_data/cnv_report/CNV.somatic.sample_tumor_only_wgs.cnvpytor.circular.png b/tests/test_data/cnv_report/CNV.somatic.sample_tumor_only_wgs.cnvpytor.circular.png deleted file mode 100644 index 6a6d3e07b..000000000 Binary files a/tests/test_data/cnv_report/CNV.somatic.sample_tumor_only_wgs.cnvpytor.circular.png and /dev/null differ diff --git a/tests/test_data/cnv_report/CNV.somatic.sample_tumor_only_wgs.cnvpytor.scatter.png b/tests/test_data/cnv_report/CNV.somatic.sample_tumor_only_wgs.cnvpytor.scatter.png deleted file mode 100644 index cb97f7ffb..000000000 Binary files a/tests/test_data/cnv_report/CNV.somatic.sample_tumor_only_wgs.cnvpytor.scatter.png and /dev/null differ diff --git a/tests/test_data/config.json b/tests/test_data/config.json index 0fa2a75f8..a63bff715 100644 --- a/tests/test_data/config.json +++ b/tests/test_data/config.json @@ -1,85 +1,377 @@ { "QC": { - "picard_rmdup": "TRUE", - "adapter": - "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT", - "min_seq_length": "25" + "picard_rmdup": false, + "adapter": "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT", + "quality_trim": true, + "adapter_trim": true, + "umi_trim": true, + "min_seq_length": "25", + "umi_trim_length": "5", + "n_base_limit": "50" }, "vcf": { - "manta": { - "default": [ - "diploidSV.vcf.gz", "somaticSV.vcf.gz", - "candidateSV.vcf.gz", "candidateSmallIndels.vcf.gz" + "vardict": { + "mutation": "somatic", + "mutation_type": "SNV", + "analysis_type": [ + "paired", + "single" + ], + "sequencing_type": [ + "targeted" ], - "merged": "manta.vcf.gz", + "workflow_solution": [ + "BALSAMIC" + ] + }, + "tnscope": { "mutation": "somatic", - "type": "SV" + "mutation_type": "SNV", + "analysis_type": [ + "paired", + "single" + ], + "sequencing_type": [ + "wgs" + ], + "workflow_solution": [ + "Sentieon" + ] }, - "manta_germline": { - "default": [ - "diploidSV.vcf.gz", "candidateSV.vcf.gz", - "candidateSmallIndels.vcf.gz" + "dnascope": { + "mutation": "germline", + "mutation_type": "SNV", + "analysis_type": [ + "paired", + "single" + ], + "sequencing_type": [ + "targeted", + "wgs" + ], + "workflow_solution": [ + "Sentieon" + ] + }, + "tnscope_umi": { + "mutation": "somatic", + "mutation_type": "SNV", + "analysis_type": [ + "single", + "paired" ], + "sequencing_type": [ + "targeted" + ], + "workflow_solution": [ + "Sentieon_umi" + ] + }, + "manta_germline": { "mutation": "germline", - "merged": "manta_germline.vcf.gz", - "type": "SV" + "mutation_type": "SV", + "analysis_type": [ + "paired", + "single" + ], + "sequencing_type": [ + "targeted", + "wgs" + ], + "workflow_solution": [ + "BALSAMIC" + ] }, - "mutect": { - "default": "mutect.vcf.gz", + "manta": { "mutation": "somatic", - "merged": "mutect.vcf.gz", - "type": "SNV" + "mutation_type": "SV", + "analysis_type": [ + "paired", + "single" + ], + "sequencing_type": [ + "targeted", + "wgs" + ], + "workflow_solution": [ + "BALSAMIC" + ] }, - "vardict": { - "default": "vardict.vcf.gz", + "dellysv": { + "mutation": "somatic", + "mutation_type": "SV", + "analysis_type": [ + "paired", + "single" + ], + "sequencing_type": [ + "targeted", + "wgs" + ], + "workflow_solution": [ + "BALSAMIC" + ] + }, + "cnvkit": { "mutation": "somatic", - "merged": "vardict.vcf.gz", - "type": "SNV" + "mutation_type": "CNV", + "analysis_type": [ + "paired", + "single" + ], + "sequencing_type": [ + "targeted" + ], + "workflow_solution": [ + "BALSAMIC" + ] + }, + "ascat": { + "mutation": "somatic", + "mutation_type": "CNV", + "analysis_type": [ + "paired" + ], + "sequencing_type": [ + "wgs" + ], + "workflow_solution": [ + "BALSAMIC" + ] + }, + "dellycnv": { + "mutation": "somatic", + "mutation_type": "CNV", + "analysis_type": [ + "single", + "paired" + ], + "sequencing_type": [ + "targeted", + "wgs" + ], + "workflow_solution": [ + "BALSAMIC" + ] + }, + "tiddit": { + "mutation": "somatic", + "mutation_type": "SV", + "analysis_type": [ + "single", + "paired" + ], + "sequencing_type": [ + "wgs" + ], + "workflow_solution": [ + "BALSAMIC" + ] + }, + "cnvpytor": { + "mutation": "somatic", + "mutation_type": "CNV", + "analysis_type": [ + "single" + ], + "sequencing_type": [ + "wgs" + ], + "workflow_solution": [ + "BALSAMIC" + ] + }, + "svdb": { + "mutation": "somatic", + "mutation_type": "SV", + "analysis_type": [ + "paired", + "single" + ], + "sequencing_type": [ + "targeted", + "wgs" + ], + "workflow_solution": [ + "BALSAMIC" + ] } }, "analysis": { "case_id": "id1", "analysis_type": "paired", + "gender": "male", "sequencing_type": "targeted", + "analysis_workflow": "balsamic", "analysis_dir": "tests/test_data/", - "fastq_path": "tests/test_data/fastq/", - "script": "tests/test_data/id1/scripts/", - "log": "tests/test_data/id1/logs/", - "result": "tests/test_data/id1/analysis/", + "fastq_path": "tests/test_data/fastq_lanes/", + "script": "placeholder/scripts/", + "log": "placeholder/logs/", + "result": "placeholder/analysis/", + "benchmark": "placeholder/benchmarks/", "config_creation_date": "yyyy-mm-dd xx", - "BALSAMIC_version": "2.9.8", - "dag": "tests/test_data/id1_analysis.json_BALSAMIC_2.9.8_graph.pdf" + "BALSAMIC_version": "12.0.2", + "dag": "tests/test_data/id1_analysis.json_BALSAMIC_12.0.2_graph.pdf" }, - "samples": { - "concatenated_tumor_XXXXXX_R": { - "file_prefix": "concatenated_tumor_XXXXXX_R", - "sample_name" : "ACC1", + "samples": [ + { + "name": "ACC1", "type": "tumor", - "readpair_suffix": ["1", "2"] + "fastq_info": { + "HXXXXXXX_ACC1_S01_L001": { + "fwd": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC1_S01_L001_R1_001.fastq.gz", + "rev": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC1_S01_L001_R2_001.fastq.gz" + }, + "HXXXXXXX_ACC1_S01_L002": { + "fwd": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC1_S01_L002_R1_001.fastq.gz", + "rev": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC1_S01_L002_R2_001.fastq.gz" + } + } }, - "concatenated_normal_XXXXXX_R": { - "file_prefix": "concatenated_normal_XXXXXX_R", - "sample_name" : "ACC2", + { + "name": "ACC2", "type": "normal", - "readpair_suffix": ["1", "2"] + "fastq_info": { + "HXXXXXXX_ACC2_S01_L002": { + "fwd": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC2_S01_L002_R1_001.fastq.gz", + "rev": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC2_S01_L002_R2_001.fastq.gz" + }, + "HXXXXXXX_ACC2_S01_L001": { + "fwd": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC2_S01_L001_R1_001.fastq.gz", + "rev": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC2_S01_L001_R2_001.fastq.gz" + } + } } - }, + ], "reference": { "reference_genome": "tests/test_data/references/genome/human_g1k_v37_decoy.fasta", "dbsnp": "tests/test_data/references/variants/dbsnp_grch37_b138.vcf.gz", - "1kg_snps_all": "tests/test_data/references/variants/1k_genome_wgs_p1_v3_all_sites.vcf.gz", - "1kg_snps_high": "tests/test_data/references/variants/1kg_phase1_snps_high_confidence_b37.vcf.gz", + "vcf_1kg": "tests/test_data/references/variants/1k_genome_wgs_p1_v3_all_sites.vcf.gz", + "hc_vcf_1kg": "tests/test_data/references/variants/1kg_phase1_snps_high_confidence_b37.vcf.gz", "mills_1kg": "tests/test_data/references/variants/mills_1kg_index.vcf.gz", "cosmic": "tests/test_data/references/variants/cosmic_coding_muts_v89.vcf.gz", - "vep": "tests/test_data/references/vep/", - "exon_bed": "tests/test_data/references/genome/refseq.flat.bed", + "vep_dir": "tests/test_data/references/vep/", + "refgene_bed": "tests/test_data/references/genome/refseq.flat.bed", "somalier_sites": "tests/test_data/references/variants/GRCh37.somalier.sites.vcf.gz" }, + "bioinfo_tools": { + "bedtools": "align_qc", + "bwa": "align_qc", + "compress": "align_qc", + "fastqc": "align_qc", + "samtools": "align_qc", + "picard": "align_qc", + "multiqc": "align_qc", + "fastp": "align_qc", + "csvkit": "align_qc", + "ensembl-vep": "annotate", + "genmod": "annotate", + "vcfanno": "annotate", + "sambamba": "coverage_qc", + "mosdepth": "coverage_qc", + "bcftools": "varcall_py3", + "tabix": "varcall_py3", + "bgzip": "varcall_py3", + "gatk": "varcall_py3", + "vardict": "varcall_py3", + "svdb": "varcall_py3", + "tiddit": "varcall_py3", + "cnvpytor": "cnvpytor", + "manta": "varcall_py27", + "cnvkit": "varcall_cnvkit", + "delly": "delly", + "ascatNgs": "ascatNgs", + "vcf2cytosure": "vcf2cytosure", + "somalier": "somalier" + }, + "bioinfo_tools_version": { + "bcftools": [ + "1.15.1", + "1.10.2", + "1.9" + ], + "tabix": [ + "1.11", + "0.2.6" + ], + "cnvkit": [ + "0.9.9" + ], + "bwa": [ + "0.7.17" + ], + "gatk": [ + "3.8" + ], + "samtools": [ + "1.15.1", + "1.9" + ], + "svdb": [ + "2.8.1" + ], + "tiddit": [ + "3.3.2" + ], + "vardict": [ + "2019.06.04" + ], + "somalier": [ + "0.2.16" + ], + "delly": [ + "1.0.3" + ], + "manta": [ + "1.6.0" + ], + "bedtools": [ + "2.30.0" + ], + "csvkit": [ + "1.0.7" + ], + "fastp": [ + "0.23.2" + ], + "fastqc": [ + "0.11.9" + ], + "multiqc": [ + "1.12" + ], + "picard": [ + "2.27.1" + ], + "mosdepth": [ + "0.3.3" + ], + "sambamba": [ + "0.8.2" + ], + "ensembl-vep": [ + "104.3" + ], + "vcfanno": [ + "0.3.3" + ], + "genmod": [ + "3.7.4" + ], + "ascatNgs": [ + "4.5.0" + ], + "vcf2cytosure": [ + "0.8.1" + ], + "cnvpytor": [ + "1.2.1" + ] + }, "panel": { "capture_kit": "tests/test_data/references/panel/panel.bed", "chrom": ["18", "14", "11", "4", "10", "12", "3", "5", "13", "20", "7", "2", "8", "21", "15", "9", "17", "16", "22", "19", "1", "6"] }, - "conda_env_yaml": "BALSAMIC/config/balsamic_env.yaml", "rule_directory": "BALSAMIC/" } diff --git a/tests/test_data/config_pon.json b/tests/test_data/config_pon.json new file mode 100644 index 000000000..662613e8e --- /dev/null +++ b/tests/test_data/config_pon.json @@ -0,0 +1,159 @@ +{ + "QC": { + "picard_rmdup": false, + "adapter": "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT", + "quality_trim": true, + "adapter_trim": true, + "umi_trim": true, + "min_seq_length": "25", + "umi_trim_length": "5", + "n_base_limit": "50" + }, + "samples": "Placeholder", + "reference": { + "reference_genome": "tests/test_data/references/genome/human_g1k_v37_decoy.fasta", + "dbsnp": "tests/test_data/references/variants/dbsnp_grch37_b138.vcf.gz", + "vcf_1kg": "tests/test_data/references/variants/1k_genome_wgs_p1_v3_all_sites.vcf.gz", + "hc_vcf_1kg": "tests/test_data/references/variants/1kg_phase1_snps_high_confidence_b37.vcf.gz", + "mills_1kg": "tests/test_data/references/variants/mills_1kg_index.vcf.gz", + "cosmic": "tests/test_data/references/variants/cosmic_coding_muts_v89.vcf.gz", + "vep_dir": "tests/test_data/references/vep/", + "refgene_bed": "tests/test_data/references/genome/refseq.flat.bed", + "somalier_sites": "tests/test_data/references/variants/GRCh37.somalier.sites.vcf.gz" + }, + "bioinfo_tools": { + "bedtools": "align_qc", + "bwa": "align_qc", + "compress": "align_qc", + "fastqc": "align_qc", + "samtools": "align_qc", + "picard": "align_qc", + "multiqc": "align_qc", + "fastp": "align_qc", + "csvkit": "align_qc", + "ensembl-vep": "annotate", + "genmod": "annotate", + "vcfanno": "annotate", + "sambamba": "coverage_qc", + "mosdepth": "coverage_qc", + "bcftools": "varcall_py3", + "tabix": "varcall_py3", + "bgzip": "varcall_py3", + "gatk": "varcall_py3", + "vardict": "varcall_py3", + "svdb": "varcall_py3", + "tiddit": "varcall_py3", + "cnvpytor": "cnvpytor", + "manta": "varcall_py27", + "cnvkit": "varcall_cnvkit", + "delly": "delly", + "ascatNgs": "ascatNgs", + "vcf2cytosure": "vcf2cytosure", + "somalier": "somalier" + }, + "bioinfo_tools_version": { + "mosdepth": [ + "0.3.3" + ], + "sambamba": [ + "0.8.2" + ], + "bcftools": [ + "1.9", + "1.10.2", + "1.15.1" + ], + "tabix": [ + "0.2.6", + "1.11" + ], + "cnvkit": [ + "0.9.9" + ], + "bedtools": [ + "2.30.0" + ], + "ensembl-vep": [ + "104.3" + ], + "vcfanno": [ + "0.3.3" + ], + "genmod": [ + "3.7.4" + ], + "manta": [ + "1.6.0" + ], + "samtools": [ + "1.9", + "1.15.1" + ], + "delly": [ + "1.0.3" + ], + "ascatNgs": [ + "4.5.0" + ], + "somalier": [ + "0.2.16" + ], + "vcf2cytosure": [ + "0.8.1" + ], + "bwa": [ + "0.7.17" + ], + "gatk": [ + "3.8" + ], + "svdb": [ + "2.8.1" + ], + "tiddit": [ + "3.3.2" + ], + "vardict": [ + "2019.06.04" + ], + "csvkit": [ + "1.0.7" + ], + "fastp": [ + "0.23.2" + ], + "fastqc": [ + "0.11.9" + ], + "multiqc": [ + "1.12" + ], + "picard": [ + "2.27.1" + ], + "cnvpytor": [ + "1.2.1" + ] + }, + "panel": { + "capture_kit": "tests/test_data/references/panel/panel.bed", + "chrom": ["18", "14", "11", "4", "10", "12", "3", "5", "13", "20", "7", "2", + "8", "21", "15", "9", "17", "16", "22", "19", "1", "6"] + }, + "analysis": { + "case_id": "id1", + "analysis_type": "pon", + "sequencing_type": "targeted", + "analysis_workflow": "balsamic", + "analysis_dir": "tests/test_data/", + "fastq_path": "tests/test_data/fastq_pon", + "script": "tests/test_data/id1/scripts/", + "log": "tests/test_data/id1/logs/", + "result": "tests/test_data/id1/analysis", + "benchmark": "tests/test_data/id1/benchmarks/", + "dag": "tests/test_data/id1/id1_BALSAMIC_12.0.2_graph.pdf", + "BALSAMIC_version": "12.0.2", + "config_creation_date": "2023-08-10 10:07", + "pon_version": "v5" + } +} diff --git a/tests/test_data/fastq_test_info.json b/tests/test_data/fastq_test_info.json new file mode 100644 index 000000000..bf78d5cf5 --- /dev/null +++ b/tests/test_data/fastq_test_info.json @@ -0,0 +1,309 @@ +{ + "standard_fastq_names": { + "tumor": [ + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz" + ], + "normal": [ + "1_171015_HJ7TLDSX5_ACC2_XXXXXX_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACC2_XXXXXX_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACC2_XXXXXX_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACC2_XXXXXX_2.fastq.gz" + ] + }, + "samples_standard_fastq_names": [ + { + "name": "ACC1", + "type": "tumor", + "fastq_info": { + "1_171015_HJ7TLDSX5_ACC1_XXXXXX": { + "fwd": "1_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz", + "rev": "1_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz" + }, + "2_171015_HJ7TLDSX5_ACC1_XXXXXX": { + "fwd": "2_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz", + "rev": "2_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz" + } + } + }, + { + "name": "ACC2", + "type": "normal", + "fastq_info": { + "1_171015_HJ7TLDSX5_ACC2_XXXXXX": { + "fwd": "1_171015_HJ7TLDSX5_ACC2_XXXXXX_1.fastq.gz", + "rev": "1_171015_HJ7TLDSX5_ACC2_XXXXXX_2.fastq.gz" + }, + "2_171015_HJ7TLDSX5_ACC2_XXXXXX": { + "fwd": "2_171015_HJ7TLDSX5_ACC2_XXXXXX_1.fastq.gz", + "rev": "2_171015_HJ7TLDSX5_ACC2_XXXXXX_2.fastq.gz" + } + } + } + ], + "fastq_pattern_types": [ + { + "id": "1", + "tumor": [ + "ACC1_S1_L001_R1_001.fastq.gz", + "ACC1_S1_L001_R2_001.fastq.gz", + "ACC1_S1_L002_R1_001.fastq.gz", + "ACC1_S1_L002_R2_001.fastq.gz" + ], + "normal": [ + "ACC2_S1_L001_R1_001.fastq.gz", + "ACC2_S1_L001_R2_001.fastq.gz", + "ACC2_S1_L002_R1_001.fastq.gz", + "ACC2_S1_L002_R2_001.fastq.gz" + ] + }, + { + "id": "2", + "tumor": [ + "HXXXXXXX_123456_ACC1_L001_R1_001.fastq.gz", + "HXXXXXXX_123456_ACC1_L001_R2_001.fastq.gz", + "HXXXXXXX_123456_ACC1_L002_R1_001.fastq.gz", + "HXXXXXXX_123456_ACC1_L002_R2_001.fastq.gz" + ], + "normal": [ + "HXXXXXXX_123456_ACC2_L001_R1_001.fastq.gz", + "HXXXXXXX_123456_ACC2_L001_R2_001.fastq.gz", + "HXXXXXXX_123456_ACC2_L002_R1_001.fastq.gz", + "HXXXXXXX_123456_ACC2_L002_R2_001.fastq.gz" + ] + }, + { + "id": "3", + "tumor": ["ACC1_XXXXX_R_1.fastq.gz", "ACC1_XXXXX_R_2.fastq.gz"], + "normal": ["ACC2_XXXXX_R_1.fastq.gz", "ACC2_XXXXX_R_2.fastq.gz"] + }, + { + "id": "4", + "tumor": [ + "HXXXXXXX_ACC1_S01_L001_R1_001.fastq.gz", + "HXXXXXXX_ACC1_S01_L001_R2_001.fastq.gz", + "HXXXXXXX_ACC1_S01_L002_R1_001.fastq.gz", + "HXXXXXXX_ACC1_S01_L002_R2_001.fastq.gz" + ], + "normal": [ + "HXXXXXXX_ACC2_S01_L001_R1_001.fastq.gz", + "HXXXXXXX_ACC2_S01_L001_R2_001.fastq.gz", + "HXXXXXXX_ACC2_S01_L002_R1_001.fastq.gz", + "HXXXXXXX_ACC2_S01_L002_R2_001.fastq.gz" + ] + }, + { + "id": "5", + "tumor": [ + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz" + ], + "normal": [ + "1_171015_HJ7TLDSX5_ACC2_XXXXXX_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACC2_XXXXXX_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACC2_XXXXXX_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACC2_XXXXXX_2.fastq.gz" + ] + } + ], + "pon_fastq_list": [ + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_R_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_R_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_R_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_R_2.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN1_XXXXXX_R_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN1_XXXXXX_R_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN1_XXXXXX_R_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN1_XXXXXX_R_2.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN2_XXXXXX_R_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN2_XXXXXX_R_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN2_XXXXXX_R_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN2_XXXXXX_R_2.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN3_XXXXXX_R_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN3_XXXXXX_R_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN3_XXXXXX_R_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN3_XXXXXX_R_2.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN4_XXXXXX_R_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN4_XXXXXX_R_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN4_XXXXXX_R_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN4_XXXXXX_R_2.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN5_XXXXXX_R_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN5_XXXXXX_R_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN5_XXXXXX_R_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN5_XXXXXX_R_2.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN6_XXXXXX_R_1.fastq.gz", + "1_171015_HJ7TLDSX5_ACCN6_XXXXXX_R_2.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN6_XXXXXX_R_1.fastq.gz", + "2_171015_HJ7TLDSX5_ACCN6_XXXXXX_R_2.fastq.gz" + ], + "pon_samples_standard_fastq_names": [ + { + "type": "normal", + "name": "ACC1", + "fastq_info": { + "2_171015_HJ7TLDSX5_ACC1_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACC1_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACC1_XXXXXX_R_2.fastq.gz" + }, + "1_171015_HJ7TLDSX5_ACC1_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACC1_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACC1_XXXXXX_R_2.fastq.gz" + } + } + }, + { + "type": "normal", + "name": "ACCN5", + "fastq_info": { + "1_171015_HJ7TLDSX5_ACCN5_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN5_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN5_XXXXXX_R_2.fastq.gz" + }, + "2_171015_HJ7TLDSX5_ACCN5_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN5_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN5_XXXXXX_R_2.fastq.gz" + } + } + }, + { + "type": "normal", + "name": "ACCN2", + "fastq_info": { + "1_171015_HJ7TLDSX5_ACCN2_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN2_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN2_XXXXXX_R_2.fastq.gz" + }, + "2_171015_HJ7TLDSX5_ACCN2_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN2_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN2_XXXXXX_R_2.fastq.gz" + } + } + }, + { + "type": "normal", + "name": "ACCN6", + "fastq_info": { + "1_171015_HJ7TLDSX5_ACCN6_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN6_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN6_XXXXXX_R_2.fastq.gz" + }, + "2_171015_HJ7TLDSX5_ACCN6_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN6_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN6_XXXXXX_R_2.fastq.gz" + } + } + }, + { + "type": "normal", + "name": "ACCN1", + "fastq_info": { + "1_171015_HJ7TLDSX5_ACCN1_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN1_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN1_XXXXXX_R_2.fastq.gz" + }, + "2_171015_HJ7TLDSX5_ACCN1_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN1_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN1_XXXXXX_R_2.fastq.gz" + } + } + }, + { + "type": "normal", + "name": "ACCN4", + "fastq_info": { + "1_171015_HJ7TLDSX5_ACCN4_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN4_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN4_XXXXXX_R_2.fastq.gz" + }, + "2_171015_HJ7TLDSX5_ACCN4_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN4_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN4_XXXXXX_R_2.fastq.gz" + } + } + }, + { + "type": "normal", + "name": "ACCN3", + "fastq_info": { + "1_171015_HJ7TLDSX5_ACCN3_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN3_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/1_171015_HJ7TLDSX5_ACCN3_XXXXXX_R_2.fastq.gz" + }, + "2_171015_HJ7TLDSX5_ACCN3_XXXXXX_R": { + "fwd": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN3_XXXXXX_R_1.fastq.gz", + "rev": "tests/test_data/fastq_pon/2_171015_HJ7TLDSX5_ACCN3_XXXXXX_R_2.fastq.gz" + } + } + } + ], + "test_fastq_info": [ + { + "name": "ACC1", + "type": "tumor", + "fastq_info": { + "HXXXXXXX_ACC1_S01_L001": { + "fwd": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC1_S01_L001_R1_001.fastq.gz", + "rev": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC1_S01_L001_R2_001.fastq.gz" + }, + "HXXXXXXX_ACC1_S01_L002": { + "fwd": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC1_S01_L002_R1_001.fastq.gz", + "rev": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC1_S01_L002_R2_001.fastq.gz" + } + } + }, + { + "name": "ACC2", + "type": "normal", + "fastq_info": { + "HXXXXXXX_ACC2_S01_L001": { + "fwd": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC2_S01_L001_R1_001.fastq.gz", + "rev": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC2_S01_L001_R2_001.fastq.gz" + }, + "HXXXXXXX_ACC1_S01_L002": { + "fwd": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC2_S01_L002_R1_001.fastq.gz", + "rev": "tests/test_data/fastq_lanes/fastq/HXXXXXXX_ACC2_S01_L002_R2_001.fastq.gz" + } + } + } + ], + "fastq_fails": { + "duplicate_fastq_patterns": { + "tumor": [ + "ACC1_S1_L001_R_R1_001.fastq.gz", + "ACC1_S1_L001_R_R2_001.fastq.gz", + "ACC1_S1_L001_R_1.fastq.gz", + "ACC1_S1_L001_R_2.fastq.gz" + ] + }, + "duplicate_fastq_patterns_model": [ + { + "name": "ACC1", + "type": "tumor", + "fastq_info": { + "ACC1_S1_L001_R": { + "fwd": "ACC1_S1_L001_R1_001.fastq.gz", + "rev": "ACC1_S1_L001_R2_001.fastq.gz" + } + } + }, + { + "name": "S1", + "type": "normal", + "fastq_info": { + "ACC1_S1_L001_R": { + "fwd": "ACC1_S1_L001_R1_001.fastq.gz", + "rev": "ACC1_S1_L001_R2_001.fastq.gz" + }, + "NICEPREFIX_S1_L001_R": { + "fwd": "NICEPREFIX_S1_L001_R1_001.fastq.gz", + "rev": "NICEPREFIX_S1_L001_R2_001.fastq.gz" + } + } + } + ] + } +} diff --git a/tests/test_data/gens_files/SNV.germline.dummy.dnascope_gnomad_af5.vcf b/tests/test_data/gens_files/SNV.germline.dummy.dnascope_gnomad_af5.vcf new file mode 100644 index 000000000..4ef7749e7 --- /dev/null +++ b/tests/test_data/gens_files/SNV.germline.dummy.dnascope_gnomad_af5.vcf @@ -0,0 +1,150 @@ +##fileformat=VCFv4.2 +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TUMOR +1 10327 . T C 0.02 LowQual AC=1;AF=0.5;AN=2;BaseQRankSum=-0.000;ClippingRankSum=-0.000;DP=10;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.5;MQ=31.16;MQRankSum=-1.150;QD=0.00;ReadPosRankSum=-0.319;SOR=1.022 GT:AD:DP:GQ:PL 0/1:3,1:4:4:4,0,111 +1 10329 . AC A 64.80 . AC=1;AF=0.5;AN=2;DP=8;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.5;MQ=32.79;QD=21.60;SOR=1.179 GT:AD:DP:GQ:PL 0/1:0,3:3:18:102,0,18 +1 13417 . C CGAGA -0.00 LowQual AC=0;AF=0;AN=2;DP=46;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=30.92;SOR=0.307 GT:AD:DP:GQ:PL 0/0:46,0:46:99:0,139,2084 +1 14464 . A T 409.77 . AC=1;AF=0.5;AN=2;BaseQRankSum=1.071;ClippingRankSum=0.000;DP=28;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.5;MQ=31.72;MQRankSum=2.606;QD=14.63;ReadPosRankSum=2.197;SOR=0.467 GT:AD:DP:GQ:PL 0/1:14,14:28:99:438,0,388 +1 15274 . A T 3138.77 . AC=2;AF=1;AN=2;DP=88;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1;MQ=33.56;QD=37.82;SOR=0.717 GT:AD:DP:GQ:PL 1/1:0,83:83:99:3167,247,0 +1 15820 . G T 5.13 LowQual AC=2;AF=1;AN=2;DP=1;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1;MQ=25.00;QD=5.13;SOR=1.609 GT:AD:DP:GQ:PL 1/1:0,1:1:3:30,3,0 +1 17961 . TG T 0.00 LowQual AC=0;AF=0;AN=2;DP=3;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=40.00;SOR=0.061 GT:AD:DP:GQ:PL 0/0:3,0:3:9:0,9,127 +1 21801 . A G 0.00 LowQual AC=0;AF=0;AN=2;DP=1;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=24.00;SOR=0.223 GT:AD:DP:GQ:PL 0/0:1,0:1:3:0,3,45 +1 23343 . C A -0.00 LowQual DP=0;FS=0.000;MQ=0.00;SOR=0.693 GT ./. +1 24912 . G A 0.03 LowQual AC=1;AF=0.5;AN=2;BaseQRankSum=-0.000;ClippingRankSum=-0.000;DP=7;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.5;MQ=24.01;MQRankSum=-1.068;QD=0.00;ReadPosRankSum=-0.000;SOR=0.078 GT:AD:DP:GQ:PL 0/1:6,1:7:7:7,0,152 +1 28494 . T C 0.00 LowQual AC=0;AF=0;AN=2;DP=2;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=24.63;SOR=0.693 GT:AD:DP:GQ:PL 0/0:2,0:2:6:0,6,90 +1 28511 . G A 0.00 LowQual AC=0;AF=0;AN=2;DP=3;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=24.10;SOR=0.368 GT:AD:DP:GQ:PL 0/0:3,0:3:9:0,9,135 +1 28558 . C T 0.00 LowQual AC=0;AF=0;AN=2;DP=4;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=23.59;SOR=0.223 GT:AD:DP:GQ:PL 0/0:4,0:4:12:0,12,180 +1 28563 . A G 103.03 . AC=2;AF=1;AN=2;DP=4;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1;MQ=23.59;QD=25.76;SOR=1.609 GT:AD:DP:GQ:PL 1/1:0,4:4:12:131,12,0 +1 28588 . G T 0.00 LowQual AC=0;AF=0;AN=2;DP=4;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=23.59;SOR=0.223 GT:AD:DP:GQ:PL 0/0:4,0:4:12:0,12,180 +1 28590 . T TTGG 130.00 . AC=2;AF=1;AN=2;DP=4;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1;MQ=23.59;QD=32.50;SOR=1.609 GT:AD:DP:GQ:PL 1/1:0,4:4:12:167,12,0 +1 28591 . T C,TGG 0.00 LowQual AC=0,0;AF=0,0;AN=2;DP=4;ExcessHet=3.0103;FS=0.000;MLEAC=0,0;MLEAF=0,0;MQ=23.59;SOR=0.223 GT:AD:DP:GQ:PL 0/0:4,0,0:4:12:0,12,180,12,170,167 +1 28663 . T A 18.59 LowQual AC=2;AF=1;AN=2;DP=1;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1;MQ=22.00;QD=18.59;SOR=1.609 GT:AD:DP:GQ:PL 1/1:0,1:1:3:45,3,0 +1 28751 . G A 0.00 LowQual AC=0;AF=0;AN=2;DP=1;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=22.00;SOR=0.223 GT:AD:DP:GQ:PL 0/0:1,0:1:3:0,3,45 +1 30548 . T G 15.65 LowQual AC=2;AF=1;AN=2;DP=1;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1;MQ=40.00;QD=15.65;SOR=1.609 GT:AD:DP:GQ:PL 1/1:0,1:1:3:42,3,0 +1 30779 . T C 0.00 LowQual AC=0;AF=0;AN=2;DP=2;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=32.90;SOR=0.693 GT:AD:DP:GQ:PL 0/0:2,0:2:6:0,6,69 +1 30923 . G T 10.90 LowQual AC=2;AF=1;AN=2;DP=1;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1;MQ=35.00;QD=10.90;SOR=1.609 GT:AD:DP:GQ:PL 1/1:0,1:1:3:37,3,0 +1 31058 . T G 0.00 LowQual AC=0;AF=0;AN=2;DP=2;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=29.61;SOR=0.693 GT:AD:DP:GQ:PL 0/0:2,0:2:6:0,6,37 +1 31488 . C G 0.00 LowQual AC=0;AF=0;AN=2;DP=4;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=22.00;SOR=0.039 GT:AD:DP:GQ:PL 0/0:4,0:4:12:0,12,107 +1 39430 . A C 0.00 LowQual AC=0;AF=0;AN=2;DP=11;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=23.55;SOR=0.007 GT:AD:DP:GQ:PL 0/0:11,0:11:33:0,33,396 +1 39578 . C T 0.00 LowQual AC=0;AF=0;AN=2;DP=1;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=24.00;SOR=0.223 GT:AD:DP:GQ:PGT:PID:PL 0/0:1,0:1:3:0|1:39578_C_T:0,3,29 +1 39662 . T C 62.74 . AC=2;AF=1;AN=2;DP=2;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1;MQ=24.02;QD=31.37;SOR=2.303 GT:AD:DP:GQ:PGT:PID:PL 1/1:0,2:2:6:1|0:39578_C_T:90,6,0 +1 39966 . C T 0.00 LowQual AC=0;AF=0;AN=2;DP=3;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=24.04;SOR=0.368 GT:AD:DP:GQ:PL 0/0:3,0:3:9:0,9,102 +1 46633 . T A 0.00 LowQual AC=0;AF=0;AN=2;DP=4;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=25.00;SOR=0.223 GT:AD:DP:GQ:PL 0/0:4,0:4:12:0,12,119 +1 46873 . A T 0.00 LowQual AC=0;AF=0;AN=2;DP=5;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=25.00;SOR=0.027 GT:AD:DP:GQ:PL 0/0:5,0:5:15:0,15,149 +1 47159 . T C 0.00 LowQual AC=0;AF=0;AN=2;DP=5;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=30.38;SOR=0.446 GT:AD:DP:GQ:PGT:PID:PL 0/0:5,0:5:15:0|1:47159_T_C:0,15,161 +1 47190 . G GA 0.00 LowQual AC=0;AF=0;AN=2;DP=3;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=25.00;SOR=0.368 GT:AD:DP:GQ:PGT:PID:PL 0/0:3,0:3:9:1|0:47159_T_C:0,9,118 +1 48518 . A G 0.05 LowQual AC=1;AF=0.5;AN=2;BaseQRankSum=-0.000;ClippingRankSum=-0.000;DP=7;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.5;MQ=25.00;MQRankSum=-0.000;QD=0.01;ReadPosRankSum=0.180;SOR=0.078 GT:AD:DP:GQ:PL 0/1:6,1:7:9:9,0,203 +1 49298 . T C 736.77 . AC=1;AF=0.5;AN=2;BaseQRankSum=-1.055;ClippingRankSum=0.000;DP=29;ExcessHet=3.0103;FS=8.994;MLEAC=1;MLEAF=0.5;MQ=37.43;MQRankSum=2.810;QD=25.41;ReadPosRankSum=1.588;SOR=0.037 GT:AD:DP:GQ:PL 0/1:5,24:29:99:765,0,111 +1 49554 . A G 0.00 LowQual AC=0;AF=0;AN=2;DP=11;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=33.37;SOR=0.551 GT:AD:DP:GQ:PL 0/0:11,0:11:33:0,33,368 +1 51479 . T A 231.77 . AC=1;AF=0.5;AN=2;BaseQRankSum=-0.493;ClippingRankSum=-0.000;DP=10;ExcessHet=3.0103;FS=3.522;MLEAC=1;MLEAF=0.5;MQ=48.71;MQRankSum=2.287;QD=23.18;ReadPosRankSum=2.287;SOR=0.859 GT:AD:DP:GQ:PL 0/1:2,8:10:30:260,0,30 +1 51803 . T C 468.77 . AC=1;AF=0.5;AN=2;BaseQRankSum=0.000;ClippingRankSum=0.000;DP=20;ExcessHet=3.0103;FS=2.128;MLEAC=1;MLEAF=0.5;MQ=44.17;MQRankSum=3.330;QD=23.44;ReadPosRankSum=-1.128;SOR=0.269 GT:AD:DP:GQ:PL 0/1:6,14:20:99:497,0,130 +1 52238 . T G 519.77 . AC=2;AF=1;AN=2;DP=17;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1;MQ=31.00;QD=30.57;SOR=3.383 GT:AD:DP:GQ:PL 1/1:0,17:17:51:548,51,0 +1 53138 . TAA T 0.00 LowQual AC=0;AF=0;AN=2;DP=10;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=27.82;SOR=0.693 GT:AD:DP:GQ:PL 0/0:10,0:10:30:0,30,524 +1 54380 . T C 175.77 . AC=1;AF=0.5;AN=2;BaseQRankSum=0.738;ClippingRankSum=0.000;DP=21;ExcessHet=3.0103;FS=12.155;MLEAC=1;MLEAF=0.5;MQ=45.67;MQRankSum=-2.893;QD=8.37;ReadPosRankSum=0.446;SOR=3.682 GT:AD:DP:GQ:PL 0/1:15,6:21:99:204,0,547 diff --git a/tests/test_data/gens_files/dummy.baf.bed b/tests/test_data/gens_files/dummy.baf.bed new file mode 100644 index 000000000..5db268b3f --- /dev/null +++ b/tests/test_data/gens_files/dummy.baf.bed @@ -0,0 +1,38 @@ +o_1 10327 10328 0.25 +a_1 10327 10328 0.25 +a_1 48518 48519 0.142857 +b_1 10327 10328 0.25 +b_1 28494 28495 0.0 +b_1 30548 30549 1.0 +b_1 46633 46634 0.0 +b_1 52238 52239 1.0 +c_1 10327 10328 0.25 +c_1 15274 15275 1.0 +c_1 24912 24913 0.142857 +c_1 28563 28564 1.0 +c_1 28663 28664 1.0 +c_1 30923 30924 1.0 +c_1 39578 39579 0.0 +c_1 46873 46874 0.0 +c_1 49298 49299 0.827586 +c_1 52238 52239 1.0 +d_1 10327 10328 0.25 +d_1 13417 13418 0.0 +d_1 15274 15275 1.0 +d_1 17961 17962 0.0 +d_1 24912 24913 0.142857 +d_1 28511 28512 0.0 +d_1 28563 28564 1.0 +d_1 28590 28591 1.0 +d_1 28663 28664 1.0 +d_1 30548 30549 1.0 +d_1 30923 30924 1.0 +d_1 31488 31489 0.0 +d_1 39578 39579 0.0 +d_1 39966 39967 0.0 +d_1 46873 46874 0.0 +d_1 47190 47191 0.0 +d_1 49298 49299 0.827586 +d_1 51479 51480 0.8 +d_1 52238 52239 1.0 +d_1 54380 54381 0.285714 diff --git a/tests/test_data/gens_files/dummy.cov.bed b/tests/test_data/gens_files/dummy.cov.bed new file mode 100644 index 000000000..8f537a05a --- /dev/null +++ b/tests/test_data/gens_files/dummy.cov.bed @@ -0,0 +1,173 @@ +o_1 59599 59600 -0.4560901707317073 +o_1 130249 130250 -0.65999825 +o_1 242799 242800 0.037237511627906984 +o_1 549799 549800 0.4260756 +a_1 15199 15200 -0.45106565 +a_1 59599 59600 -0.4852171 +a_1 109999 110000 -0.4008778333333333 +a_1 132899 132900 -0.7430383846153846 +a_1 241949 241950 0.019766414634146346 +a_1 255849 255850 0.484869 +a_1 537399 537400 0.31554075000000004 +a_1 564949 564950 0.4401146 +b_1 12449 12450 -0.480960375 +b_1 17749 17750 -0.4330809 +b_1 53349 53350 -1.415716 +b_1 63199 63200 -0.40323575 +b_1 101049 101050 -0.5332131666666666 +b_1 109099 109100 -0.08012 +b_1 122449 122450 -0.30015766666666666 +b_1 130899 130900 -0.8161176666666667 +b_1 137349 137350 -0.77222825 +b_1 232149 232150 0.0597588 +b_1 236449 236450 0.030688000000000003 +b_1 242599 242600 -0.12113657142857141 +b_1 248999 249000 -0.061346000000000005 +b_1 254999 255000 0.22436066666666665 +b_1 535299 535300 0.23075933333333334 +b_1 564899 564900 0.4997655 +c_1 10249 10250 -0.2783994 +c_1 15149 15150 -0.39595480000000005 +c_1 16799 16800 -0.15504799999999996 +c_1 17499 17500 0.09923933333333333 +c_1 20249 20250 -0.5897275 +c_1 54649 54650 -0.972184 +c_1 62249 62250 -0.4799346666666667 +c_1 64549 64550 0.316141 +c_1 98999 99000 0.0222755 +c_1 102999 103000 -0.931036 +c_1 109099 109100 -0.08012 +c_1 120999 121000 -0.542393 +c_1 126249 126250 -0.690381 +c_1 129549 129550 0.313011 +c_1 135099 135100 -0.326132 +c_1 139149 139150 -0.6530511666666666 +c_1 229849 229850 0.28455 +c_1 233149 233150 -0.16071200000000002 +c_1 234749 234750 0.3843418 +c_1 235949 235950 -0.10750725 +c_1 237699 237700 0.13531 +c_1 241099 241100 -0.2655225 +c_1 243949 243950 -0.12527899999999997 +c_1 247849 247850 0.086489 +c_1 250349 250350 -0.345723 +c_1 254499 254500 0.09410650000000001 +c_1 532149 532150 0.510034 +c_1 534549 534550 0.652804 +c_1 540649 540650 0.569885 +c_1 564899 564900 0.3533105 +c_1 565449 565450 0.787331 +d_1 10049 10050 0.053246 +d_1 10149 10150 -0.745543 +d_1 10249 10250 -1.005236 +d_1 10349 10350 0.19035 +d_1 10449 10450 0.115186 +d_1 13449 13450 -0.956941 +d_1 14749 14750 -0.173385 +d_1 14849 14850 -1.32536 +d_1 15149 15150 -0.163071 +d_1 15349 15350 -0.012349 +d_1 15549 15550 -0.305609 +d_1 16149 16150 -2.292024 +d_1 16349 16350 -0.607045 +d_1 17249 17250 0.296949 +d_1 17349 17350 0.260361 +d_1 17449 17450 -0.006471 +d_1 17649 17650 0.043828 +d_1 20049 20050 -1.208744 +d_1 20149 20150 -0.499705 +d_1 20349 20350 -0.67975 +d_1 52049 52050 -1.859248 +d_1 54649 54650 -0.972184 +d_1 58349 58350 -0.654101 +d_1 61849 61850 -0.349152 +d_1 61949 61950 -0.302002 +d_1 62049 62050 -0.243138 +d_1 62149 62150 -0.53905 +d_1 62349 62350 -0.652585 +d_1 62649 62650 -0.793681 +d_1 62849 62850 -0.662419 +d_1 64549 64550 0.316141 +d_1 98849 98850 -0.089622 +d_1 98949 98950 -0.032713 +d_1 99049 99050 0.077264 +d_1 101349 101350 -0.450722 +d_1 102849 102850 -1.168658 +d_1 102949 102950 -0.587683 +d_1 103149 103150 -1.036767 +d_1 108749 108750 0.482176 +d_1 109049 109050 -0.14116 +d_1 109149 109150 -0.01908 +d_1 115649 115650 -0.848405 +d_1 120949 120950 -0.318915 +d_1 121049 121050 -0.765871 +d_1 123949 123950 0.184313 +d_1 126249 126250 -0.690381 +d_1 128649 128650 -0.857755 +d_1 129549 129550 0.313011 +d_1 133149 133150 -1.903609 +d_1 135049 135050 -0.342939 +d_1 135149 135150 -0.309325 +d_1 138449 138450 -1.950194 +d_1 138749 138750 -1.230287 +d_1 138949 138950 -0.363633 +d_1 139049 139050 -0.662952 +d_1 139249 139250 -1.255257 +d_1 139349 139350 -0.316365 +d_1 139549 139550 -0.089813 +d_1 229549 229550 0.08185 +d_1 229749 229750 0.042753 +d_1 229949 229950 0.526347 +d_1 232849 232850 -0.176506 +d_1 232949 232950 0.466669 +d_1 233149 233150 -0.867546 +d_1 233349 233350 -0.081259 +d_1 234249 234250 -0.056044 +d_1 234349 234350 0.35987 +d_1 234449 234450 0.54939 +d_1 234549 234550 -0.166086 +d_1 235049 235050 0.45575 +d_1 235149 235150 0.722785 +d_1 235349 235350 -0.174559 +d_1 235549 235550 -0.428015 +d_1 235649 235650 -0.093883 +d_1 235749 235750 -0.277248 +d_1 235849 235850 -0.258116 +d_1 235949 235950 0.206789 +d_1 236049 236050 0.105382 +d_1 236149 236150 0.087873 +d_1 236349 236350 -0.20284 +d_1 237549 237550 0.440156 +d_1 237649 237650 -0.123766 +d_1 237749 237750 0.394386 +d_1 240949 240950 0.001368 +d_1 241049 241050 -0.493421 +d_1 241149 241150 -0.037624 +d_1 243649 243650 0.184205 +d_1 243749 243750 0.370103 +d_1 243849 243850 -0.106248 +d_1 244049 244050 -0.218121 +d_1 244149 244150 -0.54685 +d_1 247349 247350 0.597543 +d_1 247649 247650 0.340331 +d_1 247749 247750 -0.169704 +d_1 247949 247950 -0.160579 +d_1 248049 248050 0.335908 +d_1 250149 250150 -0.368309 +d_1 250349 250350 -0.345723 +d_1 252749 252750 0.09297 +d_1 254149 254150 -0.117708 +d_1 254849 254850 0.305921 +d_1 255849 255850 0.484869 +d_1 532149 532150 0.510034 +d_1 534149 534150 -0.478464 +d_1 534549 534550 0.652804 +d_1 536449 536450 0.517938 +d_1 540649 540650 0.569885 +d_1 564349 564350 0.79802 +d_1 564449 564450 0.035875 +d_1 564549 564550 0.601391 +d_1 565249 565250 0.338223 +d_1 565349 565350 0.437753 +d_1 565449 565450 0.787331 +d_1 565449 565450 0.787331 diff --git a/tests/test_data/gens_files/dummy.denoisedCR.tsv b/tests/test_data/gens_files/dummy.denoisedCR.tsv new file mode 100644 index 000000000..2b00b3212 --- /dev/null +++ b/tests/test_data/gens_files/dummy.denoisedCR.tsv @@ -0,0 +1,200 @@ +@HD VN:1.6 +@SQ SN:1 LN:249250621 +@SQ SN:2 LN:243199373 +@SQ SN:3 LN:198022430 +@SQ SN:4 LN:191154276 +@SQ SN:5 LN:180915260 +@SQ SN:6 LN:171115067 +@SQ SN:7 LN:159138663 +@SQ SN:8 LN:146364022 +@SQ SN:9 LN:141213431 +@SQ SN:10 LN:135534747 +@SQ SN:11 LN:135006516 +@SQ SN:12 LN:133851895 +@SQ SN:13 LN:115169878 +@SQ SN:14 LN:107349540 +@SQ SN:15 LN:102531392 +@SQ SN:16 LN:90354753 +@SQ SN:17 LN:81195210 +@SQ SN:18 LN:78077248 +@SQ SN:19 LN:59128983 +@SQ SN:20 LN:63025520 +@SQ SN:21 LN:48129895 +@SQ SN:22 LN:51304566 +@SQ SN:X LN:155270560 +@SQ SN:Y LN:59373566 +@SQ SN:MT LN:16569 +@SQ SN:GL000207.1 LN:4262 +@SQ SN:GL000226.1 LN:15008 +@SQ SN:GL000229.1 LN:19913 +@SQ SN:GL000231.1 LN:27386 +@SQ SN:GL000210.1 LN:27682 +@SQ SN:GL000239.1 LN:33824 +@SQ SN:GL000235.1 LN:34474 +@SQ SN:GL000201.1 LN:36148 +@SQ SN:GL000247.1 LN:36422 +@SQ SN:GL000245.1 LN:36651 +@SQ SN:GL000197.1 LN:37175 +@SQ SN:GL000203.1 LN:37498 +@SQ SN:GL000246.1 LN:38154 +@SQ SN:GL000249.1 LN:38502 +@SQ SN:GL000196.1 LN:38914 +@SQ SN:GL000248.1 LN:39786 +@SQ SN:GL000244.1 LN:39929 +@SQ SN:GL000238.1 LN:39939 +@SQ SN:GL000202.1 LN:40103 +@SQ SN:GL000234.1 LN:40531 +@SQ SN:GL000232.1 LN:40652 +@SQ SN:GL000206.1 LN:41001 +@SQ SN:GL000240.1 LN:41933 +@SQ SN:GL000236.1 LN:41934 +@SQ SN:GL000241.1 LN:42152 +@SQ SN:GL000243.1 LN:43341 +@SQ SN:GL000242.1 LN:43523 +@SQ SN:GL000230.1 LN:43691 +@SQ SN:GL000237.1 LN:45867 +@SQ SN:GL000233.1 LN:45941 +@SQ SN:GL000204.1 LN:81310 +@SQ SN:GL000198.1 LN:90085 +@SQ SN:GL000208.1 LN:92689 +@SQ SN:GL000191.1 LN:106433 +@SQ SN:GL000227.1 LN:128374 +@SQ SN:GL000228.1 LN:129120 +@SQ SN:GL000214.1 LN:137718 +@SQ SN:GL000221.1 LN:155397 +@SQ SN:GL000209.1 LN:159169 +@SQ SN:GL000218.1 LN:161147 +@SQ SN:GL000220.1 LN:161802 +@SQ SN:GL000213.1 LN:164239 +@SQ SN:GL000211.1 LN:166566 +@SQ SN:GL000199.1 LN:169874 +@SQ SN:GL000217.1 LN:172149 +@SQ SN:GL000216.1 LN:172294 +@SQ SN:GL000215.1 LN:172545 +@SQ SN:GL000205.1 LN:174588 +@SQ SN:GL000219.1 LN:179198 +@SQ SN:GL000224.1 LN:179693 +@SQ SN:GL000223.1 LN:180455 +@SQ SN:GL000195.1 LN:182896 +@SQ SN:GL000212.1 LN:186858 +@SQ SN:GL000222.1 LN:186861 +@SQ SN:GL000200.1 LN:187035 +@SQ SN:GL000193.1 LN:189789 +@SQ SN:GL000194.1 LN:191469 +@SQ SN:GL000225.1 LN:211173 +@SQ SN:GL000192.1 LN:547496 +@RG ID:GATKCopyNumber SM:TUMOR +CONTIG START END LOG2_COPY_RATIO +1 10001 10100 0.053246 +1 10101 10200 -0.745543 +1 10201 10300 -1.005236 +1 10301 10400 0.190350 +1 10401 10500 0.115186 +1 13401 13500 -0.956941 +1 14701 14800 -0.173385 +1 14801 14900 -1.325360 +1 15101 15200 -0.163071 +1 15301 15400 -0.012349 +1 15501 15600 -0.305609 +1 16101 16200 -2.292024 +1 16301 16400 -0.607045 +1 17201 17300 0.296949 +1 17301 17400 0.260361 +1 17401 17500 -0.006471 +1 17601 17700 0.043828 +1 20001 20100 -1.208744 +1 20101 20200 -0.499705 +1 20301 20400 -0.679750 +1 52001 52100 -1.859248 +1 54601 54700 -0.972184 +1 58301 58400 -0.654101 +1 61801 61900 -0.349152 +1 61901 62000 -0.302002 +1 62001 62100 -0.243138 +1 62101 62200 -0.539050 +1 62301 62400 -0.652585 +1 62601 62700 -0.793681 +1 62801 62900 -0.662419 +1 64501 64600 0.316141 +1 98801 98900 -0.089622 +1 98901 99000 -0.032713 +1 99001 99100 0.077264 +1 101301 101400 -0.450722 +1 102801 102900 -1.168658 +1 102901 103000 -0.587683 +1 103101 103200 -1.036767 +1 108701 108800 0.482176 +1 109001 109100 -0.141160 +1 109101 109200 -0.019080 +1 115601 115700 -0.848405 +1 120901 121000 -0.318915 +1 121001 121100 -0.765871 +1 123901 124000 0.184313 +1 126201 126300 -0.690381 +1 128601 128700 -0.857755 +1 129501 129600 0.313011 +1 133101 133200 -1.903609 +1 135001 135100 -0.342939 +1 135101 135200 -0.309325 +1 138401 138500 -1.950194 +1 138701 138800 -1.230287 +1 138901 139000 -0.363633 +1 139001 139100 -0.662952 +1 139201 139300 -1.255257 +1 139301 139400 -0.316365 +1 139501 139600 -0.089813 +1 229501 229600 0.081850 +1 229701 229800 0.042753 +1 229901 230000 0.526347 +1 232801 232900 -0.176506 +1 232901 233000 0.466669 +1 233101 233200 -0.867546 +1 233301 233400 -0.081259 +1 234201 234300 -0.056044 +1 234301 234400 0.359870 +1 234401 234500 0.549390 +1 234501 234600 -0.166086 +1 235001 235100 0.455750 +1 235101 235200 0.722785 +1 235301 235400 -0.174559 +1 235501 235600 -0.428015 +1 235601 235700 -0.093883 +1 235701 235800 -0.277248 +1 235801 235900 -0.258116 +1 235901 236000 0.206789 +1 236001 236100 0.105382 +1 236101 236200 0.087873 +1 236301 236400 -0.202840 +1 237501 237600 0.440156 +1 237601 237700 -0.123766 +1 237701 237800 0.394386 +1 240901 241000 0.001368 +1 241001 241100 -0.493421 +1 241101 241200 -0.037624 +1 243601 243700 0.184205 +1 243701 243800 0.370103 +1 243801 243900 -0.106248 +1 244001 244100 -0.218121 +1 244101 244200 -0.546850 +1 247301 247400 0.597543 +1 247601 247700 0.340331 +1 247701 247800 -0.169704 +1 247901 248000 -0.160579 +1 248001 248100 0.335908 +1 250101 250200 -0.368309 +1 250301 250400 -0.345723 +1 252701 252800 0.092970 +1 254101 254200 -0.117708 +1 254801 254900 0.305921 +1 255801 255900 0.484869 +1 532101 532200 0.510034 +1 534101 534200 -0.478464 +1 534501 534600 0.652804 +1 536401 536500 0.517938 +1 540601 540700 0.569885 +1 564301 564400 0.798020 +1 564401 564500 0.035875 +1 564501 564600 0.601391 +1 565201 565300 0.338223 +1 565301 565400 0.437753 +1 565401 565500 0.787331 diff --git a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json index 31077d114..2998633be 100755 --- a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json +++ b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json @@ -2,33 +2,37 @@ "report_data_sources": { "Picard": { "HsMetrics": { - "concatenated_tumor_XXXXXX_R": "tests/test_data/qc_files/analysis/bam/concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric", - "concatenated_tumor_XXXXXX_R.consensusfiltered.umi": "tests/test_data/qc_files/analysis/qc/umi_qc/concatenated_tumor_XXXXXX_R.umi.collect_hsmetric" + "tumor.ACC1": "tests/test_data/qc_files/analysis/bam/ACC1.dedup.realign.hsmetric", + "ACC1_consensusfiltered_umi": "tests/test_data/qc_files/analysis/qc/umi_qc/ACC1.umi.collect_hsmetric" }, "InsertSizeMetrics": { - "concatenated_tumor_XXXXXX_R": "tests/test_data/qc_files/analysis/bam/concatenated_tumor_XXXXXX_R.sorted.insertsizemetric" + "tumor.ACC1": "tests/test_data/qc_files/analysis/bam/ACC1.dedup.realign.insertsizemetric" }, "DuplicationMetrics": { - "concatenated_tumor_XXXXXX_R": "tests/test_data/qc_files/analysis/bam/concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt" + "tumor.ACC1": "tests/test_data/qc_files/analysis/bam/tumor.ACC1.dedup.metrics" } }, "FastQC": { "all_sections": { - "concatenated_tumor_XXXXXX_R_2": "tests/test_data/qc_files/analysis/fastqc/concatenated_tumor_XXXXXX_R_2_fastqc.zip", - "concatenated_normal_XXXXXX_R_1": "tests/test_data/qc_files/analysis/fastqc/concatenated_normal_XXXXXX_R_1_fastqc.zip", - "concatenated_normal_XXXXXX_R_2": "tests/test_data/qc_files/analysis/fastqc/concatenated_normal_XXXXXX_R_2_fastqc.zip", - "concatenated_tumor_XXXXXX_R_1": "tests/test_data/qc_files/analysis/fastqc/concatenated_tumor_XXXXXX_R_1_fastqc.zip" + "HXXXXXXX_ACC1_S01_L001_R1_001": "tests/test_data/qc_files/analysis/fastqc/HXXXXXXX_ACC1_S01_L001_R1_001.fastqc.zip", + "HXXXXXXX_ACC1_S01_L001_R2_001": "tests/test_data/qc_files/analysis/fastqc/HXXXXXXX_ACC1_S01_L001_R2_001.fastqc.zip", + "HXXXXXXX_ACC1_S01_L002_R1_001": "tests/test_data/qc_files/analysis/fastqc/HXXXXXXX_ACC1_S01_L002_R1_001.fastqc.zip", + "HXXXXXXX_ACC1_S01_L002_R2_001": "tests/test_data/qc_files/analysis/fastqc/HXXXXXXX_ACC1_S01_L002_R2_001.fastqc.zip", + "HXXXXXXX_ACC2_S01_L001_R1_001": "tests/test_data/qc_files/analysis/fastqc/HXXXXXXX_ACC2_S01_L001_R1_001.fastqc.zip", + "HXXXXXXX_ACC2_S01_L001_R2_001": "tests/test_data/qc_files/analysis/fastqc/HXXXXXXX_ACC2_S01_L001_R2_001.fastqc.zip", + "HXXXXXXX_ACC2_S01_L002_R1_001": "tests/test_data/qc_files/analysis/fastqc/HXXXXXXX_ACC2_S01_L002_R1_001.fastqc.zip", + "HXXXXXXX_ACC2_S01_L002_R2_001": "tests/test_data/qc_files/analysis/fastqc/HXXXXXXX_ACC2_S01_L002_R2_001.fastqc.zip" } }, "Somalier": { "all_sections": { - "id1_NORMAL*id1_TUMOR": "tests/test_data/somalier/somalier.pairs.tsv" + "id1_NORMAL*id1_TUMOR": "tests/test_data/qc_files/analysis/qc/somalier/somalier.pairs.tsv" } } }, "report_saved_raw_data": { "multiqc_picard_HsMetrics": { - "concatenated_tumor_XXXXXX_R": { + "tumor.ACC1": { "BAIT_SET": "gmsmyeloid_5.2_hg19_design.bed", "PCT_OFF_BAIT": 0.364546, "GENOME_SIZE": 3101804739.0, @@ -47,8 +51,8 @@ "PCT_TARGET_BASES_500X": 0.996675, "PCT_TARGET_BASES_1000X": 0.992466 }, - "concatenated_tumor_XXXXXX_R.consensusfiltered.umi": { - "BAIT_SET": "concatenated_tumor_XXXXXX_R", + "ACC1_consensusfiltered_umi": { + "BAIT_SET": "ACC1", "PCT_OFF_BAIT": 0.13703, "GENOME_SIZE": 3101804739.0, "TOTAL_READS": 1914717.0, @@ -68,8 +72,8 @@ } }, "multiqc_picard_insertSize": { - "concatenated_tumor_XXXXXX_R_FR": { - "SAMPLE_NAME": "concatenated_tumor_XXXXXX_R", + "tumor.ACC1_FR": { + "SAMPLE_NAME": "ACC1", "MEDIAN_INSERT_SIZE": 186.0, "MIN_INSERT_SIZE": 2.0, "MAX_INSERT_SIZE": 242835911.0, @@ -80,7 +84,7 @@ } }, "multiqc_picard_dups": { - "concatenated_tumor_XXXXXX_R": { + "tumor.ACC1": { "LIBRARY": "Unknown Library", "UNPAIRED_READS_EXAMINED": 7389.0, "READ_PAIRS_EXAMINED": 47884432.0, @@ -91,19 +95,35 @@ } }, "multiqc_general_stats": { - "concatenated_tumor_XXXXXX_R_2": { + "HXXXXXXX_ACC1_S01_L001_R1_001": { "FastQC_mqc-generalstats-fastqc-percent_duplicates": 15.03521942842923, "FastQC_mqc-generalstats-fastqc-total_sequences": 600529762.0 }, - "concatenated_normal_XXXXXX_R_1": { + "HXXXXXXX_ACC1_S01_L001_R2_001": { "FastQC_mqc-generalstats-fastqc-percent_duplicates": 14.426654287440797, "FastQC_mqc-generalstats-fastqc-total_sequences": 464581551.0 }, - "concatenated_normal_XXXXXX_R_2": { + "HXXXXXXX_ACC1_S01_L002_R1_001": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 15.03521942842923, + "FastQC_mqc-generalstats-fastqc-total_sequences": 600529762.0 + }, + "HXXXXXXX_ACC1_S01_L002_R2_001": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 14.426654287440797, + "FastQC_mqc-generalstats-fastqc-total_sequences": 464581551.0 + }, + "HXXXXXXX_ACC2_S01_L001_R1_001": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 14.214689357571501, + "FastQC_mqc-generalstats-fastqc-total_sequences": 464581551.0 + }, + "HXXXXXXX_ACC2_S01_L001_R2_001": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 15.213739762327492, + "FastQC_mqc-generalstats-fastqc-total_sequences": 600529762.0 + }, + "HXXXXXXX_ACC2_S01_L002_R1_001": { "FastQC_mqc-generalstats-fastqc-percent_duplicates": 14.214689357571501, "FastQC_mqc-generalstats-fastqc-total_sequences": 464581551.0 }, - "concatenated_tumor_XXXXXX_R_1": { + "HXXXXXXX_ACC2_S01_L002_R2_001": { "FastQC_mqc-generalstats-fastqc-percent_duplicates": 15.213739762327492, "FastQC_mqc-generalstats-fastqc-total_sequences": 600529762.0 } diff --git a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_HsMetrics.json b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_HsMetrics.json index 9d9cd5755..86e048a47 100755 --- a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_HsMetrics.json +++ b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_HsMetrics.json @@ -1,5 +1,5 @@ { - "concatenated_tumor_XXXXXX_R": { + "ACC1": { "BAIT_SET": "lymphoma_6.1_hg19_design.bed", "BAIT_TERRITORY": 159268.0, "BAIT_DESIGN_EFFICIENCY": 1.0, diff --git a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_dups.json b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_dups.json index 32d9bc605..b9f559470 100755 --- a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_dups.json +++ b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_dups.json @@ -1,5 +1,5 @@ { - "concatenated_tumor_XXXXXX_R": { + "ACC1": { "LIBRARY": "Unknown Library", "UNPAIRED_READS_EXAMINED": 11860.0, "READ_PAIRS_EXAMINED": 20440841.0, diff --git a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_insertSize.json b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_insertSize.json index 2278ad17c..c9023ea85 100755 --- a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_insertSize.json +++ b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_insertSize.json @@ -1,6 +1,6 @@ { - "concatenated_tumor_XXXXXX_R_FR": { - "SAMPLE_NAME": "concatenated_tumor_XXXXXX_R", + "ACC1_FR": { + "SAMPLE_NAME": "ACC1", "MEDIAN_INSERT_SIZE": 69.0, "MODE_INSERT_SIZE": 64.0, "MEDIAN_ABSOLUTE_DEVIATION": 16.0, diff --git a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_wgsmetrics.json b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_wgsmetrics.json index 490983654..b0bdb7f4d 100755 --- a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_wgsmetrics.json +++ b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_picard_wgsmetrics.json @@ -1,5 +1,5 @@ { - "concatenated_tumor_XXXXXX_R": { + "ACC1": { "GENOME_TERRITORY": 2864785223.0, "MEAN_COVERAGE": 38.396727, "SD_COVERAGE": 12.664872, diff --git a/tests/test_data/qc_files/analysis/qc/sample_tumor_normal_metrics_deliverables.yaml b/tests/test_data/qc_files/analysis/qc/sample_tumor_normal_metrics_deliverables.yaml index d8b46036a..628277e01 100644 --- a/tests/test_data/qc_files/analysis/qc/sample_tumor_normal_metrics_deliverables.yaml +++ b/tests/test_data/qc_files/analysis/qc/sample_tumor_normal_metrics_deliverables.yaml @@ -1,20 +1,20 @@ - header: null - id: normal - input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC2 + input: ACC2.sorted.mrkdup.hsmetric name: PCT_OFF_BAIT step: multiqc_picard_HsMetrics value: 0.161185 condition: null - header: null - id: normal - input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC2 + input: ACC2.sorted.mrkdup.hsmetric name: MEAN_TARGET_COVERAGE step: multiqc_picard_HsMetrics value: 636.23177 condition: null - header: null - id: normal - input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC2 + input: ACC2.sorted.mrkdup.hsmetric name: MEDIAN_TARGET_COVERAGE step: multiqc_picard_HsMetrics value: 597.0 @@ -22,8 +22,8 @@ norm: gt threshold: 500.0 - header: null - id: normal - input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC2 + input: ACC2.sorted.mrkdup.hsmetric name: FOLD_80_BASE_PENALTY step: multiqc_picard_HsMetrics value: 1.469357 @@ -31,57 +31,57 @@ norm: lt threshold: 1.8 - header: null - id: normal - input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC2 + input: ACC2.sorted.mrkdup.hsmetric name: PCT_TARGET_BASES_50X step: multiqc_picard_HsMetrics value: 0.998388 condition: null - header: null - id: normal - input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC2 + input: ACC2.sorted.mrkdup.hsmetric name: PCT_TARGET_BASES_100X step: multiqc_picard_HsMetrics value: 0.99497 condition: null - header: null - id: normal - input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC2 + input: ACC2.sorted.mrkdup.hsmetric name: PCT_TARGET_BASES_250X step: multiqc_picard_HsMetrics value: 0.965738 condition: null - header: null - id: normal - input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC2 + input: ACC2.sorted.mrkdup.hsmetric name: PCT_TARGET_BASES_500X step: multiqc_picard_HsMetrics value: 0.679445 condition: null - header: null - id: normal - input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC2 + input: ACC2.sorted.mrkdup.hsmetric name: PCT_TARGET_BASES_1000X step: multiqc_picard_HsMetrics value: 0.085208 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.sorted.mrkdup.hsmetric name: PCT_OFF_BAIT step: multiqc_picard_HsMetrics value: 0.158226 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.sorted.mrkdup.hsmetric name: MEAN_TARGET_COVERAGE step: multiqc_picard_HsMetrics value: 888.343586 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.sorted.mrkdup.hsmetric name: MEDIAN_TARGET_COVERAGE step: multiqc_picard_HsMetrics value: 805.0 @@ -89,8 +89,8 @@ norm: gt threshold: 500.0 - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.sorted.mrkdup.hsmetric name: FOLD_80_BASE_PENALTY step: multiqc_picard_HsMetrics value: 1.566744 @@ -98,64 +98,64 @@ norm: lt threshold: 1.8 - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.sorted.mrkdup.hsmetric name: PCT_TARGET_BASES_50X step: multiqc_picard_HsMetrics value: 0.998554 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.sorted.mrkdup.hsmetric name: PCT_TARGET_BASES_100X step: multiqc_picard_HsMetrics value: 0.997177 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.sorted.mrkdup.hsmetric name: PCT_TARGET_BASES_250X step: multiqc_picard_HsMetrics value: 0.979764 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.sorted.mrkdup.hsmetric name: PCT_TARGET_BASES_500X step: multiqc_picard_HsMetrics value: 0.874594 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.sorted.mrkdup.hsmetric name: PCT_TARGET_BASES_1000X step: multiqc_picard_HsMetrics value: 0.304354 condition: null - header: null - id: normal - input: concatenated_normal_XXXXXX_R.sorted.insertsizemetric + id: ACC2 + input: ACC2.sorted.insertsizemetric name: MEAN_INSERT_SIZE step: multiqc_picard_insertSize value: 125.819455 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.insertsizemetric + id: ACC1 + input: ACC1.sorted.insertsizemetric name: MEAN_INSERT_SIZE step: multiqc_picard_insertSize value: 131.280203 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt + id: ACC1 + input: ACC1.sorted.mrkdup.txt name: PERCENT_DUPLICATION step: multiqc_picard_dups value: 0.356228 condition: null - header: null - id: normal - input: concatenated_normal_XXXXXX_R.sorted.mrkdup.txt + id: ACC2 + input: ACC2.sorted.mrkdup.txt name: PERCENT_DUPLICATION step: multiqc_picard_dups value: 0.255692 diff --git a/tests/test_data/qc_files/analysis/qc/sample_tumor_only_metrics_deliverables.yaml b/tests/test_data/qc_files/analysis/qc/sample_tumor_only_metrics_deliverables.yaml index f07b74c42..7cf8e04a9 100644 --- a/tests/test_data/qc_files/analysis/qc/sample_tumor_only_metrics_deliverables.yaml +++ b/tests/test_data/qc_files/analysis/qc/sample_tumor_only_metrics_deliverables.yaml @@ -1,34 +1,34 @@ - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.dedup.realign.hsmetric name: PCT_OFF_BAIT step: multiqc_picard_HsMetrics value: 0.364546 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.dedup.realign.hsmetric name: MEAN_TARGET_COVERAGE step: multiqc_picard_HsMetrics value: 2314.698853 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.dedup.realign.hsmetric name: MEDIAN_TARGET_COVERAGE step: multiqc_picard_HsMetrics value: 2393.0 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.dedup.realign.hsmetric name: FOLD_80_BASE_PENALTY step: multiqc_picard_HsMetrics value: 1.359189 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.dedup.realign.hsmetric name: GC_DROPOUT step: multiqc_picard_HsMetrics value: 0.027402 @@ -36,29 +36,29 @@ norm: lt threshold: 1.0 - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.dedup.realign.hsmetric name: PCT_TARGET_BASES_50X step: multiqc_picard_HsMetrics value: 1.0 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.dedup.realign.hsmetric name: PCT_TARGET_BASES_100X step: multiqc_picard_HsMetrics value: 0.999987 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.dedup.realign.hsmetric name: PCT_TARGET_BASES_250X step: multiqc_picard_HsMetrics value: 0.998445 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.dedup.realign.hsmetric name: PCT_TARGET_BASES_500X step: multiqc_picard_HsMetrics value: 0.996675 @@ -66,22 +66,22 @@ norm: gt threshold: 0.9 - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + id: ACC1 + input: ACC1.dedup.realign.hsmetric name: PCT_TARGET_BASES_1000X step: multiqc_picard_HsMetrics value: 0.992466 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.insertsizemetric + id: ACC1 + input: ACC1.dedup.realign.insertsizemetric name: MEAN_INSERT_SIZE step: multiqc_picard_insertSize value: 201.813054 condition: null - header: null - id: tumor - input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt + id: ACC1 + input: tumor.ACC1.dedup.metrics name: PERCENT_DUPLICATION step: multiqc_picard_dups value: 0.391429 diff --git a/tests/test_data/qc_files/analysis/vep/SNV.somatic.case.svdb.all.filtered.pass.stats b/tests/test_data/qc_files/analysis/vep/SNV.somatic.case.svdb.clinical.filtered.pass.stats similarity index 100% rename from tests/test_data/qc_files/analysis/vep/SNV.somatic.case.svdb.all.filtered.pass.stats rename to tests/test_data/qc_files/analysis/vep/SNV.somatic.case.svdb.clinical.filtered.pass.stats diff --git a/tests/test_data/qc_files/multiqc_picard_HsMetrics.json b/tests/test_data/qc_files/multiqc_picard_HsMetrics.json index a7db13630..88944604b 100755 --- a/tests/test_data/qc_files/multiqc_picard_HsMetrics.json +++ b/tests/test_data/qc_files/multiqc_picard_HsMetrics.json @@ -1,5 +1,5 @@ { - "concatenated_neatlyfastraven_XXXXXX_R": { + "ACC00000A01": { "BAIT_SET": "gmslymphoid_7.1_hg19_design.bed", "GENOME_SIZE": 3101804739.0, "BAIT_TERRITORY": 1199545.0, @@ -89,7 +89,7 @@ "LIBRARY": "", "READ_GROUP": "" }, - "concatenated_easilyusefulorca_XXXXXX_R": { + "ACC00000A02": { "BAIT_SET": "gmslymphoid_7.1_hg19_design.bed", "GENOME_SIZE": 3101804739.0, "BAIT_TERRITORY": 1199545.0, diff --git a/tests/test_data/references/cadd/annotations/GRCh37_v1.6/reference/reference.fa b/tests/test_data/references/cadd/annotations/GRCh37_v1.6/reference/reference.fa new file mode 100644 index 000000000..9edff7ae6 --- /dev/null +++ b/tests/test_data/references/cadd/annotations/GRCh37_v1.6/reference/reference.fa @@ -0,0 +1,2 @@ +>1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/tests/test_data/references/cadd/annotations/GRCh38_v1.6/reference/reference.fa b/tests/test_data/references/cadd/annotations/GRCh38_v1.6/reference/reference.fa new file mode 100644 index 000000000..136e33710 --- /dev/null +++ b/tests/test_data/references/cadd/annotations/GRCh38_v1.6/reference/reference.fa @@ -0,0 +1,2 @@ +>1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/tests/test_data/references/genome/simpleRepeat.txt.gz b/tests/test_data/references/genome/simpleRepeat.txt.gz new file mode 100644 index 000000000..f2afffa11 Binary files /dev/null and b/tests/test_data/references/genome/simpleRepeat.txt.gz differ diff --git a/tests/test_data/references/gens/gnomad.genomes.r2.1.1.sites_0.05AF.vcf.gz b/tests/test_data/references/gens/gnomad.genomes.r2.1.1.sites_0.05AF.vcf.gz new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_data/references/gens/grch37_gens_male_pon_100bp.hdf5 b/tests/test_data/references/gens/grch37_gens_male_pon_100bp.hdf5 new file mode 100644 index 000000000..ad2823b48 --- /dev/null +++ b/tests/test_data/references/gens/grch37_gens_male_pon_100bp.hdf5 @@ -0,0 +1 @@ +' \ No newline at end of file diff --git a/tests/test_data/references/gens/grch37_gens_targets_preprocessed_100bp.interval_list b/tests/test_data/references/gens/grch37_gens_targets_preprocessed_100bp.interval_list new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_data/references/reference.json b/tests/test_data/references/reference.json index f851bee0a..14c201118 100644 --- a/tests/test_data/references/reference.json +++ b/tests/test_data/references/reference.json @@ -2,23 +2,25 @@ "reference": { "reference_genome": "tests/test_data/references/genome/human_g1k_v37_decoy.fasta", "dbsnp": "tests/test_data/references/variants/dbsnp_grch37_b138.vcf.gz", - "1kg_snps_all": "tests/test_data/references/variants/1k_genome_wgs_p1_v3_all_sites.vcf.gz", - "1kg_snps_high": "tests/test_data/references/variants/1kg_phase1_snps_high_confidence_b37.vcf.gz", - "1kg_known_indel": "tests/test_data/references/variants/1kg_known_indels_b37.vcf.gz", + "vcf_1kg": "tests/test_data/references/variants/1k_genome_wgs_p1_v3_all_sites.vcf.gz", + "hc_vcf_1kg": "tests/test_data/references/variants/1kg_phase1_snps_high_confidence_b37.vcf.gz", + "known_indel_1kg": "tests/test_data/references/variants/1kg_known_indels_b37.vcf.gz", "mills_1kg": "tests/test_data/references/variants/mills_1kg_index.vcf.gz", "gnomad_variant": "tests/test_data/reference/variants/gnomad.genomes.r2.1.1.sites.vcf.bgz", "cosmic": "tests/test_data/references/variants/cosmic_coding_muts_v89.vcf.gz", - "vep": "tests/test_data/references/vep/", - "refflat": "tests/test_data/references/genome/refseq.flat", - "refGene": "tests/test_data/references/genome/refGene.txt", - "wgs_calling_interval": "tests/test_data/references/genome/wgs_calling_regions.v1", + "vep_dir": "tests/test_data/references/vep/", + "refgene_flat": "tests/test_data/references/genome/refseq.flat", + "refgene_txt": "tests/test_data/references/genome/refGene.txt", + "wgs_calling_regions": "tests/test_data/references/genome/wgs_calling_regions.v1", "genome_chrom_size": "tests/test_data/references/genome/hg19.chrom.sizes", - "exon_bed": "tests/test_data/references/genome/refseq.flat.bed", + "refgene_bed": "tests/test_data/references/genome/refseq.flat.bed", "delly_exclusion": "tests/test_data/references/genome/delly_exclusion.tsv", "delly_exclusion_converted": "tests/test_data/references/genome/delly_exclusion_converted.tsv", "delly_mappability": "tests/test_data/references/genome/delly_mappability.gz", - "ascat_gccorrection": "tests/test_data/references/genome/GRCh37_SnpGcCorrections.tsv", - "ascat_chryloci": "tests/test_data/references/genome/GRCh37_Y.loci", - "clinvar": "tests/test_data/references/genome/clinvar.vcf.gz" + "ascat_gc_correction": "tests/test_data/references/genome/GRCh37_SnpGcCorrections.tsv", + "ascat_chr_y_loci": "tests/test_data/references/genome/GRCh37_Y.loci", + "clinvar": "tests/test_data/references/genome/clinvar.vcf.gz", + "cadd_snv": "tests/test_data/references/variants/hg19.cadd_snv.tsv.gz", + "simple_repeat": "tests/test_data/references/genome/simpleRepeat.txt.gz" } } diff --git a/tests/test_data/references/variants/cancer_germline_snv_variants.vcf.gz b/tests/test_data/references/variants/cancer_germline_snv_variants.vcf.gz new file mode 100644 index 000000000..bc9c176d9 Binary files /dev/null and b/tests/test_data/references/variants/cancer_germline_snv_variants.vcf.gz differ diff --git a/tests/test_data/references/variants/cancer_germline_snv_variants.vcf.gz.tbi b/tests/test_data/references/variants/cancer_germline_snv_variants.vcf.gz.tbi new file mode 100644 index 000000000..b9e0990fa Binary files /dev/null and b/tests/test_data/references/variants/cancer_germline_snv_variants.vcf.gz.tbi differ diff --git a/tests/test_data/references/variants/cancer_somatic_snv_variants.vcf.gz b/tests/test_data/references/variants/cancer_somatic_snv_variants.vcf.gz new file mode 100644 index 000000000..bc9c176d9 Binary files /dev/null and b/tests/test_data/references/variants/cancer_somatic_snv_variants.vcf.gz differ diff --git a/tests/test_data/references/variants/cancer_somatic_snv_variants.vcf.gz.tbi b/tests/test_data/references/variants/cancer_somatic_snv_variants.vcf.gz.tbi new file mode 100644 index 000000000..b9e0990fa Binary files /dev/null and b/tests/test_data/references/variants/cancer_somatic_snv_variants.vcf.gz.tbi differ diff --git a/tests/test_data/references/variants/hg19.cadd_snv.tsv.gz b/tests/test_data/references/variants/hg19.cadd_snv.tsv.gz new file mode 100644 index 000000000..c86aa5429 Binary files /dev/null and b/tests/test_data/references/variants/hg19.cadd_snv.tsv.gz differ diff --git a/tests/test_data/references/variants/hg19.cadd_snv.tsv.gz.tbi b/tests/test_data/references/variants/hg19.cadd_snv.tsv.gz.tbi new file mode 100644 index 000000000..99bc29274 Binary files /dev/null and b/tests/test_data/references/variants/hg19.cadd_snv.tsv.gz.tbi differ diff --git a/tests/test_data/references/variants/somatic_sv_variants.vcf.gz b/tests/test_data/references/variants/somatic_sv_variants.vcf.gz new file mode 100644 index 000000000..d8b87b484 Binary files /dev/null and b/tests/test_data/references/variants/somatic_sv_variants.vcf.gz differ diff --git a/tests/test_data/references/variants/somatic_sv_variants.vcf.gz.tbi b/tests/test_data/references/variants/somatic_sv_variants.vcf.gz.tbi new file mode 100644 index 000000000..c409386c0 Binary files /dev/null and b/tests/test_data/references/variants/somatic_sv_variants.vcf.gz.tbi differ diff --git a/tests/test_data/vcf_tables/test_input.txt b/tests/test_data/vcf_tables/test_input.txt deleted file mode 100644 index d9ef40971..000000000 --- a/tests/test_data/vcf_tables/test_input.txt +++ /dev/null @@ -1,5 +0,0 @@ -Gene_ID COSMIC_ID Mutation_ID Variant_type AA_Change AA_HGVS Average_AF% -AKT1 COSM33765 COSV62571334 SNP p.E17K p.Glu17Lys 0.118 -BRAF COSM476 COSV56056643 SNP p.V600E p.Val600Glu 0.104 -EGFR COSM6224 COSV51765161 SNP p.L858R p.Leu858Arg 0.106 -EGFR COSM6240 COSV51765492 SNP p.T790M p.Thr790Met 0.116 diff --git a/tests/test_data/vcf_tables/test_reference.vcf.gz b/tests/test_data/vcf_tables/test_reference.vcf.gz deleted file mode 100644 index 70800d9b9..000000000 Binary files a/tests/test_data/vcf_tables/test_reference.vcf.gz and /dev/null differ diff --git a/tests/test_data/vcf_tables/test_reference.vcf.gz.tbi b/tests/test_data/vcf_tables/test_reference.vcf.gz.tbi deleted file mode 100644 index f1683a0bf..000000000 Binary files a/tests/test_data/vcf_tables/test_reference.vcf.gz.tbi and /dev/null differ diff --git a/tests/test_data/vcfs/SNV.germline.sample.dnascope.vcf b/tests/test_data/vcfs/SNV.germline.sample.dnascope.vcf new file mode 100644 index 000000000..e417c696b --- /dev/null +++ b/tests/test_data/vcfs/SNV.germline.sample.dnascope.vcf @@ -0,0 +1,113 @@ +##fileformat=VCFv4.2 +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TUMOR +1 10327 . T C 0.02 LowQual AC=1;AF=0.5;AN=2;BaseQRankSum=-0.000;ClippingRankSum=-0.000;DP=10;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.5;MQ=31.16;MQRankSum=-1.150;QD=0.00;ReadPosRankSum=-0.319;SOR=1.022 GT:AD:DP:GQ:PL 0/1:3,1:4:4:4,0,111 +2 10329 . AC A 64.80 . AC=1;AF=0.5;AN=2;DP=8;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.5;MQ=32.79;QD=21.60;SOR=1.179 GT:AD:DP:GQ:PL 0/1:0,3:3:18:102,0,18 +20 13417 . C CGAGA -0.00 LowQual AC=0;AF=0;AN=2;DP=46;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=30.92;SOR=0.307 GT:AD:DP:GQ:PL 0/0:46,0:46:99:0,139,2084 diff --git a/tests/test_data/vcfs/SNV.germline.sample.dnascope.vcf.gz b/tests/test_data/vcfs/SNV.germline.sample.dnascope.vcf.gz new file mode 100644 index 000000000..420d2d715 Binary files /dev/null and b/tests/test_data/vcfs/SNV.germline.sample.dnascope.vcf.gz differ diff --git a/tests/test_workflow.py b/tests/test_workflow.py index 44dccc0ff..c9b84f7c4 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -1,23 +1,23 @@ from unittest import mock import snakemake +from BALSAMIC.constants.analysis import AnalysisWorkflow from BALSAMIC.utils.cli import get_snakefile MOCKED_OS_ENVIRON = "os.environ" -def test_workflow_tumor_normal( - tumor_normal_config, sentieon_install_dir, sentieon_license +def test_workflow_tumor_only_tga_hg19( + tumor_only_config, sentieon_install_dir, sentieon_license ): - # GIVEN a sample config dict and snakefile - workflow = "paired" - reference_genome = "hg19" + # GIVEN a sample config dict and a snakefile + analysis_type = "single" analysis_workflow = "balsamic" - snakefile = get_snakefile(workflow, analysis_workflow, reference_genome) - config_json = tumor_normal_config + snakefile = get_snakefile(analysis_type, analysis_workflow) + config_json = tumor_only_config - # WHEN invoking snakemake module with dryrun option - # THEN it should return true + # WHEN invoking snakemake module with dry run option + # THEN the snakemake workflow for TGA, hg19-tumor-only should run successfully. with mock.patch.dict( MOCKED_OS_ENVIRON, { @@ -28,16 +28,17 @@ def test_workflow_tumor_normal( assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) -def test_workflow_tumor_only(tumor_only_config, sentieon_install_dir, sentieon_license): - # GIVEN a sample config dict and snakefile - workflow = "single" - reference_genome = "hg19" +def test_workflow_tumor_normal_tga_hg19( + tumor_normal_config, sentieon_install_dir, sentieon_license +): + # GIVEN a sample config dict and a snakefile + analysis_type = "paired" analysis_workflow = "balsamic" - snakefile = get_snakefile(workflow, analysis_workflow, reference_genome) - config_json = tumor_only_config + snakefile = get_snakefile(analysis_type, analysis_workflow) + config_json = tumor_normal_config - # WHEN invoking snakemake module with dryrun option - # THEN it should return true + # WHEN invoking snakemake module with dry run option + # THEN the snakemake workflow for TGA, hg19-tumor-normal should run successfully. with mock.patch.dict( MOCKED_OS_ENVIRON, { @@ -48,83 +49,78 @@ def test_workflow_tumor_only(tumor_only_config, sentieon_install_dir, sentieon_l assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) -def test_workflow_qc_tumor_only(tumor_only_config): - - # GIVEN a sample config dict and snakefile - workflow = "single" - reference_genome = "hg19" - analysis_workflow = "balsamic-qc" - snakefile = get_snakefile(workflow, analysis_workflow, reference_genome) - config_json = tumor_only_config - - # WHEN invoking snakemake module with dryrun option - # THEN it should return true - with mock.patch.dict( - MOCKED_OS_ENVIRON, - ): - assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) - - -def test_workflow_qc_tumor_only_canfam(tumor_only_config): - - # GIVEN a sample config dict and snakefile - workflow = "single" - reference_genome = "canfam3" - analysis_workflow = "balsamic-qc" - snakefile = get_snakefile(workflow, analysis_workflow, reference_genome) - config_json = tumor_only_config +def test_workflow_tumor_only_wgs_hg19( + tumor_only_wgs_config, sentieon_install_dir, sentieon_license +): + # GIVEN a sample config dict and a snakefile + analysis_type = "single" + analysis_workflow = "balsamic" + snakefile = get_snakefile(analysis_type, analysis_workflow) + config_json = tumor_only_wgs_config - # WHEN invoking snakemake module with dryrun option - # THEN it should return true + # WHEN invoking snakemake module with dry run option + # THEN the snakemake workflow for WGS, hg19-tumor-only should run successfully. with mock.patch.dict( MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, ): assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) -def test_workflow_qc_normal(tumor_normal_config): - # GIVEN a sample config dict and snakefile - workflow = "paired" - reference_genome = "hg19" - analysis_workflow = "balsamic-qc" - snakefile = get_snakefile(workflow, analysis_workflow, reference_genome) - config_json = tumor_normal_config +def test_workflow_tumor_normal_wgs_hg19( + tumor_normal_wgs_config, sentieon_install_dir, sentieon_license +): + # GIVEN a sample config dict and a snakefile + analysis_type = "paired" + analysis_workflow = "balsamic" + snakefile = get_snakefile(analysis_type, analysis_workflow) + config_json = tumor_normal_wgs_config - # WHEN invoking snakemake module with dryrun option - # THEN it should return true + # WHEN invoking snakemake module with dry run option + # THEN the snakemake workflow for WGS, hg19-tumor-normal should run successfully. with mock.patch.dict( MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, ): assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) -def test_workflow_qc_normal_canfam3(tumor_normal_config): - # GIVEN a sample config dict and snakefile - workflow = "paired" - reference_genome = "canfam3" - analysis_workflow = "balsamic-qc" - snakefile = get_snakefile(workflow, analysis_workflow, reference_genome) - config_json = tumor_normal_config +def test_workflow_qc_tumor_only_hg19( + tumor_only_config_qc, sentieon_install_dir, sentieon_license +): + # GIVEN a sample config dict and a snakefile + workflow = "single" + snakefile = get_snakefile(workflow, AnalysisWorkflow.BALSAMIC_QC) + config_json = tumor_only_config_qc - # WHEN invoking snakemake module with dryrun option - # THEN it should return true + # WHEN invoking snakemake module with dry run option + # THEN the snakemake workflow for QC, hg19-tumor-only should run successfully. with mock.patch.dict( MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, ): assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) -def test_workflow_sentieon( - tumor_normal_wgs_config, - tumor_only_wgs_config, - sentieon_install_dir, - sentieon_license, +def test_workflow_qc_tumor_normal_hg19( + tumor_normal_config_qc, sentieon_install_dir, sentieon_license ): - # GIVEN a sample config dict and snakefile - workflows = [("single", tumor_only_wgs_config), ("paired", tumor_normal_wgs_config)] + # GIVEN a sample config dict and a snakefile + workflow = "paired" + snakefile = get_snakefile(workflow, AnalysisWorkflow.BALSAMIC_QC) + config_json = tumor_normal_config_qc - # WHEN invoking snakemake module with dryrun option - # THEN it should return true + # WHEN invoking snakemake module with dry run option + # THEN the snakemake workflow for QC, hg19-tumor-normal should run successfully. with mock.patch.dict( MOCKED_OS_ENVIRON, { @@ -132,12 +128,4 @@ def test_workflow_sentieon( "SENTIEON_INSTALL_DIR": sentieon_install_dir, }, ): - for workflow in workflows: - analysis_type = workflow[0] - config = workflow[1] - reference_genome = "hg19" - analysis_workflow = "balsamic" - snakefile = get_snakefile( - analysis_type, analysis_workflow, reference_genome - ) - assert snakemake.snakemake(snakefile, configfiles=[config], dryrun=True) + assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) diff --git a/tests/utils/test_analysis.py b/tests/utils/test_analysis.py new file mode 100644 index 000000000..adce66bc0 --- /dev/null +++ b/tests/utils/test_analysis.py @@ -0,0 +1,37 @@ +"""Test Balsamic analysis utility methods.""" +from typing import Dict, Any, List + +from BALSAMIC.models.cache import CacheConfig +from BALSAMIC.models.snakemake import SingularityBindPath +from BALSAMIC.utils.analysis import ( + get_singularity_bind_paths, + get_cache_singularity_bind_paths, +) + + +def test_get_singularity_bind_paths(sample_config: Dict[str, Any]): + """Test singularity bind paths retrieval.""" + + # GIVEN a sample config dictionary + + # WHEN extracting the singularity bind paths + bind_paths: List[SingularityBindPath] = get_singularity_bind_paths(sample_config) + + # THEN a list of singularity bind paths should be returned + assert bind_paths + assert isinstance(bind_paths[0], SingularityBindPath) + + +def test_get_cache_singularity_bind_paths(cache_config: CacheConfig): + """Test singularity bind paths retrieval for Balsamic init workflow.""" + + # GIVEN a sample config dictionary + + # WHEN extracting the singularity bind paths for balsamic init workflow + bind_paths: List[SingularityBindPath] = get_cache_singularity_bind_paths( + cache_config + ) + + # THEN a list of cache singularity bind paths should be returned + assert bind_paths + assert isinstance(bind_paths[0], SingularityBindPath) diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py new file mode 100644 index 000000000..b839c01af --- /dev/null +++ b/tests/utils/test_cache.py @@ -0,0 +1,17 @@ +"""Test init utility methods.""" +from typing import Dict + +from BALSAMIC.constants.cache import CacheVersion +from BALSAMIC.utils.cache import get_containers + + +def test_get_containers_develop(develop_containers: Dict[str, str]): + """Test containers retrieval given a develop cache version.""" + + # GIVEN a cache version + + # WHEN getting the containers dictionary + containers: Dict[str, str] = get_containers(CacheVersion.DEVELOP) + + # THEN the version associated containers should be returned + assert containers == develop_containers diff --git a/tests/utils/test_qc_metrics.py b/tests/utils/test_metrics.py similarity index 85% rename from tests/utils/test_qc_metrics.py rename to tests/utils/test_metrics.py index 46657301e..e947fe0aa 100644 --- a/tests/utils/test_qc_metrics.py +++ b/tests/utils/test_metrics.py @@ -1,4 +1,4 @@ -from BALSAMIC.utils.qc_metrics import validate_qc_metrics +from BALSAMIC.utils.metrics import validate_qc_metrics def test_validate_qc_metrics(qc_extracted_metrics): diff --git a/tests/utils/test_models.py b/tests/utils/test_models.py deleted file mode 100644 index 9d85d0a92..000000000 --- a/tests/utils/test_models.py +++ /dev/null @@ -1,568 +0,0 @@ -import copy -import os -import pytest - -from pathlib import Path -from pydantic import ValidationError - -from BALSAMIC.utils.models import ( - VCFAttributes, - VarCallerFilter, - QCModel, - VarcallerAttribute, - AnalysisModel, - SampleInstanceModel, - ReferenceUrlsModel, - ReferenceMeta, - BalsamicWorkflowConfig, - UMIParamsCommon, - UMIParamsUMIextract, - UMIParamsConsensuscall, - UMIParamsTNscope, - ParamsCommon, - ParamsVardict, - ParamsVEP, - MetricModel, - MetricConditionModel, - MetricValidationModel, - AnalysisPonModel, -) - - -def test_referencemeta(): - """test ReferenceMeta for correctly building model""" - # GIVEN a reference model - reference_files = { - "basedir": "basedir", - "reference_genome": { - "url": "gs://some_path/b37/human_g1k_v37.fasta.gz", - "file_type": "fasta", - "gzip": True, - "genome_version": "hg19", - "output_file": "genome.fa", - "output_path": "genome", - }, - "dbsnp": { - "url": "gs://some_path/b37/dbsnp_138.b37.vcf.gz", - "file_type": "fasta", - "gzip": True, - "genome_version": "hg19", - "output_file": "dbsnp.vcf", - }, - } - - # WHEN build the model - build_model = ReferenceMeta.parse_obj(reference_files) - - # THEN model should have correct attributes - assert build_model.reference_genome.genome_version == "hg19" - assert build_model.dbsnp.genome_version == "hg19" - assert build_model.reference_genome.get_output_file == "basedir/genome/genome.fa" - - -def test_referenceurlsmodel_build_model(): - """test ReferenceUrlsModel for correctly building the model""" - # GIVEN a reference model - dummy_output_file = "some_random_file" - dummy_output_path = "some_path" - actual_path = Path(dummy_output_path, dummy_output_file).as_posix() - - dummy_reference = { - "url": "gs://domain/file_name", - "file_type": "fasta", - "gzip": True, - "genome_version": "hg19", - "output_file": dummy_output_file, - "output_path": dummy_output_path, - } - - # WHEN building the model - built_model = ReferenceUrlsModel.parse_obj(dummy_reference) - - # THEN model should have correct attributes - assert built_model.url.scheme == "gs" - assert built_model.get_output_file == actual_path - - -def test_referenceurlsmodel_validate_file_type(): - """test ReferenceUrlsModel for validating file type""" - # GIVEN a reference model - dummy_output_file = "some_random_file" - dummy_output_path = "some_path" - actual_path = Path(dummy_output_path, dummy_output_file).as_posix() - - dummy_reference = { - "url": "gs://domain/file_name", - "file_type": "wrong_type", - "gzip": True, - "genome_version": "hg19", - "output_file": dummy_output_file, - "output_path": dummy_output_path, - } - - # WHEN building the model - - # THEN model raise error on validation - with pytest.raises(ValidationError) as excinfo: - built_model = ReferenceUrlsModel.parse_obj(dummy_reference) - assert "not a valid reference file format" in excinfo.value - - -def test_referenceurlsmodel_write_md5(tmp_path_factory): - """test ReferenceUrlsModel for writing md5 of the output file""" - # GIVEN a reference model - dummy_output_file = "some_random_file" - dummy_output_path = tmp_path_factory.mktemp("some_path") - Path(dummy_output_path, dummy_output_file).write_bytes(os.urandom(8196)) - - actual_md5_file = Path(dummy_output_path, dummy_output_file + ".md5") - - dummy_reference = { - "url": "gs://domain/file_name", - "file_type": "fasta", - "gzip": True, - "genome_version": "hg19", - "output_file": dummy_output_file, - "output_path": dummy_output_path.as_posix(), - } - - # WHEN building the model - built_model = ReferenceUrlsModel.parse_obj(dummy_reference) - - # THEN when md5 of the file should exist - built_model.write_md5 - assert actual_md5_file.is_file() - - -def test_referenceurlsmodel_write_md5_no_output_file(tmp_path_factory): - """test ReferenceUrlsModel for failing to write md5 if outputfile doesn't exist""" - # GIVEN a reference model - dummy_output_file = "some_random_file" - dummy_output_path = tmp_path_factory.mktemp("some_path") - - actual_md5_file = Path(dummy_output_path, dummy_output_file + ".md5") - - dummy_reference = { - "url": "gs://domain/file_name", - "file_type": "fasta", - "gzip": True, - "genome_version": "hg19", - "output_file": dummy_output_file, - "output_path": dummy_output_path.as_posix(), - } - - # WHEN building the model - built_model = ReferenceUrlsModel.parse_obj(dummy_reference) - - # THEN when md5 of the file should exist - with pytest.raises(FileNotFoundError) as excinfo: - built_model.write_md5 - assert "file does not exist" in excinfo.value - - -def test_referenceurlsmodel_validate_genome_version(): - """test ReferenceUrlsModel for validating genome version""" - # GIVEN a reference model - dummy_output_file = "some_random_file" - dummy_output_path = "some_path" - actual_path = Path(dummy_output_path, dummy_output_file).as_posix() - - dummy_reference = { - "url": "gs://domain/file_name", - "file_type": "fasta", - "gzip": True, - "genome_version": "wrong_genome", - "output_file": dummy_output_file, - "output_path": dummy_output_path, - } - - with pytest.raises(ValidationError) as excinfo: - # WHEN building the model - built_model = ReferenceUrlsModel.parse_obj(dummy_reference) - - # THEN model raise error on validation - assert "not a valid genome version" in excinfo.value - - -def test_vcfattributes(): - """test VCFAttributes model for correct validation""" - - # GIVEN a VCF attribute - dummy_attribute = { - "tag_value": 5.0, - "filter_name": "dummy_filter_name", - "field": "INFO", - } - - # WHEN building the model - dummy_attribute_built = VCFAttributes(**dummy_attribute) - - # THEN assert values can be reterived currently - assert dummy_attribute_built.tag_value == 5.0 - assert dummy_attribute_built.field == "INFO" - assert dummy_attribute_built.filter_name == "dummy_filter_name" - - -def test_varcallerfilter(): - """test required VarCallerFilters for being set correctly""" - - # GIVEN a VarCallerFilter - dummy_varcaller = { - "AD": {"tag_value": 5.0, "filter_name": "dummy_alt_depth", "field": "INFO"}, - "DP": {"tag_value": 100.0, "filter_name": "dummy_depth", "field": "INFO"}, - "pop_freq": { - "tag_value": 0.005, - "filter_name": "dummy_pop_freq", - "field": "INFO", - }, - "varcaller_name": "dummy_varcaller", - "filter_type": "dummy_ffpe_filter", - "analysis_type": "dummy_tumor_only", - "description": "dummy description of this filter", - } - - # WHEN building the model - dummy_varcaller_filter = VarCallerFilter(**dummy_varcaller) - - # THEN assert required values are set - assert dummy_varcaller_filter.AD.tag_value == 5.0 - assert dummy_varcaller_filter.DP.tag_value == 100.0 - assert dummy_varcaller_filter.analysis_type == "dummy_tumor_only" - - -def test_qc_model(): - # GIVEN valid input arguments - # THEN we can successully create a config dict - valid_args = { - "umi_trim": True, - "min_seq_length": 25, - "umi_trim_length": 5, - "n_base_limit": 50, - } - assert QCModel.parse_obj(valid_args) - - -def test_varcaller_attribute(): - # GIVEN valid input arguments - valid_args = {"mutation": "somatic", "type": "SNV"} - # THEN we can successully create a config dict - assert VarcallerAttribute.parse_obj(valid_args) - # GIVEN invalid input arguments - invalid_args = {"mutation": "strange", "type": "unacceptable"} - # THEN should trigger ValueError - with pytest.raises(ValueError) as excinfo: - VarcallerAttribute.parse_obj(invalid_args) - assert "not a valid argument" in excinfo.value - - -def test_analysis_model(): - # GIVEN valid input arguments - valid_args = { - "case_id": "case_id", - "gender": "female", - "analysis_type": "paired", - "sequencing_type": "targeted", - "analysis_dir": "tests/test_data", - "analysis_workflow": "balsamic-umi", - } - # THEN we can successully create a config dict - assert AnalysisModel.parse_obj(valid_args) - - # GIVEN invalid input arguments - invalid_args = { - "case_id": "case_id", - "gender": "unknown", - "analysis_type": "odd", - "sequencing_type": "wrong", - "analysis_dir": "tests/test_data", - "analysis_workflow": "umi", - } - # THEN should trigger ValueError - with pytest.raises(ValueError) as excinfo: - AnalysisModel.parse_obj(invalid_args) - assert "not supported" in excinfo.value - - -def test_sample_instance_model(): - # GIVEN valid input arguments - valid_args = {"file_prefix": "S2_R", "type": "normal", "sample_name": "S2"} - # THEN we can successully create a config dict - assert SampleInstanceModel.parse_obj(valid_args) - - # GIVEN invalid input arguments - invalid_args = { - "file_prefix": "S2_R", - "type": "fungal", - } - # THEN should trigger ValueError - with pytest.raises(ValueError) as excinfo: - SampleInstanceModel.parse_obj(invalid_args) - assert "not supported" in excinfo.value - - -def test_umiparams_common(): - """test UMIParamsCommon model for correct validation""" - - # GIVEN a UMI workflow common params - test_commonparams = { - "align_header": "test_header_name", - "align_intbases": 100, - "filter_tumor_af": 0.01, - } - # WHEN building the model - test_commonparams_built = UMIParamsCommon(**test_commonparams) - # THEN assert values - assert test_commonparams_built.align_header == "test_header_name" - assert test_commonparams_built.filter_tumor_af == 0.01 - assert test_commonparams_built.align_intbases == 100 - - -def test_umiparams_umiextract(): - """test UMIParamsUMIextract model for correct validation""" - # GIVEN umiextract params - test_umiextractparams = {"read_structure": "['mode', 'r1,r2']"} - - # WHEN building the model - test_umiextractparams_built = UMIParamsUMIextract(**test_umiextractparams) - - # THEN assert values - assert test_umiextractparams_built.read_structure == "['mode', 'r1,r2']" - - -def test_umiparams_consensuscall(): - """test UMIParamsConsensuscall model for correct validation""" - - # GIVEN consensuscall params - test_consensuscall = { - "align_format": "BAM", - "filter_minreads": "6,3,3", - "tag": "XZ", - } - - # WHEN building the model - test_consensuscall_built = UMIParamsConsensuscall(**test_consensuscall) - - # THEN assert values - assert test_consensuscall_built.align_format == "BAM" - assert test_consensuscall_built.filter_minreads == "6,3,3" - assert test_consensuscall_built.tag == "XZ" - - -def test_umiparams_tnscope(): - """test UMIParamsTNscope model for correct validation""" - - # GIVEN tnscope params - test_tnscope_params = { - "algo": "algoname", - "init_tumorLOD": 0.5, - "min_tumorLOD": 6, - "error_rate": 5, - "prunefactor": 3, - "padding": 30, - "disable_detect": "abc", - } - - # WHEN building the model - test_tnscope_params_built = UMIParamsTNscope(**test_tnscope_params) - - # THEN assert values - assert test_tnscope_params_built.algo == "algoname" - assert test_tnscope_params_built.init_tumorLOD == 0.5 - assert test_tnscope_params_built.min_tumorLOD == 6 - assert test_tnscope_params_built.error_rate == 5 - assert test_tnscope_params_built.prunefactor == 3 - assert test_tnscope_params_built.disable_detect == "abc" - assert test_tnscope_params_built.padding == 30 - - -def test_params_vardict(): - """test UMIParamsVardict model for correct validation""" - - # GIVEN vardict params - test_vardict_params = { - "allelic_frequency": 0.01, - "max_pval": 0.5, - "max_mm": 2, - "column_info": "-a 1 -b 2 -c 3", - } - - # WHEN building the model - test_vardict_built = ParamsVardict(**test_vardict_params) - - # THEN assert values - assert test_vardict_built.allelic_frequency == 0.01 - assert test_vardict_built.max_pval == 0.5 - assert test_vardict_built.max_mm == 2 - assert test_vardict_built.column_info == "-a 1 -b 2 -c 3" - - -def test_params_vep(): - """test UMIParamsVEP model for correct validation""" - - # GIVEN vardict params - test_vep = {"vep_filters": "all defaults params"} - - # WHEN building the model - test_vep_built = ParamsVEP(**test_vep) - - # THEN assert values - assert test_vep_built.vep_filters == "all defaults params" - - -def test_metric_condition_model(): - """test MetricConditionModel attributes parsing""" - - # GIVEN input attributes - metric_condition = {"norm": "gt", "threshold": 1} - - # WHEN building the metric condition model - metrics_model = MetricConditionModel(**metric_condition) - - # THEN assert retrieved values from the created model - assert metrics_model.dict().items() == metric_condition.items() - - -def test_metric_model_pass_validation(): - """test MetricModel attributes parsing""" - - # GIVEN input attributes - metrics = { - "header": None, - "id": "tumor", - "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric", - "name": "MEDIAN_TARGET_COVERAGE", - "step": "multiqc_picard_HsMetrics", - "value": 2393.0, - "condition": {"norm": "gt", "threshold": 1000.0}, - } - - # WHEN building the metric model - metric_model = MetricModel(**metrics) - - # THEN assert retrieved values from the created model - assert metric_model.dict().items() == metrics.items() - - -def test_metric_model_duplication_refactoring(): - """test MetricModel duplications param refactoring""" - - # GIVEN input attributes - metrics = { - "header": None, - "id": "tumor", - "input": "concatenated_tumor_XXXXXX_R_1_fastqc.zip", - "name": "FastQC_mqc-generalstats-fastqc-percent_duplicates", - "step": "multiqc_general_stats", - "value": 21.517800000611373, - "condition": None, - } - - # WHEN building the metric model - metric_model = MetricModel(**metrics) - - # THEN assert retrieved values from the created model - assert metric_model.name == "PERCENT_DUPLICATION_R1" - - -def test_metric_model_fail_validation(): - """test MetricModel behaviour for an incorrect input""" - - # GIVEN a non accepted input - invalid_input = {"header": None, "id": "tumor"} - - # THEN the model raises an error due to an incomplete input - with pytest.raises(ValueError) as input_exc: - MetricModel(**invalid_input) - assert f"field required" in str(input_exc.value) - - -def test_metric_validation_model_pass(qc_extracted_metrics): - """test MetricValidationModel attribute parsing and positive validation""" - - # WHEN building the MetricValidationModel model - model = MetricValidationModel(metrics=qc_extracted_metrics) - - # THEN assert retrieved values from the created model - assert model.dict()["metrics"] == qc_extracted_metrics - - -def test_metric_validation_model_fail(qc_extracted_metrics): - """test MetricValidationModel for an overly restrictive metric condition""" - - # GIVEN input attributes with a value that does not meet the filtering condition - metrics = copy.deepcopy(qc_extracted_metrics) - metrics[4]["value"] = 2.0 # GC_DROPOUT set to 2.0 (failing condition) - - # THEN check that the model filters the metric according to its norm - with pytest.raises(ValueError) as val_exc: - MetricValidationModel(metrics=metrics) - assert ( - f"QC metric {metrics[4]['name']}: {metrics[4]['value']} validation has failed. " - f"(Condition: {metrics[4]['condition']['norm']} {metrics[4]['condition']['threshold']}, ID: {metrics[4]['id']})" - in str(val_exc.value) - ) - - -def test_multiple_metric_validation_model_fail(qc_extracted_metrics): - """test MetricValidationModel for multiple metrics with failing conditions""" - - # GIVEN input attributes that does not meet the specified conditions - metrics = copy.deepcopy(qc_extracted_metrics) - metrics[4]["value"] = 2.0 # GC_DROPOUT set to 2.0 (failing condition) - metrics[8]["value"] = 0.5 # PCT_TARGET_BASES_500X set to 50% (failing condition) - - # THEN check that the model filters the metrics according to its norm - with pytest.raises(ValueError) as val_exc: - MetricValidationModel(metrics=metrics) - assert "2 validation errors for MetricValidationModel" in str(val_exc.value) - assert metrics[4]["name"] in str(val_exc.value) - assert metrics[8]["name"] in str(val_exc.value) - - -def test_metric_validation_model_norm_fail(qc_extracted_metrics): - """test MetricValidationModel ValueError raising for an operator that it is not accepted""" - - # GIVEN a metric with an incorrect norm attribute - metrics = copy.deepcopy(qc_extracted_metrics) - metrics[4]["condition"]["norm"] = "lower" - - # THEN model raises an error due to a non accepted norm - try: - MetricValidationModel(metrics=metrics) - except KeyError as key_exc: - assert metrics[4]["condition"]["norm"] in str(key_exc) - - -def test_analysis_pon_model(test_data_dir): - """Tests PON model parsing""" - - # GIVEN valid input arguments - valid_args = { - "case_id": "case_id", - "analysis_type": "pon", - "sequencing_type": "targeted", - "analysis_dir": test_data_dir, - "analysis_workflow": "balsamic", - "pon_version": "v1", - } - - # THEN we can successfully create a config dict - assert AnalysisPonModel.parse_obj(valid_args) - - # GIVEN an invalid version argument - invalid_args = { - "case_id": "case_id", - "analysis_type": "pon", - "sequencing_type": "targeted", - "analysis_dir": test_data_dir, - "analysis_workflow": "balsamic", - "pon_version": "v01", - } - - # THEN should trigger ValueError - with pytest.raises(ValueError) as excinfo: - AnalysisPonModel.parse_obj(invalid_args) - assert ( - f"The provided version {invalid_args['pon_version']} does not follow the defined syntax (v)" - in excinfo.value - ) diff --git a/tests/utils/test_pdf_report.py b/tests/utils/test_pdf_report.py new file mode 100644 index 000000000..9147549a6 --- /dev/null +++ b/tests/utils/test_pdf_report.py @@ -0,0 +1,60 @@ +"""Test utility function for PDF generation.""" +from pathlib import Path + +from pypdf import PdfReader + +from BALSAMIC.utils.pdf_report import get_table_html, html_to_pdf + + +def test_get_table_html(): + """Test table insertion in an HTML page""" + + # GIVEN an HTML table and a test table name + html_table: str = """ + + + + + + + + + +
Header 1Header 2
Data 1Data 2
+ """ + table_name: str = "Test Table" + + # WHEN adding the table to an HTML page + html_page: str = get_table_html(html_table=html_table, table_name=table_name) + + # THEN the table HTML page should be successfully created + assert "" in html_page + assert f"

{table_name}

" in html_page + assert html_table in html_page + + +def test_html_to_pdf(tmp_path: Path): + """Test PDF file generation from HTML string.""" + + # GIVEN an HTML string + html_string: str = """ + + +

Hello!

+ + + """ + + # GIVEN an output PDF file + pdf_path: Path = Path(tmp_path, "test_pdf.pdf") + + # WHEN generating the pdf file + html_to_pdf(html_string=html_string, pdf_path=pdf_path.as_posix()) + + # THEN the output PDF file should exist + assert pdf_path.is_file() + + # THEN the output PDF file should contain the mock HTML string + reader: PdfReader = PdfReader(stream=pdf_path) + pdf_page: str = reader.pages[0].extract_text() + assert "Hello!" in pdf_page diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 0e7c50843..ff249c123 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -1,70 +1,128 @@ +"""Test helper functions.""" import json -import os +import logging import subprocess -import pytest import sys -import copy -import collections - -import shutil -from unittest import mock -import logging - from pathlib import Path +from typing import Dict, List +from unittest import mock -from BALSAMIC import __version__ as balsamic_version -from BALSAMIC.utils.exc import BalsamicError, WorkflowRunError - -from BALSAMIC.constants.common import CONTAINERS_CONDA_ENV_PATH, BIOINFO_TOOL_ENV -from BALSAMIC.constants.reference import REFERENCE_FILES - +import click +import pytest +from _pytest.logging import LogCaptureFixture +from _pytest.tmpdir import TempPathFactory + +from BALSAMIC.constants.analysis import BIOINFO_TOOL_ENV, SampleType, SequencingType +from BALSAMIC.constants.cache import CacheVersion +from BALSAMIC.constants.cluster import ClusterConfigType +from BALSAMIC.constants.constants import FileType +from BALSAMIC.constants.paths import CONTAINERS_DIR +from BALSAMIC.models.config import ConfigModel, FastqInfoModel, SampleInstanceModel from BALSAMIC.utils.cli import ( - SnakeMake, CaptureStdout, - iterdict, - get_snakefile, + check_executable, + convert_deliverables_tags, createDir, - get_config, - recursive_default_dict, - create_pon_fastq_symlink, - convert_defaultdict_to_regular_dict, - get_file_status_string, - get_from_two_key, find_file_index, - validate_fastq_pattern, - get_panel_chrom, - create_fastq_symlink, - get_fastq_bind_path, - singularity, - get_file_extension, + generate_h5, + get_analysis_fastq_files_directory, get_bioinfo_tools_version, - convert_deliverables_tags, - check_executable, + get_config_path, + get_fastq_info, + get_file_extension, + get_file_status_string, + get_panel_chrom, + get_pon_sample_list, + get_resolved_fastq_files_directory, + get_sample_list, + get_snakefile, job_id_dump_to_yaml, - generate_h5, - get_md5, - create_md5, + validate_cache_version, +) +from BALSAMIC.utils.exc import BalsamicError, WorkflowRunError +from BALSAMIC.utils.io import ( + read_json, + read_vcf_file, + read_yaml, + write_finish_file, + write_json, ) -from BALSAMIC.utils.io import read_json, write_json, read_yaml - from BALSAMIC.utils.rule import ( - get_chrom, - get_vcf, - get_sample_type, - get_picard_mrkdup, - get_variant_callers, - get_script_path, - get_result_dir, - get_threads, get_delivery_id, - get_reference_output_files, + get_fastp_parameters, + get_result_dir, get_rule_output, - get_sample_type_from_prefix, + get_sample_type_from_sample_name, + get_script_path, + get_threads, + get_variant_callers, + get_vcf, ) -from tests.helpers import Map +from BALSAMIC.utils.utils import ( + get_absolute_paths_dict, + get_relative_paths_dict, + remove_unnecessary_spaces, +) + + +def test_remove_unnecessary_spaces(): + """Tests removal of unnecessary spaces from a string.""" + + # GIVEN a string with unnecessary spaces + string: str = " Developing Balsamic brings me joy " + + # WHEN calling the function + formatted_string: str = remove_unnecessary_spaces(string) + + # THEN the extra spaces are removed + assert formatted_string == "Developing Balsamic brings me joy" + + +def test_relative_paths_dict(session_tmp_path: Path, reference_file: Path): + """Test return of a dictionary with relative paths to a provided base path.""" + + # GIVEN a base path and a dictionary with absolute paths + absolute_paths: Dict[str, Path] = {"reference": reference_file} + + # WHEN generating the relative paths dictionary + relative_paths: Dict[str, str] = get_relative_paths_dict( + base_path=session_tmp_path, data=absolute_paths + ) + + # THEN a dictionary with relative paths should be returned + assert relative_paths == {"reference": reference_file.name} + + +def test_absolute_paths_dict(session_tmp_path: Path, reference_file: Path): + """Test return of a dictionary with relative paths to a provided base path.""" + + # GIVEN a base path and a dictionary with relative paths + relative_paths: Dict[str, Path] = {"reference": Path(reference_file.name)} + + # WHEN generating the resolved paths dictionary + absolute_paths: Dict[str, Path] = get_absolute_paths_dict( + base_path=session_tmp_path, data=relative_paths + ) + + # THEN a dictionary with relative paths should be returned + assert absolute_paths == {"reference": reference_file} + + +def test_get_pon_sample_dict(pon_config_dict_w_fastq: Dict): + """Tests sample PON dictionary retrieval.""" + + # GIVEN a FASTQ directory + fastq_dir_pon = pon_config_dict_w_fastq["analysis"]["fastq_path"] + # WHEN retrieving PON samples + samples: List[SampleInstanceModel] = get_pon_sample_list(fastq_dir_pon) + # THEN the samples should be retrieved from the FASTQ directory + # And match the expected structure of pre-designed PON-config + for sample_dict in samples: + assert sample_dict in pon_config_dict_w_fastq["samples"] -def test_get_variant_callers_wrong_analysis_type(tumor_normal_config): + +def test_get_variant_callers_wrong_analysis_type(tumor_normal_config: Dict): # GIVEN a wrong analysis_type wrong_analysis_type = "cohort" workflow = "BALSAMIC" @@ -73,9 +131,9 @@ def test_get_variant_callers_wrong_analysis_type(tumor_normal_config): mutation_class = "germline" # WHEN getting list of variant callers - # THEN capture error with pytest.raises(WorkflowRunError): - assert get_variant_callers( + # THEN capture error + get_variant_callers( config=tumor_normal_config, analysis_type=wrong_analysis_type, workflow_solution=workflow, @@ -85,7 +143,7 @@ def test_get_variant_callers_wrong_analysis_type(tumor_normal_config): ) -def test_get_variant_callers_wrong_workflow(tumor_normal_config): +def test_get_variant_callers_wrong_workflow(tumor_normal_config: Dict): # GIVEN a wrong workflow name wrong_workflow = "MIP" mutation_type = "SNV" @@ -93,8 +151,10 @@ def test_get_variant_callers_wrong_workflow(tumor_normal_config): sequencing_type = "wgs" analysis_type = "paired" + # WHEN getting list of variant callers with pytest.raises(WorkflowRunError): - assert get_variant_callers( + # THEN capture error + get_variant_callers( config=tumor_normal_config, analysis_type=analysis_type, workflow_solution=wrong_workflow, @@ -104,7 +164,7 @@ def test_get_variant_callers_wrong_workflow(tumor_normal_config): ) -def test_get_variant_callers_wrong_mutation_type(tumor_normal_config): +def test_get_variant_callers_wrong_mutation_type(tumor_normal_config: Dict): # GIVEN a wrong workflow name workflow = "BALSAMIC" wrong_mutation_type = "INDEL" @@ -113,9 +173,9 @@ def test_get_variant_callers_wrong_mutation_type(tumor_normal_config): analysis_type = "paired" # WHEN getting list of variant callers - # THEN capture error with pytest.raises(WorkflowRunError): - assert get_variant_callers( + # THEN capture error + get_variant_callers( config=tumor_normal_config, analysis_type=analysis_type, workflow_solution=workflow, @@ -125,7 +185,7 @@ def test_get_variant_callers_wrong_mutation_type(tumor_normal_config): ) -def test_get_variant_callers_wrong_mutation_class(tumor_normal_config): +def test_get_variant_callers_wrong_mutation_class(tumor_normal_config: Dict): # GIVEN a wrong workflow name workflow = "BALSAMIC" mutation_type = "SNV" @@ -134,9 +194,9 @@ def test_get_variant_callers_wrong_mutation_class(tumor_normal_config): analysis_type = "paired" # WHEN getting list of variant callers - # THEN capture error with pytest.raises(WorkflowRunError): - assert get_variant_callers( + # THEN capture error + get_variant_callers( config=tumor_normal_config, analysis_type=analysis_type, workflow_solution=workflow, @@ -146,7 +206,7 @@ def test_get_variant_callers_wrong_mutation_class(tumor_normal_config): ) -def test_get_variant_callers_wrong_sequencing_type(tumor_normal_config): +def test_get_variant_callers_wrong_sequencing_type(tumor_normal_config: Dict): # GIVEN a wrong workflow name workflow = "BALSAMIC" mutation_type = "SNV" @@ -155,9 +215,9 @@ def test_get_variant_callers_wrong_sequencing_type(tumor_normal_config): analysis_type = "paired" # WHEN getting list of variant callers - # THEN capture error with pytest.raises(WorkflowRunError): - assert get_variant_callers( + # THEN capture error + get_variant_callers( config=tumor_normal_config, analysis_type=analysis_type, workflow_solution=workflow, @@ -167,25 +227,11 @@ def test_get_variant_callers_wrong_sequencing_type(tumor_normal_config): ) -def test_get_reference_output_files(): - # GIVEN a reference genome version - genome_ver = "hg38" - file_type = "fasta" - - # WHEN getting list of valid types - fasta_files = get_reference_output_files(REFERENCE_FILES[genome_ver], file_type) - - # THEN it should return list of file - assert "Homo_sapiens_assembly38.fasta" in fasta_files - - def test_get_bioinfo_tools_version(): """Test bioinformatics tools and version extraction.""" # GIVEN a tools dictionary - bioinfo_tools: dict = get_bioinfo_tools_version( - BIOINFO_TOOL_ENV, CONTAINERS_CONDA_ENV_PATH - ) + bioinfo_tools: dict = get_bioinfo_tools_version(BIOINFO_TOOL_ENV, CONTAINERS_DIR) # THEN assert that the versions are correctly retrieved assert set(bioinfo_tools["picard"]).issubset({"2.27.1"}) @@ -196,12 +242,10 @@ def test_get_bioinfo_pip_tools_version(): """Test bioinformatics tools and version extraction for a PIP specific tool.""" # GIVEN a tools dictionary - bioinfo_tools: dict = get_bioinfo_tools_version( - BIOINFO_TOOL_ENV, CONTAINERS_CONDA_ENV_PATH - ) + bioinfo_tools: dict = get_bioinfo_tools_version(BIOINFO_TOOL_ENV, CONTAINERS_DIR) # THEN assert that the PIP specific packages are correctly retrieved - assert set(bioinfo_tools["cnvkit"]).issubset({"0.9.9"}) + assert set(bioinfo_tools["cnvkit"]).issubset({"0.9.10"}) def test_get_delivery_id(): @@ -238,8 +282,8 @@ def test_get_file_extension_get_any_ext(): def test_get_file_extension_known_ext(): # GIVEN a dummy file string with a known string - dummy_file = "hassan.fastq.gz" - actual_extension = "fastq.gz" + dummy_file = f"dummy.{FileType.FASTQ}.{FileType.GZ}" + actual_extension = f"{FileType.FASTQ}.{FileType.GZ}" # WHEN extracting the extension file_extension = get_file_extension(dummy_file) @@ -248,104 +292,6 @@ def test_get_file_extension_known_ext(): assert file_extension == actual_extension -def test_recursive_default_dict(): - # GIVEN a dictionary - test_dict = recursive_default_dict() - test_dict["key_1"]["key_2"] = "value_1" - - # WHEN it is recursively creates a default dictionary - # THEN the output should be a dicitionary - assert isinstance(test_dict, collections.defaultdict) - assert "key_2" in test_dict["key_1"] - - -def test_convert_defaultdict_to_regular_dict(): - # GIVEN a recursively created default dict - test_dict = recursive_default_dict() - test_dict["key_1"]["key_2"] = "value_1" - - # WHEN converting it back to normal dict - test_dict = convert_defaultdict_to_regular_dict(test_dict) - - # THEN the output type should be dict and not defaultdict - assert not isinstance(test_dict, collections.defaultdict) - assert isinstance(test_dict, dict) - assert "key_2" in test_dict["key_1"] - - -def test_iterdict(reference): - """GIVEN a dict for iteration""" - # WHEN passing dict to this function - dict_gen = iterdict(reference) - - # THEN it will create dict generator, we can iterate it, get the key, values as string - for key, value in dict_gen: - assert isinstance(key, str) - assert isinstance(value, str) - - -def test_snakemake_local(): - # GIVEN required params - snakemake_local = SnakeMake() - snakemake_local.working_dir = "this_path/snakemake" - snakemake_local.snakefile = "workflow/variantCalling_paired" - snakemake_local.configfile = "sample_config.json" - snakemake_local.run_mode = "local" - snakemake_local.use_singularity = True - snakemake_local.singularity_bind = ["path_1", "path_2"] - snakemake_local.forceall = True - - # WHEN calling the build command - shell_command = snakemake_local.build_cmd() - - # THEN it will contruct the snakemake command to run - assert isinstance(shell_command, str) - assert "workflow/variantCalling_paired" in shell_command - assert "sample_config.json" in shell_command - assert "this_path/snakemake" in shell_command - assert "--dryrun" in shell_command - assert "--forceall" in shell_command - - -def test_snakemake_slurm(): - # GIVEN required params - snakemake_slurm = SnakeMake() - snakemake_slurm.case_name = "test_case" - snakemake_slurm.working_dir = "this_path/snakemake" - snakemake_slurm.snakefile = "worflow/variantCalling_paired" - snakemake_slurm.configfile = "sample_config.json" - snakemake_slurm.run_mode = "cluster" - snakemake_slurm.cluster_config = "cluster_config.json" - snakemake_slurm.scheduler = "sbatch.py" - snakemake_slurm.log_path = "logs/" - snakemake_slurm.script_path = "scripts/" - snakemake_slurm.result_path = "results/" - snakemake_slurm.qos = "normal" - snakemake_slurm.account = "development" - snakemake_slurm.profile = "slurm" - snakemake_slurm.mail_type = "FAIL" - snakemake_slurm.mail_user = "john.doe@example.com" - snakemake_slurm.sm_opt = ("containers",) - snakemake_slurm.quiet = True - snakemake_slurm.use_singularity = True - snakemake_slurm.singularity_bind = ["path_1", "path_2"] - snakemake_slurm.run_analysis = True - - # WHEN calling the build command - shell_command = snakemake_slurm.build_cmd() - - # THEN constructing snakecommand for slurm runner - assert isinstance(shell_command, str) - assert "worflow/variantCalling_paired" in shell_command - assert "sample_config.json" in shell_command - assert "this_path/snakemake" in shell_command - assert "--dryrun" not in shell_command - assert "sbatch.py" in shell_command - assert "test_case" in shell_command - assert "containers" in shell_command - assert "--quiet" in shell_command - - def test_get_script_path(): # GIVEN list of scripts custom_scripts = ["refseq_sql.awk"] @@ -373,11 +319,9 @@ def test_get_snakefile(): ] # WHEN asking to see snakefile for paired - for reference_genome in ["hg19", "hg38", "canfam3"]: + for _reference_genome in ["hg19", "hg38", "canfam3"]: for analysis_type, analysis_workflow in workflow: - snakefile = get_snakefile( - analysis_type, analysis_workflow, reference_genome - ) + snakefile = get_snakefile(analysis_type, analysis_workflow) pipeline = "" if ( @@ -386,10 +330,8 @@ def test_get_snakefile(): and analysis_workflow != "balsamic-umi" ): pipeline = "BALSAMIC/workflows/balsamic.smk" - elif analysis_type == "generate_ref" and reference_genome != "canfam3": + elif analysis_type == "generate_ref": pipeline = "BALSAMIC/workflows/reference.smk" - elif analysis_type == "generate_ref" and reference_genome == "canfam3": - pipeline = "BALSAMIC/workflows/reference-canfam3.smk" elif analysis_type == "pon": pipeline = "BALSAMIC/workflows/PON.smk" elif analysis_workflow == "balsamic-qc": @@ -402,33 +344,7 @@ def test_get_snakefile(): assert Path(snakefile).is_file() -def test_get_chrom(config_files): - # Given a panel bed file - bed_file = config_files["panel_bed_file"] - actual_chrom = [ - "10", - "11", - "16", - "17", - "18", - "19", - "2", - "3", - "4", - "6", - "7", - "9", - "X", - ] - - # WHEN passing this bed file - test_chrom = get_chrom(bed_file) - - # THEN It should return list of chrom presents in that bed file - assert set(actual_chrom) == set(test_chrom) - - -def test_get_vcf(sample_config): +def test_get_vcf(sample_config: Dict): # GIVEN a sample_config dict and a variant callers list variant_callers = ["tnscope", "vardict", "manta"] @@ -453,36 +369,6 @@ def test_get_vcf_invalid_variant_caller(sample_config): get_vcf(sample_config, variant_callers, [sample_config["analysis"]["case_id"]]) -def test_get_sample_type(sample_config): - # GIVEN a sample_config dict, bio_type as tumor - bio_type = "tumor" - - # WHEN calling get_sample_type with bio_type - sample_id = get_sample_type(sample_config["samples"], bio_type) - - # THEN It should return the tumor samples id - assert sample_id == ["S1_R"] - - -def test_get_picard_mrkdup(sample_config): - # WHEN passing sample_config - picard_str = get_picard_mrkdup(sample_config) - - # THEN It will return the picard str as rmdup - assert "mrkdup" == picard_str - - -def test_get_picard_mrkdup_rmdup(sample_config): - # WHEN passing sample_config - sample_config_rmdup = copy.deepcopy(sample_config) - sample_config_rmdup["QC"]["picard_rmdup"] = True - - picard_str = get_picard_mrkdup(sample_config_rmdup) - - # THEN It will return the picard str as rmdup - assert "rmdup" == picard_str - - def test_createDir(tmp_path): # GIVEN a directory path # WHEN directory path is not yet created @@ -516,7 +402,7 @@ def test_createDir(tmp_path): assert Path(test_log_dir_created).is_dir() -def test_get_result_dir(sample_config): +def test_get_result_dir(sample_config: Dict): # WHEN a sample_config dict # GIVEN a sample_config dict # THEN get_result_dir should return result directory @@ -532,23 +418,17 @@ def test_capturestdout(): assert "".join(captured_stdout_message) == test_stdout_message -def test_get_config(): - # GIVEN the config files name - config_files = ["sample", "analysis"] - # WHEN passing file names - for config_file in config_files: - # THEN return the config files path - assert get_config(config_file) +def test_get_config_path(cluster_analysis_config_path: str): + """Test return of a config path given its type.""" + # GIVEN an analysis config path -def test_get_config_wrong_config(): - # GIVEN the config files name - config_file = "non_existing_config" + # WHEN retrieving the cluster analysis configuration + cluster_analysis: Path = get_config_path(ClusterConfigType.ANALYSIS) - # WHEN passing file names - # THEN return the config files path - with pytest.raises(FileNotFoundError): - assert get_config(config_file) + # THEN an analysis cluster json should be returned + assert cluster_analysis.exists() + assert cluster_analysis.as_posix() == cluster_analysis_config_path def test_write_json(tmp_path, reference): @@ -562,27 +442,27 @@ def test_write_json(tmp_path, reference): output = output_json.read_text() # THEN It will create a json file with given dict - for key, value in iterdict(reference): + for key, value in reference.items(): assert key in output assert value in output assert len(list(tmp.iterdir())) == 1 -def test_write_json_error(tmp_path): - with pytest.raises(Exception, match=r"Is a directory"): - # GIVEN a invalid dict - ref_json = {"path": "this_path", "reference": ""} - tmp = tmp_path / "tmp" - tmp.mkdir() +def test_write_json_error(tmp_path: Path): + """Test JSON write error.""" - # WHEN passing a invalid dict - # THEN It will raise the error - assert write_json(ref_json, tmp) + # GIVEN a dictionary to be saved in a JSON file + ref_json = {"case": "/path/to/case", "reference": "/path/to/reference"} + + # GIVEN a directory as the output file + with pytest.raises(Exception, match=r"Is a directory"): + # THEN an exception should be raised + write_json(ref_json, tmp_path) -def test_read_json(config_path): - """test data extraction from a BALSAMIC config JSON file""" +def test_read_json(config_path: str): + """Test data extraction from a BALSAMIC config JSON file.""" # GIVEN a config path @@ -594,7 +474,7 @@ def test_read_json(config_path): def test_read_json_error(): - """test data extraction from a BALSAMIC config JSON file for an ivalid path""" + """Test data extraction from a BALSAMIC config JSON file for an invalid path.""" # GIVEN an incorrect config path config_path = "/not/a/path" @@ -610,15 +490,15 @@ def test_read_json_error(): def test_read_yaml(metrics_yaml_path): - """test data extraction from a saved YAML file""" + """Test data extraction from a saved YAML file.""" # GIVEN an expected output n_metrics = 12 # Number of expected metric dropout_metric = { "header": None, - "id": "tumor", - "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric", + "id": "ACC1", + "input": "ACC1.dedup.realign.hsmetric", "name": "GC_DROPOUT", "step": "multiqc_picard_HsMetrics", "value": 0.027402, @@ -627,8 +507,8 @@ def test_read_yaml(metrics_yaml_path): ins_size_metric = { "header": None, - "id": "tumor", - "input": "concatenated_tumor_XXXXXX_R.sorted.insertsizemetric", + "id": "ACC1", + "input": "ACC1.dedup.realign.insertsizemetric", "name": "MEAN_INSERT_SIZE", "step": "multiqc_picard_insertSize", "value": 201.813054, @@ -637,8 +517,8 @@ def test_read_yaml(metrics_yaml_path): dups_metric = { "header": None, - "id": "tumor", - "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt", + "id": "ACC1", + "input": "tumor.ACC1.dedup.metrics", "name": "PERCENT_DUPLICATION", "step": "multiqc_picard_dups", "value": 0.391429, @@ -656,7 +536,7 @@ def test_read_yaml(metrics_yaml_path): def test_read_yaml_error(): - """test data extraction from an incorrect YAML path""" + """Test data extraction from an incorrect YAML path.""" # GIVEN an invalid path yaml_path = "NOT_A_PATH" @@ -665,12 +545,12 @@ def test_read_yaml_error(): try: read_yaml(yaml_path) except FileNotFoundError as file_exc: - assert f"The YAML file {yaml_path} was not found." in str(file_exc) + assert f"The YAML file {yaml_path} was not found" in str(file_exc) -def test_get_threads(config_files): +def test_get_threads(cluster_analysis_config_path: str): # GIVEN cluster config file and rule name - cluster_config = json.load(open(config_files["cluster_json"], "r")) + cluster_config = json.load(open(cluster_analysis_config_path, "r")) rule_name = "sentieon_align_sort" # WHEN passing cluster_config and rule_name @@ -690,37 +570,6 @@ def test_get_file_status_string_file_exists(tmpdir): assert "Found" in result[0].value_no_colors -def test_get_file_status_string_file_not_exist(): - # GIVEN an existing file and condition_str False - file_not_exist = "some_random_path/dummy_non_existing_file" - - # WHEN checking for file string - result = get_file_status_string(str(file_not_exist)) - - # THEN it should not return empty str - assert "missing" in result[0].value_no_colors - - -def test_get_from_two_key(): - # GIVEN a dictionary with two keys that each have list of values - input_dict = { - "key_1": ["key_1_value_1", "key_1_value_2"], - "key_2": ["key_2_value_1", "key_2_value_2"], - } - - # WHEN knowing the key_1_value_2 from key_1, return key_2_value_2 from key_2 - result = get_from_two_key( - input_dict, - from_key="key_1", - by_key="key_2", - by_value="key_1_value_2", - default=None, - ) - - # THEN retrun value should be key_2_value_2 and not None - assert result == "key_2_value_2" - - def test_find_file_index(tmpdir): # GIVEN an existing bam file and its bai index file bam_dir = tmpdir.mkdir("temporary_path") @@ -743,112 +592,6 @@ def test_find_file_index(tmpdir): assert str(bai_file_2) in result -def test_singularity_shellcmd(balsamic_cache): - """test singularity shell cmd""" - - # GIVEN a dummy command - dummy_command = "ls this_path" - dummy_path_1 = "this_path/path1" - dummy_path_2 = "this_path/path2" - correct_shellcmd = "exec --bind {} --bind {} ls this_path".format( - dummy_path_1, dummy_path_2 - ) - singularity_container_sif = Path( - balsamic_cache, balsamic_version, "containers", "align_qc", "example.sif" - ).as_posix() - - with mock.patch.object(shutil, "which") as mocked: - mocked.return_value = "/my_home/binary_path/singularity" - - # WHEN building singularity command - shellcmd = singularity( - sif_path=singularity_container_sif, - cmd=dummy_command, - bind_paths=[dummy_path_1, dummy_path_2], - ) - - # THEN successfully return a correct singularity cmd - assert correct_shellcmd in shellcmd - - -def test_singularity_shellcmd_sif_not_exist(): - """test singularity shell cmd with non-existing file""" - - # GIVEN a dummy command - dummy_command = "ls this_path" - dummy_sif_path = "/some_path/my_sif_path_3.1415/container.sif" - dummy_path_1 = "this_path/path1" - dummy_path_2 = "this_path/path2" - error_msg = "container file does not exist" - - # WHEN building singularity command - # THEN successfully get error that container doesn't exist - with mock.patch.object(shutil, "which") as mocked, pytest.raises( - BalsamicError, match=error_msg - ): - mocked.return_value = "/my_home/binary_path/singularity" - - singularity( - sif_path=dummy_sif_path, - cmd=dummy_command, - bind_paths=[dummy_path_1, dummy_path_2], - ) - - -def test_singularity_shellcmd_cmd_not_exist(): - """test singularity shell cmd with nonexisting singularity command""" - - # GIVEN a dummy command - dummy_command = "ls this_path" - error_msg = "singularity command does not exist" - dummy_path_1 = "this_path/path1" - dummy_path_2 = "this_path/path2" - singularity_container_sif = "some_path/container.sif" - - # WHEN building singularity command - # THEN successfully get error if singualrity command doesn't exist - with mock.patch.object(shutil, "which") as mocked, pytest.raises( - BalsamicError, match=error_msg - ): - mocked.return_value = None - - singularity( - sif_path=singularity_container_sif, - cmd=dummy_command, - bind_paths=[dummy_path_1, dummy_path_2], - ) - - -def test_validate_fastq_pattern(): - # GIVEN a path to a file with correct fastq file prefix - fastq_path_r1 = "/home/analysis/dummy_tumor_R_1.fastq.gz" - fastq_path_r2 = "/home/analysis/dummy_normal_R_2.fastq.gz" - # THEN it should return the correct prefix - assert validate_fastq_pattern(fastq_path_r1) == "dummy_tumor_R" - assert validate_fastq_pattern(fastq_path_r2) == "dummy_normal_R" - - with pytest.raises(AttributeError) as excinfo: - # GIVEN a path to a file with incorrect fastq file prefix - bad_fastq_path_1 = "/home/analysis/dummy_tumor.fastq.gz" - validate_fastq_pattern(bad_fastq_path_1) - # THEN AttributeError is raised - assert excinfo.value - - with pytest.raises(AttributeError) as excinfo: - # GIVEN a path to a file with incorrect fastq file prefix - bad_fastq_path_2 = "/home/analysis/dummy_tumor_R3.fastq.gz" - validate_fastq_pattern(bad_fastq_path_2) - # THEN AttributeError is raised - assert excinfo.value - - with pytest.raises(AttributeError) as excinfo: - # GIVEN a path to a file with incorrect fastq file prefix - bad_fastq_path_3 = "/home/analysis/dummy_tumor_R_2.bam" - validate_fastq_pattern(bad_fastq_path_3) - # THEN AttributeError is raised - assert excinfo.value - - def test_get_panel_chrom(): # GIVEN a valid PANEL BED file panel_bed_file = "tests/test_data/references/panel/panel.bed" @@ -856,123 +599,54 @@ def test_get_panel_chrom(): assert len(get_panel_chrom(panel_bed_file)) > 0 -def test_create_fastq_symlink(tmpdir_factory, caplog): - # GIVEN a list of valid input fastq files from test directory containing 4 files - symlink_from_path = tmpdir_factory.mktemp("symlink_from") - symlink_to_path = tmpdir_factory.mktemp("symlink_to") - filenames = [ - "tumor_R_1.fastq.gz", - "normal_R_1.fastq.gz", - "tumor_R_2.fastq.gz", - "normal_R_2.fastq.gz", - ] - successful_log = "skipping" - casefiles = [Path(symlink_from_path, x) for x in filenames] - for casefile in casefiles: - casefile.touch() - with caplog.at_level(logging.INFO): - create_fastq_symlink(casefiles=casefiles, symlink_dir=symlink_to_path) - # THEN destination should have 4 files - assert len(list(Path(symlink_to_path).rglob("*.fastq.gz"))) == 4 - # THEN exception triggers log message containing "skipping" - assert successful_log in caplog.text - - -def test_get_fastq_bind_path(tmpdir_factory): - # GIVEN a list of valid input fastq filenames and test directories - filenames = [ - "tumor_R_1.fastq.gz", - "normal_R_1.fastq.gz", - "tumor_R_2.fastq.gz", - "normal_R_2.fastq.gz", - ] - # WHEN files are created, and symlinks are made in symlink directory - symlink_from_path = tmpdir_factory.mktemp("symlink_from") - symlink_to_path = tmpdir_factory.mktemp("symlink_to") - casefiles = [Path(symlink_from_path, x) for x in filenames] - for casefile in casefiles: - casefile.touch() - create_fastq_symlink(casefiles=casefiles, symlink_dir=symlink_to_path) - # THEN function returns list containing the original parent path! - assert get_fastq_bind_path(symlink_to_path) == [symlink_from_path] - - -def test_create_pon_fastq_symlink_file_exist_error(tmpdir_factory, caplog): - # GIVEN a list of valid fastq file names for cnv pon - fastq_files = [ - "case1_R_1.fastq.gz", - ] - - # WHEN files are created, and symlinks are made in symlink directory - symlink_from_path = tmpdir_factory.mktemp("symlink_from") - symlink_to_path = tmpdir_factory.mktemp("symlink_to") - - for fastq_file in fastq_files: - Path(symlink_from_path, fastq_file).touch() - Path(symlink_to_path, fastq_file).touch() - - with caplog.at_level(logging.INFO): - create_pon_fastq_symlink(symlink_from_path, symlink_to_path) - assert "exists, skipping" in caplog.text - - -def test_convert_deliverables_tags(): +def test_convert_deliverables_tags(tumor_normal_fastq_info_correct: List[Dict]): + """Test generation of delivery tags.""" # GIVEN a deliverables dict and a sample config dict delivery_json = { "files": [ { - "path": "dummy_balsamic_run/run_tests/TN_WGS/analysis/fastq/S1_R_2.fp.fastq.gz", + "path": "dummy_balsamic_run/run_tests/TN_WGS/analysis/fastq/ACC1_R_1.fp.fastq.gz", "path_index": [], "step": "fastp", - "tag": "read2,quality-trimmed-fastq-read2,tumor", - "id": "S1_R", + "tag": "ACC1,read1,quality-trimmed-fastq-read1", + "id": "ACC1", "format": "fastq.gz", }, { - "path": "dummy_balsamic_run/run_tests/TN_WGS/analysis/qc/fastp/S1_R_fastp.json", + "path": "dummy_balsamic_run/run_tests/TN_WGS/analysis/fastq/ACC1_R_2.fp.fastq.gz", "path_index": [], "step": "fastp", - "tag": "S1_R,json,quality-trimmed-fastq-json", - "id": "S1_R", - "format": "json", + "tag": "read2,quality-trimmed-fastq-read1", + "id": "ACC1", + "format": "fastq.gz", }, { - "path": "dummy_balsamic_run/run_tests/TN_WGS/analysis/qc/fastp/S2_R_fastp.json", + "path": "dummy_balsamic_run/run_tests/TN_WGS/analysis/qc/fastp/ACC1.fastp.json", "path_index": [], "step": "fastp", - "tag": "ACC1,json,quality-trimmed-fastq-json", + "tag": "ACC1,json,quality-trimmed-fastq-json,tumor", "id": "tumor", "format": "json", }, ] } - sample_config_dict = { - "samples": { - "S1_R": { - "file_prefix": "S1_R", - "sample_name": "ACC1", - "type": "tumor", - "readpair_suffix": ["1", "2"], - }, - }, - } + sample_config_dict = {"samples": tumor_normal_fastq_info_correct} - # WHEN running convert function - delivery_json = convert_deliverables_tags( + # WHEN running the convert function + delivery_json: dict = convert_deliverables_tags( delivery_json=delivery_json, sample_config_dict=sample_config_dict ) - # Prefix strings should be replaced with sample name + # THEN prefix strings should be replaced with sample name for delivery_file in delivery_json["files"]: - assert delivery_file["id"] == "ACC1" assert "ACC1" in delivery_file["tag"] assert "tumor" not in delivery_file["tag"] + assert delivery_file["id"] == "ACC1" def test_check_executable_exists(): - # GIVEN an existing executable command test_command = "ls" @@ -982,7 +656,6 @@ def test_check_executable_exists(): def test_check_executable_not_existing(): - # GIVEN an existing executable command test_command = "twenty_twenty_was_bad" @@ -992,7 +665,6 @@ def test_check_executable_not_existing(): def test_job_id_dump_to_yaml(tmp_path): - # GIVEN a file with one job id per line, a key (case name), and an output file name dummy_dir = tmp_path / "job_id_dump_dir" dummy_dir.mkdir() @@ -1011,7 +683,6 @@ def test_job_id_dump_to_yaml(tmp_path): def test_generate_h5(tmp_path): - # GIVEN a job name, a path, and a job id dummy_path = tmp_path / "h5dir" dummy_path.mkdir() @@ -1027,7 +698,6 @@ def test_generate_h5(tmp_path): def test_generate_h5_capture_no_output(tmp_path): - # GIVEN a job name, a path, and a job id dummy_path = tmp_path / "h5dir" dummy_path.mkdir() @@ -1044,80 +714,338 @@ def test_generate_h5_capture_no_output(tmp_path): assert actual_output == None -def test_get_md5(tmp_path): - - # GIVEN a dummy file - dummy_dir = tmp_path / "md5" - dummy_dir.mkdir() - dummy_file = dummy_dir / "dummy_file.dump" - dummy_file.write_text("Awesome Text") - - # THEN md5 returned should be - assert get_md5(dummy_file) == "3945B39E" +def test_get_sample_type_from_sample_name(config_dict: Dict): + """Test sample type extraction from a extracted config file.""" + # GIVEN a config dictionary -def test_create_md5(tmp_path): - - # GIVEN a path to a md5 file and reference dummy files - ref_dir = tmp_path / "references" - ref_dir.mkdir() - dummy_ref_file1 = ref_dir / "reference_file1.dump" - dummy_ref_file1.write_text("Test reference1") - dummy_ref_file2 = ref_dir / "reference_file2.dump" - dummy_ref_file2.write_text("Test reference2") - dummy_reference_dict = { - "reference_dummy1": str(dummy_ref_file1), - "reference_dummy2": str(dummy_ref_file2), - } - dummy_dir = tmp_path / "md5" - dummy_dir.mkdir() - dummy_file = dummy_dir / "dummy_file.dump" + # GIVEN a sample name + sample_name = "ACC1" - create_md5(dummy_reference_dict, dummy_file) + # WHEN calling the function + sample_type = get_sample_type_from_sample_name(config_dict, sample_name) - # THEN md5 file exists - assert dummy_file.exists() + # THEN the retrieved sample type should match the expected one + assert sample_type == SampleType.TUMOR -def test_get_rule_output(snakemake_fastqc_rule): - """Tests retrieval of existing output files from a specific workflow""" +def test_get_rule_output(snakemake_bcftools_filter_vardict_research_tumor_only): + """Tests retrieval of existing output files from a specific workflow.""" # GIVEN a snakemake fastqc rule object, a rule name and a list of associated wildcards - rules = snakemake_fastqc_rule - rule_name = "fastqc" + rules = snakemake_bcftools_filter_vardict_research_tumor_only + rule_name = "bcftools_filter_vardict_research_tumor_only" output_file_wildcards = { - "sample": ["concatenated_tumor_XXXXXX_R", "tumor", "normal"], + "sample": ["ACC1", "tumor", "normal"], "case_name": "sample_tumor_only", + "var_type": ["CNV", "SNV", "SV"], } - # THEN retrieve the output files output_files = get_rule_output(rules, rule_name, output_file_wildcards) # THEN check that the fastq files has been picked up by the function and that the tags has been correctly created - assert len(output_files) == 2 + assert len(output_files) == 1 for file in output_files: # Expected file names assert ( - os.path.basename(file[0]) == "concatenated_tumor_XXXXXX_R_1.fastq.gz" - or os.path.basename(file[0]) == "concatenated_tumor_XXXXXX_R_2.fastq.gz" + Path(file[0]).name + == "SNV.somatic.sample_tumor_only.vardict.research.filtered.pass.vcf.gz" ) # Expected tags assert ( - file[3] == "1,fastqc,quality-trimmed-seq-fastqc" - or file[3] == "2,fastqc,quality-trimmed-seq-fastqc" + file[3] + == "SNV,sample-tumor-only,vcf-pass-vardict,research-vcf-pass-vardict" ) -def test_get_sample_type_from_prefix(config_dict): - """Test sample type extraction from a extracted config file""" +def test_get_resolved_fastq_files_directory(fastq_dir: str): + """Test get fastq directory for unlinked fastqs.""" - # GIVEN a config dictionary + # GIVEN an input fastq path - # GIVEN a sample name - sample = "concatenated_tumor_XXXXXX_R" + # WHEN extracting the input files common path + input_dir: str = get_resolved_fastq_files_directory(fastq_dir) - # WHEN calling the function - sample_type = get_sample_type_from_prefix(config_dict, sample) + # THEN the fastq directory should be returned + assert input_dir == fastq_dir - # THEN the retrieved sample type should match the expected one - assert sample_type == "tumor" + +def test_get_resolved_fastq_files_directory_symlinked_files( + fastq_dir: str, tmp_path: Path +): + """Test get fastq directory for symlinked files.""" + + # GIVEN a temporary fastq path containing symlinked files + for file in Path(fastq_dir).iterdir(): + Path(tmp_path, file.name).symlink_to(file) + + # WHEN extracting the input files common path + input_dir: str = get_resolved_fastq_files_directory(str(tmp_path)) + + # THEN the real fastq directory should be returned + assert input_dir == fastq_dir + + +def test_write_finish_file(json_file: Path): + """Test finish analysis completion file generation.""" + + # GIVEN a file path to write to + + # WHEN writing a json file after an analysis has been completed + write_finish_file(file_path=json_file.as_posix()) + + # THEN assert that a file was successfully created + assert Path.exists(json_file) + + +def test_get_analysis_fastq_files_directory(fastq_dir: str): + """Test get analysis fastq directory when it already exists in case folder.""" + + # GIVEN an input fastq path + + # WHEN getting the analysis fastq directory + input_dir: str = get_analysis_fastq_files_directory( + case_dir=Path(fastq_dir).parents[1].as_posix(), fastq_path=fastq_dir + ) + + # THEN the original fastq directory should be returned + assert input_dir == fastq_dir + + +def test_get_analysis_fastq_files_directory_exception( + fastq_dir: str, + case_id_tumor_only: str, + tmp_path_factory: TempPathFactory, + caplog: LogCaptureFixture, +): + """Test get analysis fastq directory when it already exists in case folder but another path is provided.""" + caplog.set_level(logging.INFO) + + # GIVEN an input fastq path and an external case directory + case_dir: str = tmp_path_factory.mktemp(case_id_tumor_only).as_posix() + + # WHEN getting the analysis fastq directory twice + _input_dir: str = get_analysis_fastq_files_directory( + case_dir=case_dir, fastq_path=fastq_dir + ) + input_dir: str = get_analysis_fastq_files_directory( + case_dir=case_dir, fastq_path=fastq_dir + ) + + # THEN the fastq directory should be located inside the case directory and the linking should have been skipped + assert input_dir == Path(case_dir, "fastq").as_posix() + assert "Skipping linking" in caplog.text + + +def test_get_analysis_fastq_files_directory_no_fastqs( + fastq_dir: str, tmp_path_factory: TempPathFactory, case_id_tumor_only: str +): + """Test get analysis fastq directory when the provided fastq directory is outside the case folder.""" + + # GIVEN an external input fastq path and a case directory + case_dir: str = tmp_path_factory.mktemp(case_id_tumor_only).as_posix() + + # WHEN getting the analysis fastq directory + input_dir: str = get_analysis_fastq_files_directory( + case_dir=case_dir, fastq_path=fastq_dir + ) + + # THEN the fastq directory should be located inside the case directory + assert input_dir == Path(case_dir, "fastq").as_posix() + + # THEN the case fast files should have been linked to the provided fastq directory + for fastq in Path(input_dir).iterdir(): + assert fastq.is_symlink() + assert fastq.resolve().is_file() + assert fastq_dir == fastq.resolve().parent.as_posix() + + +def test_get_sample_list( + tumor_sample_name: str, normal_sample_name: str, fastq_dir_tumor_normal: str +): + """Tests sample dictionary retrieval.""" + + samples: List = get_sample_list( + tumor_sample_name=tumor_sample_name, + normal_sample_name=normal_sample_name, + fastq_path=fastq_dir_tumor_normal, + ) + assert samples[0]["name"] == tumor_sample_name + assert samples[1]["name"] == normal_sample_name + assert samples[0]["type"] == "tumor" + assert samples[1]["type"] == "normal" + assert samples[0]["fastq_info"] + assert samples[1]["fastq_info"] + + +def test_get_fastq_info(tumor_sample_name: str, fastq_dir_tumor_only: str): + """Validates that get_fastq_info assigns fastq info as expected.""" + # GIVEN a fastq_dir and sample name ACC1 + + # WHEN calling the get_fastq_info function + fastq_dict = get_fastq_info(tumor_sample_name, fastq_dir_tumor_only) + + fwd1_expected = f"{fastq_dir_tumor_only}/1_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz" + rev1_expected = f"{fastq_dir_tumor_only}/1_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz" + fwd2_expected = f"{fastq_dir_tumor_only}/2_171015_HJ7TLDSX5_ACC1_XXXXXX_1.fastq.gz" + rev2_expected = f"{fastq_dir_tumor_only}/2_171015_HJ7TLDSX5_ACC1_XXXXXX_2.fastq.gz" + fastq_info1_expected = FastqInfoModel(fwd=fwd1_expected, rev=rev1_expected) + fastq_info2_expected = FastqInfoModel(fwd=fwd2_expected, rev=rev2_expected) + + # THEN check that the fastq_dict matches the expected fastq_dict + expected_fastq_dict = { + "1_171015_HJ7TLDSX5_ACC1_XXXXXX": fastq_info1_expected.model_dump( + exclude_none=True + ), + "2_171015_HJ7TLDSX5_ACC1_XXXXXX": fastq_info2_expected.model_dump( + exclude_none=True + ), + } + assert fastq_dict == expected_fastq_dict + + +def test_get_fastq_info_symlink(tumor_sample_name: str, fastq_dir_symlinked: str): + """Test symlinked fast info included in samples dictionary.""" + + # GIVEN a fastq_dir and sample name + + # WHEN calling the get_fastq_info function + fastq_dict: Dict[str, FastqInfoModel] = get_fastq_info( + tumor_sample_name, fastq_dir_symlinked + ) + + # THEN verify that the resolved file links have been included in the samples dictionary + assert "fwd_resolved" in next(iter(fastq_dict.values())) + assert "rev_resolved" in next(iter(fastq_dict.values())) + + +def test_get_fastq_info_empty_fastq_dir(tumor_sample_name: str, empty_fastq_dir: str): + """Tests if get_fastq_info correctly reports errors of not finding any fastq-files""" + # GIVEN an empty fastq_dir and a sample name + + # WHEN calling get_fastq_info + # THEN the following error should be found + with pytest.raises( + BalsamicError, match=f"No fastqs found for: {tumor_sample_name}" + ): + get_fastq_info(tumor_sample_name, empty_fastq_dir) + + +def test_get_fastq_info_double_assigned_fastq_pattern( + tumor_sample_name: str, fastq_dir_tumor_duplicate_fastq_patterns: str +): + """Tests if get_fastq_info correctly reports error of finding double-assigned fastq-patterns""" + # GIVEN an empty fastq_dir and a sample name + + # WHEN calling get_fastq_info + # THEN the following error should be found + with pytest.raises(BalsamicError, match="Fastq name conflict. Fastq pair pattern"): + get_fastq_info(tumor_sample_name, fastq_dir_tumor_duplicate_fastq_patterns) + + +def test_get_fastp_parameters(balsamic_model: ConfigModel): + """Validate correct retrieval of WGS and TGA specific fastp parameters.""" + + # GIVEN WGS config with quality trim + balsamic_model.analysis.sequencing_type = SequencingType.WGS + balsamic_model.QC.quality_trim = True + fastp_params_wgs = get_fastp_parameters(balsamic_model) + # THEN no UMI trimming should be active + assert "fastp_trim_umi" not in fastp_params_wgs + # THEN quality trimming should be active + assert "--n_base_limit" in fastp_params_wgs["fastp_trim_qual"] + + # GIVEN WGS config without quality trim + balsamic_model.QC.quality_trim = False + fastp_params_wgs = get_fastp_parameters(balsamic_model) + # THEN no quality trimming should be done + assert "--n_base_limit" not in fastp_params_wgs["fastp_trim_qual"] + assert "--disable_quality_filtering" in fastp_params_wgs["fastp_trim_qual"] + + # GIVEN TGA with adapter trimming active + balsamic_model.analysis.sequencing_type = SequencingType.TARGETED + balsamic_model.QC.adapter_trim = True + fastp_params_tga = get_fastp_parameters(balsamic_model) + # THEN UMI trimming should be active + assert "fastp_trim_umi" in fastp_params_tga + # THEN adapter trimming should be done + assert "--detect_adapter_for_pe" in fastp_params_tga["fastp_trim_adapter"] + + # GIVEN TGA without adapter trimming active + balsamic_model.QC.adapter_trim = False + fastp_params_tga = get_fastp_parameters(balsamic_model) + # THEN no adapter trimming should be done + assert "--disable_adapter_trimming" in fastp_params_tga["fastp_trim_adapter"] + + +def test_validate_cache_version_develop(): + """Test develop cache version validation.""" + + # GIVEN a develop cache version + cli_version: str = CacheVersion.DEVELOP + + # WHEN validating the provided version + version: str = validate_cache_version(click.Context, click.Parameter, cli_version) + + # THEN the correct version should be returned + assert version == CacheVersion.DEVELOP + + +def test_validate_cache_version_release(): + """Test release cache version validation.""" + + # GIVEN a release cache version + cli_version: str = "1.2.3" + + # WHEN validating the provided version + version: str = validate_cache_version(click.Context, click.Parameter, cli_version) + + # THEN the correct version should be returned + assert version == cli_version + + +def test_validate_cache_version_non_digit(): + """Test non digit release cache version validation.""" + + # GIVEN an incorrect release cache version + cli_version: str = "a.b.c" + + # WHEN validating the provided version + + # THEN a bad parameter error should be raised + with pytest.raises(click.BadParameter): + validate_cache_version(click.Context, click.Parameter, cli_version) + + +def test_validate_cache_version_wrong_format(): + """Test wrong format release cache version validation.""" + + # GIVEN an incorrect release cache version + cli_version: str = "1.2" + + # WHEN validating the provided version + + # THEN a bad parameter error should be raised + with pytest.raises(click.BadParameter): + validate_cache_version(click.Context, click.Parameter, cli_version) + + +def test_read_vcf(vcf_file_path, vcf_file_gz_path): + """Test correct reading of VCF files.""" + # GIVEN a path to two identical VCF files, one gzipped and one not + + # WHEN reading VCF file into list of strings + vcf_contents: List[str] = read_vcf_file(vcf_file_path) + vcf_contents_from_gz: List[str] = read_vcf_file(vcf_file_gz_path) + + # THEN first and last line of the list should match the expected values + first_line = vcf_contents[0] + last_line = vcf_contents[-1] + first_line_expect = "##fileformat=VCFv4.2" + last_line_expect = "20\t13417\t.\tC\tCGAGA\t-0.00\tLowQual\tAC=0;AF=0;AN=2;DP=46;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0;MQ=30.92;SOR=0.307\tGT:AD:DP:GQ:PL\t0/0:46,0:46:99:0,139,2084" + + assert first_line == first_line_expect + assert last_line == last_line_expect + + # THEN the contents from the gzipped VCF should match the non-gzipped VCF + assert vcf_contents == vcf_contents_from_gz diff --git a/tests/utils/test_workflowscripts.py b/tests/utils/test_workflowscripts.py index 1a9e3f2f9..03ff138d9 100644 --- a/tests/utils/test_workflowscripts.py +++ b/tests/utils/test_workflowscripts.py @@ -2,7 +2,6 @@ from pathlib import Path import pytest -from BALSAMIC.utils.cli import generate_h5 from BALSAMIC.utils.workflowscripts import plot_analysis