From 53cbf695eab1a8727c6cff6afd439b396ac33709 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 13 Dec 2021 13:16:43 +0100 Subject: [PATCH 01/58] add interval bed option to umi TNscope --- BALSAMIC/constants/workflow_params.py | 1 + BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule | 3 +++ BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule | 3 +++ 3 files changed, 7 insertions(+) diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 44d45e30b..ed686cef5 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -137,6 +137,7 @@ "init_tumorLOD": 0.5, "error_rate": 5, "prunefactor": 3, + "padding": 20, "disable_detect": "sv", }, } diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule index 399e9be17..0552421a6 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule @@ -27,6 +27,7 @@ rule sentieon_tnscope_umi: init_tumor_lod = params.tnscope_umi.init_tumorLOD, error_rate = params.tnscope_umi.error_rate, prune_factor = params.tnscope_umi.prunefactor, + padding = params.tnscope_umi.padding, tumor = "TUMOR", pcr_model = params.common.pcr_model threads: @@ -44,6 +45,8 @@ export SENTIEON_LICENSE={params.sentieon_lic}; -t {threads} \ -r {input.ref_fa} \ -i {input.bam} \ +--interval {input.bed} \ +--interval_padding {params.padding} \ --algo {params.algo} \ --tumor_sample {params.tumor} \ --dbsnp {input.dbsnp} \ diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule index 0aa0b95fe..4f9b3bfa1 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule @@ -28,6 +28,7 @@ rule sentieon_tnscope_umi_tn: error_rate = params.tnscope_umi.error_rate, prune_factor = params.tnscope_umi.prunefactor, pcr_model = params.common.pcr_model, + padding = params.tnscope_umi.padding, tumor = "TUMOR", normal = "NORMAL" threads: @@ -47,6 +48,8 @@ export SENTIEON_LICENSE={params.sentieon_lic}; -r {input.ref_fa} \ -i {input.bamT} \ -i {input.bamN} \ +--interval {input.bed} \ +--interval_padding {params.padding} \ --algo {params.algo} \ --tumor_sample {params.tumor} \ --normal_sample {params.normal} \ From e4eddbad8a76b7575f5a76b74893f291ddbafc93 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 13 Dec 2021 13:17:02 +0100 Subject: [PATCH 02/58] add tests --- BALSAMIC/utils/models.py | 2 ++ tests/utils/test_models.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index b694a0757..f6e78508a 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -617,6 +617,7 @@ class UMIParamsTNscope(BaseModel): init_tumorLOD: float (required); minimum tumor log odds in the initial pass calling variants error_rate: int (required); allow error-rate to consider in calling prunefactor: int (required); pruning factor in the kmer graph + padding: int(required); amount to pad bed interval regions """ algo: str @@ -624,6 +625,7 @@ class UMIParamsTNscope(BaseModel): min_tumorLOD: int error_rate: int prunefactor: int + padding: int disable_detect: str diff --git a/tests/utils/test_models.py b/tests/utils/test_models.py index 53d44d5ce..f48f09492 100644 --- a/tests/utils/test_models.py +++ b/tests/utils/test_models.py @@ -349,6 +349,7 @@ def test_umiparams_tnscope(): "min_tumorLOD": 6, "error_rate": 5, "prunefactor": 3, + "padding": 30, "disable_detect": "abc", } @@ -362,7 +363,7 @@ def test_umiparams_tnscope(): assert test_tnscope_params_built.error_rate == 5 assert test_tnscope_params_built.prunefactor == 3 assert test_tnscope_params_built.disable_detect == "abc" - + assert test_tnscope_params_built.padding == 30 def test_params_vardict(): """test UMIParamsVardict model for correct validation""" From ac1168959838e19ac6dca6854bbd9b2956318cff Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 13 Dec 2021 13:19:28 +0100 Subject: [PATCH 03/58] update changelog --- CHANGELOG.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6a5a315f8..9f162b59c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,12 @@ +[X.X.X] +------- + +Added: +^^^^^^ + +* Call umi variants using TNscope in bed defined regions #821 + + [8.2.4] ------- From 2f8a50a0090793d2bfbc6d4819aff81efccb27e6 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 13 Dec 2021 13:25:16 +0100 Subject: [PATCH 04/58] fix black --- tests/utils/test_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/utils/test_models.py b/tests/utils/test_models.py index f48f09492..9b61315d1 100644 --- a/tests/utils/test_models.py +++ b/tests/utils/test_models.py @@ -365,6 +365,7 @@ def test_umiparams_tnscope(): assert test_tnscope_params_built.disable_detect == "abc" assert test_tnscope_params_built.padding == 30 + def test_params_vardict(): """test UMIParamsVardict model for correct validation""" From 1b81a73360f5b96a8be9bd3da9d9d2dd058fc74d Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 13 Dec 2021 13:36:28 +0100 Subject: [PATCH 05/58] update padding to 100 --- BALSAMIC/constants/workflow_params.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index ed686cef5..a2616153f 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -137,7 +137,7 @@ "init_tumorLOD": 0.5, "error_rate": 5, "prunefactor": 3, - "padding": 20, + "padding": 100, "disable_detect": "sv", }, } From 7b4148e74cb733f3c3860c40f99ab769c4f14735 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Wed, 22 Dec 2021 12:11:35 +0100 Subject: [PATCH 06/58] feat: generate canfam3 references (#843) * feat: add canfam3 reference files * feat: add canfam3 reference files, fix test * feat: add canfam3 reference files, fix test * modify CHANGELOG.rst with updates * snakefile is assigned in initialize * Snakefile is not defined in as a -s cli option but as following the call of init and --genome-version * black linting * Update BALSAMIC/constants/reference.py Co-authored-by: ashwini06 * Update BALSAMIC/constants/reference.py Co-authored-by: ashwini06 * Update BALSAMIC/utils/cli.py Co-authored-by: ashwini06 * Update BALSAMIC/workflows/reference-canfam3.smk Co-authored-by: Hassan Foroughi * implemented comments * implemented comments, linted * functions get_md5 and create_md5 in utils/cli.py and tests for cosmic key provided for non canfam3 genome-version * linting * add tests for get_md5 and create_md5 * change md5 to sha512 and cosmic key log error * modify md5 return to sha512 * linting * slight change in formatting * sha512 to crc32/md5 * Update BALSAMIC/workflows/reference-canfam3.smk Co-authored-by: ashwini06 * Update BALSAMIC/workflows/reference-canfam3.smk Co-authored-by: ashwini06 * hash->hashed and remove sequencing_type as input for get_snakefile * remove unused sequencing_type Co-authored-by: ashwini06 Co-authored-by: Hassan Foroughi --- BALSAMIC/commands/config/case.py | 2 +- BALSAMIC/commands/init/base.py | 14 +- BALSAMIC/commands/report/deliver.py | 3 +- BALSAMIC/commands/report/status.py | 4 +- BALSAMIC/commands/run/analysis.py | 5 +- BALSAMIC/constants/reference.py | 36 +++- BALSAMIC/utils/cli.py | 31 +++- BALSAMIC/workflows/reference-canfam3.smk | 218 +++++++++++++++++++++++ BALSAMIC/workflows/reference.smk | 19 +- CHANGELOG.rst | 1 + tests/commands/init/test_init.py | 48 +++++ tests/test_workflow.py | 13 +- tests/utils/test_utils.py | 69 +++++-- 13 files changed, 414 insertions(+), 49 deletions(-) create mode 100644 BALSAMIC/workflows/reference-canfam3.smk diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index a4b0eab9d..bb172b29a 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -124,7 +124,7 @@ "-g", "--genome-version", default="hg19", - type=click.Choice(["hg19", "hg38"]), + type=click.Choice(["hg19", "hg38", "canfam3"]), help=( "Genome version to prepare reference. Path to genome" "will be /genome_version" diff --git a/BALSAMIC/commands/init/base.py b/BALSAMIC/commands/init/base.py index 8c33252ca..a14e0aa7e 100644 --- a/BALSAMIC/commands/init/base.py +++ b/BALSAMIC/commands/init/base.py @@ -57,11 +57,11 @@ is_flag=True, help="Force re-downloading all containers", ) -@click.option("-c", "--cosmic-key", required=True, help="cosmic db authentication key") +@click.option("-c", "--cosmic-key", required=False, help="cosmic db authentication key") @click.option( "-s", "--snakefile", - default=get_snakefile("generate_ref"), + default=None, type=click.Path(), show_default=True, help="snakefile for reference generation", @@ -77,7 +77,7 @@ "-g", "--genome-version", default="hg19", - type=click.Choice(["hg19", "hg38"]), + type=click.Choice(["hg19", "hg38", "canfam3"]), help=( "Genome version to prepare reference. Path to genome" "will be /genome_version" @@ -212,6 +212,10 @@ def initialize( ) raise click.Abort() + if genome_version in ["hg38", "hg19"] and not cosmic_key: + LOG.error("cosmic db authentication key required with hg38 and hg19") + raise click.Abort() + # resolve outdir to absolute path outdir = Path(outdir).resolve() container_outdir = Path(outdir, balsamic_version, "containers") @@ -261,6 +265,10 @@ def initialize( write_json(config_dict, config_json) LOG.info("Reference generation workflow configured successfully - %s" % config_json) + snakefile = ( + snakefile if snakefile else get_snakefile("generate_ref", genome_version) + ) + with CaptureStdout() as graph_dot: snakemake.snakemake( snakefile=snakefile, diff --git a/BALSAMIC/commands/report/deliver.py b/BALSAMIC/commands/report/deliver.py index e258f9127..72347c7ec 100644 --- a/BALSAMIC/commands/report/deliver.py +++ b/BALSAMIC/commands/report/deliver.py @@ -139,7 +139,8 @@ def deliver( else sample_config_dict["analysis"]["analysis_type"] ) sequencing_type = sample_config_dict["analysis"]["sequencing_type"] - snakefile = get_snakefile(analysis_type, sequencing_type) + reference_genome = sample_config_dict["reference"]["reference_genome"] + snakefile = get_snakefile(analysis_type, reference_genome) balsamic_qc_report = None if sequencing_type != "wgs" and sample_id_map and case_id_map: diff --git a/BALSAMIC/commands/report/status.py b/BALSAMIC/commands/report/status.py index d8c427053..5658246f1 100644 --- a/BALSAMIC/commands/report/status.py +++ b/BALSAMIC/commands/report/status.py @@ -53,8 +53,8 @@ def status(context, sample_config, show_only_missing, print_files): result_dir = get_result_dir(sample_config_dict) analysis_type = sample_config_dict["analysis"]["analysis_type"] - sequencing_type = sample_config_dict["analysis"]["sequencing_type"] - snakefile = get_snakefile(analysis_type, sequencing_type) + reference_genome = sample_config_dict["reference"]["reference_genome"] + snakefile = get_snakefile(analysis_type, reference_genome) if os.path.isfile(os.path.join(result_dir, "analysis_finish")): snakemake.snakemake( diff --git a/BALSAMIC/commands/run/analysis.py b/BALSAMIC/commands/run/analysis.py index 87dfb156d..ec0c41de6 100644 --- a/BALSAMIC/commands/run/analysis.py +++ b/BALSAMIC/commands/run/analysis.py @@ -210,7 +210,6 @@ def analysis( resultpath = sample_config["analysis"]["result"] benchmarkpath = sample_config["analysis"]["benchmark"] case_name = sample_config["analysis"]["case_id"] - sequencing_type = sample_config["analysis"]["sequencing_type"] if run_analysis: # if not dry run, then create (new) log/script directory @@ -252,9 +251,7 @@ def analysis( ).as_posix() + "/" ) - balsamic_run.snakefile = ( - snake_file if snake_file else get_snakefile(analysis_type, sequencing_type) - ) + balsamic_run.snakefile = snake_file if snake_file else get_snakefile(analysis_type) balsamic_run.configfile = sample_config_path balsamic_run.run_mode = run_mode balsamic_run.cluster_config = cluster_config diff --git a/BALSAMIC/constants/reference.py b/BALSAMIC/constants/reference.py index 7b6e08f74..911005878 100644 --- a/BALSAMIC/constants/reference.py +++ b/BALSAMIC/constants/reference.py @@ -1,6 +1,6 @@ # reference related constants VALID_REF_FORMAT = ["fasta", "vcf", "text", "gtf", "gff"] -VALID_GENOME_VER = ["hg19", "hg38"] +VALID_GENOME_VER = ["hg19", "hg38", "canfam3"] # reference files REFERENCE_FILES = { @@ -312,4 +312,38 @@ "output_path": "variants", }, }, + "canfam3": { + "reference_genome": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/bigZips/canFam3.fa.gz", + "file_type": "fasta", + "gzip": True, + "genome_version": "canfam3", + "output_file": "canFam3.fasta", + "output_path": "genome", + }, + "refgene_txt": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/database/refGene.txt.gz", + "file_type": "text", + "gzip": True, + "genome_version": "canfam3", + "output_file": "canfam3_refGene.txt", + "output_path": "genome", + }, + "refgene_sql": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/database/refGene.sql", + "file_type": "text", + "gzip": False, + "genome_version": "canfam3", + "output_file": "canfam3_refGene.sql", + "output_path": "genome", + }, + "genome_chrom_size": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/bigZips/canFam3.chrom.sizes", + "file_type": "text", + "gzip": False, + "genome_version": "canfam3", + "output_file": "canfam3.chrom.sizes", + "output_path": "genome", + }, + }, } diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 6dab1503c..78fe66949 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -9,6 +9,7 @@ from pathlib import Path from io import StringIO from distutils.spawn import find_executable +import zlib import yaml import snakemake @@ -251,15 +252,20 @@ def get_schedulerpy(): return scheduler -def get_snakefile(analysis_type, sequencing_type="targeted"): +def get_snakefile(analysis_type, reference_genome="hg19"): """ Return a string path for variant calling snakefile. """ p = Path(__file__).parents[1] snakefile = Path(p, "workflows", "balsamic.smk") + if analysis_type == "generate_ref": snakefile = Path(p, "workflows", "reference.smk") + if "canfam3" in reference_genome: + snakefile = Path(p, "workflows", "reference-canfam3.smk") + return str(snakefile) + if analysis_type == "pon": snakefile = Path(p, "workflows", "PON.smk") @@ -583,7 +589,9 @@ def generate_graph(config_collection_dict, config_path): snakemake.snakemake( snakefile=get_snakefile( analysis_type=config_collection_dict["analysis"]["analysis_type"], - sequencing_type=config_collection_dict["analysis"]["sequencing_type"], + reference_genome=config_collection_dict["reference"][ + "reference_genome" + ], ), dryrun=True, configfiles=[config_path], @@ -683,3 +691,22 @@ def create_pon_fastq_symlink(pon_fastqs, symlink_dir): os.symlink(pon_fastq, pon_sym_file) except FileExistsError: LOG.info(f"File {pon_sym_file} exists, skipping") + + +def get_md5(filename): + with open(filename, "rb") as fh: + hashed = 0 + while True: + s = fh.read(65536) + if not s: + break + hashed = zlib.crc32(s, hashed) + return "%08X" % (hashed & 0xFFFFFFFF) + + +def create_md5(reference, check_md5): + """create a md5 file for all reference data""" + with open(check_md5, "w") as fh: + for key, value in reference.items(): + if os.path.isfile(value): + fh.write(get_md5(value) + " " + value + "\n") diff --git a/BALSAMIC/workflows/reference-canfam3.smk b/BALSAMIC/workflows/reference-canfam3.smk new file mode 100644 index 000000000..def91c8c2 --- /dev/null +++ b/BALSAMIC/workflows/reference-canfam3.smk @@ -0,0 +1,218 @@ +# syntax=python tabstop=4 expandtab +# coding: utf-8 + +import os +import logging +from pathlib import Path + +from copy import deepcopy + +from BALSAMIC.utils.rule import get_script_path +from BALSAMIC.utils.rule import get_reference_output_files +from BALSAMIC.utils.models import ReferenceMeta +from BALSAMIC.constants.reference import REFERENCE_FILES as REFERENCE_MODEL +from BALSAMIC.utils.cli import get_md5 +from BALSAMIC.utils.cli import create_md5 + +LOG = logging.getLogger(__name__) + +# explictly check if cluster_config dict has zero keys. +if len(cluster_config.keys()) == 0: + cluster_config = config + +genome_ver = config['genome_version'] + +# essential path reference files +basedir = os.path.join(config['output']) +genome_dir = os.path.join(basedir, "genome") + +# Set temporary dir environment variable +os.environ['TMPDIR'] = basedir + +REFERENCE_FILES = deepcopy(REFERENCE_MODEL) + +# intialize reference files +REFERENCE_FILES[genome_ver]['basedir'] = basedir +reference_file_model = ReferenceMeta.parse_obj(REFERENCE_FILES[genome_ver]) +reference_genome_url = reference_file_model.reference_genome +genome_chrom_size_url = reference_file_model.genome_chrom_size +refgene_txt_url = reference_file_model.refgene_txt +refgene_sql_url = reference_file_model.refgene_sql + +check_md5 = os.path.join(basedir, "reference.json.md5") + +shell.executable("/bin/bash") +shell.prefix("set -eo pipefail; ") + +singularity_image_path = config['singularity']['image_path'] +singularity_images = [Path(singularity_image_path, image_name + ".sif").as_posix() for image_name in config["singularity"]["containers"].keys()] + +########################################################## +# Generating Reference files for BALSAMIC pipeline +# Writing reference json file +########################################################## + +rule all: + input: + singularity_images, + reference_genome = reference_genome_url.get_output_file, + bwa_index = expand(reference_genome_url.get_output_file + "{ext}", ext=['.amb','.ann','.bwt','.pac','.sa']), + refgenome_fai = reference_genome_url.get_output_file + ".fai", + refgenome_dict = reference_genome_url.get_output_file.replace("fasta","dict"), + refseq_bed = refgene_txt_url.get_output_file.replace("txt", "flat") + ".bed", + refseq_flat = refgene_txt_url.get_output_file.replace("txt", "flat"), + refgene = refgene_txt_url.get_output_file, + genome_chrom_size = genome_chrom_size_url.get_output_file, + output: + finished = os.path.join(basedir,"reference.finished"), + reference_json = os.path.join(basedir, "reference.json"), + check_md5 = check_md5 + log: + os.path.join(basedir, "reference.json.log") + run: + import json + from datetime import datetime + + today = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + ref_json = dict() + ref_json['reference'] = { + "reference_genome": input.reference_genome, + "exon_bed": input.refseq_bed, + "refflat": input.refseq_flat, + "refGene": input.refgene, + "genome_chrom_size": input.genome_chrom_size, + "reference_access_date": today, + } + + with open(str(output.reference_json), "w") as fh: + json.dump(ref_json, fh, indent=4) + + create_md5(ref_json['reference'], output.check_md5) + + with open(str(output.finished), mode='w') as finish_file: + finish_file.write('%s\n' % today ) + +########################################################### +# Download all singularity container images from dockerhub +########################################################### + +rule download_container: + output: singularity_images + run: + for image_name, docker_path in config["singularity"]["containers"].items(): + cmd = "singularity pull {}/{}.sif {}".format(config["singularity"]["image_path"], image_name, docker_path) + shell(cmd) + +########################################################## +# Download the reference genome, variant db +########################################################## +download_content = [reference_genome_url, genome_chrom_size_url, refgene_txt_url, refgene_sql_url] + +rule download_reference: + output: + expand("{output}", output=[ref.get_output_file for ref in download_content]) + run: + import requests + + for ref in download_content: + output_file = ref.get_output_file + log_file = output_file + ".log" + + cmd = "wget -a {} -O - {}".format(log_file, ref.url) + + if ref.gzip: + cmd += " | gunzip " + + cmd += " > {}".format(output_file) + shell(cmd) + ref.write_md5 + +########################################################## +# Preprocess refseq file by fetching relevant columns and +# standardize the chr column +########################################################## + +rule prepare_refgene: + input: + singularity_images, + refgene_txt = refgene_txt_url.get_output_file, + refgene_sql = refgene_sql_url.get_output_file, + params: + refgene_sql_awk = get_script_path('refseq_sql.awk'), + output: + refflat = refgene_txt_url.get_output_file.replace("txt", "flat"), + bed = refgene_txt_url.get_output_file.replace("txt", "flat") + ".bed", + log: + refgene_sql = os.path.join(basedir, "genome", "refgene_sql.log"), + refgene_txt = os.path.join(basedir, "genome", "refgene_txt.log") + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix() + shell: + """ +header=$(awk -f {params.refgene_sql_awk} {input.refgene_sql}); +(echo \"$header\"; cat {input.refgene_txt};) \ +| csvcut -t -c chrom,exonStarts,exonEnds,name,score,strand,exonCount,txStart,txEnd,name2 \ +| csvformat -T \ +| bedtools expand -c 2,3 \ +| awk '$1~/chr[1-9]/ && $1!~/[_]/' | cut -c 4- | sort -k1,1 -k2,2n > {output.bed}; + +awk -v OFS=\"\\t\" '$3!~/_/ {{ gsub(\"chr\",\"\",$3); $1=$13; print }}' {input.refgene_txt} \ +| cut -f 1-11 > {output.refflat}; +sed -i 's/chr//g' {input.refgene_txt}; + """ + +########################################################## +# Create BWA Index for reference genome +########################################################## + +rule bwa_index: + input: + singularity_img = singularity_images, + reference_genome = reference_genome_url.get_output_file + output: + expand(reference_genome_url.get_output_file + "{ext}", ext=['.amb','.ann','.bwt','.pac','.sa']) + log: + reference_genome_url.get_output_file + ".bwa_index.log" + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bwa") + ".sif").as_posix() + shell: + """ +bwa index -a bwtsw {input.reference_genome} 2> {log}; + """ + +########################################################## +# Create index for fasta file - .fai +########################################################## + +rule samtools_index_fasta: + input: + singularity_img = singularity_images, + reference_genome = reference_genome_url.get_output_file + output: + reference_genome_url.get_output_file + ".fai" + log: + reference_genome_url.get_output_file + ".faidx.log" + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("samtools") + ".sif").as_posix() + shell: + """ +samtools faidx {input.reference_genome} 2> {log}; + """ + + +########################################################## +# create reference dictionary using picard +########################################################## + +rule picard_ref_dict: + input: + singularity_img = singularity_images, + reference_genome = reference_genome_url.get_output_file + output: + reference_genome_url.get_output_file.replace("fasta","dict") + log: + reference_genome_url.get_output_file + ".ref_dict.log" + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("picard") + ".sif").as_posix() + shell: + """ +picard CreateSequenceDictionary REFERENCE={input.reference_genome} OUTPUT={output} 2> {log}; + """ + diff --git a/BALSAMIC/workflows/reference.smk b/BALSAMIC/workflows/reference.smk index 81ba9d0e4..c73d06e59 100644 --- a/BALSAMIC/workflows/reference.smk +++ b/BALSAMIC/workflows/reference.smk @@ -2,7 +2,6 @@ # coding: utf-8 import os -import hashlib import logging from pathlib import Path @@ -12,6 +11,9 @@ from BALSAMIC.utils.rule import get_script_path from BALSAMIC.utils.rule import get_reference_output_files from BALSAMIC.utils.models import ReferenceMeta from BALSAMIC.constants.reference import REFERENCE_FILES as REFERENCE_MODEL +from BALSAMIC.utils.cli import get_md5 +from BALSAMIC.utils.cli import create_md5 + LOG = logging.getLogger(__name__) @@ -76,21 +78,6 @@ check_md5 = os.path.join(basedir, "reference.json.md5") shell.executable("/bin/bash") shell.prefix("set -eo pipefail; ") -def get_md5(filename): - hash_md5 = hashlib.md5() - with open(str(filename), 'rb') as fh: - for chunk in iter(lambda: fh.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() - - -def create_md5(reference, check_md5): - """ create a md5 file for all reference data""" - with open(check_md5, 'w') as fh: - for key, value in reference.items(): - if os.path.isfile(value): - fh.write( get_md5(value) + ' ' + value + '\n') - singularity_image_path = config['singularity']['image_path'] singularity_images = [Path(singularity_image_path, image_name + ".sif").as_posix() for image_name in config["singularity"]["containers"].keys()] diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1791e96b5..d1d60e758 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,7 @@ Added: ^^^^^^ +* Snakemake workflow to create canfam3 reference #843 * Call umi variants using TNscope in bed defined regions #821 [8.2.5] diff --git a/tests/commands/init/test_init.py b/tests/commands/init/test_init.py index 2318f4753..74fcce209 100644 --- a/tests/commands/init/test_init.py +++ b/tests/commands/init/test_init.py @@ -71,6 +71,54 @@ def test_init_reference_no_write_perm(tmp_path, invoke_cli, no_write_perm_path): assert result.exit_code == 1 +def test_init_reference_no_cosmic_abort(tmp_path, invoke_cli): + # Given a path with no write permission + test_genome_version = "hg19" + test_container_version = "develop" + test_new_dir = tmp_path / "test_reference_dir" + test_new_dir.mkdir() + + # WHEN invoking config sample + result = invoke_cli( + [ + "init", + "-o", + str(test_new_dir), + "-v", + test_container_version, + "-g", + test_genome_version, + ] + ) + + # THEN it should create test_reference.json and exist with no error + assert result.exit_code == 1 + + +def test_init_reference_no_cosmic_run(tmp_path, invoke_cli): + # Given a path with no write permission + test_genome_version = "canfam3" + test_container_version = "develop" + test_new_dir = tmp_path / "test_reference_dir" + test_new_dir.mkdir() + + # WHEN invoking config sample + result = invoke_cli( + [ + "init", + "-o", + str(test_new_dir), + "-v", + test_container_version, + "-g", + test_genome_version, + ] + ) + + # THEN it should create test_reference.json and exist with no error + assert result.exit_code == 0 + + def test_init_reference_click_abort(invoke_cli, tmp_path): # Given test_reference output directory test_container_version = "develop" diff --git a/tests/test_workflow.py b/tests/test_workflow.py index 3ed867920..1c5020d9c 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -11,7 +11,8 @@ def test_workflow_tumor_normal( ): # GIVEN a sample config dict and snakefile workflow = "paired" - snakefile = get_snakefile(workflow) + reference_genome = "hg19" + snakefile = get_snakefile(workflow, reference_genome) config_json = tumor_normal_config # WHEN invoking snakemake module with dryrun option @@ -29,7 +30,8 @@ def test_workflow_tumor_normal( def test_workflow_tumor_only(tumor_only_config, sentieon_install_dir, sentieon_license): # GIVEN a sample config dict and snakefile workflow = "single" - snakefile = get_snakefile(workflow) + reference_genome = "hg19" + snakefile = get_snakefile(workflow, reference_genome) config_json = tumor_only_config # WHEN invoking snakemake module with dryrun option @@ -49,7 +51,8 @@ def test_workflow_qc( ): # GIVEN a sample config dict and snakefile workflow = "qc" - snakefile = get_snakefile(workflow) + reference_genome = "hg19" + snakefile = get_snakefile(workflow, reference_genome) # WHEN invoking snakemake module with dryrun option # THEN it should return true @@ -74,7 +77,6 @@ def test_workflow_sentieon( ): # GIVEN a sample config dict and snakefile workflows = [("single", tumor_only_wgs_config), ("paired", tumor_normal_wgs_config)] - sequencing_type = "wgs" # WHEN invoking snakemake module with dryrun option # THEN it should return true @@ -88,5 +90,6 @@ def test_workflow_sentieon( for workflow in workflows: analysis_type = workflow[0] config = workflow[1] - snakefile = get_snakefile(analysis_type, sequencing_type) + reference_genome = "hg19" + snakefile = get_snakefile(analysis_type, reference_genome) assert snakemake.snakemake(snakefile, configfiles=[config], dryrun=True) diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 5bb12da65..0d5285ef5 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -43,6 +43,8 @@ check_executable, job_id_dump_to_yaml, generate_h5, + get_md5, + create_md5, ) from BALSAMIC.utils.rule import ( @@ -355,22 +357,26 @@ def test_get_snakefile(): ] # WHEN asking to see snakefile for paired - for analysis_type, sequencing_type in workflow: - snakefile = get_snakefile(analysis_type, sequencing_type) - pipeline = "" + for reference_genome in ["hg19", "hg38", "canfam3"]: + for analysis_type, sequencing_type in workflow: + snakefile = get_snakefile(analysis_type, reference_genome) - if sequencing_type in ["targeted", "wgs", "qc"]: - pipeline = "BALSAMIC/workflows/balsamic.smk" - elif analysis_type == "generate_ref": - pipeline = "BALSAMIC/workflows/reference.smk" - elif analysis_type == "pon": - pipeline = "BALSAMIC/workflows/PON.smk" + pipeline = "" - # THEN it should return the snakefile path - # THEN assert file exists - assert snakefile.startswith("/") - assert pipeline in snakefile - assert Path(snakefile).is_file() + if sequencing_type in ["targeted", "wgs", "qc"]: + pipeline = "BALSAMIC/workflows/balsamic.smk" + elif analysis_type == "generate_ref" and reference_genome != "canfam3": + pipeline = "BALSAMIC/workflows/reference.smk" + elif analysis_type == "generate_ref" and reference_genome == "canfam3": + pipeline = "BALSAMIC/workflows/reference-canfam3.smk" + elif analysis_type == "pon": + pipeline = "BALSAMIC/workflows/PON.smk" + + # THEN it should return the snakefile path + # THEN assert file exists + assert snakefile.startswith("/") + assert pipeline in snakefile + assert Path(snakefile).is_file() def test_get_chrom(config_files): @@ -940,3 +946,38 @@ def test_generate_h5_capture_no_output(tmp_path): actual_output = generate_h5(dummy_job_name, dummy_job_id, dummy_path) assert actual_output == None + + +def test_get_md5(tmp_path): + + # GIVEN a dummy file + dummy_dir = tmp_path / "md5" + dummy_dir.mkdir() + dummy_file = dummy_dir / "dummy_file.dump" + dummy_file.write_text("Awesome Text") + + # THEN md5 returned should be + assert get_md5(dummy_file) == "3945B39E" + + +def test_create_md5(tmp_path): + + # GIVEN a path to a md5 file and reference dummy files + ref_dir = tmp_path / "references" + ref_dir.mkdir() + dummy_ref_file1 = ref_dir / "reference_file1.dump" + dummy_ref_file1.write_text("Test reference1") + dummy_ref_file2 = ref_dir / "reference_file2.dump" + dummy_ref_file2.write_text("Test reference2") + dummy_reference_dict = { + "reference_dummy1": str(dummy_ref_file1), + "reference_dummy2": str(dummy_ref_file2), + } + dummy_dir = tmp_path / "md5" + dummy_dir.mkdir() + dummy_file = dummy_dir / "dummy_file.dump" + + create_md5(dummy_reference_dict, dummy_file) + + # THEN md5 file exists + assert dummy_file.exists() From 233d7bc5a12cb3871dbacf57d53bacf3dbc565e7 Mon Sep 17 00:00:00 2001 From: ivadym Date: Thu, 23 Dec 2021 22:15:50 +0100 Subject: [PATCH 07/58] refactor: merge QC metric extraction workflows (#833) --- BALSAMIC/assets/scripts/collect_qc_metrics.py | 149 ++++++++++ BALSAMIC/commands/report/deliver.py | 50 +--- BALSAMIC/config/cluster.json | 4 + BALSAMIC/constants/quality_check_reporting.py | 145 +++------- BALSAMIC/constants/workflow_rules.py | 1 + BALSAMIC/containers/balsamic/balsamic.yaml | 2 - .../quality_control/multiqc.rule | 24 +- BALSAMIC/utils/cli.py | 9 + BALSAMIC/utils/models.py | 89 +++--- BALSAMIC/utils/qc_metrics.py | 167 +---------- BALSAMIC/utils/qc_report.py | 15 +- BALSAMIC/workflows/balsamic.smk | 22 +- CHANGELOG.rst | 10 + tests/commands/report/test_deliver.py | 42 +-- tests/conftest.py | 100 +++---- tests/scripts/test_collect_qc_metrics.py | 151 ++++++++++ tests/scripts/test_create_pdf.py | 2 - .../qc/multiqc_data/multiqc_data.json | 14 +- ...ple_tumor_normal_metrics_deliverables.yaml | 162 +++++++++++ ...ample_tumor_only_metrics_deliverables.yaml | 81 ++++++ tests/utils/test_models.py | 156 +++++------ tests/utils/test_qc_metrics.py | 259 ++---------------- tests/utils/test_utils.py | 60 ++++ 23 files changed, 913 insertions(+), 801 deletions(-) create mode 100644 BALSAMIC/assets/scripts/collect_qc_metrics.py create mode 100644 tests/scripts/test_collect_qc_metrics.py create mode 100644 tests/test_data/qc_files/analysis/qc/sample_tumor_normal_metrics_deliverables.yaml create mode 100644 tests/test_data/qc_files/analysis/qc/sample_tumor_only_metrics_deliverables.yaml diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py new file mode 100644 index 000000000..493cb6587 --- /dev/null +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python +import json +import os +from pathlib import Path +from typing import List, Union + +import click +import yaml + +from BALSAMIC.constants.quality_check_reporting import METRICS + +from BALSAMIC.utils.models import MetricModel + + +@click.command( + short_help="Extract the manually specified QC metrics", +) +@click.argument("output_path", type=click.Path(exists=False), required=True) +@click.argument("multiqc_data_path", type=click.Path(exists=True), required=True) +@click.argument("sequencing_type", required=True) +@click.argument("capture_kit", required=True) +def collect_qc_metrics( + output_path: Path, + multiqc_data_path: Path, + sequencing_type: str, + capture_kit: Union[str, None], +): + """Extracts the requested metrics from a JSON multiqc file and saves them to a YAML file + + Args: + output_path: Path; destination path for the extracted YAML formatted metrics + multiqc_data_path: Path; multiqc JSON path from which the metrics will be extracted + sequencing_type: str; analysis sequencing type + capture_kit: str; capture kit used for targeted analysis (None for WGS) + """ + + with open(output_path, "w") as fn: + yaml.dump( + get_multiqc_metrics(multiqc_data_path, sequencing_type, capture_kit), + fn, + sort_keys=False, + default_flow_style=False, + ) + + +def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str: + """Extracts the metrics data source associated with a specific sample and tool + + Args: + multiqc_data: dict; raw data from the multiqc_data.json file + sample: str; sample ID + tool: str; QC analysis tools applied during the workflow (e.g. "multiqc_picard_dups") + + Returns: + A source file that was used to produce a specific metric + """ + + # Use case: splits multiqc_picard_dups into ['multiqc', 'picard', 'dup'] in order to retrieve the + # ["report_data_sources"]["Picard"]["DuplicationMetrics"] values from multiqc_data.json + subtool_name = tool[:-1].split("_") + + # Nested json fetching + for source_tool in multiqc_data["report_data_sources"]: + # source_tool: Picard, fastp, FastQC, etc. + for source_subtool in multiqc_data["report_data_sources"][source_tool]: + # source_subtool (for Picard): AlignmentSummaryMetrics, HsMetrics, DuplicationMetric, etc. + if ( + subtool_name[1].lower() in source_tool.lower() + and subtool_name[2].lower() in source_subtool.lower() + ): + try: + return os.path.basename( + multiqc_data["report_data_sources"][source_tool][ + source_subtool + ][sample] + ) + except KeyError: + # Deletes par orientation information from the sample name (insertSize metrics) + sample = sample.rsplit("_", 1)[0] + return os.path.basename( + multiqc_data["report_data_sources"][source_tool][ + source_subtool + ][sample] + ) + + +def get_qc_available_panel_beds(metrics: List[str]) -> List[str]: + """Returns available panel bed file names from a list of requested metrics""" + available_beds = [] + + for k in metrics: + if k != "default": + available_beds.append(k) + + return available_beds + + +def get_requested_metrics( + metrics: dict, sequencing_type: str, capture_kit: Union[str, None] +) -> dict: + """Parses the defined and requested metrics and returns them as a dictionary""" + + requested_metrics = metrics[sequencing_type] + if capture_kit: + requested_metrics = metrics[sequencing_type]["default"] + if capture_kit in get_qc_available_panel_beds(metrics[sequencing_type]): + requested_metrics.update(metrics[sequencing_type][capture_kit]) + + return requested_metrics + + +def get_multiqc_metrics( + multiqc_data_path: Path, sequencing_type: str, capture_kit: Union[str, None] +) -> dict: + """Extracts the requested metrics from a multiqc JSON file and returns them as a dictionary""" + + with open(multiqc_data_path, "r") as f: + multiqc_data = json.load(f) + + requested_metrics = get_requested_metrics(METRICS, sequencing_type, capture_kit) + + def extract(data, output_metrics, sample=None, source=None): + """Recursively fetch metrics data from a nested multiqc JSON""" + + if isinstance(data, dict): + for k in data: + if "umi" not in k: + if k in requested_metrics: + output_metrics.append( + MetricModel( + id=sample.split("_")[1], + input=get_multiqc_data_source( + multiqc_data, sample, source + ), + name=k, + step=source, + value=data[k], + condition=requested_metrics[k]["condition"], + ).dict() + ) + extract(data[k], output_metrics, k, sample) + + return output_metrics + + return extract(multiqc_data["report_saved_raw_data"], []) + + +if __name__ == "__main__": + collect_qc_metrics() diff --git a/BALSAMIC/commands/report/deliver.py b/BALSAMIC/commands/report/deliver.py index 72347c7ec..7fdc9dea5 100644 --- a/BALSAMIC/commands/report/deliver.py +++ b/BALSAMIC/commands/report/deliver.py @@ -9,15 +9,13 @@ import subprocess from pathlib import Path -from BALSAMIC.constants.quality_check_reporting import METRICS_TO_DELIVER -from BALSAMIC.utils.cli import get_file_extension +from BALSAMIC.utils.cli import get_file_extension, read_yaml from BALSAMIC.utils.cli import write_json from BALSAMIC.utils.cli import get_snakefile from BALSAMIC.utils.cli import SnakeMake from BALSAMIC.utils.cli import convert_deliverables_tags -from BALSAMIC.utils.rule import get_result_dir, get_capture_kit +from BALSAMIC.utils.rule import get_result_dir from BALSAMIC.utils.exc import BalsamicError -from BALSAMIC.utils.qc_metrics import get_qc_metrics_json, extract_metrics_for_delivery from BALSAMIC.utils.qc_report import render_html, report_data_population from BALSAMIC.constants.workflow_params import VCF_DICT from BALSAMIC.constants.workflow_rules import DELIVERY_RULES @@ -86,14 +84,6 @@ help=f"Run workflow with selected variant caller(s) disable. Use comma to remove multiple variant callers. Valid " f"values are: {list(VCF_DICT.keys())}", ) -@click.option( - "--qc-metrics/--no-qc-metrics", - default=True, - show_default=True, - is_flag=True, - help=f"Generates a YAML file of quality control metrics. " - f"Currently retrieved metrics: {', '.join(list(set(METRICS_TO_DELIVER['targeted'] + METRICS_TO_DELIVER['wgs'])))}", -) @click.pass_context def deliver( context, @@ -104,7 +94,6 @@ def deliver( disable_variant_caller, sample_id_map, case_id_map, - qc_metrics, ): """ cli for deliver sub-command. @@ -163,10 +152,13 @@ def deliver( meta["case_name"] = case_id_map[1] meta["apptag"] = case_id_map[2] - collected_qc = get_qc_metrics_json( - sample_config_dict["analysis"]["result"], - sequencing_type, - get_capture_kit(sample_config_dict), + collected_qc = read_yaml( + os.path.join( + sample_config_dict["analysis"]["result"], + "qc", + sample_config_dict["analysis"]["case_id"] + + "_metrics_deliverables.yaml", + ) ) meta = report_data_population(collected_qc=collected_qc, meta=meta) balsamic_qc_report = os.path.join( @@ -266,30 +258,6 @@ def deliver( } ) - # Add output metrics delivery to report - if qc_metrics: - metric_delivery_report = os.path.join( - yaml_write_directory, case_name + "_metrics_deliverables.yaml" - ) - metrics = extract_metrics_for_delivery( - sample_config_dict["analysis"]["result"], sequencing_type - ) - - with open(metric_delivery_report, "w") as fn: - yaml.dump(metrics, fn, default_flow_style=False) - - LOG.info(f"Created metrics delivery file: {metric_delivery_report}") - - delivery_json["files"].append( - { - "path": metric_delivery_report, - "step": "balsamic_delivery", - "format": get_file_extension(metric_delivery_report), - "tag": ["qc-metrics-yaml"], - "id": case_name, - } - ) - write_json(delivery_json, delivery_file_name) with open(delivery_file_name + ".yaml", "w") as fn: yaml.dump(delivery_json, fn, default_flow_style=False) diff --git a/BALSAMIC/config/cluster.json b/BALSAMIC/config/cluster.json index e468df258..e44380d68 100644 --- a/BALSAMIC/config/cluster.json +++ b/BALSAMIC/config/cluster.json @@ -255,5 +255,9 @@ "ascat_tumor_normal_merge_output": { "time": "00:15:00", "n": 1 + }, + "collect_custom_qc_metrics": { + "time": "00:15:00", + "n": 1 } } diff --git a/BALSAMIC/constants/quality_check_reporting.py b/BALSAMIC/constants/quality_check_reporting.py index 3516cccad..272cb01e5 100644 --- a/BALSAMIC/constants/quality_check_reporting.py +++ b/BALSAMIC/constants/quality_check_reporting.py @@ -50,114 +50,45 @@ }, } -METRIC_FILES = { - "picard_insertSize": "multiqc_picard_insertSize.json", - "picard_dups": "multiqc_picard_dups.json", - "picard_HsMetrics": "multiqc_picard_HsMetrics.json", - "picard_wgsmetrics": "multiqc_picard_wgsmetrics.json", -} - METRICS = { - "qc": { - "targeted": { - "default": { - METRIC_FILES["picard_insertSize"]: { - "MEAN_INSERT_SIZE": {"condition": None}, - }, - METRIC_FILES["picard_dups"]: { - "PERCENT_DUPLICATION": {"condition": None} - }, - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": {"condition": None}, - "PCT_TARGET_BASES_50X": {"condition": None}, - "PCT_TARGET_BASES_100X": {"condition": None}, - "PCT_TARGET_BASES_250X": {"condition": None}, - "PCT_TARGET_BASES_500X": {"condition": None}, - "PCT_TARGET_BASES_1000X": {"condition": None}, - "MEAN_TARGET_COVERAGE": {"condition": None}, - "FOLD_80_BASE_PENALTY": {"condition": None}, - "PCT_OFF_BAIT": {"condition": None}, - }, - }, - "gicfdna_3.1_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 1000} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.6} - }, - } - }, - "gmcksolid_4.1_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 500} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.8} - }, - } - }, - "gmsmyeloid_5.2_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 1000} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.6} - }, - } - }, - "lymphoma_6.1_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 1000} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.6} - }, - } - }, - "gmslymphoid_7.1_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 1000} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.6} - }, - } - }, - "twistexomerefseq_9.1_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 100} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.8} - }, - } - }, + "targeted": { + "default": { + "MEAN_INSERT_SIZE": {"condition": None}, + "PERCENT_DUPLICATION": {"condition": None}, + "MEDIAN_TARGET_COVERAGE": {"condition": None}, + "PCT_TARGET_BASES_50X": {"condition": None}, + "PCT_TARGET_BASES_100X": {"condition": None}, + "PCT_TARGET_BASES_250X": {"condition": None}, + "PCT_TARGET_BASES_500X": {"condition": None}, + "PCT_TARGET_BASES_1000X": {"condition": None}, + "MEAN_TARGET_COVERAGE": {"condition": None}, + "FOLD_80_BASE_PENALTY": {"condition": None}, + "PCT_OFF_BAIT": {"condition": None}, }, - "wgs": { - METRIC_FILES["picard_wgsmetrics"]: { - "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}} - }, + "gicfdna_3.1_hg19_design.bed": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, }, - } -} - -METRICS_TO_DELIVER = { - "targeted": [ - "MEAN_INSERT_SIZE", - "PERCENT_DUPLICATION", - "MEAN_TARGET_COVERAGE", - "MEDIAN_TARGET_COVERAGE", - "FOLD_80_BASE_PENALTY", - "PCT_OFF_BAIT", - ], - "wgs": [ - "FOLD_80_BASE_PENALTY", - ], + "gmcksolid_4.1_hg19_design.bed": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 500}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, + }, + "gmsmyeloid_5.2_hg19_design.bed": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, + }, + "lymphoma_6.1_hg19_design.bed": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, + }, + "gmslymphoid_7.1_hg19_design.bed": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, + }, + "twistexomerefseq_9.1_hg19_design.bed": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 100}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, + }, + }, + "wgs": {"FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}}, } diff --git a/BALSAMIC/constants/workflow_rules.py b/BALSAMIC/constants/workflow_rules.py index 064d87501..fde13bad6 100644 --- a/BALSAMIC/constants/workflow_rules.py +++ b/BALSAMIC/constants/workflow_rules.py @@ -114,6 +114,7 @@ DELIVERY_RULES = [ "fastp", "multiqc", + "collect_custom_qc_metrics", "vep_somatic", "vep_germline", "tmb_calculation", diff --git a/BALSAMIC/containers/balsamic/balsamic.yaml b/BALSAMIC/containers/balsamic/balsamic.yaml index 20a61f3a7..f4a5c916f 100644 --- a/BALSAMIC/containers/balsamic/balsamic.yaml +++ b/BALSAMIC/containers/balsamic/balsamic.yaml @@ -7,6 +7,4 @@ dependencies: - conda-forge::python=3.7 - conda-forge::pip - conda-forge::pygraphviz - - conda-forge::numpy - - conda-forge::click - conda-forge::pillow diff --git a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule index 4922e2ee2..092cdef3a 100644 --- a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule @@ -60,8 +60,6 @@ else: multiqc_input.extend(expand(umi_qc_dir + "{sample}.umi.collect_hsmetric", sample=config["samples"])) - - rule multiqc: input: multiqc_input @@ -86,3 +84,25 @@ echo -e \"{params.dir_list}\" > {params.qc_dir}/dir_list; multiqc --force --outdir {params.qc_dir} --exclude {params.exclude_module} --data-format json -l {params.qc_dir}/dir_list; chmod -R 777 {params.qc_dir}; """ + + +rule collect_custom_qc_metrics: + input: + json = qc_dir + "multiqc_data/multiqc_data.json" + output: + yaml = qc_dir + config["analysis"]["case_id"] + "_metrics_deliverables.yaml" + params: + collect_qc_metrics_script = get_script_path("collect_qc_metrics.py"), + sequencing_type = get_sequencing_type(config), + capture_kit = get_capture_kit(config), + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "qc-metrics"} + singularity: + Path(singularity_image, "balsamic.sif").as_posix() + threads: + get_threads(cluster_config, "collect_custom_qc_metrics") + message: + "Extract the manually specified QC metric for validation and delivery" + shell: + """ +python {params.collect_qc_metrics_script} {output.yaml} {input.json} {params.sequencing_type} {params.capture_kit} + """ diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 78fe66949..213f41ebf 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -232,6 +232,15 @@ def write_json(json_out, output_config): raise error +def read_yaml(yaml_path): + """Retrieves data from a yaml file""" + if Path(yaml_path).exists(): + with open(yaml_path, "r") as fn: + return yaml.load(fn, Loader=yaml.SafeLoader) + else: + raise FileNotFoundError(f"The YAML file {yaml_path} was not found.") + + def iterdict(dic): """dictionary iteration - returns generator""" for key, value in dic.items(): diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index f6e78508a..286c74ba4 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -3,7 +3,7 @@ import os from datetime import datetime from pathlib import Path -from typing import Optional, List, Dict +from typing import Optional, List, Dict, Any from pydantic import BaseModel, validator, Field, AnyUrl, root_validator from pydantic.types import DirectoryPath, FilePath @@ -697,64 +697,20 @@ class BalsamicWorkflowConfig(BaseModel): tnscope_umi: UMIParamsTNscope -class QCMetricModel(BaseModel): - """Defines the quality control metric model +class MetricConditionModel(BaseModel): + """Defines the metric condition model Attributes: - name: str (required); quality control metric name norm: string (optional); validation condition threshold: float (optional); validation cut off - value: float (required); metrics value - - Raises: - ValueError: when a metric does not meet its validation requirements """ - name: str norm: Optional[str] = None threshold: Optional[float] = None - value: float - - @root_validator() - def check_metric(cls, values): - """Checks if a metric meets its filtering condition""" - if ( - values["norm"] - and values["threshold"] - and not VALID_OPS[values["norm"]](values["value"], values["threshold"]) - ): - raise ValueError( - f"QC metric {values['name']}: {values['value']} validation has failed. " - f"(Condition: {values['norm']} {values['threshold']})." - ) - - LOG.info(f"QC metric {values['name']}: {values['value']} meets its condition.") - return values -class QCValidationModel(BaseModel): - """Defines the quality control validation model - - Attributes: - metrics: Dict(sample_name, list(QCMetricModel)) (required); quality control metric attributes - """ - - metrics: Dict[str, List[QCMetricModel]] - - @property - def get_json(self): - """Restructures the metrics dictionary and returns a metric-value json object""" - metrics_json = {k: {} for k in self.metrics} - - for sample_name, metrics in self.metrics.items(): - for metric in metrics: - metrics_json[sample_name].update({metric.name: metric.value}) - - return metrics_json - - -class DeliveryMetricModel(BaseModel): - """Defines the metric attributes model for delivery +class MetricModel(BaseModel): + """Defines the metric attributes model Attributes: header: str (optional); data @@ -762,7 +718,8 @@ class DeliveryMetricModel(BaseModel): input: str (required); input file name: str (required); metric name step: str (required); step that generated the metric - value: float (required); metric value + value: Any (required and can take None as a value); metric value + condition: MetricConditionModel (required and can take None as a value); metric validation condition """ header: Optional[str] @@ -770,4 +727,34 @@ class DeliveryMetricModel(BaseModel): input: str name: str step: str - value: float + value: Any = ... + condition: Optional[MetricConditionModel] = ... + + +class MetricValidationModel(BaseModel): + """Defines the metric validation model + + Attributes: + metrics: List[MetricModel] (required); metric model to validate + + Raises: + ValueError: when a metric does not meet its validation requirements + """ + + metrics: List[MetricModel] + + @validator("metrics", each_item=True) + def check_squares(cls, metric): + """Checks if a metric meets its filtering condition""" + + if metric.condition and not VALID_OPS[metric.condition.norm]( + metric.value, metric.condition.threshold + ): + raise ValueError( + f"QC metric {metric.name}: {metric.value} validation has failed. " + f"(Condition: {metric.condition.norm} {metric.condition.threshold}, ID: {metric.id})." + ) + + LOG.info(f"QC metric {metric.name}: {metric.value} meets its condition.") + + return metric diff --git a/BALSAMIC/utils/qc_metrics.py b/BALSAMIC/utils/qc_metrics.py index 704641245..3d69bafad 100644 --- a/BALSAMIC/utils/qc_metrics.py +++ b/BALSAMIC/utils/qc_metrics.py @@ -1,162 +1,21 @@ -import json -import os +from typing import Union -from BALSAMIC.constants.quality_check_reporting import ( - METRICS, - METRICS_TO_DELIVER, -) -from BALSAMIC.utils.models import QCValidationModel, DeliveryMetricModel +from BALSAMIC.utils.models import MetricValidationModel -def get_qc_available_panel_beds(metrics): - """Returns available panel beds file names for QC validation""" - available_beds = [] +def get_qc_metric_value( + metrics: dict, sample_id: str, metric_name: str +) -> Union[float, None]: + """Extracts the metrics value associated to a specific sample_id and metric_name""" - for k in metrics: - if k != "default": - available_beds.append(k) + for metric in metrics: + if metric["id"] == sample_id and metric["name"] == metric_name: + return metric["value"] - return available_beds + return None -def merge_dicts(*dicts): - """Merges multiple dictionaries integrating by common keys""" - merged_dict = {} +def validate_qc_metrics(metrics: dict) -> dict: + """Returns a set of validated QC metrics""" - for d in dicts: - for key in d: - try: - # Overwrites the default values with panel specific ones - merged_dict[key].update(d[key]) - except KeyError: - merged_dict[key] = d[key] - - return merged_dict - - -def read_metrics(analysis_path, file_name): - """Extracts all the metrics from a specific QC file""" - with open(os.path.join(analysis_path, "qc", "multiqc_data", file_name), "r") as f: - raw_metrics = json.load(f) - - # Ignore the metrics associated with UMIs - filtered_raw_metrics = { - sample_name: metrics - for sample_name, metrics in raw_metrics.items() - if "umi" not in sample_name - } - - return filtered_raw_metrics - - -def update_metrics_dict(sample_id, metric, value, metrics_dict): - """Appends a {metric, value, condition} object to a dictionary""" - sample_name = "_".join([sample_id.split("_")[0], sample_id.split("_")[1]]) - - if sample_name not in metrics_dict: - metrics_dict[sample_name] = [] - - try: - norm = metric[1]["condition"]["norm"] - threshold = metric[1]["condition"]["threshold"] - except TypeError: - norm = None - threshold = None - - metrics_dict[sample_name].append( - {"name": metric[0], "norm": norm, "threshold": threshold, "value": value} - ) - - return metrics_dict - - -def get_qc_metrics_dict(analysis_path, requested_metrics): - """Returns a dictionary of the requested QC metrics along with their values and filtering conditions""" - metrics_dict = {} - - # Loop through MultiQC json files - for file_name, metrics in requested_metrics.items(): - raw_metrics = read_metrics(analysis_path, file_name) - for j in raw_metrics: - for k in metrics.items(): - metrics_dict = update_metrics_dict( - j, k, raw_metrics[j][k[0]], metrics_dict - ) - return metrics_dict - - -def get_qc_metrics_json(analysis_path, sequencing_type, panel_bed): - """Extracts the metrics of interest and returns them as a json object""" - if sequencing_type != "wgs" and panel_bed in get_qc_available_panel_beds( - METRICS["qc"][sequencing_type] - ): - metrics = merge_dicts( - METRICS["qc"][sequencing_type]["default"], - METRICS["qc"][sequencing_type][panel_bed], - ) - elif sequencing_type != "wgs": - metrics = METRICS["qc"][sequencing_type]["default"] - else: - metrics = METRICS["qc"][sequencing_type] - - qc_model = QCValidationModel.parse_obj( - {"metrics": get_qc_metrics_dict(analysis_path, metrics)} - ) - - return qc_model.get_json - - -def get_multiqc_data_source(data, sample, source_name): - """Extracts the metrics data source associated with sample and source names""" - - # Splits multiqc_picard_dups into ['multiqc', 'picard', 'dup'] in order to retrieve the - # ["report_data_sources"]["Picard"]["DuplicationMetrics"] values from multiqc_data.json - source = source_name[:-1].split("_") - - # Nested json fetching - for source_tool in data["report_data_sources"]: - for source_step in data["report_data_sources"][source_tool]: - if ( - source[1].lower() in source_tool.lower() - and source[2].lower() in source_step.lower() - ): - try: - return os.path.basename( - data["report_data_sources"][source_tool][source_step][sample] - ) - except KeyError: - # Deletes par orientation information from the sample name (insertSize metrics) - sample = sample.rsplit("_", 1)[0] - - return os.path.basename( - data["report_data_sources"][source_tool][source_step][sample] - ) - - -def extract_metrics_for_delivery(analysis_path, sequencing_type): - """Extracts the output metrics to be delivered""" - with open( - os.path.join(analysis_path, "qc", "multiqc_data", "multiqc_data.json"), "r" - ) as f: - raw_data = json.load(f) - - def extract(data, output_metrics, sample=None, source=None): - """Recursively fetch metrics information from nested multiQC JSON""" - if isinstance(data, dict): - for k in data: - if "umi" not in k: - if k in METRICS_TO_DELIVER[sequencing_type]: - output_metrics.append( - DeliveryMetricModel( - id=sample.split("_")[1], - input=get_multiqc_data_source(raw_data, sample, source), - name=k, - step=source, - value=data[k], - ).dict() - ) - extract(data[k], output_metrics, k, sample) - - return output_metrics - - return extract(raw_data["report_saved_raw_data"], []) + return MetricValidationModel(metrics=metrics).dict()["metrics"] diff --git a/BALSAMIC/utils/qc_report.py b/BALSAMIC/utils/qc_report.py index 405659821..9f2907404 100644 --- a/BALSAMIC/utils/qc_report.py +++ b/BALSAMIC/utils/qc_report.py @@ -5,9 +5,10 @@ from BALSAMIC import __version__ as balsamic_version from BALSAMIC.constants.quality_check_reporting import REPORT_MODEL +from BALSAMIC.utils.qc_metrics import get_qc_metric_value -def report_data_population(collected_qc: dict, meta: dict, lang: str = "sv") -> dict: +def report_data_population(collected_qc: list, meta: dict, lang: str = "sv") -> dict: """populates a metadata dictionary that contains qc and case/sample information""" meta = { **meta, @@ -26,16 +27,16 @@ def report_data_population(collected_qc: dict, meta: dict, lang: str = "sv") -> v[lang] for x, v in REPORT_MODEL["coverage"].items() ] - for sample_id, analysis_results in collected_qc.items(): - lims_id = sample_id.split("_")[1] + for sample in collected_qc: + lims_id = sample["id"] sample_qc = [meta["sample_map"][lims_id], meta["sample_type"][lims_id]] sample_cov = [meta["sample_map"][lims_id], meta["sample_type"][lims_id]] sample_qc = sample_qc + parse_collected_qc( - collected_qc=collected_qc, model_param="qc", sample_id=sample_id + collected_qc=collected_qc, model_param="qc", sample_id=lims_id ) sample_cov = sample_cov + parse_collected_qc( - collected_qc=collected_qc, model_param="coverage", sample_id=sample_id + collected_qc=collected_qc, model_param="coverage", sample_id=lims_id ) meta["qc_table_content"][lims_id] = sample_qc @@ -44,13 +45,13 @@ def report_data_population(collected_qc: dict, meta: dict, lang: str = "sv") -> return meta -def parse_collected_qc(collected_qc: dict, model_param: str, sample_id: str) -> list: +def parse_collected_qc(collected_qc: list, model_param: str, sample_id: str) -> list: """parses collect qc and returns model_param""" parsed_qc = list() for qc_item, qc_value in REPORT_MODEL[model_param].items(): decimal_point = qc_value["decimal"] - qc_to_report = collected_qc[sample_id][qc_item] + qc_to_report = get_qc_metric_value(collected_qc, sample_id, qc_item) if "as_percent" in qc_value: qc_to_report = qc_to_report * 100 qc_to_report = str(round(qc_to_report, decimal_point)) diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index f15fe175f..0ddc68010 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -13,7 +13,7 @@ from PyPDF2 import PdfFileMerger from BALSAMIC.utils.exc import BalsamicError -from BALSAMIC.utils.cli import (write_json, check_executable, generate_h5) +from BALSAMIC.utils.cli import (write_json, check_executable, generate_h5, read_yaml) from BALSAMIC.utils.models import VarCallerFilter, BalsamicWorkflowConfig @@ -208,7 +208,10 @@ for r in rules_to_include: include: Path(RULE_DIRECTORY, r).as_posix() # Define common and analysis specific outputs -quality_control_results = [result_dir + "qc/" + "multiqc_report.html"] +quality_control_results = [ + os.path.join(qc_dir, "multiqc_report.html"), + os.path.join(qc_dir, case_id + "_metrics_deliverables.yaml"), +] analysis_specific_results = [expand(vep_dir + "{vcf}.vcf.gz", vcf=get_vcf(config, germline_caller, germline_call_samples)), @@ -317,25 +320,20 @@ if 'delivery' in config: rule all: input: - quality_control_results + analysis_specific_results + quality_control_results + analysis_specific_results, output: - qc_json_file = os.path.join(get_result_dir(config), "qc", "qc_metrics_summary.json"), finish_file = os.path.join(get_result_dir(config), "analysis_finish") params: - tmp_dir = tmp_dir, - result_dir = result_dir, - sequencing_type = get_sequencing_type(config), - panel_bed = get_capture_kit(config) + tmp_dir = tmp_dir run: import datetime import shutil - from BALSAMIC.utils.qc_metrics import get_qc_metrics_json + from BALSAMIC.utils.qc_metrics import validate_qc_metrics - # Save QC metrics to a JSON file + # Perform validation of extracted QC metrics try: - qc_metrics_summary = get_qc_metrics_json(params.result_dir, params.sequencing_type, params.panel_bed) - write_json(qc_metrics_summary, str(output.qc_json_file)) + validate_qc_metrics(read_yaml(input[1])) except ValueError as val_exc: LOG.error(val_exc) raise BalsamicError diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d1d60e758..49b362284 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,16 @@ Added: * Snakemake workflow to create canfam3 reference #843 * Call umi variants using TNscope in bed defined regions #821 +Changed: +^^^^^^^^ + +* Merge QC metric extraction workflows #833 + +Removed +^^^^^^^ + +* ``--qc-metrics/--no-qc-metrics`` flag from the ``balsamic report deliver`` command #833 + [8.2.5] ------- diff --git a/tests/commands/report/test_deliver.py b/tests/commands/report/test_deliver.py index 37090b83c..97e910c73 100644 --- a/tests/commands/report/test_deliver.py +++ b/tests/commands/report/test_deliver.py @@ -35,6 +35,8 @@ def test_deliver_tumor_only_panel( "tumor:tumor:KS454", "--case-id-map", "gmck-solid:KSK899:apptag", + "--disable-variant-caller", + "cnvkit", ] ) @@ -107,43 +109,3 @@ def test_deliver_tumor_normal_panel( assert result.exit_code == 0 assert actual_delivery_report.is_file() assert "following" in caplog.text - - -def test_deliver_metrics( - invoke_cli, - environ, - tumor_normal_config, - helpers, - sentieon_install_dir, - sentieon_license, - caplog, -): - - # GIVEN a tumor-normal config file - helpers.read_config(tumor_normal_config) - actual_metric_delivery_yaml = Path( - helpers.delivery_dir, helpers.case_id + "_metrics_deliverables.yaml" - ) - - with mock.patch.dict( - environ, - { - "SENTIEON_LICENSE": sentieon_license, - "SENTIEON_INSTALL_DIR": sentieon_install_dir, - }, - ), caplog.at_level(logging.DEBUG): - # WHEN running analysis - result = invoke_cli( - [ - "report", - "deliver", - "--sample-config", - tumor_normal_config, - "--qc-metrics", - ] - ) - - # THEN it should run without any error - assert result.exit_code == 0 - assert actual_metric_delivery_yaml.is_file() - assert "following" in caplog.text diff --git a/tests/conftest.py b/tests/conftest.py index 983a3c235..0657f3291 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,8 @@ from pathlib import Path from functools import partial from click.testing import CliRunner + +from BALSAMIC.utils.cli import read_yaml from .helpers import ConfigHelper from BALSAMIC.commands.base import cli from BALSAMIC import __version__ as balsamic_version @@ -273,9 +275,9 @@ def tumor_normal_config( ], ) - qc_dir = Path(analysis_dir, case_id, "analysis", "qc", "multiqc_data") + qc_dir = Path(analysis_dir, case_id, "analysis", "qc") qc_dir.mkdir(parents=True, exist_ok=False) - copy_tree("tests/test_data/qc_files/analysis/qc/multiqc_data/", qc_dir.as_posix()) + copy_tree("tests/test_data/qc_files/analysis/qc/", qc_dir.as_posix()) return Path(analysis_dir, case_id, case_id + ".json").as_posix() @@ -378,9 +380,9 @@ def tumor_only_config( ], ) - qc_dir = Path(analysis_dir, case_id, "analysis", "qc", "multiqc_data") + qc_dir = Path(analysis_dir, case_id, "analysis", "qc") qc_dir.mkdir(parents=True, exist_ok=False) - copy_tree("tests/test_data/qc_files/analysis/qc/multiqc_data/", qc_dir.as_posix()) + copy_tree("tests/test_data/qc_files/analysis/qc/", qc_dir.as_posix()) return Path(analysis_dir, case_id, case_id + ".json").as_posix() @@ -494,71 +496,43 @@ def analysis_path(): @pytest.fixture(scope="session") -def qc_metrics(): - """Sample data for QC model testing""" - return { - "qc": { - "targeted": { - "multiqc_picard_insertSize.json": { - "MEAN_INSERT_SIZE": {"condition": None} - }, - "multiqc_picard_HsMetrics.json": { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 500.0} - } - }, - }, - "wgs": { - "multiqc_picard_insertSize.json": { - "MEAN_INSERT_SIZE": {"condition": None} - }, - "multiqc_picard_dups.json": { - "PERCENT_DUPLICATION": {"condition": None} - }, - }, - } - } +def multiqc_data_path(analysis_path): + """multiqc_data.json test path""" + return os.path.join(analysis_path, "qc", "multiqc_data", "multiqc_data.json") @pytest.fixture(scope="session") -def qc_extracted_metrics(): - """Extracted metrics for QC model testing""" - return { - "metrics": { - "sample_1": [ - { - "name": "MEAN_INSERT_SIZE_1", - "norm": "lt", - "threshold": 1.0, - "value": 0.5, - }, - { - "name": "MEAN_INSERT_SIZE_2", - "norm": "lt", - "threshold": 1.0, - "value": 0.5, - }, - ], - "sample_2": [ - { - "name": "MEAN_INSERT_SIZE_1", - "norm": "lt", - "threshold": 1.0, - "value": 0.5, - }, - ], - } - } +def metrics_yaml_path(analysis_path): + """sample_tumor_only_metrics_deliverables.yaml test path""" + return os.path.join( + analysis_path, "qc", "sample_tumor_only_metrics_deliverables.yaml" + ) @pytest.fixture(scope="session") -def qc_raw_targeted_metrics(): - """Raw metrics""" +def qc_requested_metrics(): + """Raw requested metrics""" return { - "default": { - "metrics_1.json": {"METRIC_1": 0.1, "METRIC_2": 0.2}, - "metrics_2.json": {"METRIC_3": 0.3}, + "targeted": { + "default": { + "METRIC_1": {"condition": None}, + "METRIC_2": {"condition": {"norm": "gt", "threshold": 2}}, + }, + "panel_1.bed": { + "METRIC_3": {"condition": {"norm": "gt", "threshold": 3}}, + }, + "panel_2.bed": { + "METRIC_1": {"condition": {"norm": "gt", "threshold": 1}}, + "METRIC_4": {"condition": {"norm": "gt", "threshold": 4}}, + }, + }, + "wgs": { + "METRIC_1": {"condition": {"norm": "gt", "threshold": 1}}, }, - "panel_1.bed": {"metrics_2.json": {"METRIC_4": 0.4}}, - "panel_2.bed": {"metrics_1.json": {"METRIC_1": 0.5, "METRIC_4": 0.4}}, } + + +@pytest.fixture(scope="session") +def qc_extracted_metrics(metrics_yaml_path): + """Extracted and formatted QC metrics""" + return read_yaml(metrics_yaml_path) diff --git a/tests/scripts/test_collect_qc_metrics.py b/tests/scripts/test_collect_qc_metrics.py new file mode 100644 index 000000000..8e262a8ee --- /dev/null +++ b/tests/scripts/test_collect_qc_metrics.py @@ -0,0 +1,151 @@ +import json +from pathlib import Path + +from BALSAMIC.assets.scripts.collect_qc_metrics import ( + get_multiqc_data_source, + get_multiqc_metrics, + collect_qc_metrics, + get_qc_available_panel_beds, + get_requested_metrics, +) + + +def test_get_qc_available_panel_beds(qc_requested_metrics): + """test extraction of capture kits available for analysis""" + + # GIVEN an expected output + expected_output = ["panel_1.bed", "panel_2.bed"] + + # WHEN calling the function + available_panel_beds = get_qc_available_panel_beds(qc_requested_metrics["targeted"]) + + # THEN check if the extracted bed file names correspond to the expected ones + assert available_panel_beds == expected_output + + +def test_get_requested_metrics_targeted(qc_requested_metrics): + """test retrieval of the requested targeted metrics""" + + # GIVEN a sequencing type and a capture kit + seq_type = "targeted" + capture_kit = "panel_1.bed" + + # GIVEN the expected output + expected_output = { + "METRIC_1": {"condition": None}, + "METRIC_2": {"condition": {"norm": "gt", "threshold": 2}}, + "METRIC_3": {"condition": {"norm": "gt", "threshold": 3}}, + } + + # WHEN calling the function + requested_metrics = get_requested_metrics( + qc_requested_metrics, seq_type, capture_kit + ) + + # THEN check if the requested targeted metrics are correctly retrieved + assert requested_metrics.items() == expected_output.items() + + +def test_get_requested_metrics_wgs(qc_requested_metrics): + """test extraction of the requested WGS metrics""" + + # GIVEN a sequencing type and a capture kit + seq_type = "wgs" + capture_kit = None + + # GIVEN the expected output + expected_output = { + "METRIC_1": {"condition": {"norm": "gt", "threshold": 1}}, + } + + # WHEN calling the function + requested_metrics = get_requested_metrics( + qc_requested_metrics, seq_type, capture_kit + ) + + # THEN check if the requested metrics are WGS specific + assert requested_metrics.items() == expected_output.items() + + +def test_get_multiqc_data_source(multiqc_data_path): + """test multiqc source extraction from multiqc_data.json analysis file""" + + # GIVEN input parameters and the multiqc data + sample = "concatenated_tumor_XXXXXX_R" + source_name_hs_metrics = "multiqc_picard_HsMetrics" + source_name_dup = "multiqc_picard_dups" + + with open(multiqc_data_path, "r") as f: + multiqc_data = json.load(f) + + # GIVEN an expected output + source_hs_metrics = "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric" + source_dup = "concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt" + + # WHEN extracting the source of a specific sample and collection of metrics + out_source_hs_metrics = get_multiqc_data_source( + multiqc_data, sample, source_name_hs_metrics + ) + out_source_dup = get_multiqc_data_source(multiqc_data, sample, source_name_dup) + + # THEN check if the extracted source names correspond to the expected ones + assert source_hs_metrics == out_source_hs_metrics + assert source_dup == out_source_dup + + +def test_get_multiqc_metrics(multiqc_data_path, qc_extracted_metrics): + """test metrics retrieval from the multiqc_data.json file""" + + # GIVEN a sequencing type and a capture kit + seq_type = "targeted" + capture_kit = "lymphoma_6.1_hg19_design.bed" + + # WHEN calling the function + metrics = get_multiqc_metrics( + multiqc_data_path, + seq_type, + capture_kit, + ) + + # THEN check if the metrics are correctly retrieved + assert qc_extracted_metrics == metrics + + +def test_get_multiqc_metrics_filtering_umi(multiqc_data_path): + """tests that UMI data is filtered out when extracting metrics""" + + # GIVEN a sequencing type and a capture kit + seq_type = "targeted" + capture_kit = None + + # WHEN calling the function + metrics = get_multiqc_metrics( + multiqc_data_path, + seq_type, + capture_kit, + ) + + # THEN check if the UMI samples are filtered out + for metric in metrics: + assert "umi" not in metric["input"] + + +def test_collect_qc_metrics(tmp_path, multiqc_data_path, cli_runner): + """tests qc metrics yaml file generation""" + + # GIVEN the output and multiqc metrics paths + output_path = tmp_path / "sample_tumor_only_metrics_deliverables.yaml" + + # GIVEN a sequencing type and a capture kit + seq_type = "targeted" + capture_kit = "lymphoma_6.1_hg19_design.bed" + + # WHEN invoking the python script + result = cli_runner.invoke( + collect_qc_metrics, + [str(output_path), multiqc_data_path, seq_type, capture_kit], + ) + + # THEN check if the YAML is correctly created and there are no errors + assert result.exit_code == 0 + assert Path(output_path).exists() diff --git a/tests/scripts/test_create_pdf.py b/tests/scripts/test_create_pdf.py index 849232bc9..d5cb85cf3 100644 --- a/tests/scripts/test_create_pdf.py +++ b/tests/scripts/test_create_pdf.py @@ -62,8 +62,6 @@ def test_create_pdf(tmp_path, cli_runner): # GIVEN the output path output_path = tmp_path / "ascat.output.pdf" - print(output_path) - # WHEN invoking the python script result = cli_runner.invoke( create_pdf, [str(output_path), statistics_path, plots_path[0], plots_path[1]] diff --git a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json index c1b9e8f05..e8b15f614 100755 --- a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json +++ b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json @@ -27,7 +27,12 @@ "MIN_TARGET_COVERAGE": 0.0, "FOLD_80_BASE_PENALTY": 1.359189, "AT_DROPOUT": 6.093115, - "GC_DROPOUT": 0.027402 + "GC_DROPOUT": 0.027402, + "PCT_TARGET_BASES_50X": 1.0, + "PCT_TARGET_BASES_100X": 0.999987, + "PCT_TARGET_BASES_250X": 0.998445, + "PCT_TARGET_BASES_500X": 0.996675, + "PCT_TARGET_BASES_1000X": 0.992466 }, "concatenated_tumor_XXXXXX_R.consensusfiltered.umi": { "BAIT_SET": "concatenated_tumor_XXXXXX_R", @@ -41,7 +46,12 @@ "MIN_TARGET_COVERAGE": 0.0, "FOLD_80_BASE_PENALTY": 1.742114, "AT_DROPOUT": 12.048384, - "GC_DROPOUT": 0.150425 + "GC_DROPOUT": 0.150425, + "PCT_TARGET_BASES_50X": 0.999866, + "PCT_TARGET_BASES_100X": 0.99819, + "PCT_TARGET_BASES_250X": 0.996568, + "PCT_TARGET_BASES_500X": 0.994423, + "PCT_TARGET_BASES_1000X": 0.984181 } }, "multiqc_picard_insertSize": { diff --git a/tests/test_data/qc_files/analysis/qc/sample_tumor_normal_metrics_deliverables.yaml b/tests/test_data/qc_files/analysis/qc/sample_tumor_normal_metrics_deliverables.yaml new file mode 100644 index 000000000..91407b3be --- /dev/null +++ b/tests/test_data/qc_files/analysis/qc/sample_tumor_normal_metrics_deliverables.yaml @@ -0,0 +1,162 @@ +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_OFF_BAIT + step: multiqc_picard_HsMetrics + value: 0.161185 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 636.23177 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEDIAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 597.0 + condition: + norm: gt + threshold: 500.0 +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: FOLD_80_BASE_PENALTY + step: multiqc_picard_HsMetrics + value: 1.469357 + condition: + norm: lt + threshold: 1.8 +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_50X + step: multiqc_picard_HsMetrics + value: 0.998388 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_100X + step: multiqc_picard_HsMetrics + value: 0.99497 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_250X + step: multiqc_picard_HsMetrics + value: 0.965738 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_500X + step: multiqc_picard_HsMetrics + value: 0.679445 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_1000X + step: multiqc_picard_HsMetrics + value: 0.085208 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_OFF_BAIT + step: multiqc_picard_HsMetrics + value: 0.158226 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 888.343586 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEDIAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 805.0 + condition: + norm: gt + threshold: 500.0 +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: FOLD_80_BASE_PENALTY + step: multiqc_picard_HsMetrics + value: 1.566744 + condition: + norm: lt + threshold: 1.8 +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_50X + step: multiqc_picard_HsMetrics + value: 0.998554 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_100X + step: multiqc_picard_HsMetrics + value: 0.997177 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_250X + step: multiqc_picard_HsMetrics + value: 0.979764 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_500X + step: multiqc_picard_HsMetrics + value: 0.874594 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_1000X + step: multiqc_picard_HsMetrics + value: 0.304354 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.insertsizemetric + name: MEAN_INSERT_SIZE + step: multiqc_picard_insertSize + value: 125.819455 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.insertsizemetric + name: MEAN_INSERT_SIZE + step: multiqc_picard_insertSize + value: 131.280203 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt + name: PERCENT_DUPLICATION + step: multiqc_picard_dups + value: 0.356228 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.txt + name: PERCENT_DUPLICATION + step: multiqc_picard_dups + value: 0.255692 + condition: null diff --git a/tests/test_data/qc_files/analysis/qc/sample_tumor_only_metrics_deliverables.yaml b/tests/test_data/qc_files/analysis/qc/sample_tumor_only_metrics_deliverables.yaml new file mode 100644 index 000000000..a9a2def98 --- /dev/null +++ b/tests/test_data/qc_files/analysis/qc/sample_tumor_only_metrics_deliverables.yaml @@ -0,0 +1,81 @@ +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_OFF_BAIT + step: multiqc_picard_HsMetrics + value: 0.364546 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 2314.698853 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEDIAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 2393.0 + condition: + norm: gt + threshold: 1000.0 +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: FOLD_80_BASE_PENALTY + step: multiqc_picard_HsMetrics + value: 1.359189 + condition: + norm: lt + threshold: 1.6 +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_50X + step: multiqc_picard_HsMetrics + value: 1.0 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_100X + step: multiqc_picard_HsMetrics + value: 0.999987 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_250X + step: multiqc_picard_HsMetrics + value: 0.998445 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_500X + step: multiqc_picard_HsMetrics + value: 0.996675 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_1000X + step: multiqc_picard_HsMetrics + value: 0.992466 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.insertsizemetric + name: MEAN_INSERT_SIZE + step: multiqc_picard_insertSize + value: 201.813054 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt + name: PERCENT_DUPLICATION + step: multiqc_picard_dups + value: 0.391429 + condition: null diff --git a/tests/utils/test_models.py b/tests/utils/test_models.py index 9b61315d1..c94cf6697 100644 --- a/tests/utils/test_models.py +++ b/tests/utils/test_models.py @@ -22,9 +22,9 @@ ParamsCommon, ParamsVardict, ParamsVEP, - QCMetricModel, - QCValidationModel, - DeliveryMetricModel, + MetricModel, + MetricConditionModel, + MetricValidationModel, ) @@ -400,116 +400,104 @@ def test_params_vep(): assert test_vep_built.vep_filters == "all defaults params" -def test_qc_metric_model_pass(qc_extracted_metrics): - """test QCMetricModel attribute parsing and positive validation""" +def test_metric_condition_model(): + """test MetricConditionModel attributes parsing""" # GIVEN input attributes - metric = qc_extracted_metrics["metrics"]["sample_1"][0] + metric_condition = {"norm": "gt", "threshold": 1} - # WHEN building the QC metric model - model = QCMetricModel(**metric) + # WHEN building the metric condition model + metrics_model = MetricConditionModel(**metric_condition) # THEN assert retrieved values from the created model - assert model.dict().items() == metric.items() + assert metrics_model.dict().items() == metric_condition.items() -def test_qc_metric_model_norm_fail(qc_extracted_metrics): - """test QCMetricModel ValueError raising for an operator that it is not accepted""" +def test_metric_model_pass_validation(): + """test MetricModel attributes parsing""" - # GIVEN incorrect input attributes - metric = copy.deepcopy(qc_extracted_metrics["metrics"]["sample_1"][0]) - metric["norm"] = "higher" + # GIVEN input attributes + metrics = { + "header": None, + "id": "tumor", + "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric", + "name": "MEDIAN_TARGET_COVERAGE", + "step": "multiqc_picard_HsMetrics", + "value": 2393.0, + "condition": {"norm": "gt", "threshold": 1000.0}, + } - # THEN model raises an error due to a non accepted norm - try: - QCMetricModel(**metric) - except KeyError as key_exc: - assert metric["norm"] in str(key_exc) + # WHEN building the metric model + metric_model = MetricModel(**metrics) + # THEN assert retrieved values from the created model + assert metric_model.dict().items() == metrics.items() -def test_qc_metric_model_condition_fail(qc_extracted_metrics): - """test QCMetricModel for an overly restrictive metric condition""" - # GIVEN input attributes with a value that does not meet the filtering condition - metric = copy.deepcopy(qc_extracted_metrics["metrics"]["sample_1"][0]) - metric["value"] = 10.0 +def test_metric_model_fail_validation(): + """test MetricModel behaviour for an incorrect input""" - # THEN check that the model filters the metric according to its norm - with pytest.raises(ValueError) as val_exc: - QCMetricModel(**metric) - assert ( - f"QC metric {metric['name']}: {metric['value']} validation has failed. " - f"(Condition: {metric['norm']} {metric['threshold']})" in str(val_exc.value) - ) + # GIVEN a non accepted input + invalid_input = {"header": None, "id": "tumor"} + + # THEN the model raises an error due to an incomplete input + with pytest.raises(ValueError) as input_exc: + MetricModel(**invalid_input) + assert f"field required" in str(input_exc.value) -def test_qc_validation_model_pass(qc_extracted_metrics): - """test QCValidationModel attribute parsing and validation""" +def test_metric_validation_model_pass(qc_extracted_metrics): + """test MetricValidationModel attribute parsing and positive validation""" - # WHEN building the QC validation model - model = QCValidationModel(**qc_extracted_metrics) + # WHEN building the MetricValidationModel model + model = MetricValidationModel(metrics=qc_extracted_metrics) # THEN assert retrieved values from the created model - assert model.dict().items() == qc_extracted_metrics.items() + assert model.dict()["metrics"] == qc_extracted_metrics -def test_qc_validation_model_condition_fail(qc_extracted_metrics): - """test QCValidationModel for multiple metrics with failing conditions""" +def test_metric_validation_model_fail(qc_extracted_metrics): + """test MetricValidationModel for an overly restrictive metric condition""" - # GIVEN input attributes that does not meet the specified conditions + # GIVEN input attributes with a value that does not meet the filtering condition metrics = copy.deepcopy(qc_extracted_metrics) - metrics["metrics"]["sample_1"][0]["value"] = 10.0 - metrics["metrics"]["sample_2"][0]["value"] = 10.0 + metrics[3]["value"] = 2.0 - # THEN check that the model filters the metrics according to its norm + # THEN check that the model filters the metric according to its norm with pytest.raises(ValueError) as val_exc: - QCValidationModel(**metrics) - assert "2 validation errors for QCValidationModel" in str(val_exc.value) - - -def test_qc_validation_model_get_json(qc_extracted_metrics): - """test metric-value json extraction and metric filtering for passing conditions""" - - # GIVEN expected output - output_metrics = { - "sample_1": {"MEAN_INSERT_SIZE_1": 0.5, "MEAN_INSERT_SIZE_2": 0.5}, - "sample_2": {"MEAN_INSERT_SIZE_1": 0.5}, - } - - # WHEN building the QC validation model - validation_model = QCValidationModel(**qc_extracted_metrics) - - # THEN check if the extracted metrics and its structure meets the expected one - assert validation_model.get_json.items() == output_metrics.items() - + MetricValidationModel(metrics=metrics) + assert ( + f"QC metric {metrics[3]['name']}: {metrics[3]['value']} validation has failed. " + f"(Condition: {metrics[3]['condition']['norm']} {metrics[3]['condition']['threshold']}, ID: {metrics[3]['id']})" + in str(val_exc.value) + ) -def test_delivery_metric_model_pass_validation(): - """test DeliveryMetricModel attributes parsing""" - # GIVEN input attributes - metrics = { - "header": None, - "id": "005", - "input": "S1_005.sorted.mrkdup.txt", - "name": "MEAN_INSERT_SIZE", - "step": "multiqc_rule", - "value": 0.5, - } +def test_multiple_metric_validation_model_fail(qc_extracted_metrics): + """test MetricValidationModel for multiple metrics with failing conditions""" - # WHEN building the delivery metric model - metrics_model = DeliveryMetricModel(**metrics) + # GIVEN input attributes that does not meet the specified conditions + metrics = copy.deepcopy(qc_extracted_metrics) + metrics[2]["value"] = 999.0 + metrics[3]["value"] = 2 - # THEN assert retrieved values from the created model - assert metrics_model.dict().items() == metrics.items() + # THEN check that the model filters the metrics according to its norm + with pytest.raises(ValueError) as val_exc: + MetricValidationModel(metrics=metrics) + assert "2 validation errors for MetricValidationModel" in str(val_exc.value) + assert metrics[2]["name"] in str(val_exc.value) + assert metrics[3]["name"] in str(val_exc.value) -def test_delivery_metric_model_fail_validation(): - """test DeliveryMetricModel behaviour for an incorrect input""" +def test_metric_validation_model_norm_fail(qc_extracted_metrics): + """test MetricValidationModel ValueError raising for an operator that it is not accepted""" - # GIVEN a non accepted input - invalid_input = {"name": "MEAN_INSERT_SIZE"} + # GIVEN a metric with an incorrect norm attribute + metrics = copy.deepcopy(qc_extracted_metrics) + metrics[3]["condition"]["norm"] = "lower" - # THEN the model raises an error due to an incomplete input - with pytest.raises(ValueError) as input_exc: - DeliveryMetricModel(**invalid_input) - assert f"field required" in str(input_exc.value) + # THEN model raises an error due to a non accepted norm + try: + MetricValidationModel(metrics=metrics) + except KeyError as key_exc: + assert metrics[3]["condition"]["norm"] in str(key_exc) diff --git a/tests/utils/test_qc_metrics.py b/tests/utils/test_qc_metrics.py index bdfdc23a3..760080f4f 100644 --- a/tests/utils/test_qc_metrics.py +++ b/tests/utils/test_qc_metrics.py @@ -1,254 +1,45 @@ -import json -import os - -from pydantic import ValidationError - from BALSAMIC.utils.qc_metrics import ( - get_qc_metrics_json, - read_metrics, - update_metrics_dict, - get_qc_metrics_dict, - get_qc_available_panel_beds, - merge_dicts, - get_multiqc_data_source, - extract_metrics_for_delivery, + validate_qc_metrics, + get_qc_metric_value, ) -def test_get_qc_available_panel_beds(qc_raw_targeted_metrics): - """test extraction of the panel beds available for QC validation""" - - # GIVEN an expected output - expected_output = ["panel_1.bed", "panel_2.bed"] - - # WHEN calling the function - available_panel_beds = get_qc_available_panel_beds(qc_raw_targeted_metrics) - - # THEN check if the extracted bed file names correspond to the expected ones - assert available_panel_beds == expected_output - - -def test_merge_dicts(qc_raw_targeted_metrics): - """test dictionary merging and requirements overwriting by panel BED specific conditions""" - - # GIVEN an expected output - expected_output = { - "metrics_1.json": {"METRIC_1": 0.5, "METRIC_2": 0.2, "METRIC_4": 0.4}, - "metrics_2.json": {"METRIC_3": 0.3}, - } - - # WHEN calling the function - merged_dict = merge_dicts( - qc_raw_targeted_metrics["default"], - qc_raw_targeted_metrics["panel_2.bed"], - ) - - # THEN check if the extracted output meets the merged dictionary - assert merged_dict.items() == expected_output.items() - +def test_get_qc_metric_value(qc_extracted_metrics): + """test QC metric value extraction""" -def test_read_metrics(analysis_path): - """test metric extraction from a specific QC file""" + # GIVEN the input parameters + sample_id = "tumor" + metric_name = "MEDIAN_TARGET_COVERAGE" - # GIVEN a QC file name - file_name = "multiqc_picard_dups.json" - - # GIVEN an expected output - expected_output = { - "concatenated_tumor_XXXXXX_R": { - "LIBRARY": "Unknown Library", - "UNPAIRED_READS_EXAMINED": 11860.0, - "READ_PAIRS_EXAMINED": 20440841.0, - "SECONDARY_OR_SUPPLEMENTARY_RDS": 4333388.0, - "UNMAPPED_READS": 19824.0, - "UNPAIRED_READ_DUPLICATES": 10178.0, - "READ_PAIR_DUPLICATES": 14680829.0, - "READ_PAIR_OPTICAL_DUPLICATES": 0.0, - "PERCENT_DUPLICATION": 0.718251, - "ESTIMATED_LIBRARY_SIZE": 5951948.0, - } - } + # GIVEN an expected value + expected_value = 2393.0 # WHEN calling the function - raw_metrics = read_metrics(analysis_path, file_name) - - # THEN check if the extracted metrics correspond to the expected ones - assert raw_metrics.items() == expected_output.items() - - -def test_update_metrics_dict(qc_extracted_metrics): - """test adding metrics to a nested dictionary""" - - # GIVEN input parameters - sample_id = "sample_" - metric = ["MEAN_INSERT_SIZE", {"condition": {"norm": "lt", "threshold": 1.0}}] - value = 0.5 - - # WHEN adding a metric to an empty dictionary - metric[0] = "MEAN_INSERT_SIZE_1" - m_dict = update_metrics_dict(sample_id + "1", metric, value, {}) - - # WHEN appending a metric to an already created dictionary - metric[0] = "MEAN_INSERT_SIZE_2" - m_dict = update_metrics_dict(sample_id + "1", metric, value, m_dict) + metric_value = get_qc_metric_value(qc_extracted_metrics, sample_id, metric_name) - # WHEN appending a metric from another sample to a dictionary - metric[0] = "MEAN_INSERT_SIZE_1" - m_dict = update_metrics_dict(sample_id + "2", metric, value, m_dict) + # THEN check if the retrieved value corresponds to the expected one + assert metric_value == expected_value - # THEN check if the dictionary is updated correctly - assert m_dict.items() == qc_extracted_metrics["metrics"].items() +def test_get_qc_metric_value_invalid_metric(qc_extracted_metrics): + """test QC metric value extraction for an invalid metric name""" -def test_get_qc_metrics_dict(analysis_path, qc_metrics): - """test QC metric extraction and its structure""" - - # GIVEN a sequencing type - seq_type = "targeted" - - # GIVEN an expected output - expected_output = { - "concatenated_tumor": [ - { - "name": "MEAN_INSERT_SIZE", - "norm": None, - "threshold": None, - "value": 74.182602, - }, - { - "name": "MEDIAN_TARGET_COVERAGE", - "norm": "gt", - "threshold": 500.0, - "value": 461.0, - }, - ] - } + # GIVEN the input parameters + sample_id = "tumor" + metric_name = "NOT_A_METRIC" # WHEN calling the function - metrics_dict = get_qc_metrics_dict(analysis_path, qc_metrics["qc"][seq_type]) - - # THEN check if the extracted metrics and its structure meets the expected one - assert metrics_dict.items() == expected_output.items() - - -def test_get_qc_metrics_json_wgs(analysis_path): - """test JSON object generation for a WGS run""" - - # GIVEN a sequencing type - seq_type = "wgs" - capture_kit = None - - # GIVEN retrieved WGS metrics - output_metrics = {"concatenated_tumor": {"FOLD_80_BASE_PENALTY": 1.238604}} - - # WHEN calling the function - qc_metrics = get_qc_metrics_json(analysis_path, seq_type, capture_kit) - - # THEN check if the obtained metrics are WGS specific - assert qc_metrics.items() == output_metrics.items() - - -def test_get_qc_metrics_json_targeted(analysis_path): - """test JSON object generation for a custom bed file""" - - # GIVEN a sequencing type - seq_type = "targeted" - capture_kit = "lymphoma_6.1_hg19_design.bed" - - # THEN check if the obtained metrics are following the panel bed specific requirements - try: - get_qc_metrics_json(analysis_path, seq_type, capture_kit) - except ValidationError as val_err: - assert ( - "2 validation errors for QCValidationModel" in str(val_err) - and "MEDIAN_TARGET_COVERAGE" in str(val_err) - and "FOLD_80_BASE_PENALTY" in str(val_err) - ) - - -def test_get_multiqc_data_source(analysis_path): - """test multiQC source extraction from multiqc_data.json analysis file""" - - # GIVEN input parameters - sample = "concatenated_tumor_XXXXXX_R" - source_name_hs_metrics = "multiqc_picard_HsMetrics" - source_name_dup = "multiqc_picard_dups" - - with open( - os.path.join(analysis_path, "qc", "multiqc_data", "multiqc_data.json"), "r" - ) as f: - raw_data = json.load(f) - - # GIVEN an expected output - source_hs_metrics = "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric" - source_dup = "concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt" - - # WHEN extracting the source of a specific sample and collection of metrics - out_source_hs_metrics = get_multiqc_data_source( - raw_data, sample, source_name_hs_metrics - ) - out_source_dup = get_multiqc_data_source(raw_data, sample, source_name_dup) - - # THEN check if the extracted source names correspond to the expected ones - assert source_hs_metrics == out_source_hs_metrics - assert source_dup == out_source_dup - - -def test_extract_metrics_for_delivery(analysis_path): - """test output metrics retrieving""" - - # GIVEN a sequencing type - seq_type = "targeted" - - # GIVEN an expected output - n_metrics = 6 # Number of expected metric - - hs_metric = { - "header": None, - "id": "tumor", - "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric", - "name": "PCT_OFF_BAIT", - "step": "multiqc_picard_HsMetrics", - "value": 0.364546, - } - - ins_size_metric = { - "header": None, - "id": "tumor", - "input": "concatenated_tumor_XXXXXX_R.sorted.insertsizemetric", - "name": "MEAN_INSERT_SIZE", - "step": "multiqc_picard_insertSize", - "value": 201.813054, - } - - dups_metric = { - "header": None, - "id": "tumor", - "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt", - "name": "PERCENT_DUPLICATION", - "step": "multiqc_picard_dups", - "value": 0.391429, - } - - # WHEN calling the function - metrics = extract_metrics_for_delivery(analysis_path, seq_type) - - # THEN check if the metrics are correctly retrieved - assert len(metrics) == n_metrics - assert ( - hs_metric in metrics and ins_size_metric in metrics and dups_metric in metrics - ) + metric_value = get_qc_metric_value(qc_extracted_metrics, sample_id, metric_name) + # THEN check if the retrieved value is None + assert metric_value is None -def test_extract_metrics_for_delivery_filtering_umi(analysis_path): - """test umi discarding when extracting metrics""" - # GIVEN a sequencing type - seq_type = "targeted" +def test_validate_qc_metrics(qc_extracted_metrics): + """test QC metric validation""" # WHEN calling the function - metrics = extract_metrics_for_delivery(analysis_path, seq_type) + validated_metrics_pass = validate_qc_metrics(qc_extracted_metrics) - # THEN check if the umi samples are filtered out - for metric in metrics: - assert "umi" not in metric["input"] + # THEN check if the obtained metrics are correctly parsed and validated + assert validated_metrics_pass == qc_extracted_metrics diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 0d5285ef5..3b029c597 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -45,6 +45,7 @@ generate_h5, get_md5, create_md5, + read_yaml, ) from BALSAMIC.utils.rule import ( @@ -549,6 +550,65 @@ def test_write_json_error(tmp_path): assert write_json(ref_json, output_json) +def test_read_yaml(metrics_yaml_path): + """test data extraction from a saved YAML file""" + + # GIVEN an expected output + n_metrics = 11 # Number of expected metric + + hs_metric = { + "header": None, + "id": "tumor", + "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric", + "name": "MEDIAN_TARGET_COVERAGE", + "step": "multiqc_picard_HsMetrics", + "value": 2393.0, + "condition": {"norm": "gt", "threshold": 1000.0}, + } + + ins_size_metric = { + "header": None, + "id": "tumor", + "input": "concatenated_tumor_XXXXXX_R.sorted.insertsizemetric", + "name": "MEAN_INSERT_SIZE", + "step": "multiqc_picard_insertSize", + "value": 201.813054, + "condition": None, + } + + dups_metric = { + "header": None, + "id": "tumor", + "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt", + "name": "PERCENT_DUPLICATION", + "step": "multiqc_picard_dups", + "value": 0.391429, + "condition": None, + } + + # WHEN calling the function + requested_metrics = read_yaml(metrics_yaml_path) + + # THEN check if the data are correctly retrieved from the YAML + assert len(requested_metrics) == n_metrics + assert hs_metric in requested_metrics + assert ins_size_metric in requested_metrics + assert dups_metric in requested_metrics + + +def test_read_yaml_error(): + """test data extraction from an incorrect YAML path""" + + # GIVEN an invalid path + yaml_path = "NOT_A_PATH" + + # THEN assert that the FileNotFoundError is raised + try: + read_yaml(yaml_path) + except FileNotFoundError as file_exc: + assert f"The YAML file {yaml_path} was not found." in str(file_exc) + + def test_get_threads(config_files): # GIVEN cluster config file and rule name cluster_config = json.load(open(config_files["cluster_json"], "r")) From a581951b2497ed7bab7a80430eab7dd259f36c66 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 4 Jan 2022 12:37:35 +0100 Subject: [PATCH 08/58] add umi dups metrics to multiqc --- .../snakemake_rules/quality_control/multiqc.rule | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule index 092cdef3a..e58dc0a5a 100644 --- a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule @@ -58,6 +58,7 @@ else: if config["umiworkflow"]: multiqc_input.extend(expand(umi_qc_dir + "{sample}.umi.collect_hsmetric", sample=config["samples"])) + multiqc_input.extend(expand(umi_qc_dir + "{sample}.umi.metrics", sample=config["samples"])) rule multiqc: @@ -75,13 +76,20 @@ rule multiqc: dir_list = result_dir, qc_dir = qc_dir, case_name = config["analysis"]["case_id"], - exclude_module = "vep" + exclude_module1 = "vep", + exclude_module2 = "fastp" message: "Aggregrate quality metrics results using multiqc for sample {params.case_name}" shell: """ echo -e \"{params.dir_list}\" > {params.qc_dir}/dir_list; -multiqc --force --outdir {params.qc_dir} --exclude {params.exclude_module} --data-format json -l {params.qc_dir}/dir_list; + +multiqc --force --outdir {params.qc_dir} \ +--exclude {params.exclude_module1} \ +--exclude {params.exclude_module2} \ +--data-format json \ +-l {params.qc_dir}/dir_list; + chmod -R 777 {params.qc_dir}; """ From 8d0c896762b71e3b48d0b8ce21568098a8268f07 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 4 Jan 2022 12:38:19 +0100 Subject: [PATCH 09/58] remove wrong spacing for comments --- BALSAMIC/snakemake_rules/umi/qc_umi.rule | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/BALSAMIC/snakemake_rules/umi/qc_umi.rule b/BALSAMIC/snakemake_rules/umi/qc_umi.rule index 2eb891e9f..e71eec2a4 100644 --- a/BALSAMIC/snakemake_rules/umi/qc_umi.rule +++ b/BALSAMIC/snakemake_rules/umi/qc_umi.rule @@ -2,8 +2,6 @@ # coding: utf-8 ## UmiAwareMarkDuplicatesWithMateCigar - umimetrics - - rule picard_umiaware: input: bam = umi_dir + "{sample}_consensusfiltered_umi.bam" @@ -29,9 +27,8 @@ O={output.bam} \ M={output.duplicates} \ UMI_METRICS={output.umimetrics}; """ -## CollectHSmetrics - median target coverage-required - +## CollectHSmetrics - median target coverage-required rule picard_collecthsmetrics_umi: input: bam = umi_dir + "{sample}_consensusfiltered_umi.bam", @@ -68,9 +65,8 @@ COVERAGE_CAP=50000 \ BAIT_SET_NAME={params.baitsetname} \ METRIC_ACCUMULATION_LEVEL=ALL_READS; """ -## SUM(Reads in each family)/ the number of families after correction, collapsing on supporting reads. - +## SUM(Reads in each family)/ the number of families after correction, collapsing on supporting reads. rule samtools_view_calculatemeanfamilydepth_umi: input: bam = umi_dir + "{sample}_consensusfiltered_umi.bam" From c5c9726bf0dbd8a8424c0a547a552f6ec3e9ee88 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 4 Jan 2022 13:52:51 +0100 Subject: [PATCH 10/58] add changelog --- CHANGELOG.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 49b362284..d00aa0b82 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,7 @@ Added: * Snakemake workflow to create canfam3 reference #843 * Call umi variants using TNscope in bed defined regions #821 +* UMI duplication metrics to report in multiqc_picard_dups.json #844 Changed: ^^^^^^^^ @@ -16,6 +17,7 @@ Removed ^^^^^^^ * ``--qc-metrics/--no-qc-metrics`` flag from the ``balsamic report deliver`` command #833 +* `fastp` module is broken in multiqc, excluded running it in multiqc [8.2.5] ------- From 016f579dc4e4a3076e2b22e2adce5ec6b627b040 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 11 Jan 2022 09:43:11 +0100 Subject: [PATCH 11/58] remove fastp exclusion --- BALSAMIC/snakemake_rules/quality_control/multiqc.rule | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule index e58dc0a5a..d31976f64 100644 --- a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule @@ -76,8 +76,7 @@ rule multiqc: dir_list = result_dir, qc_dir = qc_dir, case_name = config["analysis"]["case_id"], - exclude_module1 = "vep", - exclude_module2 = "fastp" + exclude_module = "vep" message: "Aggregrate quality metrics results using multiqc for sample {params.case_name}" shell: @@ -85,8 +84,7 @@ rule multiqc: echo -e \"{params.dir_list}\" > {params.qc_dir}/dir_list; multiqc --force --outdir {params.qc_dir} \ ---exclude {params.exclude_module1} \ ---exclude {params.exclude_module2} \ +--exclude {params.exclude_module} \ --data-format json \ -l {params.qc_dir}/dir_list; From 12128b7b2519568ecace2a307e3c4c601e8dd68a Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 11 Jan 2022 09:44:01 +0100 Subject: [PATCH 12/58] update changelog --- CHANGELOG.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d00aa0b82..5fc3ed7ae 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -17,7 +17,6 @@ Removed ^^^^^^^ * ``--qc-metrics/--no-qc-metrics`` flag from the ``balsamic report deliver`` command #833 -* `fastp` module is broken in multiqc, excluded running it in multiqc [8.2.5] ------- From 4e29a32232231328b7340f50d690398d9ca6345f Mon Sep 17 00:00:00 2001 From: ivadym Date: Mon, 24 Jan 2022 15:12:58 +0100 Subject: [PATCH 13/58] fix: capture kit empty argument for WGS runs (#850) --- BALSAMIC/assets/scripts/collect_qc_metrics.py | 18 +++++++-- CHANGELOG.rst | 4 ++ tests/scripts/test_collect_qc_metrics.py | 37 ++++++++++++++++++- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py index 493cb6587..917f36cbf 100644 --- a/BALSAMIC/assets/scripts/collect_qc_metrics.py +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -23,7 +23,7 @@ def collect_qc_metrics( output_path: Path, multiqc_data_path: Path, sequencing_type: str, - capture_kit: Union[str, None], + capture_kit: str, ): """Extracts the requested metrics from a JSON multiqc file and saves them to a YAML file @@ -31,18 +31,30 @@ def collect_qc_metrics( output_path: Path; destination path for the extracted YAML formatted metrics multiqc_data_path: Path; multiqc JSON path from which the metrics will be extracted sequencing_type: str; analysis sequencing type - capture_kit: str; capture kit used for targeted analysis (None for WGS) + capture_kit: str; capture kit used for targeted analysis ("None" for WGS) """ with open(output_path, "w") as fn: yaml.dump( - get_multiqc_metrics(multiqc_data_path, sequencing_type, capture_kit), + get_multiqc_metrics( + multiqc_data_path, + sequencing_type, + capture_kit_resolve_type(capture_kit), + ), fn, sort_keys=False, default_flow_style=False, ) +def capture_kit_resolve_type(capture_kit: str): + """Resolves the capture_kit type (NoneType or String)""" + if capture_kit == "None": + return None + else: + return capture_kit + + def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str: """Extracts the metrics data source associated with a specific sample and tool diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5fc3ed7ae..a83defdbc 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,6 +13,10 @@ Changed: * Merge QC metric extraction workflows #833 +Fixed: +^^^^^^ +* ``collect_qc_metrics.py`` failing for WGS cases with empty ``capture_kit`` argument #850 + Removed ^^^^^^^ diff --git a/tests/scripts/test_collect_qc_metrics.py b/tests/scripts/test_collect_qc_metrics.py index 8e262a8ee..5808b08bb 100644 --- a/tests/scripts/test_collect_qc_metrics.py +++ b/tests/scripts/test_collect_qc_metrics.py @@ -7,9 +7,21 @@ collect_qc_metrics, get_qc_available_panel_beds, get_requested_metrics, + capture_kit_resolve_type, ) +def test_capture_kit_resolve_type(): + """test capture_kit type""" + + # GIVEN an expected output + capture_kit = "panel.bed" + + # THEN check if the extracted capture kit is correctly formatted + assert capture_kit_resolve_type("None") is None + assert capture_kit_resolve_type(capture_kit) == capture_kit + + def test_get_qc_available_panel_beds(qc_requested_metrics): """test extraction of capture kits available for analysis""" @@ -130,8 +142,8 @@ def test_get_multiqc_metrics_filtering_umi(multiqc_data_path): assert "umi" not in metric["input"] -def test_collect_qc_metrics(tmp_path, multiqc_data_path, cli_runner): - """tests qc metrics yaml file generation""" +def test_collect_qc_metrics_targeted(tmp_path, multiqc_data_path, cli_runner): + """tests qc metrics yaml file generation for targeted analysis""" # GIVEN the output and multiqc metrics paths output_path = tmp_path / "sample_tumor_only_metrics_deliverables.yaml" @@ -149,3 +161,24 @@ def test_collect_qc_metrics(tmp_path, multiqc_data_path, cli_runner): # THEN check if the YAML is correctly created and there are no errors assert result.exit_code == 0 assert Path(output_path).exists() + + +def test_collect_qc_metrics_wgs(tmp_path, multiqc_data_path, cli_runner): + """tests qc metrics yaml file generation for wgs analysis""" + + # GIVEN the output and multiqc metrics paths + output_path = tmp_path / "sample_tumor_only_wgs_metrics_deliverables.yaml" + + # GIVEN a sequencing type and a capture kit + seq_type = "wgs" + capture_kit = "None" + + # WHEN invoking the python script + result = cli_runner.invoke( + collect_qc_metrics, + [str(output_path), multiqc_data_path, seq_type, capture_kit], + ) + + # THEN check if the YAML is correctly created and there are no errors + assert result.exit_code == 0 + assert Path(output_path).exists() From b4cda375057770513ebca67e6858c4df99cc222c Mon Sep 17 00:00:00 2001 From: ivadym Date: Mon, 7 Feb 2022 15:34:14 +0100 Subject: [PATCH 14/58] fix: QC panel bed version generalisation (#855) * version independent panel bed names * default conditions for panel qc validation --- BALSAMIC/assets/scripts/collect_qc_metrics.py | 17 +++++++----- BALSAMIC/constants/quality_check_reporting.py | 16 +++++------ CHANGELOG.rst | 3 +++ tests/conftest.py | 5 ++-- tests/scripts/test_collect_qc_metrics.py | 27 +++++++++++-------- 5 files changed, 40 insertions(+), 28 deletions(-) diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py index 917f36cbf..46377bd0d 100644 --- a/BALSAMIC/assets/scripts/collect_qc_metrics.py +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -96,15 +96,15 @@ def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str: ) -def get_qc_available_panel_beds(metrics: List[str]) -> List[str]: - """Returns available panel bed file names from a list of requested metrics""" - available_beds = [] +def get_qc_supported_capture_kit(capture_kit, metrics: List[str]) -> str: + """Returns a BALSAMIC supported panel bed name associated to a specific capture_kit parameter""" + available_panel_beds = [] for k in metrics: if k != "default": - available_beds.append(k) + available_panel_beds.append(k) - return available_beds + return next((i for i in available_panel_beds if i in capture_kit), None) def get_requested_metrics( @@ -115,8 +115,11 @@ def get_requested_metrics( requested_metrics = metrics[sequencing_type] if capture_kit: requested_metrics = metrics[sequencing_type]["default"] - if capture_kit in get_qc_available_panel_beds(metrics[sequencing_type]): - requested_metrics.update(metrics[sequencing_type][capture_kit]) + supported_capture_kit = get_qc_supported_capture_kit( + capture_kit, metrics[sequencing_type] + ) + if supported_capture_kit: + requested_metrics.update(metrics[sequencing_type][supported_capture_kit]) return requested_metrics diff --git a/BALSAMIC/constants/quality_check_reporting.py b/BALSAMIC/constants/quality_check_reporting.py index 272cb01e5..dba06df8e 100644 --- a/BALSAMIC/constants/quality_check_reporting.py +++ b/BALSAMIC/constants/quality_check_reporting.py @@ -55,37 +55,37 @@ "default": { "MEAN_INSERT_SIZE": {"condition": None}, "PERCENT_DUPLICATION": {"condition": None}, - "MEDIAN_TARGET_COVERAGE": {"condition": None}, + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 500}}, "PCT_TARGET_BASES_50X": {"condition": None}, "PCT_TARGET_BASES_100X": {"condition": None}, "PCT_TARGET_BASES_250X": {"condition": None}, "PCT_TARGET_BASES_500X": {"condition": None}, "PCT_TARGET_BASES_1000X": {"condition": None}, "MEAN_TARGET_COVERAGE": {"condition": None}, - "FOLD_80_BASE_PENALTY": {"condition": None}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, "PCT_OFF_BAIT": {"condition": None}, }, - "gicfdna_3.1_hg19_design.bed": { + "gicfdna": { "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, }, - "gmcksolid_4.1_hg19_design.bed": { + "gmcksolid": { "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 500}}, "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, }, - "gmsmyeloid_5.2_hg19_design.bed": { + "gmsmyeloid": { "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, }, - "lymphoma_6.1_hg19_design.bed": { + "lymphoma": { "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, }, - "gmslymphoid_7.1_hg19_design.bed": { + "gmslymphoid": { "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, }, - "twistexomerefseq_9.1_hg19_design.bed": { + "twistexome": { "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 100}}, "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, }, diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a83defdbc..5fa79e25f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ Added: * Snakemake workflow to create canfam3 reference #843 * Call umi variants using TNscope in bed defined regions #821 * UMI duplication metrics to report in multiqc_picard_dups.json #844 +* QC default validation conditions (for not defined capture kits) #855 Changed: ^^^^^^^^ @@ -15,7 +16,9 @@ Changed: Fixed: ^^^^^^ + * ``collect_qc_metrics.py`` failing for WGS cases with empty ``capture_kit`` argument #850 +* QC metric validation for different panel bed version #855 Removed ^^^^^^^ diff --git a/tests/conftest.py b/tests/conftest.py index 0657f3291..cff2d544a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -518,11 +518,12 @@ def qc_requested_metrics(): "METRIC_1": {"condition": None}, "METRIC_2": {"condition": {"norm": "gt", "threshold": 2}}, }, - "panel_1.bed": { + "panel_1": { "METRIC_3": {"condition": {"norm": "gt", "threshold": 3}}, }, - "panel_2.bed": { + "panel_2": { "METRIC_1": {"condition": {"norm": "gt", "threshold": 1}}, + "METRIC_2": {"condition": {"norm": "gt", "threshold": 22}}, "METRIC_4": {"condition": {"norm": "gt", "threshold": 4}}, }, }, diff --git a/tests/scripts/test_collect_qc_metrics.py b/tests/scripts/test_collect_qc_metrics.py index 5808b08bb..9b2cb7e16 100644 --- a/tests/scripts/test_collect_qc_metrics.py +++ b/tests/scripts/test_collect_qc_metrics.py @@ -5,7 +5,7 @@ get_multiqc_data_source, get_multiqc_metrics, collect_qc_metrics, - get_qc_available_panel_beds, + get_qc_supported_capture_kit, get_requested_metrics, capture_kit_resolve_type, ) @@ -22,17 +22,22 @@ def test_capture_kit_resolve_type(): assert capture_kit_resolve_type(capture_kit) == capture_kit -def test_get_qc_available_panel_beds(qc_requested_metrics): - """test extraction of capture kits available for analysis""" +def test_get_qc_supported_capture_kit(qc_requested_metrics): + """test extraction of the capture kit name available for analysis""" + + # GIVEN a capture kit + capture_kit = "panel_1_v1.0_hg19_design.bed" # GIVEN an expected output - expected_output = ["panel_1.bed", "panel_2.bed"] + expected_output = "panel_1" # WHEN calling the function - available_panel_beds = get_qc_available_panel_beds(qc_requested_metrics["targeted"]) + supported_capture_kit = get_qc_supported_capture_kit( + capture_kit, qc_requested_metrics["targeted"] + ) - # THEN check if the extracted bed file names correspond to the expected ones - assert available_panel_beds == expected_output + # THEN check if the extracted bed file name corresponds to the expected one + assert supported_capture_kit == expected_output def test_get_requested_metrics_targeted(qc_requested_metrics): @@ -40,13 +45,13 @@ def test_get_requested_metrics_targeted(qc_requested_metrics): # GIVEN a sequencing type and a capture kit seq_type = "targeted" - capture_kit = "panel_1.bed" + capture_kit = "panel_2_v1.0_hg19_design.bed" # GIVEN the expected output expected_output = { - "METRIC_1": {"condition": None}, - "METRIC_2": {"condition": {"norm": "gt", "threshold": 2}}, - "METRIC_3": {"condition": {"norm": "gt", "threshold": 3}}, + "METRIC_1": {"condition": {"norm": "gt", "threshold": 1}}, + "METRIC_2": {"condition": {"norm": "gt", "threshold": 22}}, + "METRIC_4": {"condition": {"norm": "gt", "threshold": 4}}, } # WHEN calling the function From 15964aba8b6d56a3aef005cbd753eda7cacd1ce2 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Thu, 24 Feb 2022 15:51:30 +0100 Subject: [PATCH 15/58] add vcf2cytosure container --- BALSAMIC/containers/vcf2cytosure/Dockerfile | 17 +++++++++++++++++ BALSAMIC/containers/vcf2cytosure/__init__.py | 0 .../containers/vcf2cytosure/vcf2cytosure.yaml | 1 + 3 files changed, 18 insertions(+) create mode 100644 BALSAMIC/containers/vcf2cytosure/Dockerfile create mode 100644 BALSAMIC/containers/vcf2cytosure/__init__.py create mode 100644 BALSAMIC/containers/vcf2cytosure/vcf2cytosure.yaml diff --git a/BALSAMIC/containers/vcf2cytosure/Dockerfile b/BALSAMIC/containers/vcf2cytosure/Dockerfile new file mode 100644 index 000000000..0f7059707 --- /dev/null +++ b/BALSAMIC/containers/vcf2cytosure/Dockerfile @@ -0,0 +1,17 @@ +FROM continuumio/miniconda3:4.9.2-alpine + +LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" +LABEL about.documentation="https://balsamic.readthedocs.io/" +LABEL about.license="MIT License (MIT)" +LABEL about.maintainer="Ashwini Jeggari ashwini dot jeggari at scilifelab dot se" +LABEL about.description="Bioinformatic analysis pipeline for somatic mutations in cancer" + +ENV PATH="/opt/vcf2cytosure/bin:${PATH}" + +RUN apk add --no-cache bash gcc git python3 + +RUN cd /opt \ + && git clone https://github.com/NBISweden/vcf2cytosure.git \ + && cd vcf2cytosure/ \ + && pip install -e . \ No newline at end of file diff --git a/BALSAMIC/containers/vcf2cytosure/__init__.py b/BALSAMIC/containers/vcf2cytosure/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/BALSAMIC/containers/vcf2cytosure/vcf2cytosure.yaml b/BALSAMIC/containers/vcf2cytosure/vcf2cytosure.yaml new file mode 100644 index 000000000..892ae60cb --- /dev/null +++ b/BALSAMIC/containers/vcf2cytosure/vcf2cytosure.yaml @@ -0,0 +1 @@ +- vcf2cytosure=0.7.1 From 75b6defd6b1125a591f90e553a0435c9bc62ee34 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Thu, 24 Feb 2022 15:51:54 +0100 Subject: [PATCH 16/58] add vcf2cytosure container tests --- container_tests/vcf2cytosure/vcf2cytosure.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 container_tests/vcf2cytosure/vcf2cytosure.sh diff --git a/container_tests/vcf2cytosure/vcf2cytosure.sh b/container_tests/vcf2cytosure/vcf2cytosure.sh new file mode 100644 index 000000000..40157294c --- /dev/null +++ b/container_tests/vcf2cytosure/vcf2cytosure.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Test if commands exist + +valid_commands=( "pip" "vcf2cytosure" ) + +for valid_command in "${valid_commands[@]}" +do + if ! command -v "${valid_command}" &> /dev/null + then + echo "${valid_command} could not be found" + exit 1 + else + echo "${valid_command} command is found and valid" + fi +done \ No newline at end of file From 7b9dfbb819e5a2cfdf41e854d56eec8880693479 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Thu, 24 Feb 2022 15:53:09 +0100 Subject: [PATCH 17/58] update changelog --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c4702f7f2..435c8bd08 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,6 +8,7 @@ Added: * Call umi variants using TNscope in bed defined regions #821 * UMI duplication metrics to report in multiqc_picard_dups.json #844 * QC default validation conditions (for not defined capture kits) #855 +* Docker container for vcf2cytosure #858 Changed: ^^^^^^^^ From a3daf44da167d8ea15adc26367085146f457e4ea Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Thu, 24 Feb 2022 15:55:57 +0100 Subject: [PATCH 18/58] add newlines --- BALSAMIC/containers/vcf2cytosure/Dockerfile | 2 +- container_tests/vcf2cytosure/vcf2cytosure.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/containers/vcf2cytosure/Dockerfile b/BALSAMIC/containers/vcf2cytosure/Dockerfile index 0f7059707..dcf369781 100644 --- a/BALSAMIC/containers/vcf2cytosure/Dockerfile +++ b/BALSAMIC/containers/vcf2cytosure/Dockerfile @@ -14,4 +14,4 @@ RUN apk add --no-cache bash gcc git python3 RUN cd /opt \ && git clone https://github.com/NBISweden/vcf2cytosure.git \ && cd vcf2cytosure/ \ - && pip install -e . \ No newline at end of file + && pip install -e . diff --git a/container_tests/vcf2cytosure/vcf2cytosure.sh b/container_tests/vcf2cytosure/vcf2cytosure.sh index 40157294c..e953274c4 100644 --- a/container_tests/vcf2cytosure/vcf2cytosure.sh +++ b/container_tests/vcf2cytosure/vcf2cytosure.sh @@ -12,4 +12,4 @@ do else echo "${valid_command} command is found and valid" fi -done \ No newline at end of file +done From 03ba91debd3352a676c3d220bf7421ff359666ea Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 28 Feb 2022 17:01:35 +0100 Subject: [PATCH 19/58] add vcf2cytosure to git actions --- .github/workflows/docker_build_push.yml | 2 +- .github/workflows/docker_build_push_release.yml | 2 +- .github/workflows/docker_build_test_pull_request.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker_build_push.yml b/.github/workflows/docker_build_push.yml index 860760f08..379b72e2c 100644 --- a/.github/workflows/docker_build_push.yml +++ b/.github/workflows/docker_build_push.yml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: true matrix: - container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly] + container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly, vcf2cytosure] steps: - name: Set up QEMU uses: docker/setup-qemu-action@v1 diff --git a/.github/workflows/docker_build_push_release.yml b/.github/workflows/docker_build_push_release.yml index f4e2c566c..d44c84e06 100644 --- a/.github/workflows/docker_build_push_release.yml +++ b/.github/workflows/docker_build_push_release.yml @@ -11,7 +11,7 @@ jobs: strategy: fail-fast: true matrix: - container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly] + container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly, vcf2cytosure] steps: - name: Set up QEMU uses: docker/setup-qemu-action@v1 diff --git a/.github/workflows/docker_build_test_pull_request.yml b/.github/workflows/docker_build_test_pull_request.yml index 1b7c86b80..f4b5f4379 100644 --- a/.github/workflows/docker_build_test_pull_request.yml +++ b/.github/workflows/docker_build_test_pull_request.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly] + container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly, vcf2cytosure] steps: - name: Git checkout id: git_checkout From 42dd1aba7f4dda2d7beaad3cc7aa375b5579ba45 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 28 Feb 2022 17:01:50 +0100 Subject: [PATCH 20/58] change alpine version --- BALSAMIC/containers/vcf2cytosure/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/containers/vcf2cytosure/Dockerfile b/BALSAMIC/containers/vcf2cytosure/Dockerfile index dcf369781..3e3e04cbd 100644 --- a/BALSAMIC/containers/vcf2cytosure/Dockerfile +++ b/BALSAMIC/containers/vcf2cytosure/Dockerfile @@ -1,4 +1,4 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine LABEL base_image="continuumio/miniconda3:4.9.2-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" From ded392a0f1963c3720d653670b241882daa2215f Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 28 Feb 2022 17:05:27 +0100 Subject: [PATCH 21/58] remove previous pip install vcf2cytosure --- BALSAMIC/containers/annotate/annotate.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/BALSAMIC/containers/annotate/annotate.yaml b/BALSAMIC/containers/annotate/annotate.yaml index 3ed64c994..5ddff7e04 100644 --- a/BALSAMIC/containers/annotate/annotate.yaml +++ b/BALSAMIC/containers/annotate/annotate.yaml @@ -9,6 +9,4 @@ dependencies: - bioconda::bcftools>=1.10 - bioconda::vcfanno=0.3.3 - anaconda::gxx_linux-64=7.3.0 - - anaconda::pip=20.2.4 - - pip: - - "--editable git+https://github.com/NBISweden/vcf2cytosure@0.5.1#egg=vcf2cytosure" + - anaconda::pip=20.2.4i From 1d033033ccf74cb8e2d019124198f37fc15183de Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 28 Feb 2022 17:09:16 +0100 Subject: [PATCH 22/58] fix label key in dockerfile --- BALSAMIC/containers/vcf2cytosure/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/containers/vcf2cytosure/Dockerfile b/BALSAMIC/containers/vcf2cytosure/Dockerfile index 3e3e04cbd..2ccd00d2a 100644 --- a/BALSAMIC/containers/vcf2cytosure/Dockerfile +++ b/BALSAMIC/containers/vcf2cytosure/Dockerfile @@ -1,6 +1,6 @@ FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base_image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" @@ -14,4 +14,4 @@ RUN apk add --no-cache bash gcc git python3 RUN cd /opt \ && git clone https://github.com/NBISweden/vcf2cytosure.git \ && cd vcf2cytosure/ \ - && pip install -e . + && pip install --no-cache-dir . From f7decab4ec5b0b9d974644a65e5a811489223c4e Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 28 Feb 2022 18:44:06 +0100 Subject: [PATCH 23/58] test balsamic container image to updated version --- BALSAMIC/containers/annotate/annotate.yaml | 2 +- BALSAMIC/containers/balsamic/Dockerfile | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/BALSAMIC/containers/annotate/annotate.yaml b/BALSAMIC/containers/annotate/annotate.yaml index 5ddff7e04..c83633ae5 100644 --- a/BALSAMIC/containers/annotate/annotate.yaml +++ b/BALSAMIC/containers/annotate/annotate.yaml @@ -9,4 +9,4 @@ dependencies: - bioconda::bcftools>=1.10 - bioconda::vcfanno=0.3.3 - anaconda::gxx_linux-64=7.3.0 - - anaconda::pip=20.2.4i + - anaconda::pip=20.2.4 diff --git a/BALSAMIC/containers/balsamic/Dockerfile b/BALSAMIC/containers/balsamic/Dockerfile index da15429fe..05c55c077 100644 --- a/BALSAMIC/containers/balsamic/Dockerfile +++ b/BALSAMIC/containers/balsamic/Dockerfile @@ -1,6 +1,6 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base_image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" From e4e242e7ce75a77155e121ea9f54af164bfa4a0f Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 1 Mar 2022 12:55:01 +0100 Subject: [PATCH 24/58] remove vcf2cytosure from annotate tests --- container_tests/annotate/annotate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/container_tests/annotate/annotate.sh b/container_tests/annotate/annotate.sh index f5c44d4dc..4183b6241 100644 --- a/container_tests/annotate/annotate.sh +++ b/container_tests/annotate/annotate.sh @@ -1,7 +1,7 @@ #!/bin/bash # Test if commands exist -valid_commands=( "bcftools" "vcfanno" "vcf2cytosure" "genmod" "vep" "vep_install" ) +valid_commands=( "bcftools" "vcfanno" "genmod" "vep" "vep_install" ) for valid_command in "${valid_commands[@]}" do From 1eb5f60b9d0119f6f06be1f5004f292240d5a802 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 1 Mar 2022 12:56:41 +0100 Subject: [PATCH 25/58] test invalide label key --- BALSAMIC/containers/vcf2cytosure/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/containers/vcf2cytosure/Dockerfile b/BALSAMIC/containers/vcf2cytosure/Dockerfile index 2ccd00d2a..82d3bb0eb 100644 --- a/BALSAMIC/containers/vcf2cytosure/Dockerfile +++ b/BALSAMIC/containers/vcf2cytosure/Dockerfile @@ -1,6 +1,6 @@ FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.10.3-alpine" +LABEL base_image="4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" From 2fed80c24a5e7b21e29b730a511a4ac7157e0bea Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 1 Mar 2022 12:57:03 +0100 Subject: [PATCH 26/58] test invalide label key --- BALSAMIC/containers/vcf2cytosure/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/containers/vcf2cytosure/Dockerfile b/BALSAMIC/containers/vcf2cytosure/Dockerfile index 82d3bb0eb..29f088887 100644 --- a/BALSAMIC/containers/vcf2cytosure/Dockerfile +++ b/BALSAMIC/containers/vcf2cytosure/Dockerfile @@ -1,6 +1,6 @@ FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="4.10.3-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" From abea0148fc3146201651d9d88aa3ede93aab314d Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 1 Mar 2022 14:45:22 +0100 Subject: [PATCH 27/58] fix invalid label key --- BALSAMIC/containers/balsamic/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/containers/balsamic/Dockerfile b/BALSAMIC/containers/balsamic/Dockerfile index 05c55c077..efbf0e891 100644 --- a/BALSAMIC/containers/balsamic/Dockerfile +++ b/BALSAMIC/containers/balsamic/Dockerfile @@ -1,6 +1,6 @@ FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.10.3-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" From 18b61ea7d664551140980c924ba512d759f4b63d Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 1 Mar 2022 18:34:28 +0100 Subject: [PATCH 28/58] remove --no-cache-dir from pip dockerfile --- BALSAMIC/config/balsamic_env.yaml | 4 +++- BALSAMIC/containers/vcf2cytosure/Dockerfile | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/BALSAMIC/config/balsamic_env.yaml b/BALSAMIC/config/balsamic_env.yaml index 3c49b6ffb..59f7d6966 100644 --- a/BALSAMIC/config/balsamic_env.yaml +++ b/BALSAMIC/config/balsamic_env.yaml @@ -30,4 +30,6 @@ vcf_merge: delly: - delly ascatNgs: - -ascat + - ascat +vcf2cytosure: + - vcf2cytosure diff --git a/BALSAMIC/containers/vcf2cytosure/Dockerfile b/BALSAMIC/containers/vcf2cytosure/Dockerfile index 29f088887..25c1cd866 100644 --- a/BALSAMIC/containers/vcf2cytosure/Dockerfile +++ b/BALSAMIC/containers/vcf2cytosure/Dockerfile @@ -13,5 +13,5 @@ RUN apk add --no-cache bash gcc git python3 RUN cd /opt \ && git clone https://github.com/NBISweden/vcf2cytosure.git \ - && cd vcf2cytosure/ \ - && pip install --no-cache-dir . + && cd /opt/vcf2cytosure \ + && pip install -e . From 70fbdbe718edcd42e165442a521444fcb41d5c55 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Wed, 2 Mar 2022 15:34:57 +0100 Subject: [PATCH 29/58] update changelog --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 435c8bd08..430f90692 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,7 @@ Changed: ^^^^^^^^ * Merge QC metric extraction workflows #833 +* Changed the base-image for balsamic container to 4.10.3-alpine #869 Fixed: ^^^^^^ From 5c3d5b514b9fb90249cd6a86ce6b508256539aa4 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Wed, 2 Mar 2022 15:54:00 +0100 Subject: [PATCH 30/58] add vcf2cytosure to commons --- BALSAMIC/constants/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/BALSAMIC/constants/common.py b/BALSAMIC/constants/common.py index bd0494ff0..e0c85f052 100644 --- a/BALSAMIC/constants/common.py +++ b/BALSAMIC/constants/common.py @@ -78,6 +78,7 @@ "delly": "delly", "ascatNgs": "ascatNgs", "sentieon": "sentieon", + "vcf2cytosure": "vcf2cytosure" } VALID_OPS = { From 7933a9154ade6b0904c35832350f6f6d46063fce Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Wed, 2 Mar 2022 15:57:23 +0100 Subject: [PATCH 31/58] fix black linter --- BALSAMIC/constants/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/constants/common.py b/BALSAMIC/constants/common.py index e0c85f052..ee0d6f71a 100644 --- a/BALSAMIC/constants/common.py +++ b/BALSAMIC/constants/common.py @@ -78,7 +78,7 @@ "delly": "delly", "ascatNgs": "ascatNgs", "sentieon": "sentieon", - "vcf2cytosure": "vcf2cytosure" + "vcf2cytosure": "vcf2cytosure", } VALID_OPS = { From 8c2b2af43a9ea9955774a4a3cf0fa69a11669c9e Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Thu, 3 Mar 2022 13:45:05 +0100 Subject: [PATCH 32/58] feat: add SVDB (#872) * add SVDB * add SVDB params * add SVDB to container * fix lint error * fix lint error * update changelog --- BALSAMIC/config/balsamic_env.yaml | 1 + BALSAMIC/constants/common.py | 1 + BALSAMIC/constants/workflow_params.py | 7 +++++++ BALSAMIC/containers/varcall_py36/varcall_py36.yaml | 3 ++- CHANGELOG.rst | 2 +- container_tests/varcall_py36/varcall_py36.sh | 2 +- docs/resources.rst | 3 ++- tests/test_data/BALSAMIC_env.yaml | 1 + 8 files changed, 16 insertions(+), 4 deletions(-) diff --git a/BALSAMIC/config/balsamic_env.yaml b/BALSAMIC/config/balsamic_env.yaml index 3c49b6ffb..389eb86b4 100644 --- a/BALSAMIC/config/balsamic_env.yaml +++ b/BALSAMIC/config/balsamic_env.yaml @@ -19,6 +19,7 @@ varcall_py36: - gatk - vardict - libiconv + - svdb varcall_py27: - strelka - manta diff --git a/BALSAMIC/constants/common.py b/BALSAMIC/constants/common.py index bd0494ff0..278f8b6ee 100644 --- a/BALSAMIC/constants/common.py +++ b/BALSAMIC/constants/common.py @@ -72,6 +72,7 @@ "tabix": "varcall_py36", "gatk": "varcall_py36", "vardict": "varcall_py36", + "svdb": "varcall_py36", "strelka": "varcall_py27", "manta": "varcall_py27", "cnvkit": "varcall_cnvkit", diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 69b760ecf..b02987b1d 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -85,6 +85,13 @@ "sequencing_type": ["wgs"], "workflow_solution": ["BALSAMIC"], }, + "svdb": { + "mutation": "somatic", + "type": "SV", + "analysis_type": ["paired", "single"], + "sequencing_type": ["wgs"], + "workflow_solution": ["BALSAMIC"], + }, } WORKFLOW_PARAMS = { diff --git a/BALSAMIC/containers/varcall_py36/varcall_py36.yaml b/BALSAMIC/containers/varcall_py36/varcall_py36.yaml index 55528ce44..9cc507742 100644 --- a/BALSAMIC/containers/varcall_py36/varcall_py36.yaml +++ b/BALSAMIC/containers/varcall_py36/varcall_py36.yaml @@ -2,12 +2,13 @@ channels: - defaults dependencies: - - anaconda::python=3.6 + - anaconda::python=3.8 - bioconda::bcftools=1.11 - bioconda::tabix=0.2.6 - bioconda::samtools=1.11 - bioconda::gatk=3.8 - bioconda::vardict=2019.06.04=pl526_0 - bioconda::vardict-java=1.7 + - bioconda::svdb=2.5.0 - conda-forge::libiconv - conda-forge::r-base=3.6.3 diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c4702f7f2..a092443e5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,7 +8,7 @@ Added: * Call umi variants using TNscope in bed defined regions #821 * UMI duplication metrics to report in multiqc_picard_dups.json #844 * QC default validation conditions (for not defined capture kits) #855 - +* SVDB to the varcall_py36 container Changed: ^^^^^^^^ diff --git a/container_tests/varcall_py36/varcall_py36.sh b/container_tests/varcall_py36/varcall_py36.sh index 50d551eac..3011d1fa2 100644 --- a/container_tests/varcall_py36/varcall_py36.sh +++ b/container_tests/varcall_py36/varcall_py36.sh @@ -1,7 +1,7 @@ #!/bin/bash # Test if commands exist -valid_commands=( "bcftools" "samtools" "tabix" "vardict" "vardict-java" ) +valid_commands=( "bcftools" "samtools" "tabix" "vardict" "vardict-java" "svdb") for valid_command in "${valid_commands[@]}" do diff --git a/docs/resources.rst b/docs/resources.rst index b9dbe305e..b8839a9f9 100644 --- a/docs/resources.rst +++ b/docs/resources.rst @@ -143,7 +143,7 @@ Methods and tools #. **Delly2**\ : An integrated structural variant prediction method that can discover, genotype and visualize deletions, tandem duplications, inversions and translocations https://github.com/dellytools/delly #. **PLINK**\ : PLINK: Whole genome data analysis toolset https://www.cog-genomics.org/plink2 #. **freebayes**\ : a haplotype-based variant detector. https://github.com/ekg/freebayes -#. **ASCAT**\ : Allele-Specific Copy Number Analysis of Tumors, tumor purity and ploidy https://github.com/Crick-CancerGenomics/ascat +#. **AscatNGS**\ : Allele-Specific Copy Number Analysis of Tumors, tumor purity and ploidy https://github.com/cancerit/ascatNgs #. **MutationalPatterns**\ : R package for extracting and visualizing mutational patterns in base substitution catalogues https://github.com/UMCUGenetics/MutationalPatterns #. **desconstructSigs**\ : identification of mutational signatures within a single tumor sample https://github.com/raerose01/deconstructSigs #. **treeOmics**\ : Decrypting somatic mutation patterns to reveal the evolution of cancer @@ -190,3 +190,4 @@ Methods and tools #. **msisensor**\ : microsatellite instability detection using paired tumor-normal https://github.com/ding-lab/msisensor #. **MOSAIC**\ : MicrOSAtellite Instability Classifier https://github.com/ronaldhause/mosaic #. **MANTIS**\ : Microsatellite Analysis for Normal-Tumor InStability https://github.com/OSU-SRLab/MANTIS +#. **SBDB**\ : A toolkit for constricting and querying structural variant databases https://github.com/J35P312/SVDB diff --git a/tests/test_data/BALSAMIC_env.yaml b/tests/test_data/BALSAMIC_env.yaml index b6172c820..c108004f4 100644 --- a/tests/test_data/BALSAMIC_env.yaml +++ b/tests/test_data/BALSAMIC_env.yaml @@ -24,3 +24,4 @@ D_BALSAMIC-py36_test: - pindel - multiqc - bedtools +- svdb From c0404be278042d0ad1b5ae81a3ebaf9ab7018f66 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Fri, 4 Mar 2022 14:07:30 +0100 Subject: [PATCH 33/58] fix pythonpath --- BALSAMIC/containers/vcf2cytosure/Dockerfile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/BALSAMIC/containers/vcf2cytosure/Dockerfile b/BALSAMIC/containers/vcf2cytosure/Dockerfile index 25c1cd866..0c5c7370d 100644 --- a/BALSAMIC/containers/vcf2cytosure/Dockerfile +++ b/BALSAMIC/containers/vcf2cytosure/Dockerfile @@ -7,11 +7,14 @@ LABEL about.license="MIT License (MIT)" LABEL about.maintainer="Ashwini Jeggari ashwini dot jeggari at scilifelab dot se" LABEL about.description="Bioinformatic analysis pipeline for somatic mutations in cancer" -ENV PATH="/opt/vcf2cytosure/bin:${PATH}" +ARG CONTAINER_NAME +ENV PATH="/opt/${CONTAINER_NAME}/bin:${PATH}" +ENV PYTHONPATH="/opt/${CONTAINER_NAME}" + RUN apk add --no-cache bash gcc git python3 RUN cd /opt \ && git clone https://github.com/NBISweden/vcf2cytosure.git \ - && cd /opt/vcf2cytosure \ - && pip install -e . + && cd /opt/${CONTAINER_NAME}/ \ + && pip install --no-cache-dir . From 1451d111a10006cba9ce963c56100f13385c9b6c Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Tue, 8 Mar 2022 09:11:52 +0100 Subject: [PATCH 34/58] feat: add svdb workflow (#873) * update changelog * add svdb for wgs somatic SV * add svdb rule for wgs somatic SV * add svdb rule for wgs somatic SV * add svdb varcaller attribute * remove unused parameter and format command line * add cluster configuration for svdb * modify output variable name * remove unused parameter * remove unused parameter * remove unused parameter --- BALSAMIC/config/analysis.json | 4 +++ BALSAMIC/config/cluster.json | 12 ++++++++ .../varcaller_sv_wgs_filter_tumor_normal.rule | 23 +++++++++++++++ .../varcaller_sv_wgs_filter_tumor_only.rule | 23 +++++++++++++++ .../somatic_sv_tumor_normal.rule | 29 +++++++++++++++++++ .../somatic_sv_tumor_only.rule | 26 +++++++++++++++++ BALSAMIC/utils/models.py | 1 + CHANGELOG.rst | 3 +- 8 files changed, 120 insertions(+), 1 deletion(-) diff --git a/BALSAMIC/config/analysis.json b/BALSAMIC/config/analysis.json index da437c5c7..d09911f63 100644 --- a/BALSAMIC/config/analysis.json +++ b/BALSAMIC/config/analysis.json @@ -73,6 +73,10 @@ "ascat": { "mutation": "somatic", "type": "SV" + }, + "svdb": { + "mutation": "somatic", + "type": "SV" } } } diff --git a/BALSAMIC/config/cluster.json b/BALSAMIC/config/cluster.json index 93ccc8ca0..7bc561ed2 100644 --- a/BALSAMIC/config/cluster.json +++ b/BALSAMIC/config/cluster.json @@ -263,5 +263,17 @@ "collect_custom_qc_metrics": { "time": "00:15:00", "n": 1 + }, + "svdb_merge_tumor_normal": { + "time": "01:00:00", + "n": 8 + }, + "svdb_merge_tumor_only": { + "time": "01:00:00", + "n": 8 + }, + "bcftools_filter_svdb": { + "time": "01:00:00", + "n": 8 } } diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule index 5f2fbb596..8afd84e7e 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule @@ -75,3 +75,26 @@ rule bcftools_filter_ascat: bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; tabix -p vcf -f {output.vcf_sv_pass}; """ + +rule bcftools_filter_svdb: + input: + vcf = vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz", + output: + vcf_sv_pass_svdb = vep_dir + "SV.somatic.{case_name}.svdb.all.filtered.pass.vcf.gz", + benchmark: + benchmark_dir + "bcftools_filter_svdb_SV.somatic.{case_name}.svdb.vep.tsv" + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + case_name = "{case_name}", + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, + threads: + get_threads(cluster_config, "bcftools_filter_svdb") + message: + "Filtering svdb merged Manta and Delly results for PASS variants using bcftools for sample '{params.case_name}' " + shell: + """ +bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass_svdb} -O z {input.vcf}; + +tabix -p vcf -f {output.vcf_sv_pass_svdb}; + """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule index 081ca681c..3b57d0c31 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule @@ -51,3 +51,26 @@ rule bcftools_filter_delly: bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; tabix -p vcf -f {output.vcf_sv_pass}; """ + +rule bcftools_filter_svdb: + input: + vcf = vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz", + output: + vcf_sv_pass_svdb = vep_dir + "SV.somatic.{case_name}.svdb.all.filtered.pass.vcf.gz", + benchmark: + benchmark_dir + "bcftools_filter_svdb_SV.somatic.{case_name}.svdb.vep.tsv" + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + case_name = "{case_name}", + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, + threads: + get_threads(cluster_config, "bcftools_filter_svdb") + message: + "Filtering svdb merged Manta and Delly results for PASS variants using bcftools for sample '{params.case_name}' " + shell: + """ +bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass_svdb} -O z {input.vcf}; + +tabix -p vcf -f {output.vcf_sv_pass_svdb}; + """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule index c9bb6b186..dd2b2c941 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule @@ -206,3 +206,32 @@ rule ascat_tumor_normal_merge_output: """ python {params.merge_ascat_output_script} {output.ascat_output_pdf} {input.sample_statistics} {input.ascat_plots} """ + +rule svdb_merge_tumor_normal: + input: + manta_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".manta.vcf.gz", + delly_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.vcf.gz", + output: + svdb_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.vcf.gz", + namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.sample_name_map", + benchmark: + Path(benchmark_dir, 'svdb_merge_tumor_normal_' + config["analysis"]["case_id"] + ".tsv") + singularity: + Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() + params: + tumor = get_sample_type(config["samples"], "tumor"), + normal = get_sample_type(config["samples"], "normal"), + case_name = config["analysis"]["case_id"], + threads: + get_threads(cluster_config, "svdb_merge_tumor_normal") + message: + "Merging Manta and Delly results for PASS variants using svdb for sample '{params.case_name}' " + shell: + """ +svdb --merge --no_intra --bnd_distance 5000 --overlap 0.80 \ +--vcf {input.manta_vcf}:manta {input.delly_vcf}:delly \ +--priority manta,delly | \ +bgzip -l 9 -c > {output.svdb_vcf} + +echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap}; + """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule index 2bf3655f4..dead0ba8a 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule @@ -104,4 +104,30 @@ bcftools view --threads {threads} -f PASS -O z -o {output.vcf} {input.bcf}; tabix -p vcf -f {output.vcf}; """ +rule svdb_merge_tumor_only: + input: + manta_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".manta.vcf.gz", + delly_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.vcf.gz", + output: + svdb_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.vcf.gz", + namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.sample_name_map", + benchmark: + Path(benchmark_dir, 'svdb_merge_tumor_only_' + config["analysis"]["case_id"] + ".tsv") + singularity: + Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() + params: + tumor = get_sample_type(config["samples"], "tumor"), + case_name = config["analysis"]["case_id"], + threads: + get_threads(cluster_config, "svdb_merge_tumor_only") + message: + "Merging Manta and Delly results for PASS variants using svdb for sample '{params.case_name}' " + shell: + """ +svdb --merge --no_intra --bnd_distance 5000 --overlap 0.80 \ +--vcf {input.manta_vcf}:manta {input.delly_vcf}:delly \ +--priority manta,delly | \ +bgzip -l 9 -c > {output.svdb_vcf} +echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; + """ diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index 286c74ba4..13ae60ec0 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -187,6 +187,7 @@ class VCFModel(BaseModel): TNscope_umi: VarcallerAttribute delly: VarcallerAttribute ascat: VarcallerAttribute + svdb: VarcallerAttribute class AnalysisModel(BaseModel): diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b943c3d44..cb51d4ba9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,8 +8,9 @@ Added: * Call umi variants using TNscope in bed defined regions #821 * UMI duplication metrics to report in multiqc_picard_dups.json #844 * QC default validation conditions (for not defined capture kits) #855 +* SVDB to the varcall_py36 container #871 +* SVDB to WGS workflow #871 * Docker container for vcf2cytosure #858 -* SVDB to the varcall_py36 container Changed: ^^^^^^^^ From e1eeb00641d27329f0fcd342dc73a089d54c4580 Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Wed, 9 Mar 2022 14:25:02 +0100 Subject: [PATCH 35/58] feat: add svdb workflow tga (#879) * svdb merge manta delly tga * update changelog * svdb merge manta delly tga --- BALSAMIC/constants/workflow_params.py | 2 +- .../annotation/varcaller_sv_filter.rule | 23 +++++++++++++++++++ CHANGELOG.rst | 6 +++-- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index b02987b1d..48d6ed4cb 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -89,7 +89,7 @@ "mutation": "somatic", "type": "SV", "analysis_type": ["paired", "single"], - "sequencing_type": ["wgs"], + "sequencing_type": ["wgs", "targeted"], "workflow_solution": ["BALSAMIC"], }, } diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule index cb52cf100..00cef12e3 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule @@ -73,3 +73,26 @@ rule bcftools_filter_delly: bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; tabix -p vcf -f {output.vcf_sv_pass}; """ + +rule bcftools_filter_svdb: + input: + vcf = vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz", + output: + vcf_sv_pass_svdb = vep_dir + "SV.somatic.{case_name}.svdb.all.filtered.pass.vcf.gz", + benchmark: + benchmark_dir + "bcftools_filter_svdb_SV.somatic.{case_name}.svdb.vep.tsv" + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + case_name = "{case_name}", + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, + threads: + get_threads(cluster_config, "bcftools_filter_svdb") + message: + "Filtering svdb merged Manta and Delly results for PASS variants using bcftools for sample '{params.case_name}' " + shell: + """ +bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass_svdb} -O z {input.vcf}; + +tabix -p vcf -f {output.vcf_sv_pass_svdb}; + """ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cb51d4ba9..f445e76a4 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,9 +8,11 @@ Added: * Call umi variants using TNscope in bed defined regions #821 * UMI duplication metrics to report in multiqc_picard_dups.json #844 * QC default validation conditions (for not defined capture kits) #855 -* SVDB to the varcall_py36 container #871 -* SVDB to WGS workflow #871 +* SVdb to the varcall_py36 container #871 +* SVdb to WGS workflow #871 * Docker container for vcf2cytosure #858 +* SVdb to TGA workflow #871 + Changed: ^^^^^^^^ From e0177739525120537109a6b3ed6cd8108842fde4 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 14 Mar 2022 18:02:25 +0100 Subject: [PATCH 36/58] feat: create cgh files from cnvkit outputs (#880) * add snakemake rule for cytosure * add vcf2cytosure to the valid container name --- BALSAMIC/constants/common.py | 1 + BALSAMIC/constants/workflow_rules.py | 3 +++ .../annotation/vcf2cytosure_convert.rule | 24 +++++++++++++++++++ BALSAMIC/workflows/balsamic.smk | 6 ++++- CHANGELOG.rst | 1 + 5 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule diff --git a/BALSAMIC/constants/common.py b/BALSAMIC/constants/common.py index ca94f859d..a2455581a 100644 --- a/BALSAMIC/constants/common.py +++ b/BALSAMIC/constants/common.py @@ -52,6 +52,7 @@ "delly", "ascatNgs", "balsamic", + "vcf2cytosure", } BIOINFO_TOOL_ENV = { diff --git a/BALSAMIC/constants/workflow_rules.py b/BALSAMIC/constants/workflow_rules.py index 91a206b70..7fb80513a 100644 --- a/BALSAMIC/constants/workflow_rules.py +++ b/BALSAMIC/constants/workflow_rules.py @@ -38,6 +38,7 @@ "snakemake_rules/annotation/rankscore.rule", "snakemake_rules/annotation/varcaller_sv_filter.rule", "snakemake_rules/annotation/varcaller_filter_tumor_only.rule", + "snakemake_rules/annotation/vcf2cytosure_convert.rule", ], }, "paired_targeted": { @@ -70,6 +71,7 @@ "snakemake_rules/annotation/rankscore.rule", "snakemake_rules/annotation/varcaller_sv_filter.rule", "snakemake_rules/annotation/varcaller_filter_tumor_normal.rule", + "snakemake_rules/annotation/vcf2cytosure_convert.rule", ], }, "single_wgs": { @@ -139,5 +141,6 @@ "mergeBam_normal_umiconsensus", "cnvkit_paired", "cnvkit_single", + "vcf2cytosure_convert", "ascat_tumor_normal_merge_output", ] diff --git a/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule b/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule new file mode 100644 index 000000000..54cdaa2eb --- /dev/null +++ b/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule @@ -0,0 +1,24 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + + +rule vcf2cytosure_convert: + input: + cnv_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf.gz", + cnv_cnr = cnv_dir + "tumor.merged" + ".cnr" + output: + cgh_file = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf2cytosure.cgh" + benchmark: + Path(benchmark_dir, 'vcf2cytosure_convert.' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("vcf2cytosure") + ".sif").as_posix() + threads: + get_threads(cluster_config, "vcf2cytosure_convert") + params: + case_name = config["analysis"]["case_id"], + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv-somatic"}, + message: "Convert VCF file with CNVs to the .CGH format using vcf2cytosure for sample {params.case_name}" + shell: + """ +vcf2cytosure --vcf {input.cnv_vcf} --cn {input.cnv_cnr} --out {output.cgh_file} --bins 1 + """ diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 0ddc68010..9e6db5900 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -222,6 +222,10 @@ if config["analysis"]["sequencing_type"] != "wgs": analysis_specific_results.append(expand(vep_dir + "{vcf}.all.filtered.pass.ranked.vcf.gz", vcf=get_vcf(config, ["vardict"], [config["analysis"]["case_id"]]))) + analysis_specific_results.append(expand(vcf_dir + "CNV.somatic.{case_name}.{var_caller}.vcf2cytosure.cgh", + case_name=config["analysis"]["case_id"], + var_caller=["cnvkit"])) + analysis_specific_results.append(expand(umi_qc_dir + "{sample}.umi.mean_family_depth", sample=config["samples"])) if background_variant_file: @@ -320,7 +324,7 @@ if 'delivery' in config: rule all: input: - quality_control_results + analysis_specific_results, + quality_control_results + analysis_specific_results output: finish_file = os.path.join(get_result_dir(config), "analysis_finish") params: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index db359a050..e8ea3c4cc 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,6 +11,7 @@ Added: * SVdb to the varcall_py36 container #871 * SVdb to WGS workflow #871 * Docker container for vcf2cytosure #858 +* Snakemake rule for creating `.cgh` files from `CNVkit` outputs #858 * SVdb to TGA workflow #871 From c749a3c6363c9b00ec32d430ca80c34ce143cbea Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Tue, 15 Mar 2022 09:30:19 +0100 Subject: [PATCH 37/58] feat: svdb merge sv cnv (#886) * update changelog * refactor delly, svdb sequencing_type * refactor varcallers * change type for valid_variant_callers * add sv cnv callers * svdb merge sv cnv * update svdb to 2.5.1 * rename sv_callers to svdb_sv_callers_to_merge_prio * add get_svdb_input function * lint * fix CodeFactor suggestion * put input function back in rule * change var name --- BALSAMIC/constants/workflow_params.py | 4 ++-- .../containers/varcall_py36/varcall_py36.yaml | 2 +- .../somatic_sv_tumor_normal.rule | 16 +++++++++++----- .../variant_calling/somatic_sv_tumor_only.rule | 17 +++++++++++------ BALSAMIC/utils/models.py | 6 +++--- BALSAMIC/utils/rule.py | 4 ++-- BALSAMIC/workflows/balsamic.smk | 18 ++++++++++++++++++ CHANGELOG.rst | 2 +- 8 files changed, 49 insertions(+), 20 deletions(-) diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 48d6ed4cb..575457923 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -75,7 +75,7 @@ "mutation": "somatic", "type": "SV", "analysis_type": ["paired", "single"], - "sequencing_type": ["wgs", "targeted"], + "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["BALSAMIC"], }, "ascat": { @@ -89,7 +89,7 @@ "mutation": "somatic", "type": "SV", "analysis_type": ["paired", "single"], - "sequencing_type": ["wgs", "targeted"], + "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["BALSAMIC"], }, } diff --git a/BALSAMIC/containers/varcall_py36/varcall_py36.yaml b/BALSAMIC/containers/varcall_py36/varcall_py36.yaml index 9cc507742..6fab44964 100644 --- a/BALSAMIC/containers/varcall_py36/varcall_py36.yaml +++ b/BALSAMIC/containers/varcall_py36/varcall_py36.yaml @@ -9,6 +9,6 @@ dependencies: - bioconda::gatk=3.8 - bioconda::vardict=2019.06.04=pl526_0 - bioconda::vardict-java=1.7 - - bioconda::svdb=2.5.0 + - bioconda::svdb=2.5.1 - conda-forge::libiconv - conda-forge::r-base=3.6.3 diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule index dd2b2c941..ee632501b 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule @@ -209,8 +209,12 @@ python {params.merge_ascat_output_script} {output.ascat_output_pdf} {input.sampl rule svdb_merge_tumor_normal: input: - manta_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".manta.vcf.gz", - delly_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.vcf.gz", + vcf = expand( + vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".{caller}.vcf.gz", + caller=somatic_caller_sv) + + expand( + vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".{caller}.vcf.gz", + caller=somatic_caller_cnv) output: svdb_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.vcf.gz", namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.sample_name_map", @@ -222,6 +226,8 @@ rule svdb_merge_tumor_normal: tumor = get_sample_type(config["samples"], "tumor"), normal = get_sample_type(config["samples"], "normal"), case_name = config["analysis"]["case_id"], + vcf= lambda wildcards, input:[input[index] + ":" + svdb_callers_prio[index] for index in range(0,len(input))], + svdb_priority= ",".join(svdb_callers_prio) threads: get_threads(cluster_config, "svdb_merge_tumor_normal") message: @@ -229,9 +235,9 @@ rule svdb_merge_tumor_normal: shell: """ svdb --merge --no_intra --bnd_distance 5000 --overlap 0.80 \ ---vcf {input.manta_vcf}:manta {input.delly_vcf}:delly \ ---priority manta,delly | \ -bgzip -l 9 -c > {output.svdb_vcf} +--vcf {params.vcf} \ +--priority {params.svdb_priority} | \ +bgzip -l 9 -c > {output.svdb_vcf}; echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap}; """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule index dead0ba8a..3787002a4 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule @@ -106,8 +106,12 @@ tabix -p vcf -f {output.vcf}; rule svdb_merge_tumor_only: input: - manta_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".manta.vcf.gz", - delly_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.vcf.gz", + vcf = expand( + vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".{caller}.vcf.gz", + caller=somatic_caller_sv) + + expand( + vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".{caller}.vcf.gz", + caller=somatic_caller_cnv) output: svdb_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.vcf.gz", namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.sample_name_map", @@ -118,6 +122,8 @@ rule svdb_merge_tumor_only: params: tumor = get_sample_type(config["samples"], "tumor"), case_name = config["analysis"]["case_id"], + vcf= lambda wildcards, input:[input[index] + ":" + svdb_callers_prio[index] for index in range(0,len(input))], + svdb_priority= ",".join(svdb_callers_prio) threads: get_threads(cluster_config, "svdb_merge_tumor_only") message: @@ -125,9 +131,8 @@ rule svdb_merge_tumor_only: shell: """ svdb --merge --no_intra --bnd_distance 5000 --overlap 0.80 \ ---vcf {input.manta_vcf}:manta {input.delly_vcf}:delly \ ---priority manta,delly | \ -bgzip -l 9 -c > {output.svdb_vcf} - +--vcf {params.vcf} \ +--priority {params.svdb_priority} | \ +bgzip -l 9 -c > {output.svdb_vcf}; echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; """ diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index 13ae60ec0..6d3965923 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -176,16 +176,16 @@ def sequencing_type_literal(cls, value) -> str: class VCFModel(BaseModel): """Contains VCF config""" - manta: VarcallerAttribute - cnvkit: VarcallerAttribute vardict: VarcallerAttribute tnscope: VarcallerAttribute dnascope: VarcallerAttribute tnhaplotyper: VarcallerAttribute - manta_germline: VarcallerAttribute haplotypecaller: VarcallerAttribute TNscope_umi: VarcallerAttribute + manta_germline: VarcallerAttribute + manta: VarcallerAttribute delly: VarcallerAttribute + cnvkit: VarcallerAttribute ascat: VarcallerAttribute svdb: VarcallerAttribute diff --git a/BALSAMIC/utils/rule.py b/BALSAMIC/utils/rule.py index 5a6144885..d7cb07d62 100644 --- a/BALSAMIC/utils/rule.py +++ b/BALSAMIC/utils/rule.py @@ -75,7 +75,7 @@ def get_variant_callers( WorkflowRunError if values are not valid """ - valid_variant_callers = set() + valid_variant_callers = list() if mutation_type not in MUTATION_TYPE: raise WorkflowRunError(f"{mutation_type} is not a valid mutation type.") @@ -99,7 +99,7 @@ def get_variant_callers( and workflow_solution in variant_caller_params.get("workflow_solution") and sequencing_type in variant_caller_params.get("sequencing_type") ): - valid_variant_callers.add(variant_caller_name) + valid_variant_callers.append(variant_caller_name) return list(valid_variant_callers) diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 9e6db5900..375c6ab78 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -130,6 +130,8 @@ os.environ['TMPDIR'] = get_result_dir(config) # Extract variant callers for the workflow germline_caller = [] somatic_caller = [] +somatic_caller_cnv = [] +somatic_caller_sv = [] for m in MUTATION_TYPE: germline_caller_balsamic = get_variant_callers(config=config, analysis_type=config['analysis']['analysis_type'], @@ -170,6 +172,22 @@ for m in MUTATION_TYPE: mutation_class="somatic") somatic_caller = somatic_caller + somatic_caller_sentieon_umi + somatic_caller_balsamic + somatic_caller_sentieon +somatic_caller_sv = get_variant_callers(config=config, + analysis_type=config['analysis']['analysis_type'], + workflow_solution="BALSAMIC", + mutation_type="SV", + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="somatic") + +somatic_caller_cnv = get_variant_callers(config=config, + analysis_type=config['analysis']['analysis_type'], + workflow_solution="BALSAMIC", + mutation_type="CNV", + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="somatic") +somatic_caller_sv.remove("svdb") +svdb_callers_prio = somatic_caller_sv + somatic_caller_cnv + # Collect only snv callers for calculating tmb somatic_caller_tmb = [] for ws in ["BALSAMIC","Sentieon","Sentieon_umi"]: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e8ea3c4cc..20b1b99bb 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,7 +13,7 @@ Added: * Docker container for vcf2cytosure #858 * Snakemake rule for creating `.cgh` files from `CNVkit` outputs #858 * SVdb to TGA workflow #871 - +* SVdb merge SV and CNV #871 Changed: ^^^^^^^^ From 59ed2fd0ae87b2e9b30bfd2abe356868a0a85e4e Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Wed, 16 Mar 2022 13:47:21 +0100 Subject: [PATCH 38/58] feat: remove sv and cnv callers from annotation and filter rules (#889) * update changelog * move sv filter to common * remove sv cnv callers * remove sv cnv callers * delete sv wgs filter files * update changelog * reformat using black * revert changes * fix DELIVERY_RULES for SV callers * modify changelog --- BALSAMIC/constants/workflow_rules.py | 14 +-- .../annotation/varcaller_sv_filter.rule | 74 +------------ .../varcaller_sv_wgs_filter_tumor_normal.rule | 100 ------------------ .../varcaller_sv_wgs_filter_tumor_only.rule | 76 ------------- BALSAMIC/workflows/balsamic.smk | 4 + CHANGELOG.rst | 1 + 6 files changed, 11 insertions(+), 258 deletions(-) delete mode 100644 BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule delete mode 100644 BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule diff --git a/BALSAMIC/constants/workflow_rules.py b/BALSAMIC/constants/workflow_rules.py index 7fb80513a..5900605e5 100644 --- a/BALSAMIC/constants/workflow_rules.py +++ b/BALSAMIC/constants/workflow_rules.py @@ -9,7 +9,10 @@ ], "align": [], "varcall": ["snakemake_rules/variant_calling/germline_sv.rule"], - "annotate": ["snakemake_rules/annotation/vep.rule"], + "annotate": [ + "snakemake_rules/annotation/vep.rule", + "snakemake_rules/annotation/varcaller_sv_filter.rule", + ], }, "single_targeted": { "qc": [ @@ -36,7 +39,6 @@ ], "annotate": [ "snakemake_rules/annotation/rankscore.rule", - "snakemake_rules/annotation/varcaller_sv_filter.rule", "snakemake_rules/annotation/varcaller_filter_tumor_only.rule", "snakemake_rules/annotation/vcf2cytosure_convert.rule", ], @@ -69,7 +71,6 @@ ], "annotate": [ "snakemake_rules/annotation/rankscore.rule", - "snakemake_rules/annotation/varcaller_sv_filter.rule", "snakemake_rules/annotation/varcaller_filter_tumor_normal.rule", "snakemake_rules/annotation/vcf2cytosure_convert.rule", ], @@ -89,7 +90,6 @@ ], "annotate": [ "snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule", - "snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule", ], }, "paired_wgs": { @@ -107,7 +107,6 @@ ], "annotate": [ "snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule", - "snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule", ], }, } @@ -128,10 +127,7 @@ "bcftools_filter_tnscope_tumor_normal", "bcftools_filter_tnhaplotyper_tumor_only", "bcftools_filter_tnhaplotyper_tumor_normal", - "bcftools_filter_manta", - "bcftools_filter_delly", - "bcftools_filter_ascat", - "bcftools_filter_cnvkit", + "bcftools_filter_svdb", "bcftools_intersect_tumor_only", "bcftools_filter_TNscope_umi_tumor_only", "genmod_score_vardict", diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule index 00cef12e3..efa975278 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule @@ -1,78 +1,6 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -# NGS filters for various scenarios - - - - -rule bcftools_filter_manta: - input: - vcf = vep_dir + "{var_type}.somatic.{case_name}.manta.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "{var_type}.somatic.{case_name}.manta.all.filtered.pass.vcf.gz", - benchmark: - Path(benchmark_dir, 'bcftools_filter_manta_' + "{var_type}.somatic.{case_name}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = '{case_name}', - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"} - threads: - get_threads(cluster_config, 'bcftools_filter_manta') - message: - "Filtering Manta results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; - -tabix -p vcf -f {output.vcf_sv_pass}; - """ - - -rule bcftools_filter_cnvkit: - input: - vcf = vep_dir + "{var_type}.somatic.{case_name}.cnvkit.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "{var_type}.somatic.{case_name}.cnvkit.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + 'bcftools_filter_' + "{var_type}.somatic.{case_name}.cnvkit.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - threads: - get_threads(cluster_config, 'bcftools_filter_cnvkit') - message: - "Filtering CNVkit results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ - - -rule bcftools_filter_delly: - input: - vcf = vep_dir + "{var_type}.somatic.{case_name}.delly.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "{var_type}.somatic.{case_name}.delly.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + 'bcftools_filter_' + "{var_type}.somatic.{case_name}.delly.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - threads: - get_threads(cluster_config, 'bcftools_filter_delly') - message: - "Filtering Delly results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ +# NGS filters for merged SVs and CNVs rule bcftools_filter_svdb: input: diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule deleted file mode 100644 index 8afd84e7e..000000000 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule +++ /dev/null @@ -1,100 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 -# NGS filters for various scenarios - - - - -rule bcftools_filter_manta: - input: - vcf = vep_dir + "SV.somatic.{case_name}.manta.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "SV.somatic.{case_name}.manta.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_manta_SV.somatic.{case_name}.manta.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, - var_caller = "manta" - threads: - get_threads(cluster_config, "bcftools_filter_manta") - message: - "Filtering Manta results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ - - -rule bcftools_filter_delly: - input: - vcf = vep_dir + "SV.somatic.{case_name}.delly.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "SV.somatic.{case_name}.delly.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_delly_SV.somatic.{case_name}.delly.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - var_caller = "delly" - threads: - get_threads(cluster_config, "bcftools_filter_delly") - message: - "Filtering Delly results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ - - -rule bcftools_filter_ascat: - input: - vcf = vep_dir + "CNV.somatic.{case_name}.ascat.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "CNV.somatic.{case_name}.ascat.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_ascat_CNV.somatic.{case_name}.ascat.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - var_caller = "ascat" - threads: - get_threads(cluster_config, "bcftools_filter_ascat") - message: - "Filtering Ascat results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ - -rule bcftools_filter_svdb: - input: - vcf = vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz", - output: - vcf_sv_pass_svdb = vep_dir + "SV.somatic.{case_name}.svdb.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_svdb_SV.somatic.{case_name}.svdb.vep.tsv" - singularity: - Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, - threads: - get_threads(cluster_config, "bcftools_filter_svdb") - message: - "Filtering svdb merged Manta and Delly results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass_svdb} -O z {input.vcf}; - -tabix -p vcf -f {output.vcf_sv_pass_svdb}; - """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule deleted file mode 100644 index 3b57d0c31..000000000 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule +++ /dev/null @@ -1,76 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 -# NGS filters for various scenarios - - - - -rule bcftools_filter_manta: - input: - vcf = vep_dir + "SV.somatic.{case_name}.manta.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "SV.somatic.{case_name}.manta.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_manta_SV.somatic.{case_name}.manta.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, - var_caller = "manta" - threads: - get_threads(cluster_config, "bcftools_filter_manta") - message: - "Filtering Manta results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ - - -rule bcftools_filter_delly: - input: - vcf = vep_dir + "SV.somatic.{case_name}.delly.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "SV.somatic.{case_name}.delly.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_delly_SV.somatic.{case_name}.delly.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - var_caller = "delly" - threads: - get_threads(cluster_config, "bcftools_filter_delly") - message: - "Filtering Delly results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ - -rule bcftools_filter_svdb: - input: - vcf = vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz", - output: - vcf_sv_pass_svdb = vep_dir + "SV.somatic.{case_name}.svdb.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_svdb_SV.somatic.{case_name}.svdb.vep.tsv" - singularity: - Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, - threads: - get_threads(cluster_config, "bcftools_filter_svdb") - message: - "Filtering svdb merged Manta and Delly results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass_svdb} -O z {input.vcf}; - -tabix -p vcf -f {output.vcf_sv_pass_svdb}; - """ diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 375c6ab78..78eb1c845 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -188,6 +188,10 @@ somatic_caller_cnv = get_variant_callers(config=config, somatic_caller_sv.remove("svdb") svdb_callers_prio = somatic_caller_sv + somatic_caller_cnv +for var_caller in svdb_callers_prio: + if var_caller in somatic_caller: + somatic_caller.remove(var_caller) + # Collect only snv callers for calculating tmb somatic_caller_tmb = [] for ws in ["BALSAMIC","Sentieon","Sentieon_umi"]: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 20b1b99bb..3783d0713 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -31,6 +31,7 @@ Removed ^^^^^^^ * ``--qc-metrics/--no-qc-metrics`` flag from the ``balsamic report deliver`` command #833 +* SV and CNV callers from annotation and filtering #871 [8.2.8] -------- From 8914b315ba8cf4486871a46707801dccc648bc71 Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Mon, 28 Mar 2022 13:11:48 +0200 Subject: [PATCH 39/58] feat: remove vcfanno and cosmic for sv (#891) * update changelog * remove vcfanno for SV * add cluster config for somatic sv * fix delivery for somatic sv callers * remove housekeeper_id * fix delivery for somatic sv callers * add temp to vep somatic output files * remove cosmic from sv vep annotation * fix coverage drop * remove tmp file from params * remove tmp file --- BALSAMIC/config/cluster.json | 6 +- BALSAMIC/constants/workflow_rules.py | 3 +- BALSAMIC/snakemake_rules/annotation/vep.rule | 64 ++++++++++++++++---- CHANGELOG.rst | 1 + 4 files changed, 60 insertions(+), 14 deletions(-) diff --git a/BALSAMIC/config/cluster.json b/BALSAMIC/config/cluster.json index 7bc561ed2..84c0fdd62 100644 --- a/BALSAMIC/config/cluster.json +++ b/BALSAMIC/config/cluster.json @@ -220,10 +220,14 @@ "time": "4:00:00", "n": 12 }, - "vep_somatic": { + "vep_somatic_snv": { "time":"18:00:00", "n": 24 }, + "vep_somatic_sv": { + "time":"12:00:00", + "n": 24 + }, "vep_germline": { "time":"06:00:00", "n": 10 diff --git a/BALSAMIC/constants/workflow_rules.py b/BALSAMIC/constants/workflow_rules.py index 5900605e5..bbd5c0d0a 100644 --- a/BALSAMIC/constants/workflow_rules.py +++ b/BALSAMIC/constants/workflow_rules.py @@ -116,7 +116,8 @@ "fastp", "multiqc", "collect_custom_qc_metrics", - "vep_somatic", + "vep_somatic_snv", + "vep_somatic_sv", "vep_germline", "tmb_calculation", "bcftools_filter_TNscope_umi_tumor_only", diff --git a/BALSAMIC/snakemake_rules/annotation/vep.rule b/BALSAMIC/snakemake_rules/annotation/vep.rule index 0f6a76ad3..c4d586c99 100644 --- a/BALSAMIC/snakemake_rules/annotation/vep.rule +++ b/BALSAMIC/snakemake_rules/annotation/vep.rule @@ -3,32 +3,31 @@ # VEP annotation module. Annotate all VCFs generated through VEP - -rule vep_somatic: +rule vep_somatic_snv: input: - vcf = vcf_dir + "{var_type}.somatic.{case_name}.{var_caller}.vcf.gz", - header = vcf_dir + "{var_type}.somatic.{case_name}.{var_caller}.sample_name_map", + vcf = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.vcf.gz", + header = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.sample_name_map", cosmic = config["reference"]["cosmic"] output: - vcf_all = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.all.vcf.gz", - vcf_summary = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.all.vcf.gz_summary.html", - bcftools_stats = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.all.stats" + vcf_all = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}.all.vcf.gz"), + vcf_summary = vep_dir + "SNV.somatic.{case_name}.{var_caller}.all.vcf.gz_summary.html", + bcftools_stats = vep_dir + "SNV.somatic.{case_name}.{var_caller}.all.stats" benchmark: - Path(benchmark_dir, "vep_somatic_{var_type}.somatic.{case_name}.{var_caller}.tsv").as_posix() + Path(benchmark_dir, "vep_somatic_SNV.somatic.{case_name}.{var_caller}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() params: housekeeper_id = {"id": "{case_name}", "tags": "annotated-somatic"}, ref_path = Path(config["reference"]["gnomad_variant"]).parent.as_posix(), - message_text = "{var_type}.somatic.{case_name}.{var_caller}.vcf.gz", - tmpvcf = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.tmp.vcf.gz", + message_text = "SNV.somatic.{case_name}.{var_caller}.vcf.gz", + tmpvcf = vep_dir + "SNV.somatic.{case_name}.{var_caller}.tmp.vcf.gz", vcfanno_toml = VCFANNO_TOML, vep_cache = config["reference"]["vep"], vep_defaults = params.vep.vep_filters threads: - get_threads(cluster_config, "vep_somatic") + get_threads(cluster_config, "vep_somatic_snv") message: - "Running vep annotation on {params.message_text}" + "Running vep annotation on {params.message_text}" shell: """ vep_path=$(dirname $(readlink -f $(which vep))); @@ -56,6 +55,47 @@ bcftools stats {output.vcf_all} > {output.bcftools_stats}; rm $tmpvcf; """ +rule vep_somatic_sv: + input: + vcf = vcf_dir + "SV.somatic.{case_name}.svdb.vcf.gz", + header = vcf_dir + "SV.somatic.{case_name}.svdb.sample_name_map", + output: + vcf_all = temp(vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz"), + vcf_summary = vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz_summary.html", + bcftools_stats = vep_dir + "SV.somatic.{case_name}.svdb.all.stats" + benchmark: + Path(benchmark_dir, "vep_somatic_SV.somatic.{case_name}.svdb.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() + params: + housekeeper_id = {"id": "{case_name}", "tags": "annotated-somatic"}, + message_text = "SV.somatic.{case_name}.svdb.vcf.gz", + vep_cache = config["reference"]["vep"], + vep_defaults = params.vep.vep_filters + threads: + get_threads(cluster_config, "vep_somatic_sv") + message: + "Running vep annotation on {params.message_text}" + shell: + """ +vep_path=$(dirname $(readlink -f $(which vep))); +export PERL5LIB=; + +bcftools reheader --threads {threads} -s {input.header} {input.vcf} | \ +bcftools view --threads {threads} -O v | \ +vep \ +--dir $vep_path \ +--dir_cache {params.vep_cache} \ +--dir_plugins $vep_path \ +--output_file {output.vcf_all} \ +--fork {threads} \ +{params.vep_defaults} \ + +tabix -p vcf -f {output.vcf_all}; + +bcftools stats {output.vcf_all} > {output.bcftools_stats}; + + """ rule tmb_calculation: input: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3783d0713..012f80c88 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -32,6 +32,7 @@ Removed * ``--qc-metrics/--no-qc-metrics`` flag from the ``balsamic report deliver`` command #833 * SV and CNV callers from annotation and filtering #871 +* vcfanno from SV annotation [8.2.8] -------- From cd97dbdc7d0ead3922eb54fbae36952baae210ac Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Mon, 4 Apr 2022 17:15:18 +0200 Subject: [PATCH 40/58] fix: update svdb to 2.6.0 (#901) * update changelog * update svdb to 2.6.0 * update black linter to 22.3.0 * update changelog --- .github/workflows/black_linter.yml | 2 +- BALSAMIC/containers/varcall_py36/varcall_py36.yaml | 2 +- CHANGELOG.rst | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/black_linter.yml b/.github/workflows/black_linter.yml index 9686afda6..5e9cad8b6 100644 --- a/.github/workflows/black_linter.yml +++ b/.github/workflows/black_linter.yml @@ -11,4 +11,4 @@ jobs: - uses: psf/black@stable with: options: "--check --verbose" - version: "21.7b0" + version: "22.3.0" diff --git a/BALSAMIC/containers/varcall_py36/varcall_py36.yaml b/BALSAMIC/containers/varcall_py36/varcall_py36.yaml index 6fab44964..dec8bbdd7 100644 --- a/BALSAMIC/containers/varcall_py36/varcall_py36.yaml +++ b/BALSAMIC/containers/varcall_py36/varcall_py36.yaml @@ -9,6 +9,6 @@ dependencies: - bioconda::gatk=3.8 - bioconda::vardict=2019.06.04=pl526_0 - bioconda::vardict-java=1.7 - - bioconda::svdb=2.5.1 + - bioconda::svdb=2.6.0 - conda-forge::libiconv - conda-forge::r-base=3.6.3 diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 012f80c88..f3d90721e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -20,6 +20,8 @@ Changed: * Merge QC metric extraction workflows #833 * Changed the base-image for balsamic container to 4.10.3-alpine #869 +* updated SVdb to 2.6.0 #871 + Fixed: ^^^^^^ From bd70c3a65c313a7d8f824347c33fe6c9a3f647c1 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 4 Apr 2022 21:07:53 +0200 Subject: [PATCH 41/58] feat: Option to use PON with TGA cnvkit (#851) * add new pon option for config case * add option to consider pon ref in cnvkit t-only * add pon to models * remove unused pon option for snv * add changelog * add pon bindpath * fix black * fix black for config case * add tests for pon file * add test pon file * add pon to conftest * add echo for pon file exists * add new tests for pon path * add test * fix black * add small test * fix black and code smell * fix bind_path * add new pon tests * fix conftest * fix black * fix code smell * test codecov * add check path existance * undo changes * restucture pon_cnn from config analysis to panel Co-authored-by: Vadym Ivanchuk --- BALSAMIC/commands/config/case.py | 8 +++ BALSAMIC/commands/run/analysis.py | 2 + .../variant_calling/cnvkit_single.rule | 28 +++++---- .../variant_calling/somatic_tumor_only.rule | 12 +--- BALSAMIC/utils/models.py | 8 +++ CHANGELOG.rst | 2 + tests/commands/config/test_config_sample.py | 32 +++++++++++ tests/commands/run/test_run_analysis.py | 19 +++++++ tests/conftest.py | 57 ++++++++++++++++++- .../references/panel/test_panel_ponn.cnn | 6 ++ 10 files changed, 152 insertions(+), 22 deletions(-) create mode 100644 tests/test_data/references/panel/test_panel_ponn.cnn diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index bb172b29a..b52d11824 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -75,6 +75,12 @@ required=False, help="Background set of valid variants for UMI", ) +@click.option( + "--pon-cnn", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="Panel of normal reference (.cnn) for cnvkit", +) @click.option( "--balsamic-cache", type=click.Path(exists=True, resolve_path=True), @@ -140,6 +146,7 @@ def case_config( quality_trim, panel_bed, background_variants, + pon_cnn, analysis_dir, tumor, normal, @@ -196,6 +203,7 @@ def case_config( panel={ "capture_kit": panel_bed, "chrom": get_panel_chrom(panel_bed), + "pon_cnn": pon_cnn, } if panel_bed else None, diff --git a/BALSAMIC/commands/run/analysis.py b/BALSAMIC/commands/run/analysis.py index ec0c41de6..7c9c9b491 100644 --- a/BALSAMIC/commands/run/analysis.py +++ b/BALSAMIC/commands/run/analysis.py @@ -238,6 +238,8 @@ def analysis( bind_path.append(sample_config.get("panel").get("capture_kit")) if "background_variants" in sample_config: bind_path.append(sample_config.get("background_variants")) + if "pon_cnn" in sample_config: + bind_path.append(sample_config.get("panel").get("pon_cnn")) bind_path.append(BALSAMIC_SCRIPTS) bind_path.append(sample_config["analysis"]["analysis_dir"]) bind_path.extend(get_fastq_bind_path(sample_config["analysis"]["fastq_path"])) diff --git a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule index 6b7ffaf09..c3844ddc4 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule @@ -1,6 +1,14 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 + +def get_pon_cnn(config): + if "pon_cnn" in config["panel"]: + return os.path.abspath(config["panel"]["pon_cnn"]) + else: + return None + + rule cnvkit_single: input: access_bed = config["reference"]["access_regions"], @@ -33,7 +41,8 @@ rule cnvkit_single: min_mapq= params.common.min_mapq, case_name = config["analysis"]["case_id"], sample_id = "TUMOR", - genome_version = GENOME_VERSION + genome_version = GENOME_VERSION, + pon = " " if get_pon_cnn(config) is None else get_pon_cnn(config) message: ("Run CNVkit pipeline for sample {params.case_name}," "while tumor purity/ploidy calculated using PureCN") @@ -66,18 +75,17 @@ cnvkit.py coverage {input.bamT} \ --processes {threads} \ --output {params.cnv_dir}/tumor.antitargetcoverage.cnn; -# Compile a coverage reference from the given list of files -cnvkit.py reference --output {params.cnv_dir}/FlatReference.cnn \ ---fasta {input.fasta} \ ---targets {params.cnv_dir}/targets.bed \ ---antitargets {params.cnv_dir}/antitarget_bed; # Combine the uncorrected target and antitarget coverage tables (.cnn) and # correct for biases in regional coverage and GC content, according to the given reference -cnvkit.py fix {params.cnv_dir}/tumor.targetcoverage.cnn \ -{params.cnv_dir}/tumor.antitargetcoverage.cnn \ -{params.cnv_dir}/FlatReference.cnn \ ---output {output.cnr}; +if [[ ! -f "{params.pon}" ]]; then +cnvkit.py reference --output {params.cnv_dir}/FlatReference.cnn --fasta {input.fasta} --targets {params.cnv_dir}/targets.bed --antitargets {params.cnv_dir}/antitarget_bed; +cnvkit.py fix {params.cnv_dir}/tumor.targetcoverage.cnn {params.cnv_dir}/tumor.antitargetcoverage.cnn {params.cnv_dir}/FlatReference.cnn --output {output.cnr}; +else +echo "PON reference exists- Using it for coverage correction" +cnvkit.py fix {params.cnv_dir}/tumor.targetcoverage.cnn {params.cnv_dir}/tumor.antitargetcoverage.cnn {params.pon} --output {output.cnr}; +fi + # Infer copy number segments from the given coverage table # segmentattion methods (-m): cbs: reccommended for mid-size target panels and exomes diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule index 73c1a5f2d..1369c2f37 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule @@ -1,15 +1,6 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -def get_pon(config): - """ return pon cli string, complete with file """ - if "PON" in config["analysis"]: - return os.path.abspath(config["analysis"]["PON"]) - else: - return None - - - rule vardict_tumor_only: input: fa = config["reference"]["reference_genome"], @@ -105,7 +96,6 @@ rule sentieon_TNhaplotyper_tumor_only: Path(benchmark_dir,'sentieon_TNhaplotyper_tumor_only_' + config["analysis"]["case_id"] + ".tsv").as_posix() params: tumor = "TUMOR", - pon = " " if get_pon(config) is None else " ".join(["--pon", get_pon(config)]), tmpdir= tempfile.mkdtemp(prefix=tmp_dir), sentieon_exec = config["SENTIEON_EXEC"], sentieon_lic = config["SENTIEON_LICENSE"], @@ -127,7 +117,7 @@ export SENTIEON_LICENSE={params.sentieon_lic}; -i {input.bam} \ --interval {input.interval} \ --algo TNhaplotyper \ ---tumor_sample {params.tumor} {params.pon} \ +--tumor_sample {params.tumor} \ --cosmic {input.cosmic} \ --dbsnp {input.dbsnp} {output.vcf}; diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index 6d3965923..9ee9ba8c1 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -352,6 +352,7 @@ class PanelModel(BaseModel): Attributes: capture_kit : Field(str(Path)); string representation of path to PANEL BED file chrom : Field(list(str)); list of chromosomes in PANEL BED + pon_cnn: Field(optional); Path where PON reference .cnn file is stored Raises: ValueError: @@ -361,11 +362,18 @@ class PanelModel(BaseModel): capture_kit: Optional[FilePath] chrom: Optional[List[str]] + pon_cnn: Optional[FilePath] @validator("capture_kit") def path_as_abspath_str(cls, value): return Path(value).resolve().as_posix() + @validator("pon_cnn") + def pon_abspath_as_str(cls, value): + if value: + return Path(value).resolve().as_posix() + return None + class PonBalsamicConfigModel(BaseModel): """Summarizes config models in preparation for export diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f3d90721e..18726ade1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ Added: * Snakemake workflow to create canfam3 reference #843 * Call umi variants using TNscope in bed defined regions #821 * UMI duplication metrics to report in multiqc_picard_dups.json #844 +* Option to use PON reference in cnv calling for TGA tumor-only cases * QC default validation conditions (for not defined capture kits) #855 * SVdb to the varcall_py36 container #871 * SVdb to WGS workflow #871 @@ -33,6 +34,7 @@ Removed ^^^^^^^ * ``--qc-metrics/--no-qc-metrics`` flag from the ``balsamic report deliver`` command #833 +* Unused pon option for SNV calling with TNhaplotyper tumor-only * SV and CNV callers from annotation and filtering #871 * vcfanno from SV annotation diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py index b16a6173b..922fde841 100644 --- a/tests/commands/config/test_config_sample.py +++ b/tests/commands/config/test_config_sample.py @@ -249,3 +249,35 @@ def test_config_graph_failed( ) assert case_result.exit_code == 1 + + +def test_pon_cnn_file( + invoke_cli, sample_fastq, analysis_dir, balsamic_cache, panel_bed_file +): + + # GIVEN CLI arguments including optional pon reference '.cnn' file + case_id = "test_sample_cnv" + tumor = sample_fastq["tumor"] + pon_file = "tests/test_data/references/panel/test_panel_ponn.cnn" + + result = invoke_cli( + [ + "config", + "case", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--pon-cnn", + pon_file, + "--balsamic-cache", + balsamic_cache, + ], + ) + # THEN program exits and checks for filepath + assert result.exit_code == 0 + assert Path(pon_file).exists() diff --git a/tests/commands/run/test_run_analysis.py b/tests/commands/run/test_run_analysis.py index 59146c119..b3d1d8e2e 100644 --- a/tests/commands/run/test_run_analysis.py +++ b/tests/commands/run/test_run_analysis.py @@ -92,3 +92,22 @@ def test_run_analysis_create_dir(invoke_cli, tumor_only_config): ) # THEN it should abort with error assert Path(re.sub("/$", ".1/", log_dir)).exists() + + +def test_run_analysis_ponpath(invoke_cli, tumor_only_pon_config): + # GIVEN a tumor-only with pon file in the config file + # WHEN running analysis + + with open(tumor_only_pon_config) as fh: + sample_config = json.load(fh) + + bind_path = ["/path_to_dummy/ash/"] + pon_fl = sample_config["panel"].get("pon_cnn") + pon_path = Path(pon_fl).resolve() + + if "pon_cnn" in sample_config["panel"]: + bind_path.append(str(pon_path)) + + # THEN it checks for existence of paths + assert pon_path.exists() + assert str(pon_path) in bind_path diff --git a/tests/conftest.py b/tests/conftest.py index cff2d544a..f1113daea 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,6 +47,7 @@ def config_files(): "analysis_single_umi": "BALSAMIC/config/analysis_single_umi.json", "panel_bed_file": "tests/test_data/references/panel/panel.bed", "background_variant_file": "tests/test_data/references/panel/background_variants.txt", + "pon_cnn": "tests/test_data/references/panel/test_panel_ponn.cnn", "pon_fastq_path": "tests/test_data/fastq/", } @@ -96,6 +97,11 @@ def background_variant_file(): return "tests/test_data/references/panel/background_variants.txt" +@pytest.fixture(scope="session") +def pon_cnn(): + return "tests/test_data/references/panel/test_panel_ponn.cnn" + + @pytest.fixture(scope="session") def sentieon_license(tmp_path_factory): """ @@ -336,7 +342,7 @@ def tumor_normal_wgs_config( @pytest.fixture(scope="session") def tumor_only_config( - tmpdir_factory, + tmp_path_factory, sample_fastq, balsamic_cache, background_variant_file, @@ -430,6 +436,55 @@ def tumor_only_wgs_config( return Path(analysis_dir, case_id, case_id + ".json").as_posix() +@pytest.fixture(scope="session") +def tumor_only_pon_config( + tmp_path_factory, + sample_fastq, + balsamic_cache, + analysis_dir, + panel_bed_file, + sentieon_license, + sentieon_install_dir, + pon_cnn, +): + """ + invokes balsamic config sample -t xxx to create sample config + for tumor only + """ + case_id = "sample_tumor_only_pon" + tumor = sample_fastq["tumor"] + + with mock.patch.dict( + MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "case", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + "--pon-cnn", + pon_cnn, + ], + ) + + return Path(analysis_dir, case_id, case_id + ".json").as_posix() + + @pytest.fixture(scope="session") def sample_config(): """ diff --git a/tests/test_data/references/panel/test_panel_ponn.cnn b/tests/test_data/references/panel/test_panel_ponn.cnn new file mode 100644 index 000000000..1a62208ab --- /dev/null +++ b/tests/test_data/references/panel/test_panel_ponn.cnn @@ -0,0 +1,6 @@ +chromosome start end gene log2 depth gc rmask spread +12 49445437 49445694 KMT2D 0.100978 1741.99 0.622568 0.0934444 +12 51204937 51204938 ATF1 0.175285 1145.69 0 0.1385 +12 52055418 52199527 Antitarget 0.146543 0.271576 0.412285 0 0.172095 +12 52345527 52345618 ACVR1B -0.84849 837.93 0.747253 0.335741 +12 52346118 52357667 Antitarget 0.321353 0.344461 0.424452 0 0.520977 From 998e68c88ff7b7e0ae416369844a01d8abda8fdc Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 4 Apr 2022 22:11:23 +0200 Subject: [PATCH 42/58] update changelog --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 98841f9a4..299781b4d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,4 +1,4 @@ -[X.X.X] +[9.0.0] ======= Added: From cb10e60d5e634f12530a3c071d9a9bd643352981 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Wed, 6 Apr 2022 13:40:57 +0200 Subject: [PATCH 43/58] refactor: redundant files and varaibles from balsamic (#903) * remove MSK_impact files from config * remove strelka and pindel from config analysis * remove strelka and pindel from balsamic env variables * remove strelka,mutect and baserecalibator from cluster json * remove strelka and pindel from constants * update changelog --- BALSAMIC/config/MSK_impact.json | 119 ---------------------- BALSAMIC/config/MSK_impact_noStrelka.json | 75 -------------- BALSAMIC/config/analysis.json | 12 --- BALSAMIC/config/balsamic_env.yaml | 1 - BALSAMIC/config/cluster.json | 28 ----- BALSAMIC/constants/common.py | 1 - CHANGELOG.rst | 3 +- tests/conftest.py | 3 - tests/test_data/BALSAMIC_env.yaml | 2 - 9 files changed, 2 insertions(+), 242 deletions(-) delete mode 100644 BALSAMIC/config/MSK_impact.json delete mode 100644 BALSAMIC/config/MSK_impact_noStrelka.json diff --git a/BALSAMIC/config/MSK_impact.json b/BALSAMIC/config/MSK_impact.json deleted file mode 100644 index d45582e58..000000000 --- a/BALSAMIC/config/MSK_impact.json +++ /dev/null @@ -1,119 +0,0 @@ -{ - "version": - "0.1.1", - "reference_documents": - ["https://www.accessdata.fda.gov/cdrh_docs/reviews/DEN170058.pdf"], - "base_line": {}, - "filters": { - "set_1": { - "VF_ratio": "5", - "name": "'MSK-IMPACT high confidence'", - "in_mvl": "T", - "variantcaller": ["MUTECT2"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_2": { - "VF_ratio": "5", - "name": "'MSK-IMPACT low confidence'", - "in_mvl": "F", - "variantcaller": ["MUTECT2"], - "TUMOR": { - "DP": "20", - "AD": "10", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_3": { - "VF_ratio": "5", - "name": "'Discovery High confidence'", - "in_mvl": "T", - "variantcaller": ["MUTECT2", "VARDICT", "STRELKA"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_4": { - "VF_ratio": "5", - "name": "'Discovery low confidence'", - "in_mvl": "F", - "variantcaller": ["MUTECT2", "VARDICT", "STRELKA"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_5": { - "VF_ratio": "1", - "name": "'Discovery extra'", - "in_mvl": "F", - "variantcaller": ["MUTECT2", "VARDICT", "STRELKA"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - } - } -} diff --git a/BALSAMIC/config/MSK_impact_noStrelka.json b/BALSAMIC/config/MSK_impact_noStrelka.json deleted file mode 100644 index 1db906629..000000000 --- a/BALSAMIC/config/MSK_impact_noStrelka.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "version": - "0.1.1", - "reference_documents": - ["https://www.accessdata.fda.gov/cdrh_docs/reviews/DEN170058.pdf"], - "base_line": {}, - "filters": { - "set_1": { - "VF_ratio": "5", - "name": "'High confidence set (in MSK-IMPACT)'", - "in_mvl": "T", - "variantcaller": ["MUTECT2"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_2": { - "VF_ratio": "5", - "name": "'Low confidence set (not in MSK-IMPACT)'", - "in_mvl": "F", - "variantcaller": ["MUTECT2"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_3": { - "VF_ratio": "5", - "name": "'Discovery High confidence (in MSK-IMPACT)'", - "in_mvl": "T", - "variantcaller": ["MUTECT2", "VARDICT"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - } - } -} diff --git a/BALSAMIC/config/analysis.json b/BALSAMIC/config/analysis.json index d09911f63..3e46e0b1d 100644 --- a/BALSAMIC/config/analysis.json +++ b/BALSAMIC/config/analysis.json @@ -22,14 +22,6 @@ "mutation": "somatic", "type": "SNV" }, - "pindel": { - "mutation": "somatic", - "type": "SV" - }, - "strelka": { - "mutation": "somatic", - "type": "SNV" - }, "mutect": { "mutation": "somatic", "type": "SNV" @@ -58,10 +50,6 @@ "mutation": "germline", "type": "SNV" }, - "strelka_germline": { - "mutation": "germline", - "type": "SNV" - }, "vcfmerge":{ "mutation": "somatic", "type": "SNV" diff --git a/BALSAMIC/config/balsamic_env.yaml b/BALSAMIC/config/balsamic_env.yaml index 720f27d9a..c3ecb6b34 100644 --- a/BALSAMIC/config/balsamic_env.yaml +++ b/BALSAMIC/config/balsamic_env.yaml @@ -21,7 +21,6 @@ varcall_py36: - libiconv - svdb varcall_py27: - - strelka - manta varcall_cnvkit: - cnvkit diff --git a/BALSAMIC/config/cluster.json b/BALSAMIC/config/cluster.json index 84c0fdd62..edd6965f1 100644 --- a/BALSAMIC/config/cluster.json +++ b/BALSAMIC/config/cluster.json @@ -11,10 +11,6 @@ "time": "00:15:00", "n": 1 }, - "BaseRecalibrator": { - "time": "15:00:00", - "n": 10 - }, "CollectAlignmentSummaryMetrics": { "time": "03:30:00", "n": 8 @@ -35,10 +31,6 @@ "time": "03:30:00", "n": 8 }, - "RealignerTargetCreator": { - "time": "10:00:00", - "n": 10 - }, "bwa_mem": { "time": "08:00:00", "n": 16 @@ -100,18 +92,6 @@ "time": "00:15:00", "n": 4 }, - "mutect2_merge": { - "time": "01:30:00", - "n": 8 - }, - "mutect2_tumor_normal": { - "time": "24:00:00", - "n": 12 - }, - "mutect2_tumor_only": { - "time": "24:00:00", - "n": 12 - }, "sambamba_exon_depth": { "time": "02:30:00", "n": 8 @@ -180,14 +160,6 @@ "time": "06:00:00", "n": 10 }, - "strelka_germline": { - "time": "08:00:00", - "n": 10 - }, - "strelka_tumor_normal": { - "time": "10:00:00", - "n": 10 - }, "vardict_merge": { "time": "01:30:00", "n": 5 diff --git a/BALSAMIC/constants/common.py b/BALSAMIC/constants/common.py index a2455581a..eec892a84 100644 --- a/BALSAMIC/constants/common.py +++ b/BALSAMIC/constants/common.py @@ -74,7 +74,6 @@ "gatk": "varcall_py36", "vardict": "varcall_py36", "svdb": "varcall_py36", - "strelka": "varcall_py27", "manta": "varcall_py27", "cnvkit": "varcall_cnvkit", "delly": "delly", diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 98841f9a4..d20adbee5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -39,7 +39,8 @@ Removed * Unused pon option for SNV calling with TNhaplotyper tumor-only * SV and CNV callers from annotation and filtering #871 * vcfanno from SV annotation - +* Removed `MSK_impact` and `MSK_impact_noStrelka` json files from config +* Cleanup of `strelka`, `pindel` , `mutect2` variables from BALSAMIC [8.2.8] -------- diff --git a/tests/conftest.py b/tests/conftest.py index f1113daea..c28f51a44 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -515,8 +515,6 @@ def sample_config(): "vcf": { "manta": {"mutation": "somatic", "type": "SV"}, "vardict": {"mutation": "somatic", "type": "SNV"}, - "pindel": {"mutation": "somatic", "type": "SV"}, - "strelka": {"mutation": "somatic", "type": "SNV"}, "mutect": {"mutation": "somatic", "type": "SNV"}, "tnscope": {"mutation": "somatic", "type": "SNV"}, "tnsnv": {"mutation": "somatic", "type": "SNV"}, @@ -524,7 +522,6 @@ def sample_config(): "dnascope": {"mutation": "germline", "type": "SNV"}, "manta_germline": {"mutation": "germline", "type": "SV"}, "haplotypecaller": {"mutation": "germline", "type": "SNV"}, - "strelka_germline": {"mutation": "germline", "type": "SNV"}, }, "samples": { "S1_R": { diff --git a/tests/test_data/BALSAMIC_env.yaml b/tests/test_data/BALSAMIC_env.yaml index c108004f4..0e9038004 100644 --- a/tests/test_data/BALSAMIC_env.yaml +++ b/tests/test_data/BALSAMIC_env.yaml @@ -1,6 +1,5 @@ D_BALSAMIC-py27_test: - python -- strelka - manta - bcftools - tabix @@ -21,7 +20,6 @@ D_BALSAMIC-py36_test: - ensembl-vep - cnvkit - cutadapt -- pindel - multiqc - bedtools - svdb From b27c519a768f7c2e193e47a867e37e1b76a69da0 Mon Sep 17 00:00:00 2001 From: ivadym Date: Mon, 11 Apr 2022 16:23:06 +0200 Subject: [PATCH 44/58] feat: extract additional WGS QC metrics (#907) --- BALSAMIC/assets/scripts/collect_qc_metrics.py | 11 +++++--- BALSAMIC/constants/quality_check_reporting.py | 11 +++++++- BALSAMIC/utils/models.py | 11 +++++++- CHANGELOG.rst | 1 + .../qc/multiqc_data/multiqc_data.json | 26 +++++++++++++++++++ 5 files changed, 55 insertions(+), 5 deletions(-) mode change 100644 => 100755 BALSAMIC/assets/scripts/collect_qc_metrics.py diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py old mode 100644 new mode 100755 index 46377bd0d..015735f7b --- a/BALSAMIC/assets/scripts/collect_qc_metrics.py +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -49,6 +49,7 @@ def collect_qc_metrics( def capture_kit_resolve_type(capture_kit: str): """Resolves the capture_kit type (NoneType or String)""" + if capture_kit == "None": return None else: @@ -67,9 +68,12 @@ def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str: A source file that was used to produce a specific metric """ - # Use case: splits multiqc_picard_dups into ['multiqc', 'picard', 'dup'] in order to retrieve the - # ["report_data_sources"]["Picard"]["DuplicationMetrics"] values from multiqc_data.json - subtool_name = tool[:-1].split("_") + if tool == "multiqc_general_stats": + subtool_name = ["multiqc", "FastQC", "all_sections"] + else: + # Use case: splits multiqc_picard_dups into ['multiqc', 'picard', 'dup'] in order to retrieve the + # ["report_data_sources"]["Picard"]["DuplicationMetrics"] values from multiqc_data.json + subtool_name = tool[:-1].split("_") # Nested json fetching for source_tool in multiqc_data["report_data_sources"]: @@ -139,6 +143,7 @@ def extract(data, output_metrics, sample=None, source=None): if isinstance(data, dict): for k in data: + # Ignore UMI and reverse reads metrics if "umi" not in k: if k in requested_metrics: output_metrics.append( diff --git a/BALSAMIC/constants/quality_check_reporting.py b/BALSAMIC/constants/quality_check_reporting.py index dba06df8e..64b6a80ef 100644 --- a/BALSAMIC/constants/quality_check_reporting.py +++ b/BALSAMIC/constants/quality_check_reporting.py @@ -90,5 +90,14 @@ "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, }, }, - "wgs": {"FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}}, + "wgs": { + "MEAN_INSERT_SIZE": {"condition": None}, + "MEDIAN_COVERAGE": {"condition": None}, + "FastQC_mqc-generalstats-fastqc-percent_duplicates": {"condition": None}, + "PCT_15X": {"condition": None}, + "PCT_30X": {"condition": None}, + "PCT_60X": {"condition": None}, + "PCT_100X": {"condition": None}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, + }, } diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index dc0dd89c8..ec92b496e 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -741,6 +741,15 @@ class MetricModel(BaseModel): value: Any = ... condition: Optional[MetricConditionModel] = ... + @validator("name") + def validate_name(cls, name, values): + """Updates the name if the source is FastQC""" + + if "fastqc-percent_duplicates" in name: + return "PERCENT_DUPLICATION_R" + values["input"].split("_")[-2] + + return name + class MetricValidationModel(BaseModel): """Defines the metric validation model @@ -755,7 +764,7 @@ class MetricValidationModel(BaseModel): metrics: List[MetricModel] @validator("metrics", each_item=True) - def check_squares(cls, metric): + def validate_metrics(cls, metric): """Checks if a metric meets its filtering condition""" if metric.condition and not VALID_OPS[metric.condition.norm]( diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 976609584..13155ab92 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -15,6 +15,7 @@ Added: * Snakemake rule for creating `.cgh` files from `CNVkit` outputs #858 * SVdb to TGA workflow #871 * SVdb merge SV and CNV #871 +* Additional WGS metrics to be stored in ``_metrics_deliverables.yaml`` #907 Changed: ^^^^^^^^ diff --git a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json index e8b15f614..b06d8b626 100755 --- a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json +++ b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json @@ -11,6 +11,14 @@ "DuplicationMetrics": { "concatenated_tumor_XXXXXX_R": "tests/test_data/qc_files/analysis/bam/concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt" } + }, + "FastQC": { + "all_sections": { + "concatenated_tumor_XXXXXX_R_2": "tests/test_data/qc_files/analysis/fastqc/concatenated_tumor_XXXXXX_R_2_fastqc.zip", + "concatenated_normal_XXXXXX_R_1": "tests/test_data/qc_files/analysis/fastqc/concatenated_normal_XXXXXX_R_1_fastqc.zip", + "concatenated_normal_XXXXXX_R_2": "tests/test_data/qc_files/analysis/fastqc/concatenated_normal_XXXXXX_R_2_fastqc.zip", + "concatenated_tumor_XXXXXX_R_1": "tests/test_data/qc_files/analysis/fastqc/concatenated_tumor_XXXXXX_R_1_fastqc.zip" + } } }, "report_saved_raw_data": { @@ -76,6 +84,24 @@ "READ_PAIR_DUPLICATES": 18741892.0, "PERCENT_DUPLICATION": 0.391429 } + }, + "multiqc_general_stats": { + "concatenated_tumor_XXXXXX_R_2": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 15.03521942842923, + "FastQC_mqc-generalstats-fastqc-total_sequences": 600529762.0 + }, + "concatenated_normal_XXXXXX_R_1": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 14.426654287440797, + "FastQC_mqc-generalstats-fastqc-total_sequences": 464581551.0 + }, + "concatenated_normal_XXXXXX_R_2": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 14.214689357571501, + "FastQC_mqc-generalstats-fastqc-total_sequences": 464581551.0 + }, + "concatenated_tumor_XXXXXX_R_1": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 15.213739762327492, + "FastQC_mqc-generalstats-fastqc-total_sequences": 600529762.0 + } } } } From 096df69883a3643e165198cd472d1ecacc6f848f Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Wed, 13 Apr 2022 13:21:23 +0200 Subject: [PATCH 45/58] fix: remove stat files and add count files for scout upload VCFs (#899) * update changelog * update changelog * remove bcftools_stats * add bcftools_stats * update changelog * add bcftools +counts to end point VCFs * update black linter * remove bcftools counts for VCFs not to be uploaded to scout * remove bcftools counts for VCFs not to be uploaded to scout * update changelog --- .../varcaller_filter_tumor_normal.rule | 25 ++++++++++++------- .../varcaller_filter_tumor_only.rule | 25 ++++++++++++------- .../annotation/varcaller_sv_filter.rule | 9 ++++--- .../varcaller_wgs_filter_tumor_normal.rule | 16 +++++++----- .../varcaller_wgs_filter_tumor_only.rule | 16 +++++++----- BALSAMIC/snakemake_rules/annotation/vep.rule | 12 --------- CHANGELOG.rst | 2 ++ 7 files changed, 60 insertions(+), 45 deletions(-) diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule index 9a29e5869..24723d72b 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule @@ -9,7 +9,8 @@ rule bcftools_filter_vardict_tumor_normal: vcf = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.vcf.gz", + vcf_pass_vardict = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_filter_vardict_tumor_normal_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -42,9 +43,11 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_vardict} -O z {output.vcf_filtered}; -tabix -p vcf -f {output.vcf_pass}; +tabix -p vcf -f {output.vcf_pass_vardict}; + +bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts}; """ @@ -53,7 +56,7 @@ rule bcftools_filter_tnhaplotyper_tumor_normal: vcf = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", + vcf_pass_tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_filter_tnhaplotyper_tumor_normal' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -74,9 +77,10 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_tnhaplotyper} -O z {output.vcf_filtered}; + +tabix -p vcf -f {output.vcf_pass_tnhaplotyper}; -tabix -p vcf -f {output.vcf_pass}; """ @@ -85,7 +89,8 @@ rule bcftools_filter_TNscope_umi_tumor_normal: vcf = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.vcf.gz", + vcf_pass_TNscope_umi = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_filter_TNscope_umi_tumor_normal' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -106,7 +111,9 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_TNscope_umi} -O z {output.vcf_filtered}; + +tabix -p vcf -f {output.vcf_pass_TNscope_umi}; -tabix -p vcf -f {output.vcf_pass}; +bcftools +counts {output.vcf_pass_TNscope_umi} > {output.bcftools_counts}; """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule index 986c68d46..5ba6919e3 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule @@ -9,7 +9,8 @@ rule bcftools_filter_vardict_tumor_only: vcf = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.vcf.gz", + vcf_pass_vardict = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_filter_vardict_tumor_only_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -40,9 +41,11 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_vardict} -O z {output.vcf_filtered}; -tabix -p vcf -f {output.vcf_pass}; +tabix -p vcf -f {output.vcf_pass_vardict}; + +bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts}; """ @@ -51,7 +54,7 @@ rule bcftools_filter_tnhaplotyper_tumor_only: vcf = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", + vcf_pass_tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_filter_tnhaplotyper_tumor_only' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -72,9 +75,10 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_tnhaplotyper} -O z {output.vcf_filtered}; + +tabix -p vcf -f {output.vcf_pass_tnhaplotyper}; -tabix -p vcf -f {output.vcf_pass}; """ @@ -83,7 +87,8 @@ rule bcftools_filter_TNscope_umi_tumor_only: vcf = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.vcf.gz", + vcf_pass_TNscope_umi = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_filter_TNscope_umi_tumor_only' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -104,7 +109,9 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_TNscope_umi} -O z {output.vcf_filtered}; + +tabix -p vcf -f {output.vcf_pass_TNscope_umi}; -tabix -p vcf -f {output.vcf_pass}; +bcftools +counts {output.vcf_pass_TNscope_umi} > {output.bcftools_counts}; """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule index efa975278..e1588a79a 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule @@ -6,7 +6,8 @@ rule bcftools_filter_svdb: input: vcf = vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz", output: - vcf_sv_pass_svdb = vep_dir + "SV.somatic.{case_name}.svdb.all.filtered.pass.vcf.gz", + vcf_pass_svdb = vep_dir + "SV.somatic.{case_name}.svdb.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "SV.somatic.{case_name}.svdb.all.filtered.pass.stats" benchmark: benchmark_dir + "bcftools_filter_svdb_SV.somatic.{case_name}.svdb.vep.tsv" singularity: @@ -20,7 +21,9 @@ rule bcftools_filter_svdb: "Filtering svdb merged Manta and Delly results for PASS variants using bcftools for sample '{params.case_name}' " shell: """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass_svdb} -O z {input.vcf}; +bcftools view --threads {threads} -f .,PASS -o {output.vcf_pass_svdb} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass_svdb}; +tabix -p vcf -f {output.vcf_pass_svdb}; + +bcftools +counts {output.vcf_pass_svdb} > {output.bcftools_counts}; """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule index 59b9c0435..23a229a0d 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule @@ -9,7 +9,8 @@ rule bcftools_filter_tnscope_tumor_normal: vcf = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.vcf.gz", + vcf_pass_tnscope = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_filter_tnscope_tumor_normal_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -38,9 +39,11 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_tnscope} -O z {output.vcf_filtered}; -tabix -p vcf -f {output.vcf_pass}; +tabix -p vcf -f {output.vcf_pass_tnscope}; + +bcftools +counts {output.vcf_pass_tnscope} > {output.bcftools_counts}; """ @@ -50,7 +53,7 @@ rule bcftools_filter_tnhaplotyper_tumor_normal: wgs_calling_file = config["reference"]["wgs_calling_interval"] output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", + vcf_pass_tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_filter_tnhaplotyper_tumor_normal_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -73,7 +76,8 @@ bcftools view -f PASS --threads {threads} --regions-file {input.wgs_calling_file tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS --threads {threads} -O z -o {output.vcf_pass} {output.vcf_filtered} +bcftools view -f PASS --threads {threads} -O z -o {output.vcf_pass_tnhaplotyper} {output.vcf_filtered} + +tabix -p vcf -f {output.vcf_pass_tnhaplotyper} -tabix -p vcf -f {output.vcf_pass} """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule index a95af1602..d6bd339ed 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule @@ -54,7 +54,7 @@ rule bcftools_filter_tnhaplotyper_tumor_only: wgs_calling_file = config["reference"]["wgs_calling_interval"] output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", + vcf_pass_tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_filter_tnhaplotyper_tumor_only_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -89,9 +89,10 @@ bcftools view -f PASS --threads {threads} --regions-file {input.wgs_calling_file tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS --threads {threads} -O z -o {output.vcf_pass} {output.vcf_filtered} +bcftools view -f PASS --threads {threads} -O z -o {output.vcf_pass_tnhaplotyper} {output.vcf_filtered} + +tabix -p vcf -f {output.vcf_pass_tnhaplotyper} -tabix -p vcf -f {output.vcf_pass} """ @@ -101,7 +102,8 @@ rule bcftools_intersect_tumor_only: tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.vcf.gz" output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.vcf.gz", + vcf_pass_tnscope = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_intersect_tumor_only_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -122,9 +124,11 @@ cp {params.vcf_dir}/0002.vcf.gz {output.vcf_filtered}; tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_tnscope} -O z {output.vcf_filtered}; + +tabix -p vcf -f {output.vcf_pass_tnscope}; -tabix -p vcf -f {output.vcf_pass}; +bcftools +counts {output.vcf_pass_tnscope} > {output.bcftools_counts}; rm -r {params.vcf_dir} """ diff --git a/BALSAMIC/snakemake_rules/annotation/vep.rule b/BALSAMIC/snakemake_rules/annotation/vep.rule index c4d586c99..d31cda315 100644 --- a/BALSAMIC/snakemake_rules/annotation/vep.rule +++ b/BALSAMIC/snakemake_rules/annotation/vep.rule @@ -10,8 +10,6 @@ rule vep_somatic_snv: cosmic = config["reference"]["cosmic"] output: vcf_all = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}.all.vcf.gz"), - vcf_summary = vep_dir + "SNV.somatic.{case_name}.{var_caller}.all.vcf.gz_summary.html", - bcftools_stats = vep_dir + "SNV.somatic.{case_name}.{var_caller}.all.stats" benchmark: Path(benchmark_dir, "vep_somatic_SNV.somatic.{case_name}.{var_caller}.tsv").as_posix() singularity: @@ -50,8 +48,6 @@ vep \ tabix -p vcf -f {output.vcf_all}; -bcftools stats {output.vcf_all} > {output.bcftools_stats}; - rm $tmpvcf; """ @@ -61,8 +57,6 @@ rule vep_somatic_sv: header = vcf_dir + "SV.somatic.{case_name}.svdb.sample_name_map", output: vcf_all = temp(vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz"), - vcf_summary = vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz_summary.html", - bcftools_stats = vep_dir + "SV.somatic.{case_name}.svdb.all.stats" benchmark: Path(benchmark_dir, "vep_somatic_SV.somatic.{case_name}.svdb.tsv").as_posix() singularity: @@ -92,9 +86,6 @@ vep \ {params.vep_defaults} \ tabix -p vcf -f {output.vcf_all}; - -bcftools stats {output.vcf_all} > {output.bcftools_stats}; - """ rule tmb_calculation: @@ -160,8 +151,6 @@ rule vep_germline: cosmic = config["reference"]["cosmic"] output: vcf_all = vep_dir + "{var_type}.germline.{sample}.{var_caller}.vcf.gz", - vcf_summary = vep_dir + "{var_type}.germline.{sample}.{var_caller}.vcf.gz_summary.html", - bcftools_stats = vep_dir + "{var_type}.germline.{sample}.{var_caller}.all.stats" benchmark: Path(benchmark_dir, "vep_germline_{var_type}.germline.{sample}.{var_caller}.tsv").as_posix() singularity: @@ -192,5 +181,4 @@ vep \ tabix -p vcf -f {output.vcf_all}; -bcftools stats {output.vcf_all} > {output.bcftools_stats}; """ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 72fba2f56..d921c3f84 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -15,6 +15,7 @@ Added: * Snakemake rule for creating `.cgh` files from `CNVkit` outputs #858 * SVdb to TGA workflow #871 * SVdb merge SV and CNV #871 +* bcftools counts to varcall filter rules #898 * Additional WGS metrics to be stored in ``_metrics_deliverables.yaml`` #907 Changed: @@ -39,6 +40,7 @@ Removed * vcfanno from SV annotation * Removed `MSK_impact` and `MSK_impact_noStrelka` json files from config * Cleanup of `strelka`, `pindel` , `mutect2` variables from BALSAMIC +* bcftools_stats from vep #898 [8.2.10] -------- From 580f4e2646baefd5f3f30563316f618078953d1d Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 19 Apr 2022 08:57:49 +0200 Subject: [PATCH 46/58] docs: balsamic methods description in readthedocs (#906) * add balsamic method description * add svdb to softwares * update changelog * adding more text * add wgs sentieon descr * add sentieon filter text * add spell and grammer checks * update changelog * spell and grammer checks for balsamic filters * small text replacemnets * add svdb citation * fix review comments-1 * fix review comments-2 * fix review comments-3 * edit methods text * few more text edits --- CHANGELOG.rst | 2 + docs/balsamic_filters.rst | 98 ++++++++++++++++++++++++++++++-------- docs/balsamic_methods.rst | 88 ++++++++++++++++++++++++++++++++++ docs/bioinfo_softwares.rst | 6 +++ docs/index.rst | 1 + 5 files changed, 175 insertions(+), 20 deletions(-) create mode 100644 docs/balsamic_methods.rst diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d921c3f84..c4935033f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -15,6 +15,8 @@ Added: * Snakemake rule for creating `.cgh` files from `CNVkit` outputs #858 * SVdb to TGA workflow #871 * SVdb merge SV and CNV #871 +* Readthedocs for BALSAMIC method descriptions #892 +* Readthedocs for BALSAMIC variant filters for WGS somatic callers #892 * bcftools counts to varcall filter rules #898 * Additional WGS metrics to be stored in ``_metrics_deliverables.yaml`` #907 diff --git a/docs/balsamic_filters.rst b/docs/balsamic_filters.rst index ac774b2c0..89cfbf4d2 100644 --- a/docs/balsamic_filters.rst +++ b/docs/balsamic_filters.rst @@ -3,9 +3,9 @@ BALSAMIC Variant Calling Algorithms *********************************** In BALSAMIC, various bioinfo tools are integrated for reporting somatic and germline variants. Also, the choice of these tools differs between the type of analysis, -for eg: `Target Genome Analysis (TGA)` or `Whole Genome Sequencing (WGS)`. Various filters (Pre-call filtering and Post-call filtering) are applied at different levels to report high-confidence variant calls. +e.g.: `Target Genome Analysis (TGA)` or analysis of `Whole Genome Sequencing (WGS)`. Various filters (Pre-call and Post-call filtering) are applied at different levels to report high-confidence variant calls. -* **Pre-call filtering** is where the variant-calling tool decides not to call a variant line to the VCF file, if the default filters did not pass the criteria. The set of default filters differs between the various variant-calling algorithms. +**Pre-call filtering** is where the variant-calling tool decides not to add a variant to the VCF file if the default filters of the variant-caller did not pass the filter criteria. The set of default filters differs between the various variant-calling algorithms. To know more about the pre-call filters used by the variant callers, please have a look at the VCF header of the particular variant-calling results. For example: @@ -13,26 +13,30 @@ For example: .. figure:: images/vcf_filters.png :width: 500px - Pre-call filters applied by the `Vardict` variant-caller is listed out in the VCF header + Pre-call filters applied by the `Vardict` variant-caller is listed in the VCF header. -In the VCF file, `FILTER` status is `PASS` if this position has passed all filters, i.e., a call is made at this position. Otherwise, -if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. e.g., `p8;pSTD` might -indicate that at this site, the mean position in reads is less than 8 and position in reads has a standard deviation of 0. +In the VCF file, the `FILTER` status is `PASS` if this position has passed all filters, i.e., a call is made at this position. Contrary, +if the site has not passed any of the filters, a semicolon-separated list of those failed filter(s) will be appended to the `FILTER` column instead of `PASS`. E.g., `p8;pSTD` might +indicate that at this site, the mean position in reads is less than 8, and the position in reads has a standard deviation of 0. + + +.. note:: -.. important:: In BALSAMIC, this VCF file is named as `*.all.vcf.gz` (eg: `SNV.somatic..vardict.all.vcf.gz`) + .. figure:: images/filter_status.png :width: 500px Vardict Variant calls with different 'FILTER' status underlined in white line (`NM4.5`, `PASS`, `p8;pSTD`) -* **Post-call filtering** is where a variant is further filtered with criteria such as quality, depth, VAF etc with more stringent thresholds. + +**Post-call filtering** is where a variant is further filtered with quality, depth, VAF, etc., with more stringent thresholds. For `Post-call filtering`, in BALSAMIC we have applied various filtering criteria (`Vardict_filtering`_, `TNscope filtering (Tumor_normal)`_ ) depending on the analysis-type (TGS/WGS) and sample-type(tumor-only/tumor-normal). -.. important:: +.. note:: In BALSAMIC, this VCF file is named as `*.all.filtered.pass.vcf.gz` (eg: `SNV.somatic..vardict.all.filtered.pass.vcf.gz`) **Targeted Genome Analysis** @@ -41,7 +45,6 @@ For `Post-call filtering`, in BALSAMIC we have applied various filtering criteri Somatic Callers for reporting SNVs/INDELS ****************************************** - **Vardict** =========== @@ -51,7 +54,7 @@ These high-confidence variant calls are the final list of variants uploaded to S **Vardict_filtering** ^^^^^^^^^^^^^^^^^^^^^^ -Following are the set of criterias applied for filtering vardict results. Applies for both tumor-normal and tumor-only samples +Following is the set of criteria applied for filtering vardict results. It is used for both tumor-normal and tumor-only samples. *Mean Mapping Quality (MQ)*: Refers to the root mean square (RMS) mapping quality of all the reads spanning the given variant site. @@ -88,7 +91,7 @@ Following are the set of criterias applied for filtering vardict results. Applie GNOMADAF_popmax <= 0.005 (or) GNOMADAF_popmax == "." .. important:: - Additionally, for tumor-normal cases; the variant is excluded if it marked as 'germline' in the `STATUS` column of vcf file. + Additionally, the variant is excluded for tumor-normal cases if marked as 'germline' in the `STATUS` column of the VCF file. **Whole Genome Sequencing (WGS)** ********************************** @@ -96,19 +99,20 @@ Following are the set of criterias applied for filtering vardict results. Applie **Sentieon's TNscope** ======================= -BALSAMIC utilizes `TNscope` algorithm for the variant calling of somatic SNV/INDELS in WGS samples. +BALSAMIC utilizes the `TNscope` algorithm for calling somatic SNVs and INDELS in WGS samples. The `TNscope `_ algorithm performs the somatic variant calling on the tumor-normal or the tumor-only samples, using a Haplotyper algorithm. **TNscope filtering (Tumor_normal)** ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The following filters are applied to the variants in TNscope raw VCF file (`SNV.somatic.$CASE_ID.tnscope.all.vcf.gz`). The variants scored as `PASS` are included in the final vcf file (`SNV.somatic.$CASE_ID.tnscope.all.filtered.pass.vcf.gz`). *Total Depth (DP)*: Refers to the overall read depth from all target samples supporting the variant call :: - DP(tumor) >= 10 || DP(normal) >= 10 + DP(tumor) >= 10 (or) DP(normal) >= 10 -*Allelic Depth (AD)*: Total reads supporting the ALT allele in tumor sample +*Allelic Depth (AD)*: Total reads supporting the ALT allele in the tumor sample :: @@ -129,6 +133,8 @@ The `TNscope `_ algor **TNscope filtering (tumor_only)** ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The somatic variants in TNscope raw VCF file (`SNV.somatic.$CASE_ID.tnscope.all.vcf.gz`) are filtered out for the genomic regions that are not reliable (eg: centromeric regions, non-chromosome contigs) to enhance the computation time. This WGS interval region file is collected from gatk_bundles ``_ +and following filters are applied. The variants that scored as `PASS` are considered for `Merging of TNscope and TNhaplotyper results (tumor_only)`_ *Total Depth (DP)*: Refers to the overall read depth supporting the variant call @@ -136,7 +142,7 @@ The `TNscope `_ algor DP(tumor) >= 10 -*Allelic Depth (AD)*: Total reads supporting the ALT allele in tumor sample +*Allelic Depth (AD)*: Total reads supporting the ALT allele in the tumor sample :: @@ -176,6 +182,58 @@ The `TNscope `_ algor SOR < 3 +**TNhaplotyper filtering (tumor_only)** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The somatic variants in TNhaplotyper raw VCF file (`SNV.somatic.$CASE_ID.tnhaplotyper.all.vcf.gz`) are filtered out for the genomic regions that are not reliable (eg: centromeric regions, non-chromosome contigs) to enhance the computation time. This WGS interval region file is collected from gatk_bundles ``_ +and following filters are applied. The variants that scored as `PASS` are considered for `Merging of TNscope and TNhaplotyper results (tumor_only)`_ + + +*Total Depth (DP)*: Refers to the overall read depth from all target samples supporting the variant call + +:: + + DP(tumor) >= 10 (or) DP(normal) >= 10 + +*Allelic Depth (AD)*: Total reads supporting the ALT allele in the tumor sample + +:: + + AD(tumor) >= 3 + +*Allelic Frequency (AF)*: Fraction of the reads supporting the alternate allele + +:: + + Minimum AF(tumor) >= 0.05 + Maximum AF(tumor) < 1 + +*GNOMADAF_POPMAX*: Maximum Allele Frequency across populations + +:: + + GNOMADAF_popmax <= 0.001 (or) GNOMADAF_popmax == "." + +*Normalized base quality scores*: The sum of base quality scores for each allele (QSS) is divided by the allelic depth of alt and ref alleles (AD) + +:: + + SUM(QSS)/SUM(AD) >= 20 + +*Read Counts*: Count of reads in a given (F1R2, F2R1) pair orientation supporting the alternate allele and reference alleles + +:: + + ALT_F1R2 > 0, ALT_F2R1 > 0 + REF_F1R2 > 0, REF_F2R1 > 0 + + +**Merging of TNscope and TNhaplotyper results (tumor_only)** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The filtered somatic variants from `TNscope filtering (tumor_only)`_ and `TNhaplotyper filtering (tumor_only)`_ are merged using the `bcftools` intersect command to reduce the number of reported somatic variants for tumor-only samples. +Next, the somatic variants that are called by both variant-callers are reported as the final filtered list of variants (`SNV.somatic.{CASE_ID}.tnscope.all.filtered.pass.vcf.gz`). +The final VCF constitutes a high confidence set of somatic variants, which is delivered to the customer either by scout or caesar filesystem. + **Target Genome Analysis with UMI's into account** ************************************************** @@ -192,16 +250,16 @@ The following filter applies for both tumor-normal and tumor-only samples. minreads = 3,1,1 -Which means that at least `3` UMI tag groups should be ideally considered from both DNA strands, where a minimum of atleast `1` UMI tag group should exist in each of the single-stranded consensus reads. +It means that at least `3` UMI tag groups should be ideally considered from both DNA strands, where a minimum of at least `1` UMI tag group should exist in each of the single-stranded consensus reads. -*min_init_tumor_lod* : Log odds is the likelihood that the candidate mutation is real over the likelihood that the candidate mutation is a sequencing error before any read-based filters are applied. -minimum log odds for the candidate selection. TNscope default: `4` +*min_init_tumor_lod*: Log odds is the likelihood that the candidate mutation is real over the likelihood that the candidate mutation is a sequencing error before any read-based filters are applied. +Minimum log-odds for the candidate selection. TNscope default: `4`. In our UMI-workflow we reduced this setting to `0.5` :: min_init_tumor_lod = 0.5 -*min_tumor_lod* : minimum log odds in the final call of variants. TNscope default: `6.3` +*min_tumor_lod*: minimum log odds in the final call of variants. TNscope default: `6.3`. In our UMI-workflow we reduced this setting to `4.0` :: diff --git a/docs/balsamic_methods.rst b/docs/balsamic_methods.rst new file mode 100644 index 000000000..5c9ccd90e --- /dev/null +++ b/docs/balsamic_methods.rst @@ -0,0 +1,88 @@ +======================== +BALSAMIC METHODS +======================== + +Target Genome Analysis +~~~~~~~~~~~~~~~~~~~~~~ + +BALSAMIC :superscript:`1` (**version** = 8.2.8) was used to analyze the data from raw FASTQ files. +We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. +Adapter sequences and low-quality bases were trimmed using fastp v0.20.1 :superscript:`3`. +Trimmed reads were mapped to the reference genome hg19 using BWA MEM v0.7.15 :superscript:`4`. +The resulted SAM files were converted to BAM files and sorted using samtools v1.12 :superscript:`5`. +Duplicated reads were marked using Picard tools MarkDuplicate v2.25.0 :superscript:`6` +and promptly quality controlled using CollectHsMetrics, CollectInsertSizeMetrics and CollectAlignmentSummaryMetrics functionalities. +Results of the quality controlled steps were summarized by MultiQC v1.11 :superscript:`7`. +Small somatic mutations (SNVs and INDELs) were called for each sample using VarDict v2019.06.04 :superscript:`8`. +Apart from the Vardict filters to report the variants, the called-variants were also further second filtered using the criteria +(*MQ >= 40, DP >= 100, VD >= 5, Minimum AF >= 0.007, Maximum AF < 1, GNOMADAF_popmax <= 0.005*). +Only those variants that fulfilled the filtering criteria and scored as `PASS` in the VCF file were reported. +Structural variants were called using Manta v1.6.0 :superscript:`9` and Delly v0.8.7 :superscript:`10`. +Copy number aberrations were called using CNVkit v0.9.4 :superscript:`11`. +The variant calls from CNVkit, Manta and Delly were merged using SVDB v2.6.0 :superscript:`12`. +All variants were annotated using Ensembl VEP v100.2 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` +to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`. + +Whole Genome Analysis +~~~~~~~~~~~~~~~~~~~~~ +BALSAMIC :superscript:`1` (**version** = 8.2.8) was used to analyze the data from raw FASTQ files. +We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. +Adapter sequences and low-quality bases were trimmed using fastp v0.20.1 :superscript:`3`. +Trimmed reads were mapped to the reference genome hg19 using sentieon-tools :superscript:`15`. +The resulted SAM files were converted to BAM files and sorted using samtools v1.12 :superscript:`5`. +Duplicated reads were marked using Picard tools MarkDuplicate v2.25.0 :superscript:`6` +and promptly quality controlled using CollectMultipleMetrics and CollectWgsMetrics functionalities. +Results of the quality controlled steps were summarized by MultiQC v1.11 :superscript:`7`. +Small somatic mutations (SNVs and INDELs) were called for each sample using Sentieon TNscope and TNhaplotyper :superscript:`16`. +The called-variants were also further second filtered using the criteria (DP(tumor,normal) >= 10; AD(tumor) >= 3; AF(tumor) >= 0.05, Maximum AF(tumor < 1; GNOMADAF_popmax <= 0.001; normalized base quality scores >= 20, read_counts of alt,ref alle > 0). +The filtered variants from TNscope and TNhaplotyper were merged using bcftools isec functionality to reduce the number of variants for tumor-only samples. +Structural variants were called using Manta v1.6.0 :superscript:`9` and Delly v0.8.7 :superscript:`10`. +Copy number aberrations were called using ascatNgs v4.5.0 :superscript:`17` for tumor-normal samples. +The structural variant calls from Manta, Delly and ascatNgs were merged using SVDB v2.6.0 :superscript:`12` +All variants were finally annotated using Ensembl VEP v100.2 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` +to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`. + +============================= +UMI Data Analysis +============================= + +BALSAMIC :superscript:`1` (**version** = 8.2.8) was used to analyze the data from raw FASTQ files. +We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. +Adapter sequences and low-quality bases were trimmed using fastp v0.20.1 :superscript:`3`. +UMI tag extraction and consensus generation were performed using Sentieon tools v202010.02 :superscript:`15`. +The alignment of UMI extracted and consensus called reads to the human reference genome (hg19) was done by bwa-mem and +samtools using Sentieon utils. Consensus reads were filtered based on the number of minimum reads supporting each UMI tag group. +We applied a criteria filter of minimum reads `3,1,1`. It means that at least three UMI tag groups should be ideally considered from both DNA strands, +where a minimum of at least one UMI tag group should exist in each single-stranded consensus read. +The filtered consensus reads were quality controlled using Picard CollectHsMetrics v2.25.0 :superscript:`5`. Results of the quality controlled steps were summarized by MultiQC v1.11 :superscript:`6`. +For each sample, somatic mutations were called using Sentieon TNscope :superscript:`16`, with non-default parameters for passing the final list of variants +(--min_tumor_allele_frac 0.0005, --filter_t_alt_frac 0.0005, --min_init_tumor_lod 0.5, min_tumor_lod 4, --max_error_per_read 5 --pcr_indel_model NONE, GNOMADAF_popmax <= 0.001). +All variants were finally annotated using Ensembl VEP v100.2 :superscript:`7`. We used vcfanno v0.3.3 :superscript:`8` to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`. +For exact parameters used for each software, please refer to https://github.com/Clinical-Genomics/BALSAMIC. +We used three commercially available products from SeraCare [Material numbers: 0710-067110 :superscript:`19`, 0710-067211 :superscript:`20`, 0710-067312 :superscript:`21`] for validating the efficiency of the UMI workflow in identifying 14 mutation sites at known allelic frequencies. + + +**References** +~~~~~~~~~~~~~~~~ + +1. Foroughi-Asl, H., Jeggari, A., Maqbool, K., Ivanchuk, V., Elhami, K., & Wirta, V. BALSAMIC: Bioinformatic Analysis pipeLine for SomAtic MutatIons in Cancer (Version v8.2.8) [Computer software]. https://github.com/Clinical-Genomics/BALSAMIC +2. Babraham Bioinformatics - FastQC A Quality Control tool for High Throughput Sequence Data. Accessed June 22, 2020. https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ +3. Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. 2018;34(17):i884-i890. doi:10.1093/bioinformatics/bty560 +4. Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN] +5. Li H, Handsaker B, Wysoker A, Fennell T, Ruan J., Homer N., Marth G., Abecasis G., Durbin R. and 1000 Genome Project Data Processing Subgroup (2009) The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics, 25, 2078-9. doi: 10.1093/bioinformatics/btp352 +6. Picard Tools - By Broad Institute. Accessed June 22, 2020. https://broadinstitute.github.io/picard/ +7. Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016;32(19):3047-3048. doi:10.1093/bioinformatics/btw354 +8. Lai Z, Markovets A, Ahdesmaki M, Chapman B, Hofmann O, McEwen R, Johnson J, Dougherty B, Barrett JC, and Dry JR. VarDict: a novel and versatile variant caller for next-generation sequencing in cancer research. Nucleic Acids Res. 2016. https://doi.org/10.1093/nar/gkw227 +9. Chen, X. et al. (2016) Manta: rapid detection of structural variants and indels for germline and cancer sequencing applications. Bioinformatics, 32, 1220-1222. doi:10.1093/bioinformatics/btv710 +10. Tobias Rausch, Thomas Zichner, Andreas Schlattl, Adrian M. Stuetz, Vladimir Benes, Jan O. Korbel. DELLY: structural variant discovery by integrated paired-end and split-read analysis. Bioinformatics. 2012 Sep 15;28(18):i333-i339. https://doi.org/10.1093/bioinformatics/bts378 +11. Talevich, E, Shain, A.H, Botton, T, & Bastian, B.C. CNVkit: Genome-wide copy number detection and visualization from targeted sequencing. PLOS Computational Biology. 2016, 12(4):e1004873. https://doi.org/10.1371/journal.pcbi.1004873 +12. Jesper Eisfeldt et.al. TIDDIT, an efficient and comprehensive structural variant caller for massive parallel sequencing data. F1000 research. 2017. doi: 10.12688/f1000research.11168.2 +13. McLaren W, Gil L, Hunt SE, et al. The Ensembl Variant Effect Predictor. Genome Biology. 2016;17(1):122. +14. Pedersen BS, Layer RM, Quinlan AR. Vcfanno: fast, flexible annotation of genetic variants. Genome Biology. 2016;17(1):118. doi:10.1186/s13059-016-0973-5 +15. Donald Freed, Rafael Aldana, Jessica A. Weber, Jeremy S. Edwards. The Sentieon Genomics Tools - A fast and accurate solution to variant calling from next-generation sequence data. Bioinformatics. 2016, Volume 32,Issue 8. https://doi.org/10.1093/bioinformatics/btv710 +16. Donald Freed, Renke Pan, Rafael Aldana. TNscope: Accurate Detection of Somatic Mutations with Haplotype-based Variant Candidate Detection and Machine Learning Filtering. bioRvix. doi: https://doi.org/10.1101/250647 +17. Keiran MR, Peter VL, David CW, David J, Andrew M, Adam PB , Jon WT, Patrick T, Serena Nik-Zainal, Peter J C. ascatNgs: Identifying Somatically Acquired Copy-Number Alterations from Whole-Genome Sequencing Data. Curr Protoc Bioinformatics. 2016. doi:https://doi.org/10.1002/cpbi.17 +18. Karczewski, K.J., Francioli, L.C., Tiao, G. et al. The mutational constraint spectrum quantified from variation in 141,456 humans. Nature 581, 434–443 (2020). https://doi.org/10.1038/s41586-020-2308-7 +19. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF1-0710-0671/ +20. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF05-0710-0672/ +21. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF01-0710-0673/ diff --git a/docs/bioinfo_softwares.rst b/docs/bioinfo_softwares.rst index a3192f9ab..00a17a036 100644 --- a/docs/bioinfo_softwares.rst +++ b/docs/bioinfo_softwares.rst @@ -114,6 +114,12 @@ sentieon-tools :Article: `Bioinformatics` ``_ :Version: `202010.02` +svdb +~~~~ +:Source code: `Github` ``_ +:Article: `F1000Res` ``_ +:Version: `2.6.0` + tabix ~~~~~ :Source code: `GitHub` ``_ diff --git a/docs/index.rst b/docs/index.rst index c4e72d940..17389a66d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -17,6 +17,7 @@ :maxdepth: 1 balsamic_filters + balsamic_methods bioinfo_softwares From b272c95008afaf3668f3072264fb43c64f094491 Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Tue, 19 Apr 2022 09:22:49 +0200 Subject: [PATCH 47/58] feat: add ascatngs copynumber (#914) * update changelog * add ascatngs copynumber file * add ascat to copynumber output * fix fpdf2 to version 2.4.6 --- .../variant_calling/somatic_sv_tumor_normal.rule | 5 ++++- CHANGELOG.rst | 1 + requirements-dev.txt | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule index ee632501b..cb00d426b 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule @@ -124,6 +124,7 @@ rule ascat_tumor_normal: chryloci= config["reference"]["ascat_chryloci"], output: final_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.vcf.gz", + ascat_copynumber = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".copynumber.txt.gz", sample_statistics = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.samplestatistics.txt", ascat_plots= expand( vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat." + "{output_suffix}" + ".png", @@ -162,6 +163,8 @@ ascat.pl \ cp {params.tmpdir}/{params.tumor}.copynumber.caveman.vcf.gz {output.final_vcf}; +cp {params.tmpdir}/{params.tumor}.copynumber.txt.gz {output.ascat_copynumber} + cp {params.tmpdir}/{params.tumor}.samplestatistics.txt {output.sample_statistics}; cp {params.tmpdir}/{params.tumor}.ASCATprofile.png {output.ascat_plots[0]}; @@ -231,7 +234,7 @@ rule svdb_merge_tumor_normal: threads: get_threads(cluster_config, "svdb_merge_tumor_normal") message: - "Merging Manta and Delly results for PASS variants using svdb for sample '{params.case_name}' " + "Merging SV and CNV results for all variants using svdb for sample '{params.case_name}' " shell: """ svdb --merge --no_intra --bnd_distance 5000 --overlap 0.80 \ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c4935033f..b0d4f65fa 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -19,6 +19,7 @@ Added: * Readthedocs for BALSAMIC variant filters for WGS somatic callers #892 * bcftools counts to varcall filter rules #898 * Additional WGS metrics to be stored in ``_metrics_deliverables.yaml`` #907 +* ascatNGS copynumber file #897 Changed: ^^^^^^^^ diff --git a/requirements-dev.txt b/requirements-dev.txt index c2fedd5f7..f6d3737c3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,4 +4,4 @@ coveralls pylint black==22.3.0 pillow>=8.4.0 -fpdf2>=2.4.6 +fpdf2==2.4.6 From e8385cd34d7e0c46542cb66dbcd325b4c522aadc Mon Sep 17 00:00:00 2001 From: ivadym Date: Tue, 19 Apr 2022 10:56:10 +0200 Subject: [PATCH 48/58] refactor: remove QC delivery report wokflow (#913) --- BALSAMIC/assets/scripts/collect_qc_metrics.py | 3 +- BALSAMIC/commands/report/deliver.py | 70 ------------- ...ality_check_reporting.py => qc_metrics.py} | 52 ---------- BALSAMIC/utils/qc_metrics.py | 14 --- BALSAMIC/utils/qc_report.py | 98 ------------------- CHANGELOG.rst | 3 + tests/commands/report/test_deliver.py | 15 +-- tests/utils/test_qc_metrics.py | 36 +------ 8 files changed, 6 insertions(+), 285 deletions(-) rename BALSAMIC/constants/{quality_check_reporting.py => qc_metrics.py} (61%) delete mode 100644 BALSAMIC/utils/qc_report.py diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py index 015735f7b..e376430f4 100755 --- a/BALSAMIC/assets/scripts/collect_qc_metrics.py +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -7,8 +7,7 @@ import click import yaml -from BALSAMIC.constants.quality_check_reporting import METRICS - +from BALSAMIC.constants.qc_metrics import METRICS from BALSAMIC.utils.models import MetricModel diff --git a/BALSAMIC/commands/report/deliver.py b/BALSAMIC/commands/report/deliver.py index 7fdc9dea5..81b050d5e 100644 --- a/BALSAMIC/commands/report/deliver.py +++ b/BALSAMIC/commands/report/deliver.py @@ -5,7 +5,6 @@ import yaml import click import snakemake -import datetime import subprocess from pathlib import Path @@ -16,7 +15,6 @@ from BALSAMIC.utils.cli import convert_deliverables_tags from BALSAMIC.utils.rule import get_result_dir from BALSAMIC.utils.exc import BalsamicError -from BALSAMIC.utils.qc_report import render_html, report_data_population from BALSAMIC.constants.workflow_params import VCF_DICT from BALSAMIC.constants.workflow_rules import DELIVERY_RULES @@ -33,25 +31,6 @@ required=True, help="Sample config file. Output of balsamic config sample", ) -@click.option( - "--sample-id-map", - required=False, - help=( - "Separated internal sample ID with external ID. Use comma for" - "multiple samples. These IDs MUST exist in sample-config." - "Syntax: internal_id:sample_type:external_id" - ". e.g. ACC1:tumor:KS454,ACC2:normal:KS556" - ), -) -@click.option( - "--case-id-map", - required=False, - help=( - "Separated internal case ID with external ID." - "Syntax: gene_panel_name:external_id" - ". e.g. gmck-solid:KSK899:apptag" - ), -) @click.option( "-a", "--analysis-type", @@ -92,8 +71,6 @@ def deliver( rules_to_deliver, delivery_mode, disable_variant_caller, - sample_id_map, - case_id_map, ): """ cli for deliver sub-command. @@ -127,45 +104,9 @@ def deliver( if analysis_type else sample_config_dict["analysis"]["analysis_type"] ) - sequencing_type = sample_config_dict["analysis"]["sequencing_type"] reference_genome = sample_config_dict["reference"]["reference_genome"] snakefile = get_snakefile(analysis_type, reference_genome) - balsamic_qc_report = None - if sequencing_type != "wgs" and sample_id_map and case_id_map: - case_id_map = case_id_map.split(":") - sample_id_map = sample_id_map.split(",") - sample_map = dict() - sample_type = dict() - for sample in sample_id_map: - lims_id = sample.split(":")[0] - sample_map[lims_id] = sample.split(":")[1] - sample_type[lims_id] = sample.split(":")[2] - - meta = dict() - meta["sample_map"] = sample_map - meta["sample_type"] = sample_type - meta["now"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") - meta["config_date"] = sample_config_dict["analysis"]["config_creation_date"] - meta["internal_case_id"] = case_name - meta["gene_panel_name"] = case_id_map[0] - meta["case_name"] = case_id_map[1] - meta["apptag"] = case_id_map[2] - - collected_qc = read_yaml( - os.path.join( - sample_config_dict["analysis"]["result"], - "qc", - sample_config_dict["analysis"]["case_id"] - + "_metrics_deliverables.yaml", - ) - ) - meta = report_data_population(collected_qc=collected_qc, meta=meta) - balsamic_qc_report = os.path.join( - yaml_write_directory, case_name + "_qc_report.html" - ) - balsamic_qc_report = render_html(meta=meta, html_out=balsamic_qc_report) - report_file_name = os.path.join( yaml_write_directory, sample_config_dict["analysis"]["case_id"] + "_report.html" ) @@ -246,17 +187,6 @@ def deliver( "id": case_name, } ) - # Add balsamic_qc_report - if balsamic_qc_report: - delivery_json["files"].append( - { - "path": balsamic_qc_report, - "step": "balsamic_delivery", - "format": get_file_extension(balsamic_qc_report), - "tag": ["coverage-qc-report"], - "id": case_name, - } - ) write_json(delivery_json, delivery_file_name) with open(delivery_file_name + ".yaml", "w") as fn: diff --git a/BALSAMIC/constants/quality_check_reporting.py b/BALSAMIC/constants/qc_metrics.py similarity index 61% rename from BALSAMIC/constants/quality_check_reporting.py rename to BALSAMIC/constants/qc_metrics.py index 64b6a80ef..509940587 100644 --- a/BALSAMIC/constants/quality_check_reporting.py +++ b/BALSAMIC/constants/qc_metrics.py @@ -1,55 +1,3 @@ -REPORT_MODEL = { - "qc": { - "MEDIAN_TARGET_COVERAGE": { - "sv": "Mediansekvensdjup [x]", - "en": "Median sequencing depth [x]", - "decimal": 0, - }, - "FOLD_80_BASE_PENALTY": { - "sv": "Fold 80 base penalty", - "en": "Fold 80 base penalty", - "decimal": 2, - }, - "MEAN_INSERT_SIZE": { - "sv": "Fragmentlängd, medel [baspar]", - "en": "Mean insert size [base pair]", - "decimal": 2, - }, - }, - "coverage": { - "PCT_TARGET_BASES_50X": { - "sv": "Täckningsgrad [50X]", - "en": "Target coverage [50X]", - "decimal": 2, - "as_percent": True, - }, - "PCT_TARGET_BASES_100X": { - "sv": "Täckningsgrad [100X]", - "en": "Target coverage [100X]", - "decimal": 2, - "as_percent": True, - }, - "PCT_TARGET_BASES_250X": { - "sv": "Täckningsgrad [250X]", - "en": "Target coverage [250X]", - "decimal": 2, - "as_percent": True, - }, - "PCT_TARGET_BASES_500X": { - "sv": "Täckningsgrad [500X]", - "en": "Target coverage [500X]", - "decimal": 2, - "as_percent": True, - }, - "PCT_TARGET_BASES_1000X": { - "sv": "Täckningsgrad [1000X]", - "en": "Target coverage [1000X]", - "decimal": 2, - "as_percent": True, - }, - }, -} - METRICS = { "targeted": { "default": { diff --git a/BALSAMIC/utils/qc_metrics.py b/BALSAMIC/utils/qc_metrics.py index 3d69bafad..17d4b281f 100644 --- a/BALSAMIC/utils/qc_metrics.py +++ b/BALSAMIC/utils/qc_metrics.py @@ -1,20 +1,6 @@ -from typing import Union - from BALSAMIC.utils.models import MetricValidationModel -def get_qc_metric_value( - metrics: dict, sample_id: str, metric_name: str -) -> Union[float, None]: - """Extracts the metrics value associated to a specific sample_id and metric_name""" - - for metric in metrics: - if metric["id"] == sample_id and metric["name"] == metric_name: - return metric["value"] - - return None - - def validate_qc_metrics(metrics: dict) -> dict: """Returns a set of validated QC metrics""" diff --git a/BALSAMIC/utils/qc_report.py b/BALSAMIC/utils/qc_report.py deleted file mode 100644 index 9f2907404..000000000 --- a/BALSAMIC/utils/qc_report.py +++ /dev/null @@ -1,98 +0,0 @@ -from markdown import Markdown -from jinja2 import Environment, FileSystemLoader -from datetime import datetime -from pathlib import Path - -from BALSAMIC import __version__ as balsamic_version -from BALSAMIC.constants.quality_check_reporting import REPORT_MODEL -from BALSAMIC.utils.qc_metrics import get_qc_metric_value - - -def report_data_population(collected_qc: list, meta: dict, lang: str = "sv") -> dict: - """populates a metadata dictionary that contains qc and case/sample information""" - meta = { - **meta, - **{ - "title": "Kvalitetsrapport", - "subtitle": "Klinisk sekvensering av cancerprover", - "footnote": "Slut på rapporten", - "bioinformatic": f"BALSAMIC version {balsamic_version}", - "qc_table_content": {}, - "coverage_table_content": {}, - }, - } - - meta["qc_table_header"] = [v[lang] for x, v in REPORT_MODEL["qc"].items()] - meta["coverage_table_header"] = [ - v[lang] for x, v in REPORT_MODEL["coverage"].items() - ] - - for sample in collected_qc: - lims_id = sample["id"] - sample_qc = [meta["sample_map"][lims_id], meta["sample_type"][lims_id]] - sample_cov = [meta["sample_map"][lims_id], meta["sample_type"][lims_id]] - - sample_qc = sample_qc + parse_collected_qc( - collected_qc=collected_qc, model_param="qc", sample_id=lims_id - ) - sample_cov = sample_cov + parse_collected_qc( - collected_qc=collected_qc, model_param="coverage", sample_id=lims_id - ) - - meta["qc_table_content"][lims_id] = sample_qc - meta["coverage_table_content"][lims_id] = sample_cov - - return meta - - -def parse_collected_qc(collected_qc: list, model_param: str, sample_id: str) -> list: - """parses collect qc and returns model_param""" - parsed_qc = list() - - for qc_item, qc_value in REPORT_MODEL[model_param].items(): - decimal_point = qc_value["decimal"] - qc_to_report = get_qc_metric_value(collected_qc, sample_id, qc_item) - if "as_percent" in qc_value: - qc_to_report = qc_to_report * 100 - qc_to_report = str(round(qc_to_report, decimal_point)) - if "as_percent" in qc_value: - qc_to_report = f"{qc_to_report} %" - parsed_qc.append(qc_to_report) - - return parsed_qc - - -def render_html(meta: dict, html_out: str): - """renders html report from template""" - - p = Path(__file__).parents[1] - template_path = Path(p, "assets", "report_template").as_posix() - - report_body = render_body(meta=meta, template_path=template_path) - - md_template = Markdown(extensions=["meta", "tables", "def_list", "fenced_code"]) - - markdown_text = md_template.convert(source=report_body) - - env = Environment(loader=FileSystemLoader(template_path), autoescape=False) - - template = env.get_template("balsamic_report.html") - - html_report = template.render(body=markdown_text, meta=meta) - - with open(html_out, "w") as f: - f.write(html_report) - return html_out - - -def render_body( - meta: dict, template_path: str, body_template_md: str = "balsamic_report.md" -) -> str: - """renders text body of the report from a markdown template""" - env = Environment(loader=FileSystemLoader(template_path), autoescape=False) - - template = env.get_template(body_template_md) - - report_body = template.render(meta=meta) - - return report_body diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b0d4f65fa..289c72c16 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -33,6 +33,7 @@ Fixed: * Automate balsamic version for readthedocs install page #888 * ``collect_qc_metrics.py`` failing for WGS cases with empty ``capture_kit`` argument #850 * QC metric validation for different panel bed version #855 +* Fixed development version of ``fpdf2`` to ``2.4.6`` #878 Removed ^^^^^^^ @@ -44,6 +45,8 @@ Removed * Removed `MSK_impact` and `MSK_impact_noStrelka` json files from config * Cleanup of `strelka`, `pindel` , `mutect2` variables from BALSAMIC * bcftools_stats from vep #898 +* QC delivery report workflow (generating the ``_qc_report.html`` file) #878 +* ``--sample-id-map`` and ``--case-id-map`` flags from the ``balsamic report deliver`` command #878 [8.2.10] -------- diff --git a/tests/commands/report/test_deliver.py b/tests/commands/report/test_deliver.py index 97e910c73..43477c106 100644 --- a/tests/commands/report/test_deliver.py +++ b/tests/commands/report/test_deliver.py @@ -31,10 +31,6 @@ def test_deliver_tumor_only_panel( "deliver", "--sample-config", tumor_only_config, - "--sample-id-map", - "tumor:tumor:KS454", - "--case-id-map", - "gmck-solid:KSK899:apptag", "--disable-variant-caller", "cnvkit", ] @@ -93,16 +89,7 @@ def test_deliver_tumor_normal_panel( ), caplog.at_level(logging.DEBUG): # WHEN running analysis result = invoke_cli( - [ - "report", - "deliver", - "--sample-config", - tumor_normal_config, - "--sample-id-map", - "tumor:tumor:KS454,normal:normal:KS999", - "--case-id-map", - "gmck-solid:KSK899:apptag", - ] + ["report", "deliver", "--sample-config", tumor_normal_config] ) # THEN it should run without any error diff --git a/tests/utils/test_qc_metrics.py b/tests/utils/test_qc_metrics.py index 760080f4f..46657301e 100644 --- a/tests/utils/test_qc_metrics.py +++ b/tests/utils/test_qc_metrics.py @@ -1,38 +1,4 @@ -from BALSAMIC.utils.qc_metrics import ( - validate_qc_metrics, - get_qc_metric_value, -) - - -def test_get_qc_metric_value(qc_extracted_metrics): - """test QC metric value extraction""" - - # GIVEN the input parameters - sample_id = "tumor" - metric_name = "MEDIAN_TARGET_COVERAGE" - - # GIVEN an expected value - expected_value = 2393.0 - - # WHEN calling the function - metric_value = get_qc_metric_value(qc_extracted_metrics, sample_id, metric_name) - - # THEN check if the retrieved value corresponds to the expected one - assert metric_value == expected_value - - -def test_get_qc_metric_value_invalid_metric(qc_extracted_metrics): - """test QC metric value extraction for an invalid metric name""" - - # GIVEN the input parameters - sample_id = "tumor" - metric_name = "NOT_A_METRIC" - - # WHEN calling the function - metric_value = get_qc_metric_value(qc_extracted_metrics, sample_id, metric_name) - - # THEN check if the retrieved value is None - assert metric_value is None +from BALSAMIC.utils.qc_metrics import validate_qc_metrics def test_validate_qc_metrics(qc_extracted_metrics): From 9ea3dcfdfb87bfa08787ddf2f845cdad3d1bd5a5 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Wed, 20 Apr 2022 10:36:12 +0200 Subject: [PATCH 49/58] feat: add qc only workflow (#847) * feat: add qc only workflow * conftest qc config * formatting * fixing code smells and attemps to reduce duplication * formatting * remove qc_config container version * formatting * fix qc test * fix qc test * test fixing * test fixing * formatting * draft pytest qc * feat: add qc only workflow * conftest qc config * formatting * fixing code smells and attemps to reduce duplication * formatting * remove qc_config container version * formatting * fix qc test * fix qc test * test fixing * test fixing * formatting * draft pytest qc * black linting + removing test_config_qc_graph_value_error * add container version in qc * add tests for QC graph generation and ValueError * black * upgrade black because of click update and remove unused config_dick in conftest * update black in github action * remove benchmark plot for qc * address duplication * changelog and conftest update * Apply suggestions from code review Co-authored-by: ashwini06 * balck upgrade in changelog * remove variable germline_call_samples as only qc * Update BALSAMIC/workflows/QC.smk Remove chromlist Co-authored-by: ashwini06 * Revert "changelog and conftest update" This reverts commit fabddafe5bb4b9e016fb17b911668215bbbc2c14. * remove umiworkflow and vcf from qc * remove bedchrom from qc workflow * update to CHANGELOG from black version * add canfam3 to workflow qc test * remove unused import * remove wgs from qc workflow and modify command to qc_panel * add qc_panel to click * adapt conftest to qc_panel * adapt to qc_panel * use qc_panel also in get_snakemake instead of only qc and models.py comments * modidy test_utils to qc_panel * qc metrics * keep chr in refGene as ref fasta uses chr * add more memory to picard for canfam3 because dogs have 38 chromosomes * no analysis_specific_results in QC.sml * stop removing chr with canfam3 references Co-authored-by: ashwini06 --- BALSAMIC/commands/config/base.py | 2 + BALSAMIC/commands/config/qc.py | 213 ++++++++++++++ BALSAMIC/constants/common.py | 2 +- .../quality_control/picard.rule | 9 +- BALSAMIC/utils/cli.py | 2 + BALSAMIC/utils/models.py | 7 +- BALSAMIC/workflows/QC.smk | 154 ++++++++++ BALSAMIC/workflows/balsamic.smk | 12 +- BALSAMIC/workflows/reference-canfam3.smk | 31 +- CHANGELOG.rst | 37 +-- tests/commands/config/test_config_qc.py | 264 ++++++++++++++++++ tests/conftest.py | 83 ++++++ tests/test_workflow.py | 63 ++++- tests/utils/test_utils.py | 10 +- 14 files changed, 827 insertions(+), 62 deletions(-) create mode 100644 BALSAMIC/commands/config/qc.py create mode 100644 BALSAMIC/workflows/QC.smk create mode 100644 tests/commands/config/test_config_qc.py diff --git a/BALSAMIC/commands/config/base.py b/BALSAMIC/commands/config/base.py index 7744c765a..f0530ca7b 100644 --- a/BALSAMIC/commands/config/base.py +++ b/BALSAMIC/commands/config/base.py @@ -3,6 +3,7 @@ from BALSAMIC.commands.config.case import case_config as case_command from BALSAMIC.commands.config.pon import pon_config as pon_command +from BALSAMIC.commands.config.qc import qc_config as qc_command @click.group() @@ -14,3 +15,4 @@ def config(context): config.add_command(case_command) config.add_command(pon_command) +config.add_command(qc_command) diff --git a/BALSAMIC/commands/config/qc.py b/BALSAMIC/commands/config/qc.py new file mode 100644 index 000000000..e56f7f6f1 --- /dev/null +++ b/BALSAMIC/commands/config/qc.py @@ -0,0 +1,213 @@ +import os +import json +import logging +from pathlib import Path + +import click + +from BALSAMIC import __version__ as balsamic_version +from BALSAMIC.utils.cli import ( + get_sample_dict, + get_panel_chrom, + get_bioinfo_tools_version, + create_fastq_symlink, + generate_graph, +) +from BALSAMIC.constants.common import ( + CONTAINERS_CONDA_ENV_PATH, + BIOINFO_TOOL_ENV, +) +from BALSAMIC.utils.models import BalsamicConfigModel + + +LOG = logging.getLogger(__name__) + + +@click.command( + "qc_panel", + short_help="Create a sample config file for panel cases to perform QC", +) +@click.option( + "--case-id", + required=True, + help="Sample id that is used for reporting, \ + naming the analysis jobs, and analysis path", +) +@click.option( + "--quality-trim/--no-quality-trim", + default=True, + show_default=True, + is_flag=True, + help="Trim low quality reads in fastq", +) +@click.option( + "--adapter-trim/--no-adapter-trim", + default=True, + show_default=True, + is_flag=True, + help="Trim adapters from reads in fastq", +) +@click.option( + "-p", + "--panel-bed", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="Panel bed file for variant calling.", +) +@click.option( + "--balsamic-cache", + type=click.Path(exists=True, resolve_path=True), + required=True, + help="Path to BALSAMIC cache", +) +@click.option( + "--container-version", + show_default=True, + default=balsamic_version, + type=click.Choice(["develop", "master", balsamic_version]), + help="Container for BALSAMIC version to download", +) +@click.option( + "--analysis-dir", + type=click.Path(exists=True, resolve_path=True), + required=True, + help="Root analysis path to store analysis logs and results. \ + The final path will be analysis-dir/sample-id", +) +@click.option( + "-t", + "--tumor", + type=click.Path(exists=True, resolve_path=True), + required=True, + multiple=True, + help="Fastq files for tumor sample.", +) +@click.option( + "-n", + "--normal", + type=click.Path(exists=True, resolve_path=True), + required=False, + multiple=True, + help="Fastq files for normal sample.", +) +@click.option( + "--umi/--no-umi", + default=True, + show_default=True, + is_flag=True, + help=("UMI processing steps for samples with UMI tags."), +) +@click.option( + "--umi-trim-length", + default=5, + show_default=True, + type=int, + help="Trim N bases from reads in fastq", +) +@click.option("--tumor-sample-name", help="Tumor sample name") +@click.option("--normal-sample-name", help="Normal sample name") +@click.option( + "-g", + "--genome-version", + default="hg19", + type=click.Choice(["hg19", "hg38", "canfam3"]), + help=( + "Genome version to prepare reference. Path to genome" + "will be /genome_version" + ), +) +@click.pass_context +def qc_config( + context, + case_id, + umi, + umi_trim_length, + adapter_trim, + quality_trim, + panel_bed, + analysis_dir, + tumor, + normal, + tumor_sample_name, + normal_sample_name, + genome_version, + balsamic_cache, + container_version, +): + + if container_version: + balsamic_version = container_version + + try: + samples = get_sample_dict( + tumor=tumor, + normal=normal, + tumor_sample_name=tumor_sample_name, + normal_sample_name=normal_sample_name, + ) + except AttributeError: + LOG.error(f"File name is invalid, use convention [SAMPLE_ID]_R_[1,2].fastq.gz") + raise click.Abort() + + reference_config = os.path.join( + balsamic_cache, balsamic_version, genome_version, "reference.json" + ) + with open(reference_config, "r") as f: + reference_dict = json.load(f)["reference"] + + config_collection_dict = BalsamicConfigModel( + QC={ + "quality_trim": quality_trim, + "adapter_trim": adapter_trim, + "umi_trim": umi if panel_bed else False, + "umi_trim_length": umi_trim_length, + }, + analysis={ + "case_id": case_id, + "analysis_dir": analysis_dir, + "analysis_type": "qc_panel", + "sequencing_type": "targeted" if panel_bed else "wgs", + }, + reference=reference_dict, + singularity=os.path.join(balsamic_cache, balsamic_version, "containers"), + samples=samples, + bioinfo_tools=BIOINFO_TOOL_ENV, + bioinfo_tools_version=get_bioinfo_tools_version( + BIOINFO_TOOL_ENV, CONTAINERS_CONDA_ENV_PATH + ), + panel={ + "capture_kit": panel_bed, + "chrom": get_panel_chrom(panel_bed), + } + if panel_bed + else None, + umiworkflow=False, + ).dict(by_alias=True, exclude_none=True) + LOG.info("QC config file generated successfully") + + Path.mkdir( + Path(config_collection_dict["analysis"]["fastq_path"]), + parents=True, + exist_ok=True, + ) + LOG.info("Directories created successfully") + + create_fastq_symlink( + casefiles=(tumor + normal), + symlink_dir=Path(config_collection_dict["analysis"]["fastq_path"]), + ) + LOG.info(f"Symlinks generated successfully") + + config_path = Path(analysis_dir) / case_id / (case_id + "_QC.json") + with open(config_path, "w+") as fh: + fh.write(json.dumps(config_collection_dict, indent=4)) + LOG.info(f"QC config file saved successfully - {config_path}") + + try: + generate_graph(config_collection_dict, config_path) + LOG.info(f"BALSAMIC QC Workflow has been configured successfully!") + except ValueError: + LOG.error( + f'BALSAMIC QC dag graph generation failed - {config_collection_dict["analysis"]["dag"]}', + ) + raise click.Abort() diff --git a/BALSAMIC/constants/common.py b/BALSAMIC/constants/common.py index eec892a84..ecf9eca5e 100644 --- a/BALSAMIC/constants/common.py +++ b/BALSAMIC/constants/common.py @@ -36,7 +36,7 @@ # Analysis related constants MUTATION_CLASS = ["somatic", "germline"] MUTATION_TYPE = ["SNV", "SV", "CNV"] -ANALYSIS_TYPES = ["paired", "single", "qc", "pon"] +ANALYSIS_TYPES = ["paired", "single", "qc_panel", "pon"] WORKFLOW_SOLUTION = ["BALSAMIC", "Sentieon", "DRAGEN", "Sentieon_umi"] SEQUENCING_TYPE = ["wgs", "targeted"] diff --git a/BALSAMIC/snakemake_rules/quality_control/picard.rule b/BALSAMIC/snakemake_rules/quality_control/picard.rule index 86f660dd5..6a159629b 100644 --- a/BALSAMIC/snakemake_rules/quality_control/picard.rule +++ b/BALSAMIC/snakemake_rules/quality_control/picard.rule @@ -1,7 +1,10 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 - +if "canfam3" in config['reference']['reference_genome']: + memory = "20g" +else: + memory = "16g" rule picard_CollectHsMetrics: input: @@ -16,7 +19,7 @@ rule picard_CollectHsMetrics: singularity: Path(singularity_image, config["bioinfo_tools"].get("picard") + ".sif").as_posix() params: - mem = "16g", + mem = memory, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), baitsetname = os.path.basename(config["panel"]["capture_kit"]), sample = '{sample}' @@ -103,7 +106,7 @@ rule picard_CollectInsertSizeMetrics: "Calculating picard InsertSize metrics for sample '{params.sample}'" shell: """ -mkdir -p {params.tmpdir}; +mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 213f41ebf..7780e2c13 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -277,6 +277,8 @@ def get_snakefile(analysis_type, reference_genome="hg19"): if analysis_type == "pon": snakefile = Path(p, "workflows", "PON.smk") + if "qc_panel" in analysis_type: + snakefile = Path(p, "workflows", "QC.smk") return str(snakefile) diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index ec92b496e..44263d141 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -198,10 +198,11 @@ class AnalysisModel(BaseModel): Attributes: case_id : Field(required); string case identifier - analysis_type : Field(required); string literal [single, paired, pon] + analysis_type : Field(required); string literal [single, paired, pon, qc_panel] single : if only tumor samples are provided paired : if both tumor and normal samples are provided pon : panel of normal analysis + qc_panel : QC analysis only sequencing_type : Field(required); string literal [targeted, wgs] targeted : if capture kit was used to enrich specific genomic regions wgs : if whole genome sequencing was performed @@ -217,7 +218,7 @@ class AnalysisModel(BaseModel): Raises: ValueError: - When analysis_type is set to any value other than [single, paired, qc, pon] + When analysis_type is set to any value other than [single, paired, pon, qc_panel] When sequencing_type is set to any value other than [wgs, targeted] """ @@ -427,7 +428,7 @@ class BalsamicConfigModel(BaseModel): """ QC: QCModel - vcf: VCFModel + vcf: Optional[VCFModel] analysis: AnalysisModel samples: Dict[str, SampleInstanceModel] reference: Dict[str, Path] diff --git a/BALSAMIC/workflows/QC.smk b/BALSAMIC/workflows/QC.smk new file mode 100644 index 000000000..7e475e464 --- /dev/null +++ b/BALSAMIC/workflows/QC.smk @@ -0,0 +1,154 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 +import os +import logging +import tempfile + +from pathlib import Path +from yapf.yapflib.yapf_api import FormatFile + +from snakemake.exceptions import RuleException, WorkflowError + +from BALSAMIC.utils.exc import BalsamicError + +from BALSAMIC.utils.cli import (write_json, check_executable, generate_h5) + +from BALSAMIC.utils.models import BalsamicWorkflowConfig + +from BALSAMIC.utils.rule import (get_rule_output, get_result_dir, + get_sample_type, get_picard_mrkdup, get_script_path, + get_threads, get_sequencing_type, get_capture_kit) + +from BALSAMIC.constants.common import (RULE_DIRECTORY); +from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS + + +shell.executable("/bin/bash") +shell.prefix("set -eo pipefail; ") + +LOG = logging.getLogger(__name__) +logging.getLogger("filelock").setLevel("WARN") + +# Create a temporary directory with trailing / +tmp_dir = os.path.join(get_result_dir(config), "tmp", "" ) +Path.mkdir(Path(tmp_dir), exist_ok=True) + +benchmark_dir = config["analysis"]["benchmark"] +fastq_dir = get_result_dir(config) + "/fastq/" +bam_dir = get_result_dir(config) + "/bam/" +fastqc_dir = get_result_dir(config) + "/fastqc/" +result_dir = get_result_dir(config) + "/" +qc_dir = get_result_dir(config) + "/qc/" +delivery_dir = get_result_dir(config) + "/delivery/" + +singularity_image = config['singularity']['image'] + +# picarddup flag +picarddup = get_picard_mrkdup(config) + +# parse parameters as constants to workflows +params = BalsamicWorkflowConfig.parse_obj(WORKFLOW_PARAMS) + +# Capture kit name +if config["analysis"]["sequencing_type"] != "wgs": + capture_kit = os.path.split(config["panel"]["capture_kit"])[1] + +# Sample names for tumor or normal +tumor_sample = get_sample_type(config["samples"], "tumor")[0] +if "paired" in config['analysis']['analysis_type']: + normal_sample = get_sample_type(config["samples"], "normal")[0] + +# Set case id/name +case_id = config["analysis"]["case_id"] + +# explicitly check if cluster_config dict has zero keys. +if len(cluster_config.keys()) == 0: + cluster_config = config + +# Add reference assembly if not defined for backward compatibility +if 'genome_version' not in config["reference"]: + GENOME_VERSION = 'hg19' ## if hg19 convention works, replace accordingly + LOG.info('Genome version was not found in config. Setting it to %s', GENOME_VERSION) + + +# Set temporary dir environment variable +os.environ['TMPDIR'] = get_result_dir(config) + +analysis_type = config['analysis']["analysis_type"] + +rules_to_include = [ + "snakemake_rules/quality_control/fastp.rule", + "snakemake_rules/quality_control/fastqc.rule", + "snakemake_rules/quality_control/multiqc.rule", + "snakemake_rules/variant_calling/mergetype_tumor.rule", + "snakemake_rules/quality_control/picard.rule", + "snakemake_rules/quality_control/sambamba_depth.rule", + "snakemake_rules/quality_control/mosdepth.rule", + "snakemake_rules/align/bwa_mem.rule" +] + +if "paired" in config['analysis']['analysis_type']: + rules_to_include.append("snakemake_rules/variant_calling/mergetype_normal.rule") + + + +# for r in rules_to_include: +for r in rules_to_include: + include: Path(RULE_DIRECTORY, r).as_posix() +LOG.info(f"The following rules will be included in the workflow: {rules_to_include}") + +# Define common and analysis specific outputs +quality_control_results = [result_dir + "qc/" + "multiqc_report.html"] + +if 'delivery' in config: + wildcard_dict = {"sample": list(config["samples"].keys())+["tumor", "normal"], + "case_name": config["analysis"]["case_id"], + "allow_missing": True + } + + if 'rules_to_deliver' in config: + rules_to_deliver = config['rules_to_deliver'].split(",") + else: + rules_to_deliver = ['multiqc'] + + output_files_ready = [('path', 'path_index', 'step', 'tag', 'id', 'format')] + + for my_rule in set(rules_to_deliver): + try: + housekeeper_id = getattr(rules, my_rule).params.housekeeper_id + except (ValueError, AttributeError, RuleException, WorkflowError) as e: + LOG.warning("Cannot deliver step (rule) {}: {}".format(my_rule, e)) + continue + + LOG.info("Delivering step (rule) {} {}.".format(my_rule, housekeeper_id)) + files_to_deliver = get_rule_output(rules=rules, rule_name=my_rule, output_file_wildcards=wildcard_dict) + LOG.debug("The following files added to delivery: {}".format(files_to_deliver)) + output_files_ready.extend(files_to_deliver) + + output_files_ready = [dict(zip(output_files_ready[0], value)) for value in output_files_ready[1:]] + delivery_ready = os.path.join(get_result_dir(config), + "delivery_report", + config["analysis"]["case_id"] + "_delivery_ready.hk") + write_json(output_files_ready, delivery_ready) + FormatFile(delivery_ready) + +rule all: + input: + quality_control_results + output: + finish_file = os.path.join(get_result_dir(config), "analysis_finish") + params: + tmp_dir = tmp_dir, + run: + import datetime + import shutil + + # Delete a temporal directory tree + try: + shutil.rmtree(params.tmp_dir) + except OSError as e: + print ("Error: %s - %s." % (e.filename, e.strerror)) + + # Finish timestamp file + with open(str(output.finish_file), mode="w") as finish_file: + finish_file.write("%s\n" % datetime.datetime.now()) diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 78eb1c845..071e9d743 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -27,7 +27,7 @@ from BALSAMIC.constants.common import (SENTIEON_DNASCOPE, SENTIEON_TNSCOPE, RULE_DIRECTORY, VCFANNO_TOML, MUTATION_TYPE); from BALSAMIC.constants.variant_filters import COMMON_SETTINGS,VARDICT_SETTINGS,SENTIEON_VARCALL_SETTINGS; from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS, VARCALL_PARAMS -from BALSAMIC.constants.workflow_rules import SNAKEMAKE_RULES +from BALSAMIC.constants.workflow_rules import SNAKEMAKE_RULES shell.executable("/bin/bash") @@ -51,7 +51,7 @@ vep_dir = get_result_dir(config) + "/vep/" qc_dir = get_result_dir(config) + "/qc/" delivery_dir = get_result_dir(config) + "/delivery/" -umi_dir = get_result_dir(config) + "/umi/" +umi_dir = get_result_dir(config) + "/umi/" umi_qc_dir = qc_dir + "umi_qc/" singularity_image = config['singularity']['image'] @@ -95,7 +95,7 @@ try: config["SENTIEON_TNSCOPE"] = SENTIEON_TNSCOPE config["SENTIEON_DNASCOPE"] = SENTIEON_DNASCOPE - + except KeyError as error: LOG.error("Set environment variables SENTIEON_LICENSE, SENTIEON_INSTALL_DIR, SENTIEON_EXEC " "to run SENTIEON variant callers") @@ -147,7 +147,7 @@ for m in MUTATION_TYPE: sequencing_type=config["analysis"]["sequencing_type"], mutation_class="germline") - germline_caller = germline_caller + germline_caller_balsamic + germline_caller_sentieon + germline_caller = germline_caller + germline_caller_balsamic + germline_caller_sentieon somatic_caller_balsamic = get_variant_callers(config=config, @@ -244,7 +244,7 @@ if config["analysis"]["sequencing_type"] != "wgs": analysis_specific_results.append(expand(vep_dir + "{vcf}.all.filtered.pass.ranked.vcf.gz", vcf=get_vcf(config, ["vardict"], [config["analysis"]["case_id"]]))) - analysis_specific_results.append(expand(vcf_dir + "CNV.somatic.{case_name}.{var_caller}.vcf2cytosure.cgh", + analysis_specific_results.append(expand(vcf_dir + "CNV.somatic.{case_name}.{var_caller}.vcf2cytosure.cgh", case_name=config["analysis"]["case_id"], var_caller=["cnvkit"])) @@ -281,7 +281,7 @@ if 'benchmark_plots' in config: # Make individual plot per job for log_file in Path(log_dir).glob("*.err"): log_file_list = log_file.name.split(".") - job_name = ".".join(log_file_list[0:4]) + job_name = ".".join(log_file_list[0:4]) job_id = log_file_list[4].split("_")[1] h5_file = generate_h5(job_name, job_id, log_file.parent) benchmark_plot = Path(benchmark_dir, job_name + ".pdf") diff --git a/BALSAMIC/workflows/reference-canfam3.smk b/BALSAMIC/workflows/reference-canfam3.smk index def91c8c2..91699c777 100644 --- a/BALSAMIC/workflows/reference-canfam3.smk +++ b/BALSAMIC/workflows/reference-canfam3.smk @@ -8,7 +8,7 @@ from pathlib import Path from copy import deepcopy from BALSAMIC.utils.rule import get_script_path -from BALSAMIC.utils.rule import get_reference_output_files +from BALSAMIC.utils.rule import get_reference_output_files from BALSAMIC.utils.models import ReferenceMeta from BALSAMIC.constants.reference import REFERENCE_FILES as REFERENCE_MODEL from BALSAMIC.utils.cli import get_md5 @@ -27,7 +27,7 @@ basedir = os.path.join(config['output']) genome_dir = os.path.join(basedir, "genome") # Set temporary dir environment variable -os.environ['TMPDIR'] = basedir +os.environ['TMPDIR'] = basedir REFERENCE_FILES = deepcopy(REFERENCE_MODEL) @@ -36,7 +36,7 @@ REFERENCE_FILES[genome_ver]['basedir'] = basedir reference_file_model = ReferenceMeta.parse_obj(REFERENCE_FILES[genome_ver]) reference_genome_url = reference_file_model.reference_genome genome_chrom_size_url = reference_file_model.genome_chrom_size -refgene_txt_url = reference_file_model.refgene_txt +refgene_txt_url = reference_file_model.refgene_txt refgene_sql_url = reference_file_model.refgene_sql check_md5 = os.path.join(basedir, "reference.json.md5") @@ -45,11 +45,11 @@ shell.executable("/bin/bash") shell.prefix("set -eo pipefail; ") singularity_image_path = config['singularity']['image_path'] -singularity_images = [Path(singularity_image_path, image_name + ".sif").as_posix() for image_name in config["singularity"]["containers"].keys()] +singularity_images = [Path(singularity_image_path, image_name + ".sif").as_posix() for image_name in config["singularity"]["containers"].keys()] ########################################################## # Generating Reference files for BALSAMIC pipeline -# Writing reference json file +# Writing reference json file ########################################################## rule all: @@ -71,7 +71,7 @@ rule all: os.path.join(basedir, "reference.json.log") run: import json - from datetime import datetime + from datetime import datetime today = datetime.now().strftime('%Y-%m-%d %H:%M:%S') @@ -87,7 +87,7 @@ rule all: with open(str(output.reference_json), "w") as fh: json.dump(ref_json, fh, indent=4) - + create_md5(ref_json['reference'], output.check_md5) with open(str(output.finished), mode='w') as finish_file: @@ -105,7 +105,7 @@ rule download_container: shell(cmd) ########################################################## -# Download the reference genome, variant db +# Download the reference genome, variant db ########################################################## download_content = [reference_genome_url, genome_chrom_size_url, refgene_txt_url, refgene_sql_url] @@ -129,7 +129,7 @@ rule download_reference: ref.write_md5 ########################################################## -# Preprocess refseq file by fetching relevant columns and +# Preprocess refseq file by fetching relevant columns and # standardize the chr column ########################################################## @@ -146,7 +146,7 @@ rule prepare_refgene: log: refgene_sql = os.path.join(basedir, "genome", "refgene_sql.log"), refgene_txt = os.path.join(basedir, "genome", "refgene_txt.log") - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix() + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix() shell: """ header=$(awk -f {params.refgene_sql_awk} {input.refgene_sql}); @@ -154,11 +154,10 @@ header=$(awk -f {params.refgene_sql_awk} {input.refgene_sql}); | csvcut -t -c chrom,exonStarts,exonEnds,name,score,strand,exonCount,txStart,txEnd,name2 \ | csvformat -T \ | bedtools expand -c 2,3 \ -| awk '$1~/chr[1-9]/ && $1!~/[_]/' | cut -c 4- | sort -k1,1 -k2,2n > {output.bed}; +| awk '$1~/chr[1-9]/ && $1!~/[_]/' | sort -k1,1 -k2,2n > {output.bed}; -awk -v OFS=\"\\t\" '$3!~/_/ {{ gsub(\"chr\",\"\",$3); $1=$13; print }}' {input.refgene_txt} \ +awk -v OFS=\"\\t\" '$3!~/_/ {{ gsub(\"chr\",\"chr\",$3); $1=$13; print }}' {input.refgene_txt} \ | cut -f 1-11 > {output.refflat}; -sed -i 's/chr//g' {input.refgene_txt}; """ ########################################################## @@ -173,7 +172,7 @@ rule bwa_index: expand(reference_genome_url.get_output_file + "{ext}", ext=['.amb','.ann','.bwt','.pac','.sa']) log: reference_genome_url.get_output_file + ".bwa_index.log" - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bwa") + ".sif").as_posix() + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bwa") + ".sif").as_posix() shell: """ bwa index -a bwtsw {input.reference_genome} 2> {log}; @@ -191,7 +190,7 @@ rule samtools_index_fasta: reference_genome_url.get_output_file + ".fai" log: reference_genome_url.get_output_file + ".faidx.log" - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("samtools") + ".sif").as_posix() + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("samtools") + ".sif").as_posix() shell: """ samtools faidx {input.reference_genome} 2> {log}; @@ -210,7 +209,7 @@ rule picard_ref_dict: reference_genome_url.get_output_file.replace("fasta","dict") log: reference_genome_url.get_output_file + ".ref_dict.log" - singularity: Path(singularity_image_path, config["bioinfo_tools"].get("picard") + ".sif").as_posix() + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("picard") + ".sif").as_posix() shell: """ picard CreateSequenceDictionary REFERENCE={input.reference_genome} OUTPUT={output} 2> {log}; diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 289c72c16..37e0eb8b1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,8 +12,8 @@ Added: * SVdb to the varcall_py36 container #871 * SVdb to WGS workflow #871 * Docker container for vcf2cytosure #858 -* Snakemake rule for creating `.cgh` files from `CNVkit` outputs #858 -* SVdb to TGA workflow #871 +* Snakemake rule for creating `.cgh` files from `CNVkit` outputs #858 +* SVdb to TGA workflow #871 * SVdb merge SV and CNV #871 * Readthedocs for BALSAMIC method descriptions #892 * Readthedocs for BALSAMIC variant filters for WGS somatic callers #892 @@ -26,7 +26,8 @@ Changed: * Merge QC metric extraction workflows #833 * Changed the base-image for balsamic container to 4.10.3-alpine #869 -* updated SVdb to 2.6.0 #871 +* Updated SVdb to 2.6.0 #871 +* Upgrade black to 22.3.0 Fixed: ^^^^^^ @@ -94,7 +95,7 @@ Changed: Added: ^^^^^^ * Added the readthedocs page for BALSAMIC variant-calling filters #867 -* Project requirements (setup.py) to build the docs #874 +* Project requirements (setup.py) to build the docs #874 * Generate cram from umi-consensus called bam files #865 Changed: @@ -147,11 +148,11 @@ Added: Fixed: ^^^^^^ -* Add default for gender if ``purecn`` captures dual gender values #824 +* Add default for gender if ``purecn`` captures dual gender values #824 Changed: ^^^^^^^^ -* Updated ``purecn`` and its dependencies to latest versions +* Updated ``purecn`` and its dependencies to latest versions [8.2.2] ------- @@ -181,7 +182,7 @@ Added: * Added various basic filters to all variant callers irregardless of their delivery status #750 * BALSAMIC container #728 -* BALSAMIC reference generation via cluster submission for both reference and container #686 +* BALSAMIC reference generation via cluster submission for both reference and container #686 * Container specific tests #770 * BALSAMIC quality control metrics extraction and validation #754 * Delly is added as a submodule and removed from rest of the conda environments #787 @@ -219,12 +220,12 @@ Fixed: * Bumped version for ``bcftools`` in cnvkit container * Fixed issues #776 and #777 with correct install paths for gatk and manta * Fixed issue #782 for missing AF in the vcf INFO field -* Fixed issues #748 #749 with correct sample names +* Fixed issues #748 #749 with correct sample names * Fixed issue #767 for ascatngs hardcoded values -* Fixed missing output option in bcftools filters for tnhaplotyper #793 +* Fixed missing output option in bcftools filters for tnhaplotyper #793 * Fixed issue #795 with increasing resources for vep and filter SV prior to vep * Building ``wheel`` for ``cryptography`` bug inside BALSAMIC container #801 -* Fixed badget for docker container master and develop status +* Fixed badget for docker container master and develop status * ReadtheDocs building failure due to dependencies, fixed by locking versions #773 * Dev requirements installation for Sphinx docs (Github Action) #812 * Changed path for main Dockerfile version in ``.bumpversion.cfg`` @@ -238,7 +239,7 @@ Added: * Workflow to check PR tiltes to make easier to tell PR intents #724 * ``bcftools stats`` to calculate Ti/Tv for all post annotate germline and somatic calls #93 * Added reference download date to ``reference.json`` #726 -* ``ascatngs`` hg38 references to constants #683 +* ``ascatngs`` hg38 references to constants #683 * Added ClinVar as a source to download and to be annotated with VCFAnno #737 Changed: @@ -300,7 +301,7 @@ Added: * Individual rules (i.e. ngs filters) for cnv and sv callers. Only Manta will be delivered and added to the list of output files. #708 * Added "targeted" and "wgs" tags to variant callers to provide another layer of separation. #708 * ``manta`` convert inversion #709 -* Sentieon version to bioinformatic tool version parsing #685 +* Sentieon version to bioinformatic tool version parsing #685 * added ``CITATION.cff`` to cite BALSAMIC @@ -309,9 +310,9 @@ Changed: * Upgrade to latest sentieon version 202010.02 * New name ``MarkDuplicates`` to ``picard_markduplicates`` in ``bwa_mem`` rule and ``cluster.json`` -* New name rule ``GATK_contest`` to ``gatk_contest`` +* New name rule ``GATK_contest`` to ``gatk_contest`` * Avoid running pytest github actions workflow on ``docs/**`` and ``CHANGELOG.rst`` changes -* Updated ``snakemake`` to ``v6.5.3`` #501 +* Updated ``snakemake`` to ``v6.5.3`` #501 * Update ``GNOMAD`` URL * Split Tumor-only ``cnvkit batch`` into individual commands * Improved TMB calculation issue #51 @@ -329,7 +330,7 @@ Fixed: * post-processing of the umi consensus in handling BI tags * vcf-filtered-clinical tag files will have all variants including PASS * Refactor snakemake ``annotate`` rules according to snakemake etiquette #636 -* Refactor snakemake ``align`` rules according to snakemake etiquette #636 +* Refactor snakemake ``align`` rules according to snakemake etiquette #636 * Refactor snakemake ``fastqc`` ``vep`` contest and ``mosdepth`` rules according to ``snakemake`` etiquette #636 * Order of columns in QC and coverage report issue #601 * ``delly`` not showing in workflow at runtime #644 @@ -525,7 +526,7 @@ Fixed: ^^^^^^ * umi_workflow config json is set as true for panel and wgs as false. -* Rename umiconsensus bam file headers from {samplenames} to TUMOR/NORMAL. +* Rename umiconsensus bam file headers from {samplenames} to TUMOR/NORMAL. * Documentation autobuild on RTFD @@ -549,7 +550,7 @@ Removed Fixed ^^^^^ -* Fixed issue 577 with missing ``tumor.merged.bam`` and ``normal.merged.bam`` +* Fixed issue 577 with missing ``tumor.merged.bam`` and ``normal.merged.bam`` * Issue 448 with lingering tmp_dir. It is not deleted after analysis is properly finished. Changed @@ -609,7 +610,7 @@ Changed * Update FastQC to 0.11.9 PR #532 * Update BCFTools to 1.11 PR #537 * Update Samtools to 1.11 PR #537 -* Increase resources and runtime for various workflows in PRs #482 +* Increase resources and runtime for various workflows in PRs #482 * Python package dependenicies versions fixed in PR #480 * QoL changes to workflow in series of PR #471 * Series of documentation updates in PRs #489 #553 diff --git a/tests/commands/config/test_config_qc.py b/tests/commands/config/test_config_qc.py new file mode 100644 index 000000000..1fea387a6 --- /dev/null +++ b/tests/commands/config/test_config_qc.py @@ -0,0 +1,264 @@ +import os +import json +import graphviz +import logging +from unittest import mock +from pathlib import Path +import pytest +from BALSAMIC.utils.cli import generate_graph + +qc_json = "_QC.json" + + +def test_qc_normal_config( + invoke_cli, + sample_fastq, + tmp_path, + balsamic_cache, + panel_bed_file, +): + # GIVEN a case ID, fastq files, and an analysis dir + test_analysis_dir = tmp_path / "test_analysis_dir" + test_analysis_dir.mkdir() + case_id = "sample_tumor_normal" + tumor = sample_fastq["tumor"] + normal = sample_fastq["normal"] + + # WHEN creating a case analysis + with mock.patch.dict( + "os.environ", + ): + result = invoke_cli( + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "-n", + normal, + "--case-id", + case_id, + "--analysis-dir", + test_analysis_dir, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + "ACC1", + "--normal-sample-name", + "ACC2", + ], + ) + + # THEN a config should be created and exist + assert result.exit_code == 0 + assert Path(test_analysis_dir, case_id, case_id + qc_json).exists() + # load json file and check if dag exists + qc_config = json.load(open(Path(test_analysis_dir, case_id, case_id + qc_json))) + # assert if config json dag file is created + assert Path(qc_config["analysis"]["dag"]).exists() + assert "BALSAMIC QC Workflow has been configured successfully!" in result.output + + +def test_qc_tumor_only_config( + invoke_cli, + sample_fastq, + tmp_path, + balsamic_cache, + panel_bed_file, + sentieon_license, + sentieon_install_dir, +): + # GIVEN a case ID, fastq files, and an analysis dir + test_analysis_dir = tmp_path / "test_analysis_dir" + test_analysis_dir.mkdir() + case_id = "sample_tumor_only" + tumor = sample_fastq["tumor"] + + # WHEN creating a case analysis + with mock.patch.dict( + "os.environ", + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + result = invoke_cli( + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + test_analysis_dir, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + "ACC1", + ], + ) + + # THEN a config should be created and exist + assert result.exit_code == 0 + assert Path(test_analysis_dir, case_id, case_id + qc_json).exists() + # load json file and check if dag exists + qc_config = json.load(open(Path(test_analysis_dir, case_id, case_id + qc_json))) + # assert if config json dag file is created + assert Path(qc_config["analysis"]["dag"]).exists() + + +def test_qc_config_bad_filename( + invoke_cli, + tmp_path_factory, + analysis_dir, + panel_bed_file, + balsamic_cache, +): + # GIVEN existing fastq file with wrong naming convention + faulty_fastq_dir = tmp_path_factory.mktemp("error_fastq") + fastq_file_name_tumor = "tumor_error.fastq.gz" + Path(faulty_fastq_dir / fastq_file_name_tumor).touch() + + case_id1 = "faulty_tumor" + tumor = Path(faulty_fastq_dir / fastq_file_name_tumor).as_posix() + # Invoke CLI command using file as argument + case_result = invoke_cli( + [ + "config", + "qc_panel", + "-t", + tumor, + "-p", + panel_bed_file, + "--case-id", + case_id1, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + ], + ) + + # THEN run should abort + assert case_result.exit_code == 1 + + +def test_qc_run_without_permissions( + invoke_cli, + no_write_perm_path, + sample_fastq, + panel_bed_file, + balsamic_cache, +): + # GIVEN CLI arguments including an analysis_dir without write permissions + case_id = "sample_tumor_only" + tumor = sample_fastq["tumor"] + + result = invoke_cli( + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + no_write_perm_path, + "--balsamic-cache", + balsamic_cache, + ], + ) + # THEN program exits before completion + assert result.exit_code == 1 + + +def test_qc_config_failed(invoke_cli, tmp_path, balsamic_cache, panel_bed_file): + # GIVEN a case ID, fastq files, and an analysis dir + test_analysis_dir = tmp_path / "test_analysis_dir" + test_analysis_dir.mkdir() + case_id = "sample_qc" + + # WHEN creating a case analysis + result = invoke_cli( + [ + "config", + "qc_panel", + "--case-id", + case_id, + "-p", + panel_bed_file, + "--analysis-dir", + test_analysis_dir, + "--balsamic-cache", + balsamic_cache, + ] + ) + + # THEN a config should not be created and exit + assert "Error: Missing option" in result.output + assert result.exit_code == 2 + + +def test_config_qc_graph_failed( + invoke_cli, sample_fastq, analysis_dir, balsamic_cache, panel_bed_file +): + # GIVEN an analysis config + case_id = "sample_tumor_only" + tumor = sample_fastq["tumor"] + + with mock.patch.object(graphviz, "Source") as mocked: + mocked.return_value = None + case_result = invoke_cli( + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + ], + ) + + assert case_result.exit_code == 1 + + +def test_config_qc_graph_failed_value_error( + invoke_cli, sample_fastq, analysis_dir, balsamic_cache, panel_bed_file +): + # GIVEN an analysis config + case_id = "sample_tumor_only" + tumor = sample_fastq["tumor"] + + with mock.patch.object(graphviz, "Source", side_effect=ValueError) as mocked: + mocked.return_value = None + case_result = invoke_cli( + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + ], + ) + + assert "BALSAMIC QC dag graph generation failed" in case_result.output diff --git a/tests/conftest.py b/tests/conftest.py index c28f51a44..9286aaf1a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -288,6 +288,50 @@ def tumor_normal_config( return Path(analysis_dir, case_id, case_id + ".json").as_posix() +@pytest.fixture(scope="session") +def tumor_normal_qc_config( + tmp_path_factory, + sample_fastq, + analysis_dir, + balsamic_cache, + panel_bed_file, +): + """ + invokes balsamic config sample -t xxx -n xxx to create sample config + for tumor-normal + """ + case_id = "sample_tumor_normal" + tumor = sample_fastq["tumor"] + normal = sample_fastq["normal"] + + with mock.patch.dict(MOCKED_OS_ENVIRON): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "-n", + normal, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + "ACC1", + "--normal-sample-name", + "ACC2", + ], + ) + return Path(analysis_dir, case_id, case_id + "_QC.json").as_posix() + + @pytest.fixture(name="helpers") def fixture_config_helpers(): """Helper fixture for case config files""" @@ -436,6 +480,45 @@ def tumor_only_wgs_config( return Path(analysis_dir, case_id, case_id + ".json").as_posix() +@pytest.fixture(scope="session") +def tumor_only_qc_config( + tmpdir_factory, + sample_fastq, + balsamic_cache, + analysis_dir, + panel_bed_file, +): + """ + invokes balsamic config sample -t xxx to create sample config + for tumor only + """ + case_id = "sample_tumor_only" + tumor = sample_fastq["tumor"] + + with mock.patch.dict( + MOCKED_OS_ENVIRON, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + ], + ) + return Path(analysis_dir, case_id, case_id + "_QC.json").as_posix() + + @pytest.fixture(scope="session") def tumor_only_pon_config( tmp_path_factory, diff --git a/tests/test_workflow.py b/tests/test_workflow.py index 1c5020d9c..90c35117e 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -46,27 +46,66 @@ def test_workflow_tumor_only(tumor_only_config, sentieon_install_dir, sentieon_l assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) -def test_workflow_qc( - tumor_normal_config, tumor_only_config, sentieon_install_dir, sentieon_license -): +def test_workflow_qc_tumor_only(tumor_only_qc_config): + # GIVEN a sample config dict and snakefile - workflow = "qc" + workflow = "qc_panel" reference_genome = "hg19" snakefile = get_snakefile(workflow, reference_genome) + config_json = tumor_only_qc_config + + # WHEN invoking snakemake module with dryrun option + # THEN it should return true + with mock.patch.dict( + MOCKED_OS_ENVIRON, + ): + assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) + + +def test_workflow_qc_tumor_only_canfam(tumor_only_qc_config): + + # GIVEN a sample config dict and snakefile + workflow = "qc_panel" + reference_genome = "canfam3" + snakefile = get_snakefile(workflow, reference_genome) + config_json = tumor_only_qc_config + + # WHEN invoking snakemake module with dryrun option + # THEN it should return true + with mock.patch.dict( + MOCKED_OS_ENVIRON, + ): + assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) + + +def test_workflow_qc_normal(tumor_normal_qc_config): + # GIVEN a sample config dict and snakefile + workflow = "qc_panel" + reference_genome = "hg19" + snakefile = get_snakefile(workflow, reference_genome) + config_json = tumor_normal_qc_config # WHEN invoking snakemake module with dryrun option # THEN it should return true with mock.patch.dict( MOCKED_OS_ENVIRON, - { - "SENTIEON_LICENSE": sentieon_license, - "SENTIEON_INSTALL_DIR": sentieon_install_dir, - }, ): - for config_json in (tumor_normal_config, tumor_only_config): - assert snakemake.snakemake( - snakefile, configfiles=[config_json], dryrun=True - ) + assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) + + +def test_workflow_qc_normal_canfam3(tumor_normal_qc_config): + # GIVEN a sample config dict and snakefile + workflow = "qc_panel" + reference_genome = "canfam3" + snakefile = get_snakefile(workflow, reference_genome) + config_json = tumor_normal_qc_config + + # WHEN invoking snakemake module with dryrun option + # THEN it should return true + with mock.patch.dict( + MOCKED_OS_ENVIRON, + ): + assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) def test_workflow_sentieon( diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 3b029c597..f0ee2ef0b 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -352,7 +352,7 @@ def test_get_snakefile(): ("paired", "targeted"), ("single", "wgs"), ("single", "targeted"), - ("qc", ""), + ("qc_panel", "targeted"), ("generate_ref", ""), ("pon", ""), ] @@ -363,8 +363,10 @@ def test_get_snakefile(): snakefile = get_snakefile(analysis_type, reference_genome) pipeline = "" - - if sequencing_type in ["targeted", "wgs", "qc"]: + if sequencing_type in ["targeted", "wgs"] and analysis_type in [ + "single", + "paired", + ]: pipeline = "BALSAMIC/workflows/balsamic.smk" elif analysis_type == "generate_ref" and reference_genome != "canfam3": pipeline = "BALSAMIC/workflows/reference.smk" @@ -372,6 +374,8 @@ def test_get_snakefile(): pipeline = "BALSAMIC/workflows/reference-canfam3.smk" elif analysis_type == "pon": pipeline = "BALSAMIC/workflows/PON.smk" + elif "qc" in analysis_type: + pipeline = "BALSAMIC/workflows/QC.smk" # THEN it should return the snakefile path # THEN assert file exists From 74e5475e655d7c99ae58f2de666c152beecfbf73 Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Wed, 20 Apr 2022 15:04:05 +0200 Subject: [PATCH 50/58] docs: add annotation resources (#916) * add ascat to copynumber file * update changelog * add balsamic annotation resources to docs * update changelog * fix review suggestion * modify description for CLNACC * modify description for COSMIC_CNT * modify description for COSMIC_CNT --- .../somatic_sv_tumor_normal.rule | 2 +- CHANGELOG.rst | 1 + docs/balsamic_annotation.rst | 254 ++++++++++++++++++ docs/index.rst | 1 + 4 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 docs/balsamic_annotation.rst diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule index cb00d426b..85d970f77 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule @@ -124,7 +124,7 @@ rule ascat_tumor_normal: chryloci= config["reference"]["ascat_chryloci"], output: final_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.vcf.gz", - ascat_copynumber = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".copynumber.txt.gz", + ascat_copynumber = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.copynumber.txt.gz", sample_statistics = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.samplestatistics.txt", ascat_plots= expand( vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat." + "{output_suffix}" + ".png", diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 37e0eb8b1..cd51e27a8 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -20,6 +20,7 @@ Added: * bcftools counts to varcall filter rules #898 * Additional WGS metrics to be stored in ``_metrics_deliverables.yaml`` #907 * ascatNGS copynumber file #897 +* ReadtheDocs for BALSAMIC annotation resources #916 Changed: ^^^^^^^^ diff --git a/docs/balsamic_annotation.rst b/docs/balsamic_annotation.rst new file mode 100644 index 000000000..dd51e1dcc --- /dev/null +++ b/docs/balsamic_annotation.rst @@ -0,0 +1,254 @@ +*********************************** +BALSAMIC Annotation Resources +*********************************** + +BALSAMIC annotates somatic single nucleotide variants (SNVs) using ``ensembl-vep`` and ``vcfanno``. Somatic structural variants (SVs), somatic copy-number variants (CNVs) and germline single nucleotide variants are annotated using only ``ensembl-vep``. All SVs and CNVs are merged using ``SVDB`` before annotating for `Target Genome Analysis (TGA)` or `Whole Genome Sequencing (WGS)` analyses. + +`BALSAMIC` adds the following annotation from `gnomAD` database using ``vcfanno``. + +.. list-table:: gnomAD + :widths: 50 50 + :header-rows: 1 + + * - VCF tag + - description + * - GNOMADAF_popmax + - maximum allele frequency across populations + * - GNOMADAF + - fraction of the reads supporting the alternate allele, allelic frequency + +`BALSAMIC` adds the following annotation from `ClinVar` database using ``vcfanno``. + +.. list-table:: ClinVar + :widths: 50 50 + :header-rows: 1 + + * - VCF tag + - description + * - CLNACC + - Variant Accession and Versions + * - CLNREVSTAT + - ClinVar review status for the Variation ID + * - CLNSIG + - Clinical significance for this single variant + * - CLNVCSO + - Sequence Ontology id for variant type + * - CLNVC + - Variant type + * - ORIGIN + - Allele origin + +The values for `ORIGIN` are described below: + +.. list-table:: ORIGIN + :widths: 25 25 + :header-rows: 1 + + * - Value + - Annotation + * - 0 + - unknown + * - 1 + - germline + * - 2 + - somatic + * - 4 + - inherited + * - 8 + - paternal + * - 16 + - maternal + * - 32 + - *de-novo* + * - 64 + - biparental + * - 128 + - uniparental + * - 256 + - not-tested + * - 512 + - tested-inconclusive + * - 1073741824 + - other + +`BALSAMIC` uses `ensembl-vep` to add the following annotation from `COSMIC` database. + +.. list-table:: COSMIC + :widths: 50 50 + :header-rows: 1 + + * - VCF tag + - description + * - COSMIC_CDS + - CDS annotation + * - COSMIC_GENE + - gene name + * - COSMIC_STRAND + - strand + * - COSMIC_CNT + - number of samples with this mutation in the `COSMIC` database + * - COSMIC_AA + - peptide annotation + + +Where relevant, `BALSAMIC` uses `ensembl-vep` to annotate somatic and germline SNVs and somatic SVs/CNVs from `1000genomes (phase3)`, `ClinVar`, `ESP, HGMD-PUBLIC`, `dbSNP`, `gencode`, `gnomAD`, `polyphen`, `refseq`, and `sift` databases. +The following annotations are added by `ensembl-vep`. + +.. list-table:: ensembl-vep + :widths: 10 60 + :header-rows: 1 + + * - Annotation + - description + * - Allele + - the variant allele used to calculate the consequence + * - Gene + - Ensembl stable ID of affected gene + * - Feature + - Ensembl stable ID of feature + * - Feature type + - type of feature. Currently one of Transcript, RegulatoryFeature, MotifFeature. + * - Consequence + - consequence type of this variant + * - Position in cDNA + - relative position of base pair in cDNA sequence + * - Position in CDS + - relative position of base pair in coding sequence + * - Position in protein + - relative position of amino acid in protein + * - Amino acid change + - only given if the variant affects the protein-coding sequence + * - Codon change + - the alternative codons with the variant base in upper case + * - Co-located variation + - identifier of any existing variants + * - VARIANT_CLASS + - Sequence Ontology variant class + * - SYMBOL + - the gene symbol + * - SYMBOL_SOURCE + - the source of the gene symbol + * - STRAND + - the DNA strand (1 or -1) on which the transcript/feature lies + * - ENSP + - the Ensembl protein identifier of the affected transcript + * - FLAGS + - | transcript quality flags: + | cds_start_NF: CDS 5' incomplete + | cds_end_NF: CDS 3' incomplete + * - SWISSPROT + - Best match UniProtKB/Swiss-Prot accession of protein product + * - TREMBL + - Best match UniProtKB/TrEMBL accession of protein product + * - UNIPARC + - Best match UniParc accession of protein product + * - HGVSc + - the HGVS coding sequence name + * - HGVSp + - the HGVS protein sequence name + * - HGVSg + - the HGVS genomic sequence name + * - HGVS_OFFSET + - Indicates by how many bases the HGVS notations for this variant have been shifted + * - SIFT + - the SIFT prediction and/or score, with both given as prediction(score) + * - PolyPhen + - the PolyPhen prediction and/or score + * - MOTIF_NAME + - The source and identifier of a transcription factor binding profile aligned at this position + * - MOTIF_POS + - The relative position of the variation in the aligned TFBP + * - HIGH_INF_POS + - A flag indicating if the variant falls in a high information position of a transcription factor binding profile (TFBP) + * - MOTIF_SCORE_CHANGE + - The difference in motif score of the reference and variant sequences for the TFBP + * - CANONICAL + - a flag indicating if the transcript is denoted as the canonical transcript for this gene + * - CCDS + - the CCDS identifer for this transcript, where applicable + * - INTRON + - the intron number (out of total number) + * - EXON + - the exon number (out of total number) + * - DOMAINS + - the source and identifer of any overlapping protein domains + * - DISTANCE + - Shortest distance from variant to transcript + * - AF + - Frequency of existing variant in 1000 Genomes + * - AFR_AF + - Frequency of existing variant in 1000 Genomes combined African population + * - AMR_AF + - Frequency of existing variant in 1000 Genomes combined American population + * - EUR_AF + - Frequency of existing variant in 1000 Genomes combined European population + * - EAS_AF + - Frequency of existing variant in 1000 Genomes combined East Asian population + * - SAS_AF + - Frequency of existing variant in 1000 Genomes combined South Asian population + * - AA_AF + - Frequency of existing variant in NHLBI-ESP African American population + * - EA_AF + - Frequency of existing variant in NHLBI-ESP European American population + * - gnomAD_AF + - Frequency of existing variant in gnomAD exomes combined population + * - gnomAD_AFR_AF + - Frequency of existing variant in gnomAD exomes African/American population + * - gnomAD_AMR_AF + - Frequency of existing variant in gnomAD exomes American population + * - gnomAD_ASJ_AF + - Frequency of existing variant in gnomAD exomes Ashkenazi Jewish population + * - gnomAD_EAS_AF + - Frequency of existing variant in gnomAD exomes East Asian population + * - gnomAD_FIN_AF + - Frequency of existing variant in gnomAD exomes Finnish population + * - gnomAD_NFE_AF + - Frequency of existing variant in gnomAD exomes Non-Finnish European population + * - gnomAD_OTH_AF + - Frequency of existing variant in gnomAD exomes combined other combined populations + * - gnomAD_SAS_AF + - Frequency of existing variant in gnomAD exomes South Asian population + * - MAX_AF + - Maximum observed allele frequency in 1000 Genomes, ESP and gnomAD + * - MAX_AF_POPS + - Populations in which maximum allele frequency was observed + * - CLIN_SIG + - ClinVar clinical significance of the dbSNP variant + * - BIOTYPE + - Biotype of transcript or regulatory feature + * - APPRIS + - Annotates alternatively spliced transcripts as primary or alternate based on a range of computational methods. NB: not available for GRCh37 + * - TSL + - Transcript support level. NB: not available for GRCh37 + * - PUBMED + - Pubmed ID(s) of publications that cite existing variant + * - SOMATIC + - Somatic status of existing variant(s); multiple values correspond to multiple values in the Existing_variation field + * - PHENO + - Indicates if existing variant is associated with a phenotype, disease or trait; multiple values correspond to multiple values in the Existing_variation field + * - GENE_PHENO + - Indicates if overlapped gene is associated with a phenotype, disease or trait + * - BAM_EDIT + - Indicates success or failure of edit using BAM file + * - GIVEN_REF + - Reference allele from input + * - REFSEQ_MATCH + - | the RefSeq transcript match status; contains a number of flags indicating whether this RefSeq transcript matches the underlying reference sequence and/or an Ensembl transcript (more information): + - rseq_3p_mismatch: signifies a mismatch between the RefSeq transcript and the underlying primary genome assembly sequence. Specifically, there is a mismatch in the 3' UTR of the RefSeq model with respect to the primary genome assembly (e.g. GRCh37/GRCh38). + - rseq_5p_mismatch: signifies a mismatch between the RefSeq transcript and the underlying primary genome assembly sequence. Specifically, there is a mismatch in the 5' UTR of the RefSeq model with respect to the primary genome assembly. + - rseq_cds_mismatch: signifies a mismatch between the RefSeq transcript and the underlying primary genome assembly sequence. Specifically, there is a mismatch in the CDS of the RefSeq model with respect to the primary genome assembly. + - rseq_ens_match_cds: signifies that for the RefSeq transcript there is an overlapping Ensembl model that is identical across the CDS region only. A CDS match is defined as follows: the CDS and peptide sequences are identical and the genomic coordinates of every translatable exon match. Useful related attributes are: rseq_ens_match_wt and rseq_ens_no_match. + - rseq_ens_match_wt: signifies that for the RefSeq transcript there is an overlapping Ensembl model that is identical across the whole transcript. A whole transcript match is defined as follows: 1) In the case that both models are coding, the transcript, CDS and peptide sequences are all identical and the genomic coordinates of every exon match. 2) In the case that both transcripts are non-coding the transcript sequences and the genomic coordinates of every exon are identical. No comparison is made between a coding and a non-coding transcript. Useful related attributes are: rseq_ens_match_cds and rseq_ens_no_match. + - rseq_ens_no_match: signifies that for the RefSeq transcript there is no overlapping Ensembl model that is identical across either the whole transcript or the CDS. This is caused by differences between the transcript, CDS or peptide sequences or between the exon genomic coordinates. Useful related attributes are: rseq_ens_match_wt and rseq_ens_match_cds. + - rseq_mrna_match: signifies an exact match between the RefSeq transcript and the underlying primary genome assembly sequence (based on a match between the transcript stable id and an accession in the RefSeq mRNA file). An exact match occurs when the underlying genomic sequence of the model can be perfectly aligned to the mRNA sequence post polyA clipping. + - rseq_mrna_nonmatch: signifies a non-match between the RefSeq transcript and the underlying primary genome assembly sequence. A non-match is deemed to have occurred if the underlying genomic sequence does not have a perfect alignment to the mRNA sequence post polyA clipping. It can also signify that no comparison was possible as the model stable id may not have had a corresponding entry in the RefSeq mRNA file (sometimes happens when accessions are retired or changed). When a non-match occurs one or several of the following transcript attributes will also be present to provide more detail on the nature of the non-match: rseq_5p_mismatch, rseq_cds_mismatch, rseq_3p_mismatch, rseq_nctran_mismatch, rseq_no_comparison + - rseq_nctran_mismatch: signifies a mismatch between the RefSeq transcript and the underlying primary genome assembly sequence. This is a comparison between the entire underlying genomic sequence of the RefSeq model to the mRNA in the case of RefSeq models that are non-coding. + - rseq_no_comparison: signifies that no alignment was carried out between the underlying primary genome assembly sequence and a corresponding RefSeq mRNA. The reason for this is generally that no corresponding, unversioned accession was found in the RefSeq mRNA file for the transcript stable id. This sometimes happens when accessions are retired or replaced. A second possibility is that the sequences were too long and problematic to align (though this is rare). + * - CHECK_REF + - Reports variants where the input reference does not match the expected reference + * - HGNC_ID + - A unique ID provided by the HGNC for each gene with an approved symbol + * - MANE + - indicating if the transcript is the MANE Select or MANE Plus Clinical transcript for the gene. + * - miRNA + - Reports where the variant lies in the miRNA secondary structure. \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 17389a66d..9503955ea 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,6 +16,7 @@ :hidden: :maxdepth: 1 + balsamic_annotation balsamic_filters balsamic_methods bioinfo_softwares From cb8e5b64037e4d84dd0cf4b32ab5516189e06482 Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Fri, 22 Apr 2022 17:27:36 +0200 Subject: [PATCH 51/58] feat: update delly (#920) * update delly to 0.9.1 * update changelog * update changelog * fix Dockerfile * fix Dockerfile * remove OMP * add OMP * remove OMP * add OMP --- BALSAMIC/containers/delly/Dockerfile | 8 +++++--- BALSAMIC/containers/delly/delly.yaml | 2 +- CHANGELOG.rst | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/BALSAMIC/containers/delly/Dockerfile b/BALSAMIC/containers/delly/Dockerfile index 89fe792ec..5d17634fa 100644 --- a/BALSAMIC/containers/delly/Dockerfile +++ b/BALSAMIC/containers/delly/Dockerfile @@ -28,11 +28,13 @@ RUN apt-get update && apt-get install -y \ # set environment ENV BOOST_ROOT /usr ENV PATH="/opt/delly/bin:${PATH}" +ENV OMP_NUM_THREADS 2 # install delly RUN cd /opt \ && git clone --recursive https://github.com/dellytools/delly.git \ && cd /opt/delly/ \ - && git checkout v0.8.7 \ - && make STATIC=1 all \ - && make install + && git checkout v0.9.1 \ + && make STATIC=1 PARALLEL=1 all \ + && make install \ + diff --git a/BALSAMIC/containers/delly/delly.yaml b/BALSAMIC/containers/delly/delly.yaml index 5ef59fcbe..1689329d4 100644 --- a/BALSAMIC/containers/delly/delly.yaml +++ b/BALSAMIC/containers/delly/delly.yaml @@ -1 +1 @@ -- delly=0.8.7 +- delly=0.9.1 diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cd51e27a8..352cc112f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -29,6 +29,7 @@ Changed: * Changed the base-image for balsamic container to 4.10.3-alpine #869 * Updated SVdb to 2.6.0 #871 * Upgrade black to 22.3.0 +* updated delly to 0.9.1 #920 Fixed: ^^^^^^ From 4e2bdd7d013707edbd7551d3a999e7f64a5e969a Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Tue, 26 Apr 2022 15:21:26 +0200 Subject: [PATCH 52/58] fix: Change of gnomad pop freq value for UMI workflow (#919) * add new gnomad pop freq for umi workflow * edit balsamic filters docs * update changelog * fix ident * fix typo * add pop_freq_umi to model attributes * fix indentation * add review suggestions * add PR number to changelog --- BALSAMIC/constants/variant_filters.py | 5 ++ .../varcaller_filter_tumor_normal.rule | 2 +- .../varcaller_filter_tumor_only.rule | 2 +- BALSAMIC/utils/models.py | 2 + CHANGELOG.rst | 1 + docs/balsamic_filters.rst | 46 ++++++++++++++++++- 6 files changed, 54 insertions(+), 4 deletions(-) diff --git a/BALSAMIC/constants/variant_filters.py b/BALSAMIC/constants/variant_filters.py index 44e78a746..3ad66fe0b 100644 --- a/BALSAMIC/constants/variant_filters.py +++ b/BALSAMIC/constants/variant_filters.py @@ -47,6 +47,11 @@ "filter_name": "balsamic_high_pop_freq", "field": "INFO", }, + "pop_freq_umi": { + "tag_value": 0.02, + "filter_name": "balsamic_umi_high_pop_freq", + "field": "INFO", + }, "qss": { "tag_value": 20, "filter_name": "balsamic_low_quality_scores", diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule index 24723d72b..d3457faf9 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule @@ -96,7 +96,7 @@ rule bcftools_filter_TNscope_umi_tumor_normal: singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - pop_freq = [COMMON_FILTERS.pop_freq.tag_value, COMMON_FILTERS.pop_freq.filter_name], + pop_freq = [SENTIEON_CALLER.pop_freq_umi.tag_value, SENTIEON_CALLER.pop_freq_umi.filter_name], housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule index 5ba6919e3..247177bf2 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule @@ -94,7 +94,7 @@ rule bcftools_filter_TNscope_umi_tumor_only: singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - pop_freq = [COMMON_FILTERS.pop_freq.tag_value, COMMON_FILTERS.pop_freq.filter_name], + pop_freq = [SENTIEON_CALLER.pop_freq_umi.tag_value, SENTIEON_CALLER.pop_freq_umi.filter_name], housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index 44263d141..173d7ad37 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -56,6 +56,7 @@ class VarCallerFilter(BaseModel): MQ: VCFAttributes (optional); minimum mapping quality DP: VCFAttributes (optional); minimum read depth pop_freq: VCFAttributes (optional); maximum gnomad_af + pop_freq_umi: VCFAttributes (optional); maximum gnomad_af for UMI workflow strand_reads: VCFAttributes (optional); minimum strand specific read counts qss: VCFAttributes (optional); minimum sum of base quality scores sor: VCFAttributes (optional); minimum symmetrical log-odds ratio @@ -71,6 +72,7 @@ class VarCallerFilter(BaseModel): MQ: Optional[VCFAttributes] DP: Optional[VCFAttributes] pop_freq: Optional[VCFAttributes] + pop_freq_umi: Optional[VCFAttributes] strand_reads: Optional[VCFAttributes] qss: Optional[VCFAttributes] sor: Optional[VCFAttributes] diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 352cc112f..e5a0c7fd6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -29,6 +29,7 @@ Changed: * Changed the base-image for balsamic container to 4.10.3-alpine #869 * Updated SVdb to 2.6.0 #871 * Upgrade black to 22.3.0 +* For UMI workflow, post filter `gnomad_pop_freq` value is changed from `0.005` to `0.02` #919 * updated delly to 0.9.1 #920 Fixed: diff --git a/docs/balsamic_filters.rst b/docs/balsamic_filters.rst index 89cfbf4d2..313f8d582 100644 --- a/docs/balsamic_filters.rst +++ b/docs/balsamic_filters.rst @@ -37,7 +37,35 @@ indicate that at this site, the mean position in reads is less than 8, and the p For `Post-call filtering`, in BALSAMIC we have applied various filtering criteria (`Vardict_filtering`_, `TNscope filtering (Tumor_normal)`_ ) depending on the analysis-type (TGS/WGS) and sample-type(tumor-only/tumor-normal). .. note:: - In BALSAMIC, this VCF file is named as `*.all.filtered.pass.vcf.gz` (eg: `SNV.somatic..vardict.all.filtered.pass.vcf.gz`) + In BALSAMIC, this VCF file is named as `*.all.filtered.vcf.gz` (eg: `SNV.somatic..vardict.all.filtered.vcf.gz`) + + +Only those variants that fulfill the pre-call and post-call filters are scored as `PASS` in the `STATUS` column of the VCF file. We filter those `PASS` variants and deliver a final list of variants to the customer either via `Scout` or `Caesar` + +.. note:: + In BALSAMIC, this VCF file is named as `*.all.filtered.pass.vcf.gz` (eg: `SNV.somatic..vardict.all.filtered.pass.vcf.gz`) + + +.. list-table:: Description of VCF files + :widths: 30 50 20 + :header-rows: 1 + + * - VCF file name + - Description + - Delivered to the customer + * - .vcf.gz + - Unannotated VCF file with pre-call filters included in the STATUS column + - Yes (Caesar) + * - .all.vcf.gz + - Annotated VCF file with pre-call filters included in the STATUS column + - No + * - .all.filtered.vcf.gz + - Annotated VCF file with pre-call and post-call filters included in the STATUS column + - No + * - .all.filtered.pass.vcf.gz + - Annotated and filtered VCF file by excluding all filters that did not meet the pre and post-call filter criteria. Includes only variants with the `PASS` STATUS + - Yes (Caesar and Scout) + **Targeted Genome Analysis** ############################# @@ -265,10 +293,24 @@ Minimum log-odds for the candidate selection. TNscope default: `4`. In our UMI-w min_tumor_lod = 4.0 +*min_tumor_allele_frac*: Set the minimum tumor AF to be considered as potential variant site. + +:: + + min_tumor_allele_frac = 0.0005 + +*interval_padding*: Adding an extra 100bp to each end of the target region in the bed file before variant calling. +:: + interval_padding = 100 + **Post-call Filters** *GNOMADAF_POPMAX*: Maximum Allele Frequency across populations :: - GNOMADAF_popmax <= 0.001 (or) GNOMADAF_popmax == "." + GNOMADAF_popmax <= 0.02 (or) GNOMADAF_popmax == "." + +.. attention:: + BALSAMIC <= v8.2.10 uses GNOMAD_popmax <= 0.005. From Balsamic v9.0.0, this settings is changed to 0.02, to reduce the stringency. + From 23773a5f27bcc229f9342ec346470bdfe1b1cc5e Mon Sep 17 00:00:00 2001 From: Khurram Maqbool Date: Wed, 27 Apr 2022 12:31:48 +0200 Subject: [PATCH 53/58] feat: add Delly CNV for tumor only workflow (#923) * update changelog * add tumor only cnv analysis and fix messeges in rules * fix messege text * update BALSAMIC documentation --- BALSAMIC/config/analysis.json | 6 +- BALSAMIC/constants/reference.py | 48 ++++++++++++++ BALSAMIC/constants/workflow_params.py | 9 ++- .../snakemake_rules/annotation/rankscore.rule | 4 +- .../varcaller_filter_tumor_normal.rule | 6 +- .../varcaller_filter_tumor_only.rule | 4 +- .../annotation/varcaller_sv_filter.rule | 2 +- .../varcaller_wgs_filter_tumor_normal.rule | 4 +- .../varcaller_wgs_filter_tumor_only.rule | 4 +- .../annotation/vcf2cytosure_convert.rule | 2 +- BALSAMIC/snakemake_rules/annotation/vep.rule | 8 +-- .../umi/generate_AF_tables.rule | 2 +- .../umi/mergetype_normal_umi.rule | 6 +- .../umi/mergetype_tumor_umi.rule | 3 +- BALSAMIC/snakemake_rules/umi/qc_umi.rule | 4 +- .../umi/sentieon_consensuscall.rule | 6 +- .../umi/sentieon_umiextract.rule | 4 +- .../umi/sentieon_varcall_tnscope.rule | 2 +- .../umi/sentieon_varcall_tnscope_tn.rule | 6 +- .../variant_calling/cnvkit_paired.rule | 2 +- .../variant_calling/cnvkit_single.rule | 3 +- .../variant_calling/germline.rule | 7 +-- .../variant_calling/germline_sv.rule | 2 +- .../variant_calling/mergetype_normal.rule | 2 +- .../variant_calling/mergetype_tumor.rule | 2 +- .../variant_calling/sentieon_germline.rule | 2 +- .../sentieon_split_snv_sv.rule | 2 +- .../variant_calling/sentieon_t_varcall.rule | 6 +- .../variant_calling/sentieon_tn_varcall.rule | 2 +- .../somatic_sv_tumor_normal.rule | 24 +++---- .../somatic_sv_tumor_only.rule | 59 ++++++++++++++---- .../variant_calling/somatic_tumor_normal.rule | 7 +-- .../variant_calling/somatic_tumor_only.rule | 5 +- .../variant_calling/split_bed.rule | 3 +- BALSAMIC/utils/models.py | 7 ++- BALSAMIC/workflows/reference.smk | 10 ++- CHANGELOG.rst | 15 ++--- docs/balsamic_methods.rst | 12 ++-- docs/bioinfo_softwares.rst | 2 +- tests/conftest.py | 3 + .../references/genome/delly_mappability.gz | Bin 0 -> 66 bytes .../genome/delly_mappability.gz.fai | 10 +++ .../genome/delly_mappability.gz.gzi | Bin 0 -> 5626 bytes tests/test_data/references/reference.json | 1 + 44 files changed, 216 insertions(+), 102 deletions(-) create mode 100644 tests/test_data/references/genome/delly_mappability.gz create mode 100644 tests/test_data/references/genome/delly_mappability.gz.fai create mode 100644 tests/test_data/references/genome/delly_mappability.gz.gzi diff --git a/BALSAMIC/config/analysis.json b/BALSAMIC/config/analysis.json index 3e46e0b1d..1caec394e 100644 --- a/BALSAMIC/config/analysis.json +++ b/BALSAMIC/config/analysis.json @@ -54,10 +54,14 @@ "mutation": "somatic", "type": "SNV" }, - "delly":{ + "dellysv":{ "mutation": "somatic", "type": "SV" }, + "dellycnv":{ + "mutation": "somatic", + "type": "CNV" + }, "ascat": { "mutation": "somatic", "type": "SV" diff --git a/BALSAMIC/constants/reference.py b/BALSAMIC/constants/reference.py index 911005878..d27b6c0fc 100644 --- a/BALSAMIC/constants/reference.py +++ b/BALSAMIC/constants/reference.py @@ -133,6 +133,30 @@ "output_file": "delly_exclusion.tsv", "output_path": "genome", }, + "delly_mappability": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh38.delly.blacklist.gz", + "file_type": "text", + "gzip": False, + "genome_version": "hg38", + "output_file": "delly_mappability.gz", + "output_path": "genome", + }, + "delly_mappability_gindex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh38.delly.blacklist.gz.gzi", + "file_type": "text", + "gzip": False, + "genome_version": "hg38", + "output_file": "delly_mappability.gz.gzi", + "output_path": "genome", + }, + "delly_mappability_findex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh38.delly.blacklist.gz.fai", + "file_type": "text", + "gzip": False, + "genome_version": "hg38", + "output_file": "delly_mappability.gz.fai", + "output_path": "genome", + }, "ascat_gccorrection": { "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/35465e2644f76f2d59427a9b379d34ecea71f259/cancer/references/hg38_SnpGcCorrections.tsv.gz", "file_type": "text", @@ -287,6 +311,30 @@ "output_file": "delly_exclusion.tsv", "output_path": "genome", }, + "delly_mappability": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz", + "file_type": "text", + "gzip": False, + "genome_version": "hg19", + "output_file": "delly_mappability.gz", + "output_path": "genome", + }, + "delly_mappability_gindex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz.gzi", + "file_type": "text", + "gzip": False, + "genome_version": "hg19", + "output_file": "delly_mappability.gz.gzi", + "output_path": "genome", + }, + "delly_mappability_findex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz.fai", + "file_type": "text", + "gzip": False, + "genome_version": "hg19", + "output_file": "delly_mappability.gz.fai", + "output_path": "genome", + }, "ascat_gccorrection": { "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/12a6c760fd542c02de2cda286b6245e46f4b6a97/cancer/references/GRCh37_SnpGcCorrections.tsv.gz", "file_type": "text", diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 575457923..f65780c87 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -71,13 +71,20 @@ "sequencing_type": ["targeted"], "workflow_solution": ["BALSAMIC"], }, - "delly": { + "dellysv": { "mutation": "somatic", "type": "SV", "analysis_type": ["paired", "single"], "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["BALSAMIC"], }, + "dellycnv": { + "mutation": "somatic", + "type": "CNV", + "analysis_type": ["single"], + "sequencing_type": ["targeted", "wgs"], + "workflow_solution": ["BALSAMIC"], + }, "ascat": { "mutation": "somatic", "type": "CNV", diff --git a/BALSAMIC/snakemake_rules/annotation/rankscore.rule b/BALSAMIC/snakemake_rules/annotation/rankscore.rule index 0d23dad80..1982b0f1c 100644 --- a/BALSAMIC/snakemake_rules/annotation/rankscore.rule +++ b/BALSAMIC/snakemake_rules/annotation/rankscore.rule @@ -19,11 +19,11 @@ rule genmod_score_vardict: threads: get_threads(cluster_config, 'genmod_score_vardict') message: - ("Score annotated vardict variants using genmod" - "and compress vcf using bcftools on {params.case_name}") + ("Scoring annotated vardict variants using genmod for {params.case_name}") shell: """ genmod score -r -c {input.rankscore} {input.vcf} | \ + bcftools view -o {output.vcf_pass} -O z; tabix -p vcf -f {output.vcf_pass}; diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule index d3457faf9..532702769 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule @@ -28,7 +28,7 @@ rule bcftools_filter_vardict_tumor_normal: threads: get_threads(cluster_config, 'bcftools_filter_vardict_tumor_normal') message: - "Filtering vardict tumor-normal annotated variants using bcftools on {params.case_name}" + "Filtering vardict tumor-normal annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ @@ -68,7 +68,7 @@ rule bcftools_filter_tnhaplotyper_tumor_normal: threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_normal') message: - "Filtering tnhaplotyper tumor-normal annotated variants using bcftools on {params.case_name}" + "Filtering tnhaplotyper tumor-normal annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ @@ -102,7 +102,7 @@ rule bcftools_filter_TNscope_umi_tumor_normal: threads: get_threads(cluster_config, 'bcftools_filter_TNscope_umi_tumor_normal') message: - "Filtering TNscope_umi tumor-normal annotated variants using bcftools on {params.case_name}" + "Filtering TNscope_umi tumor-normal annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule index 247177bf2..7c90f12f8 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule @@ -66,7 +66,7 @@ rule bcftools_filter_tnhaplotyper_tumor_only: threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_only') message: - "Filtering tnhaplotyper tumor-only annotated variants using bcftools on {params.case_name}" + "Filtering tnhaplotyper tumor-only annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ @@ -100,7 +100,7 @@ rule bcftools_filter_TNscope_umi_tumor_only: threads: get_threads(cluster_config, 'bcftools_filter_TNscope_umi_tumor_only') message: - "Filtering TNscope_umi tumor-only annotated variants using bcftools on {params.case_name}" + "Filtering TNscope_umi tumor-only annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule index e1588a79a..cb7529d83 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule @@ -18,7 +18,7 @@ rule bcftools_filter_svdb: threads: get_threads(cluster_config, "bcftools_filter_svdb") message: - "Filtering svdb merged Manta and Delly results for PASS variants using bcftools for sample '{params.case_name}' " + "Filtering merged structural and copy number variants using bcftools for {params.case_name}" shell: """ bcftools view --threads {threads} -f .,PASS -o {output.vcf_pass_svdb} -O z {input.vcf}; diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule index 23a229a0d..017937cd3 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule @@ -26,7 +26,7 @@ rule bcftools_filter_tnscope_tumor_normal: threads: get_threads(cluster_config, 'bcftools_filter_tnscope_tumor_normal') message: - "Filtering wgs tumor-normal tnscope annotated variants using bcftools on {params.case_name}" + "Filtering WGS tumor-normal tnscope annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ @@ -65,7 +65,7 @@ rule bcftools_filter_tnhaplotyper_tumor_normal: threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_normal') message: - "Filtering wgs tumor-normal tnhaplotyper annotated variants using bcftools on {params.case_name}" + "Filtering WGS tumor-normal tnhaplotyper annotated variants using bcftools for {params.case_name}" shell: """ grep -v '^@' {input.wgs_calling_file} > {input.wgs_calling_file}.bed diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule index d6bd339ed..4a73b17be 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule @@ -28,7 +28,7 @@ rule bcftools_filter_tnscope_tumor_only: threads: get_threads(cluster_config, 'bcftools_filter_tnscope_tumor_only') message: - "Filtering wgs tumor-only tnscope annotated variants using bcftools on {params.case_name}" + "Filtering WGS tumor-only tnscope annotated variants using bcftools for {params.case_name}" shell: """ grep -v '^@' {input.wgs_calling_file} > {input.wgs_calling_file}.bed @@ -72,7 +72,7 @@ rule bcftools_filter_tnhaplotyper_tumor_only: threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_only') message: - "Filtering wgs tumor-only tnhaplotyper annotated variants using bcftools on {params.case_name}" + "Filtering WGS tumor-only tnhaplotyper annotated variants using bcftools for {params.case_name}" shell: """ grep -v '^@' {input.wgs_calling_file} > {input.wgs_calling_file}.bed diff --git a/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule b/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule index 54cdaa2eb..ab6e51c5b 100644 --- a/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule +++ b/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule @@ -17,7 +17,7 @@ rule vcf2cytosure_convert: params: case_name = config["analysis"]["case_id"], housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv-somatic"}, - message: "Convert VCF file with CNVs to the .CGH format using vcf2cytosure for sample {params.case_name}" + message: "Converting CNVs from VCF to the CGH format using vcf2cytosure for {params.case_name}" shell: """ vcf2cytosure --vcf {input.cnv_vcf} --cn {input.cnv_cnr} --out {output.cgh_file} --bins 1 diff --git a/BALSAMIC/snakemake_rules/annotation/vep.rule b/BALSAMIC/snakemake_rules/annotation/vep.rule index d31cda315..a79526cc6 100644 --- a/BALSAMIC/snakemake_rules/annotation/vep.rule +++ b/BALSAMIC/snakemake_rules/annotation/vep.rule @@ -25,7 +25,7 @@ rule vep_somatic_snv: threads: get_threads(cluster_config, "vep_somatic_snv") message: - "Running vep annotation on {params.message_text}" + "Running vep annotation for single nuceotide variants on {params.message_text}" shell: """ vep_path=$(dirname $(readlink -f $(which vep))); @@ -69,7 +69,7 @@ rule vep_somatic_sv: threads: get_threads(cluster_config, "vep_somatic_sv") message: - "Running vep annotation on {params.message_text}" + "Running vep annotation for structural and copy number variants on {params.message_text}" shell: """ vep_path=$(dirname $(readlink -f $(which vep))); @@ -106,7 +106,7 @@ rule tmb_calculation: threads: get_threads(cluster_config, "vep") message: - "Calculating TMB for {params.message_text}" + "Calculating TMB score for {params.message_text}" shell: """ mkdir -p {params.tmpdir}; @@ -163,7 +163,7 @@ rule vep_germline: threads: get_threads(cluster_config, 'vep_germline') message: - "Running vep annotation on {params.sample}" + "Running vep annotation on germline variants for {params.sample}" shell: """ vep_path=$(dirname $(readlink -f $(which vep))); diff --git a/BALSAMIC/snakemake_rules/umi/generate_AF_tables.rule b/BALSAMIC/snakemake_rules/umi/generate_AF_tables.rule index 56a0f62c9..4faf00b11 100644 --- a/BALSAMIC/snakemake_rules/umi/generate_AF_tables.rule +++ b/BALSAMIC/snakemake_rules/umi/generate_AF_tables.rule @@ -19,7 +19,7 @@ rule bcftools_query_generatebackgroundaf_umitable: threads: get_threads(cluster_config, "bcftools_query_generatebackgroundaf_umitable") message: - "Creating Allelic frequency table from VCF file for sample {params.case_name}" + "Creating Allelic frequency table from VCF file for {params.case_name}" shell: """ bcftools query \ diff --git a/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule b/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule index 55f07a4c7..ab2f5b401 100644 --- a/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule +++ b/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule @@ -21,12 +21,14 @@ rule mergeBam_normal_umiconsensus: threads: get_threads(cluster_config, "mergeBam_normal_umiconsensus") message: - ("Replace ReadGroups using picard for normal sample {params.sample} " - "and convert bam to cram format") + ("Replacing ReadGroups using picard and converting from bam to cram format for {params.sample}") shell: """ picard AddOrReplaceReadGroups {params.picard} INPUT={input.bam} OUTPUT={output.bam}; + samtools index {output.bam}; + samtools view -h -T {input.fasta} --threads {threads} -C -o {output.cram} {output.bam}; + samtools index {output.cram}; """ diff --git a/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule b/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule index 7915d6b00..f294f2e82 100644 --- a/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule +++ b/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule @@ -21,8 +21,7 @@ rule mergeBam_tumor_umiconsensus: threads: get_threads(cluster_config, "mergeBam_tumor_umiconsensus") message: - ("Replace ReadGroups using picard for tumor sample {params.sample} " - "and convert bam to cram") + ("Replacing ReadGroups using picard and converting from bam to cram for {params.sample}") shell: """ picard AddOrReplaceReadGroups {params.picard} INPUT={input.bam} OUTPUT={output.bam}; diff --git a/BALSAMIC/snakemake_rules/umi/qc_umi.rule b/BALSAMIC/snakemake_rules/umi/qc_umi.rule index e71eec2a4..a4b09f124 100644 --- a/BALSAMIC/snakemake_rules/umi/qc_umi.rule +++ b/BALSAMIC/snakemake_rules/umi/qc_umi.rule @@ -18,7 +18,7 @@ rule picard_umiaware: threads: get_threads(cluster_config, "picard_umiaware") message: - "Picard Umiaware mark dups for sample {params.sample_id}" + "Marking duplicates using Picardtools with UmiAware for {params.sample_id}" shell: """ picard UmiAwareMarkDuplicatesWithMateCigar \ @@ -47,7 +47,7 @@ rule picard_collecthsmetrics_umi: threads: get_threads(cluster_config, "CollectHsMetrics") message: - "Collect HSmetrics using Picardtools for {params.sample_id}" + "Collecting HSmetrics using Picardtools for {params.sample_id}" shell: """ picard BedToIntervalList \ diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule b/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule index e6f61bdc6..460833320 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule @@ -22,7 +22,7 @@ rule sentieon_consensuscall_umi: threads: get_threads(cluster_config, "sentieon_consensuscall_umi") message: - "Consensus molecule creation using sentieon for sample {params.sample_id}" + "Calling consensus molecules using sentieon for {params.sample_id}" shell: """ export LD_PRELOAD={params.sentieon_install_dir}/lib/libjemalloc.so.1 @@ -63,7 +63,7 @@ rule sentieon_bwa_umiconsensus: threads: get_threads(cluster_config, "sentieon_bwa_umiconsensus") message: - "Mapping of consensus reads with the sentieon bwa mem, sorting for sample {params.sample_id}" + "Mapping consensus reads and sorting using sentieon bwa-mem for {params.sample_id}" shell: """ export LD_PRELOAD={params.sentieon_install_dir}/lib/libjemalloc.so.1 @@ -106,7 +106,7 @@ rule sentieon_consensusfilter_umi: threads: get_threads(cluster_config, "sentieon_consensusfilter_umi") message: - "Filtering consensus reads based on XZ tag for sample {params.sample_id}" + "Filtering consensus reads based on XZ tag for {params.sample_id}" shell: """ samtools view -h {input} | \ diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule b/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule index 747dff6ea..71e5e2f73 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule @@ -22,7 +22,7 @@ rule sentieon_umiextract: threads: get_threads(cluster_config, "sentieon_umiextract") message: - "UMI tag extraction using sentieon for sample {params.sample_id}" + "Extracing UMI tags using sentieon for {params.sample_id}" shell: """ export LD_PRELOAD={params.sentieon_install_dir}/lib/libjemalloc.so.1 @@ -59,7 +59,7 @@ rule sentieon_bwa_umiextract: threads: get_threads(cluster_config, "sentieon_bwa_umiextract") message: - "Aligning of UMI extracted reads with sentieon bwa mem, sorting for sample {params.sample_id}" + "Aligning UMI extracted reads and sorting using sentieon bwa-mem for {params.sample_id}" shell: """ export LD_PRELOAD={params.sentieon_install_dir}/lib/libjemalloc.so.1 diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule index 0552421a6..cff784b12 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule @@ -33,7 +33,7 @@ rule sentieon_tnscope_umi: threads: get_threads(cluster_config, "sentieon_tnscope_umi") message: - "Calling SNVs using TNscope for sample {params.tumor}" + "Calling single nucleotide variants using TNscope for {params.tumor}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule index 4f9b3bfa1..64ded84bb 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule @@ -30,12 +30,12 @@ rule sentieon_tnscope_umi_tn: pcr_model = params.common.pcr_model, padding = params.tnscope_umi.padding, tumor = "TUMOR", - normal = "NORMAL" + normal = "NORMAL", + case_name= config["analysis"]["case_id"] threads: get_threads(cluster_config, "sentieon_tnscope_umi") message: - "Calling SNVs using TNscope for sample: {params.tumor}" - " versus sample {params.normal}" + "Calling single nucleotide variants using TNscope for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_paired.rule b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_paired.rule index c8c834ab4..c2ee61797 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_paired.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_paired.rule @@ -35,7 +35,7 @@ rule cnvkit_paired: sample_id = "TUMOR", genome = GENOME_VERSION message: - "Run CNVkit pipeline for sample {params.case_name} while tumor purity/ploidy calculated using PureCN" + "Calling CNVs using CNVkit and calculating tumor purity/ploidy using PureCN for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule index c3844ddc4..9a3dc08a8 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule @@ -44,8 +44,7 @@ rule cnvkit_single: genome_version = GENOME_VERSION, pon = " " if get_pon_cnn(config) is None else get_pon_cnn(config) message: - ("Run CNVkit pipeline for sample {params.case_name}," - "while tumor purity/ploidy calculated using PureCN") + ("Calling CNVs using CNVkit and calculating tumor purity/ploidy using PureCN for {params.case_name}") shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/germline.rule b/BALSAMIC/snakemake_rules/variant_calling/germline.rule index 8af31a0f1..d8229520f 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/germline.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/germline.rule @@ -21,8 +21,7 @@ rule gatk_haplotypecaller: threads: get_threads(cluster_config,'gatk_haplotypecaller') message: - ("Calling germline variants using gatk haplotypecaller for " - "targeted-panel sample {params.sample}") + ("Calling germline variants using gatk haplotypecaller for {params.sample}") shell: """ mkdir -p {params.tmpdir}; @@ -52,7 +51,7 @@ rule haplotypecaller_merge: tmpdir = tempfile.mkdtemp(prefix = tmp_dir), sample = '{sample_type}' message: - "Concatenate haplotyper outputs of multiple chr vcfs using bcftools for sample {params.sample}" + "Concatenating haplotyper outputs from multiple VCF files using bcftools for {params.sample}" shell: """ mkdir -p {params.tmpdir}; @@ -83,7 +82,7 @@ rule sentieon_DNAscope: threads: get_threads(cluster_config, 'sentieon_DNAscope') message: - "Calling germline variants using Sentieon DNAscope for sample {params.sample}" + "Calling germline variants using Sentieon DNAscope for {params.sample}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule b/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule index 11c5ed4ab..4eed3880e 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule @@ -20,7 +20,7 @@ rule manta_germline: threads: get_threads(cluster_config,"manta_germline") message: - "Calling germline variants using manta for sample {params.sample}" + "Calling germline variants using manta for {params.sample}" shell: """ configManta.py \ diff --git a/BALSAMIC/snakemake_rules/variant_calling/mergetype_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/mergetype_normal.rule index 488e43b86..4f1b26a3d 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/mergetype_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/mergetype_normal.rule @@ -26,7 +26,7 @@ rule mergeBam_normal: threads: get_threads(cluster_config, "mergeBam_normal") message: - "Replace bam header using Picard tools for normal sample {params.sample}" + "Replacing bam header using Picardtools for {params.sample}" shell: """ picard AddOrReplaceReadGroups {params.picard} INPUT={input.bam} OUTPUT={output.bam}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/mergetype_tumor.rule b/BALSAMIC/snakemake_rules/variant_calling/mergetype_tumor.rule index 2d3fe24ed..8ac8bbf3b 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/mergetype_tumor.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/mergetype_tumor.rule @@ -26,7 +26,7 @@ rule mergeBam_tumor: threads: get_threads(cluster_config, "mergeBam_tumor") message: - "Replace bam header using Picard tools for tumor sample {params.sample}" + "Replacing bam header using Picardtools for {params.sample}" shell: """ picard AddOrReplaceReadGroups {params.picard} INPUT={input.bam} OUTPUT={output.bam}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule index 8f32aa665..4d39675ae 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule @@ -23,7 +23,7 @@ rule sentieon_DNAscope: threads: get_threads(cluster_config, 'sentieon_DNAscope') message: - "Calling germline variants using Sentieon DNAscope for sample {params.sample}" + "Calling germline variants using Sentieon DNAscope for {params.sample}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_split_snv_sv.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_split_snv_sv.rule index ca8189759..0751a0512 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_split_snv_sv.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_split_snv_sv.rule @@ -20,7 +20,7 @@ rule bcftools_view_split_variant: threads: get_threads(cluster_config, 'bcftools_view_split_variant') message: - "Split tnscope snv and sv variants using bcftools for sample {params.case_name}" + "Split tnscope snv and sv variants using bcftools for {params.case_name}" shell: """ export TMPDIR={params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule index f97ed8189..1c9750b5e 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule @@ -31,7 +31,7 @@ rule sentieon_base_calibration: threads: get_threads(cluster_config, 'sentieon_base_calibration') message: - "Base recalibration using sentieon tools for sample {params.sample}" + "Recalibrating bases using sentieon tools for {params.sample}" shell: """ mkdir -p {params.tmpdir}; @@ -96,7 +96,7 @@ rule sentieon_TNhaplotyper_tumor_only: threads: get_threads(cluster_config, 'sentieon_TNhaplotyper_tumor_only') message: - "Calling SNVs using sentieon TNhaplotyper for sample {params.case_name}" + "Calling SNVs using sentieon TNhaplotyper for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; @@ -145,7 +145,7 @@ rule sentieon_TNscope_tumor_only: threads: get_threads(cluster_config, 'sentieon_TNscope_tumor_only') message: - "Calling SNVs using sentieon TNscope for sample {params.case_name}" + "Calling SNVs using sentieon TNscope for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule index f2d372091..f2b756710 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule @@ -24,7 +24,7 @@ rule sentieon_base_calibration: threads: get_threads(cluster_config, 'sentieon_base_calibration') message: - "Base recalibration using Sentieon tools for sample {params.sample}" + "Base recalibration using Sentieon tools for {params.sample}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule index 85d970f77..d08d06914 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule @@ -54,8 +54,7 @@ echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap} rm -rf {params.tmpdir}; """ - -rule delly_tumor_normal: +rule delly_sv_tumor_normal: input: fa = config["reference"]["reference_genome"], bamN = bam_dir + normal_bam, @@ -63,9 +62,9 @@ rule delly_tumor_normal: excl = config["reference"]["delly_exclusion_converted"], output: final = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", - namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.sample_name_map", + namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".dellysv.sample_name_map", benchmark: - Path(benchmark_dir, 'delly_tumor_normal_' + config["analysis"]["case_id"] + ".tsv") + Path(benchmark_dir, 'delly_sv_tumor_normal_' + config["analysis"]["case_id"] + ".tsv") singularity: Path(singularity_image, config["bioinfo_tools"].get("delly") + ".sif").as_posix() params: @@ -91,12 +90,11 @@ echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap} rm -rf {params.tmpdir}; """ - rule bcftools_bcf2vcf_delly: input: bcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", output: - vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.vcf.gz", + vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".dellysv.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_bcf2vcf_delly_' + config["analysis"]["case_id"] + ".tsv") singularity: @@ -106,7 +104,7 @@ rule bcftools_bcf2vcf_delly: threads: get_threads(cluster_config, "bcftools_bcf2vcf_delly") message: - ("Convert bcf to vcf for structural variants called using delly for {params.case_name}") + ("Converting BCF from delly to VCF for {params.case_name}") shell: """ bcftools view --threads {threads} -f PASS -O z -o {output.vcf} {input.bcf}; @@ -114,7 +112,6 @@ bcftools view --threads {threads} -f PASS -O z -o {output.vcf} {input.bcf}; tabix -p vcf -f {output.vcf}; """ - rule ascat_tumor_normal: input: fa = config["reference"]["reference_genome"] , @@ -139,12 +136,12 @@ rule ascat_tumor_normal: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), tumor = "TUMOR", normal = "NORMAL", - genome = GENOME_VERSION + genome = GENOME_VERSION, + case_name = config["analysis"]["case_id"] threads: get_threads(cluster_config, "ascat_tumor_normal") message: - ("Call copy number variants using ascatNGS for {input.bamT} vs {input.bamN} files then " - "filter somatic variants and finally convert to compressed vcf file") + ("Calling copy number variants using ascatNGS for {params.case_name}") shell: """ export LD_LIBRARY_PATH=:/opt/wtsi-cgp/lib; @@ -186,7 +183,6 @@ echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap} rm -rf {params.tmpdir}; """ - rule ascat_tumor_normal_merge_output: input: sample_statistics = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.samplestatistics.txt", @@ -204,7 +200,7 @@ rule ascat_tumor_normal_merge_output: threads: get_threads(cluster_config, "ascat_tumor_normal_merge_output") message: - "Merge the ascatNgs output plots together with the sample statistics into a single PDF" + "Merging the output plots and the sample statistics from ascatNGS into a single PDF" shell: """ python {params.merge_ascat_output_script} {output.ascat_output_pdf} {input.sample_statistics} {input.ascat_plots} @@ -234,7 +230,7 @@ rule svdb_merge_tumor_normal: threads: get_threads(cluster_config, "svdb_merge_tumor_normal") message: - "Merging SV and CNV results for all variants using svdb for sample '{params.case_name}' " + "Merging structural and copy number variants using SVDB for {params.case_name}" shell: """ svdb --merge --no_intra --bnd_distance 5000 --overlap 0.80 \ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule index 3787002a4..0572b732a 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule @@ -50,17 +50,16 @@ echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; rm -rf {params.tmpdir}; """ - -rule delly_tumor_only: +rule delly_sv_tumor_only: input: fa = config["reference"]["reference_genome"], bamT = bam_dir + tumor_bam, excl = config["reference"]["delly_exclusion_converted"], output: bcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", - namemap= vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.sample_name_map", + namemap= vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".dellysv.sample_name_map", benchmark: - benchmark_dir + 'delly_tumor_only_' + config["analysis"]["case_id"] + ".tsv" + benchmark_dir + 'delly_sv_tumor_only_' + config["analysis"]["case_id"] + ".tsv" singularity: Path(singularity_image, config["bioinfo_tools"].get("delly") + ".sif").as_posix() params: @@ -71,8 +70,7 @@ rule delly_tumor_only: threads: get_threads(cluster_config, "delly_tumor_only") message: - ("Calling structural variants using delly for {params.case_name}," - "filter somatic variants and finally convert from bcf to compressed vcf file") + ("Calling structural variants using delly for {params.case_name}") shell: """ delly call -x {input.excl} -o {output.bcf} -g {input.fa} {input.bamT} @@ -82,11 +80,44 @@ echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; rm -rf {params.tmpdir}; """ -rule bcftools_bcf2vcf_delly: +rule delly_cnv_tumor_only: input: + fa = config["reference"]["reference_genome"], + bamT = bam_dir + tumor_bam, bcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", + map = config["reference"]["delly_mappability"], + output: + cnv = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", + namemap= vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".dellycnv.sample_name_map", + benchmark: + benchmark_dir + 'delly_cnv_tumor_only_' + config["analysis"]["case_id"] + ".tsv" + singularity: + Path(singularity_image, config["bioinfo_tools"].get("delly") + ".sif").as_posix() + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + runmode = "local", + tumor = "TUMOR", + case_name = config["analysis"]["case_id"] + threads: + get_threads(cluster_config, "delly_tumor_only") + message: + ("Calling copy number variants using delly for {params.case_name}") + shell: + """ +delly cnv -m {input.map} -g {input.fa} -o {output.cnv} -l {input.bcf} {input.bamT} + +echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; + +rm -rf {params.tmpdir}; + """ + +rule bcftools_bcf2vcf_delly: + input: + bcf_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", + bcf_cnv= vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", output: - vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.vcf.gz", + vcf_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".dellysv.vcf.gz", + vcf_cnv= vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".dellycnv.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_bcf2vcf_delly_' + config["analysis"]["case_id"] + ".tsv") singularity: @@ -96,12 +127,16 @@ rule bcftools_bcf2vcf_delly: threads: get_threads(cluster_config, "bcftools_bcf2vcf_delly") message: - ("Convert bcf to vcf for structural variants called using delly for {params.case_name}") + ("Converting BCF from delly to VCF for {params.case_name}") shell: """ -bcftools view --threads {threads} -f PASS -O z -o {output.vcf} {input.bcf}; +bcftools view --threads {threads} -f PASS -O z -o {output.vcf_sv} {input.bcf_sv}; + +bcftools view --threads {threads} -f PASS -O z -o {output.vcf_cnv} {input.bcf_cnv} + +tabix -p vcf -f {output.vcf_sv}; -tabix -p vcf -f {output.vcf}; +tabix -p vcf -f {output.vcf_cnv} """ rule svdb_merge_tumor_only: @@ -127,7 +162,7 @@ rule svdb_merge_tumor_only: threads: get_threads(cluster_config, "svdb_merge_tumor_only") message: - "Merging Manta and Delly results for PASS variants using svdb for sample '{params.case_name}' " + "Merging structural and copy number variants using SVDB for {params.case_name}" shell: """ svdb --merge --no_intra --bnd_distance 5000 --overlap 0.80 \ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule index e7a4de20f..532460b78 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule @@ -25,7 +25,7 @@ rule vardict_tumor_normal: threads: get_threads(cluster_config, "vardict_tumor_normal") message: - "Calling variants using vardict for sample {params.case_name}" + "Calling variants using vardict for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; @@ -63,8 +63,7 @@ rule vardict_merge: threads: get_threads(cluster_config,"vardict_merge") message: - ("Merging all chromosomes vardict results into " - "single vcf using bcftools for sample {params.case_name}") + ("Merging multiple VCFs from vardict into single VCF using bcftools for {params.case_name}") shell: """ mkdir -p {params.tmpdir}; @@ -103,7 +102,7 @@ rule sentieon_TNhaplotyper: threads: get_threads(cluster_config, 'sentieon_TNhaplotyper') message: - "Calling variants using TNhaplotyper for sample {params.case_name}" + "Calling single nucleotide variants using TNhaplotyper for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule index 1369c2f37..37992d751 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule @@ -22,7 +22,7 @@ rule vardict_tumor_only: threads: get_threads(cluster_config, "vardict_tumor_only") message: - "Calling variants using vardict for sample {params.case_name}" + "Calling single nucleotide variants using vardict for {params.case_name}" shell: """ export PERL5LIB=; @@ -64,8 +64,7 @@ rule vardict_merge: threads: get_threads(cluster_config,"vardict_merge") message: - ("Merging all chromosomes vardict results into " - "single vcf using bcftools for sample {params.case_name}") + ("Merging multiple VCFs from vardict into single VCF using bcftools for {params.case_name}") shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule b/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule index 856a95a51..747d7604e 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule @@ -19,8 +19,7 @@ rule bedtools_splitbed_by_chrom: split_bed_dir = vcf_dir + "split_bed/", origin_bed = capture_kit, message: - ("Capturing reference genome chromosome size and splitting the panel bed per chromosome" - "Extend the region by 100bp on each direction, sort and merge the overlapping intervals using bedtools") + ("Splitting the panel bed per chromosome, flanking regions by 100bp and merging into single VCF using bedtools") shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index 173d7ad37..606a3bb97 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -188,9 +188,10 @@ class VCFModel(BaseModel): TNscope_umi: VarcallerAttribute manta_germline: VarcallerAttribute manta: VarcallerAttribute - delly: VarcallerAttribute + dellysv: VarcallerAttribute cnvkit: VarcallerAttribute ascat: VarcallerAttribute + dellycnv: VarcallerAttribute svdb: VarcallerAttribute @@ -538,6 +539,7 @@ class ReferenceMeta(BaseModel): rankscore: ReferenceUrlsModel. Optional rankscore model access_regions: ReferenceUrlsModel. Optional field for accessible genome regions delly_exclusion: ReferenceUrlsModel. Optional field for genome exclusion regions + delly_mappability: ReferenceUrlsModel. Optional field for genome mappability ascat_gccorrection: ReferenceUrlsModel. Optional field for genome gc correction bins ascat_chryloci: ReferenceUrlsModel. Optional field for chromosome Y loci clinvar: ReferenceUrlsModel. Optional field for clinvar reference @@ -560,6 +562,9 @@ class ReferenceMeta(BaseModel): rankscore: Optional[ReferenceUrlsModel] access_regions: Optional[ReferenceUrlsModel] delly_exclusion: Optional[ReferenceUrlsModel] + delly_mappability: Optional[ReferenceUrlsModel] + delly_mappability_gindex: Optional[ReferenceUrlsModel] + delly_mappability_findex: Optional[ReferenceUrlsModel] ascat_gccorrection: Optional[ReferenceUrlsModel] ascat_chryloci: Optional[ReferenceUrlsModel] clinvar: Optional[ReferenceUrlsModel] diff --git a/BALSAMIC/workflows/reference.smk b/BALSAMIC/workflows/reference.smk index c73d06e59..6d5cfa0eb 100644 --- a/BALSAMIC/workflows/reference.smk +++ b/BALSAMIC/workflows/reference.smk @@ -66,6 +66,9 @@ refgene_sql_url = reference_file_model.refgene_sql rankscore_url = reference_file_model.rankscore access_regions_url = reference_file_model.access_regions delly_exclusion_url = reference_file_model.delly_exclusion +delly_mappability_url = reference_file_model.delly_mappability +delly_mappability_gindex_url = reference_file_model.delly_mappability_gindex +delly_mappability_findex_url = reference_file_model.delly_mappability_findex ascat_gccorrection_url = reference_file_model.ascat_gccorrection ascat_chryloci_url = reference_file_model.ascat_chryloci clinvar_url = reference_file_model.clinvar @@ -112,6 +115,9 @@ rule all: access_regions = access_regions_url.get_output_file, delly_exclusion = delly_exclusion_url.get_output_file, delly_exclusion_converted = delly_exclusion_url.get_output_file.replace(".tsv", "_converted.tsv"), + delly_mappability= delly_mappability_url.get_output_file, + delly_mappability_gindex= delly_mappability_gindex_url.get_output_file, + delly_mappability_findex= delly_mappability_findex_url.get_output_file, ascat_gccorrection = ascat_gccorrection_url.get_output_file, ascat_chryloci = ascat_chryloci_url.get_output_file, clinvar = clinvar_url.get_output_file + ".gz", @@ -147,6 +153,7 @@ rule all: "access_regions": input.access_regions, "delly_exclusion" : input.delly_exclusion, "delly_exclusion_converted" : input.delly_exclusion_converted, + "delly_mappability": input.delly_mappability, "ascat_gccorrection" : input.ascat_gccorrection, "ascat_chryloci" : input.ascat_chryloci, "clinvar": input.clinvar, @@ -180,7 +187,8 @@ download_content = [reference_genome_url, dbsnp_url, hc_vcf_1kg_url, wgs_calling_url, genome_chrom_size_url, gnomad_url, gnomad_tbi_url, cosmicdb_url, refgene_txt_url, refgene_sql_url, rankscore_url, access_regions_url, - delly_exclusion_url, ascat_gccorrection_url, ascat_chryloci_url, clinvar_url] + delly_exclusion_url, delly_mappability_url, delly_mappability_gindex_url, + delly_mappability_findex_url, ascat_gccorrection_url, ascat_chryloci_url, clinvar_url] rule download_reference: output: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e5a0c7fd6..d162b9b12 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,18 +9,19 @@ Added: * UMI duplication metrics to report in multiqc_picard_dups.json #844 * Option to use PON reference in cnv calling for TGA tumor-only cases * QC default validation conditions (for not defined capture kits) #855 -* SVdb to the varcall_py36 container #871 -* SVdb to WGS workflow #871 +* SVdb to the varcall_py36 container #872 +* SVdb to WGS workflow #873 * Docker container for vcf2cytosure #858 * Snakemake rule for creating `.cgh` files from `CNVkit` outputs #858 -* SVdb to TGA workflow #871 -* SVdb merge SV and CNV #871 +* SVdb to TGA workflow #879 +* SVdb merge SV and CNV #886 * Readthedocs for BALSAMIC method descriptions #892 * Readthedocs for BALSAMIC variant filters for WGS somatic callers #892 * bcftools counts to varcall filter rules #898 * Additional WGS metrics to be stored in ``_metrics_deliverables.yaml`` #907 -* ascatNGS copynumber file #897 +* ascatNGS copynumber file #914 * ReadtheDocs for BALSAMIC annotation resources #916 +* Delly CNV for tumor only workflow Changed: ^^^^^^^^ @@ -44,8 +45,8 @@ Removed * ``--qc-metrics/--no-qc-metrics`` flag from the ``balsamic report deliver`` command #833 * Unused pon option for SNV calling with TNhaplotyper tumor-only -* SV and CNV callers from annotation and filtering #871 -* vcfanno from SV annotation +* SV and CNV callers from annotation and filtering #889 +* vcfanno and COSMIC from SV annotation #891 * Removed `MSK_impact` and `MSK_impact_noStrelka` json files from config * Cleanup of `strelka`, `pindel` , `mutect2` variables from BALSAMIC * bcftools_stats from vep #898 diff --git a/docs/balsamic_methods.rst b/docs/balsamic_methods.rst index 5c9ccd90e..34f1c9061 100644 --- a/docs/balsamic_methods.rst +++ b/docs/balsamic_methods.rst @@ -5,7 +5,7 @@ BALSAMIC METHODS Target Genome Analysis ~~~~~~~~~~~~~~~~~~~~~~ -BALSAMIC :superscript:`1` (**version** = 8.2.8) was used to analyze the data from raw FASTQ files. +BALSAMIC :superscript:`1` (**version** = 8.2.10) was used to analyze the data from raw FASTQ files. We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. Adapter sequences and low-quality bases were trimmed using fastp v0.20.1 :superscript:`3`. Trimmed reads were mapped to the reference genome hg19 using BWA MEM v0.7.15 :superscript:`4`. @@ -17,7 +17,7 @@ Small somatic mutations (SNVs and INDELs) were called for each sample using VarD Apart from the Vardict filters to report the variants, the called-variants were also further second filtered using the criteria (*MQ >= 40, DP >= 100, VD >= 5, Minimum AF >= 0.007, Maximum AF < 1, GNOMADAF_popmax <= 0.005*). Only those variants that fulfilled the filtering criteria and scored as `PASS` in the VCF file were reported. -Structural variants were called using Manta v1.6.0 :superscript:`9` and Delly v0.8.7 :superscript:`10`. +Structural variants were called using Manta v1.6.0 :superscript:`9` and Delly v0.9.1 :superscript:`10`. Copy number aberrations were called using CNVkit v0.9.4 :superscript:`11`. The variant calls from CNVkit, Manta and Delly were merged using SVDB v2.6.0 :superscript:`12`. All variants were annotated using Ensembl VEP v100.2 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` @@ -25,7 +25,7 @@ to annotate somatic variants for their population allele frequency from gnomAD v Whole Genome Analysis ~~~~~~~~~~~~~~~~~~~~~ -BALSAMIC :superscript:`1` (**version** = 8.2.8) was used to analyze the data from raw FASTQ files. +BALSAMIC :superscript:`1` (**version** = 8.2.10) was used to analyze the data from raw FASTQ files. We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. Adapter sequences and low-quality bases were trimmed using fastp v0.20.1 :superscript:`3`. Trimmed reads were mapped to the reference genome hg19 using sentieon-tools :superscript:`15`. @@ -36,7 +36,7 @@ Results of the quality controlled steps were summarized by MultiQC v1.11 :supers Small somatic mutations (SNVs and INDELs) were called for each sample using Sentieon TNscope and TNhaplotyper :superscript:`16`. The called-variants were also further second filtered using the criteria (DP(tumor,normal) >= 10; AD(tumor) >= 3; AF(tumor) >= 0.05, Maximum AF(tumor < 1; GNOMADAF_popmax <= 0.001; normalized base quality scores >= 20, read_counts of alt,ref alle > 0). The filtered variants from TNscope and TNhaplotyper were merged using bcftools isec functionality to reduce the number of variants for tumor-only samples. -Structural variants were called using Manta v1.6.0 :superscript:`9` and Delly v0.8.7 :superscript:`10`. +Structural variants were called using Manta v1.6.0 :superscript:`9` and Delly v0.9.1 :superscript:`10`. Copy number aberrations were called using ascatNgs v4.5.0 :superscript:`17` for tumor-normal samples. The structural variant calls from Manta, Delly and ascatNgs were merged using SVDB v2.6.0 :superscript:`12` All variants were finally annotated using Ensembl VEP v100.2 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` @@ -46,7 +46,7 @@ to annotate somatic variants for their population allele frequency from gnomAD v UMI Data Analysis ============================= -BALSAMIC :superscript:`1` (**version** = 8.2.8) was used to analyze the data from raw FASTQ files. +BALSAMIC :superscript:`1` (**version** = 8.2.10) was used to analyze the data from raw FASTQ files. We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. Adapter sequences and low-quality bases were trimmed using fastp v0.20.1 :superscript:`3`. UMI tag extraction and consensus generation were performed using Sentieon tools v202010.02 :superscript:`15`. @@ -65,7 +65,7 @@ We used three commercially available products from SeraCare [Material numbers: 0 **References** ~~~~~~~~~~~~~~~~ -1. Foroughi-Asl, H., Jeggari, A., Maqbool, K., Ivanchuk, V., Elhami, K., & Wirta, V. BALSAMIC: Bioinformatic Analysis pipeLine for SomAtic MutatIons in Cancer (Version v8.2.8) [Computer software]. https://github.com/Clinical-Genomics/BALSAMIC +1. Foroughi-Asl, H., Jeggari, A., Maqbool, K., Ivanchuk, V., Elhami, K., & Wirta, V. BALSAMIC: Bioinformatic Analysis pipeLine for SomAtic MutatIons in Cancer (Version v8.2.10) [Computer software]. https://github.com/Clinical-Genomics/BALSAMIC 2. Babraham Bioinformatics - FastQC A Quality Control tool for High Throughput Sequence Data. Accessed June 22, 2020. https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ 3. Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. 2018;34(17):i884-i890. doi:10.1093/bioinformatics/bty560 4. Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN] diff --git a/docs/bioinfo_softwares.rst b/docs/bioinfo_softwares.rst index 00a17a036..29e14d4a8 100644 --- a/docs/bioinfo_softwares.rst +++ b/docs/bioinfo_softwares.rst @@ -46,7 +46,7 @@ delly ~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.8.7` +:Version: `0.9.1` ensembl-vep ~~~~~~~~~~~ diff --git a/tests/conftest.py b/tests/conftest.py index 9286aaf1a..25ca26a09 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -75,6 +75,9 @@ def reference(): "access_regions": "tests/test_data/references/genome/access-5k-mappable.hg19.bed", "delly_exclusion": "tests/test_data/references/genome/delly_exclusion.tsv", "delly_exclusion_converted": "tests/test_data/references/genome/delly_exclusion_converted.tsv", + "delly_mappability": "tests/test_data/references/genome/delly_mappability.gz", + "delly_mappability_gindex": "tests/test_data/references/genome/delly_mappability.gz.gzi", + "delly_mappability_findex": "tests/test_data/references/genome/delly_mappability.fai", "ascat_gccorrection": "tests/test_data/references/genome/GRCh37_SnpGcCorrections.tsv", "ascat_chryloci": "tests/test_data/references/genome/GRCh37_Y.loci", "clinvar": "tests/test_data/references/genome/clinvar.vcf.gz", diff --git a/tests/test_data/references/genome/delly_mappability.gz b/tests/test_data/references/genome/delly_mappability.gz new file mode 100644 index 0000000000000000000000000000000000000000..f7083a45167a0919cf51543d8af5308b1ea3397c GIT binary patch literal 66 zcmb2|=3rp}f&Xj_PR>jWstlWrpL|l5?8%Z`62&01@wLG@Mg|6XG*!|J%wSC*0s!)V B41@px literal 0 HcmV?d00001 diff --git a/tests/test_data/references/genome/delly_mappability.gz.fai b/tests/test_data/references/genome/delly_mappability.gz.fai new file mode 100644 index 000000000..af9ba0e09 --- /dev/null +++ b/tests/test_data/references/genome/delly_mappability.gz.fai @@ -0,0 +1,10 @@ +1 248956422 3 50 51 +2 242193529 253935557 50 51 +3 198295559 500972960 50 51 +4 190214555 703234434 50 51 +5 181538259 897253284 50 51 +6 170805979 1082422312 50 51 +7 159345973 1256644414 50 51 +8 145138636 1419177310 50 51 +9 138394717 1567218722 50 51 +10 133797422 1708381338 50 51 diff --git a/tests/test_data/references/genome/delly_mappability.gz.gzi b/tests/test_data/references/genome/delly_mappability.gz.gzi new file mode 100644 index 0000000000000000000000000000000000000000..076cb8ee78cfc3bf24d924669b296765effe57dd GIT binary patch literal 5626 zcmXZgc~lR37zXfPM2pfwAxcCfm8DV=Dp873q#_m7#jTJfON1<`=yGv|x|K>HKWnKJ zLXuEXBBG>}Xrst>&wb{Zf8O^q&wI|CGv_-qXTG8=5srx?gev+i)D(OAmjoL7Kk1)6 z&{PD|KYF4`kEXv%qIr-)clSc0T}*%LjmG~o{Z$H0M^E88qAqE8#wfZ|23}%Cf02b_ z7SW&kz^}H_9ddB?LAqTY&Nxc9^@STx(ya<`auVI52y33DKPkasV)~;pYu2qK>Z0H&dSj(P%GZcRBNWa#EeO>74VQ{q@{c1Rz=t;lSf>$r2tAvH5;zz$2 ziTuPq`nfh7l1W#Nfsv@K+=HsVs)Avl^QeV2j6xLZm-<=HK_NVWd!9GFs?J02CdOF`6PS`}>nhHmT(l@8U0b%rw z>2StgI?n=DilTEZ;r3|yx)pr=5S=pvPCY_rTf-}l(OEO$@Dud4S#Wh6ooNGiCeRtS z@bx76>TLLM8hynMHcY243oouFF@0$c@{p_a#ksI!4t>D^ex65*9bv6}`usfj+8sLG z2_9HTpL2#!7SU(t!x0bZGYeqF5<1NV9{q$)T?m&~(5GGD6EEl#H~398o$L;G)zV3e zVCj1L)MEH!Bc130w=~lUOJM6(I^Gkm`$GTg1@HPs$9cm+Kk3+|u(*fto)?{52D{17 zCw$-*Ir_LStgTGP_`!Qs=wpAtWoq<4%i-3c^wAaYRxSF-N;q7bJ}i8g0(9uV{gDqJ zOCJh=qx9*6f$)Hd^k1vsbH;S^YS?`;eP9i&Z%*$If=#XHsI~AmYdUfroNG%*tcTki z=zSaD04F+J_>e{{p!aS>Ugb*f*#x_K(7QLoP2TjbEpUk+9TozoucUWwg?$6*9oyji zAbNW!>=aCgZifeNp||aTl|t#QJK@}&bVwL18%}T81#gI?H}8gz9Hckxf!`jXH|~Wc zkJG{7aC98KVISO_NUx88ho#c%BH_q%dTkWkc99O+FU)7qYYxEP+4SmY_)#vs>MuC- z79DsHetw4zI0U;C(*A$Lxew_-55otZ&?}F?&Xx3vqp-;oPj$h(DTp2 z(k8U?IrxV;?UW8bv!dsnhi}ZH9mR0GJ?(G-HgcrrUW6|=({nDtX)d(=Ww_Xlwz~q` zdeF14!lvG|Z3dj{L)&D+KFjG@*I+k)dS({fzJ|8WhTVhd89DIu5ZdZGJZU>^nG5HK z(H41dcsM=%1}qarPrC_=qv@%);L=01c|Kfll%8@M);ms{-GS?4>B)EDhD6%505%ZQ zCimdWh4iFC*t3H+z7NYv^+pvL6`?k0(S{FT3wwHEF+6iEJ>enj7fX+S1kW#|$32GQ zKhOpxaOxLYzZ6#JruE9;<-h2$PvAa1q)CVS8($vdRR65OqbStEzD1#hrWSl8POUw zu$Kv~UJENtp@+PMYp2qK>)=6_wAwp(sWm;Q9$sWa4{U$~?C1gS;mWzRY9oByiB@TX zy=$_1Zd#@jwvVKxyI}WdTIwr&{t(^!8|<7w_v(f_lWEECaB&*l^9L+S zr+fT_OD@t9zu;4uG=KQxbLii{ksBA#zeE!MpKxfVe@ehN<)kqu`q2aKX-t0?e&oG8 z=x#~myVld+dcp1c>94)vvSW0Y6l@encS^(J1p12%yeOIeEDOt|(j9%^(`V^+Ie3Yf zZj*<_m+98Nuuc};q5u!hqdzIaV{g$PmEd=`>1JhE{vQ3IAFNSCH}!|V71NC>aN1+~ zy((N*MmG$AEP2KRhTza0$6*U+^?V23)oMjf8lK)=y|H#O0( zhr$;=($$)9K^y&Q7~K7temNX2@YuI)NeQhS(xQEW11qVgZ88)!zLHepK9C(zzG8?Xlp)cFP z`(o)!_ONO?eQ^%#pF>}m3s>dQVh6ZSK7HO1w!TZJ&w~x`)90Mv%t!QDXSlwUJ~JPl zQ% Date: Thu, 28 Apr 2022 16:25:58 +0200 Subject: [PATCH 54/58] feat: add delly CNV read-depth profile (#924) * update changelog * update changelog * add copy-number profile to delly tumor only * add copy-number profile to delly tumor only --- .../snakemake_rules/variant_calling/somatic_sv_tumor_only.rule | 3 ++- CHANGELOG.rst | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule index 0572b732a..ac11b2ae5 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule @@ -88,6 +88,7 @@ rule delly_cnv_tumor_only: map = config["reference"]["delly_mappability"], output: cnv = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", + rd = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".delly.cov.gz", namemap= vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".dellycnv.sample_name_map", benchmark: benchmark_dir + 'delly_cnv_tumor_only_' + config["analysis"]["case_id"] + ".tsv" @@ -104,7 +105,7 @@ rule delly_cnv_tumor_only: ("Calling copy number variants using delly for {params.case_name}") shell: """ -delly cnv -m {input.map} -g {input.fa} -o {output.cnv} -l {input.bcf} {input.bamT} +delly cnv -m {input.map} -g {input.fa} -c {output.rd} -o {output.cnv} -l {input.bcf} {input.bamT} echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d162b9b12..2a4a8fdfd 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -21,7 +21,8 @@ Added: * Additional WGS metrics to be stored in ``_metrics_deliverables.yaml`` #907 * ascatNGS copynumber file #914 * ReadtheDocs for BALSAMIC annotation resources #916 -* Delly CNV for tumor only workflow +* Delly CNV for tumor only workflow #923 +* Delly CNV Read-depth profiles for tumor only workflows #924 Changed: ^^^^^^^^ From 91aeda9cb9d157058d91e09f6c103bda2edc3d41 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Fri, 29 Apr 2022 13:15:51 +0200 Subject: [PATCH 55/58] refactor: Remove gatk haplotypcaller (#922) * remove gatk haplotypecallers * remove haplotypecaller and tnsnv from cluster json * remove haplotypecaller from models * remove haplotypecaller from workflow params * remove haplotypcaller and tnsnv from analysis json * remove unused callers from tests * update changelog --- BALSAMIC/config/analysis.json | 8 --- BALSAMIC/config/cluster.json | 16 ----- .../variant_calling/germline.rule | 63 ------------------- BALSAMIC/utils/models.py | 1 - CHANGELOG.rst | 1 + tests/conftest.py | 2 - tests/test_data/config.json | 24 ------- 7 files changed, 1 insertion(+), 114 deletions(-) diff --git a/BALSAMIC/config/analysis.json b/BALSAMIC/config/analysis.json index 1caec394e..bbfd71a83 100644 --- a/BALSAMIC/config/analysis.json +++ b/BALSAMIC/config/analysis.json @@ -30,10 +30,6 @@ "mutation": "somatic", "type": "SNV" }, - "tnsnv": { - "mutation": "somatic", - "type": "SNV" - }, "tnhaplotyper": { "mutation": "somatic", "type": "SNV" @@ -46,10 +42,6 @@ "mutation": "germline", "type": "SV" }, - "haplotypecaller": { - "mutation": "germline", - "type": "SNV" - }, "vcfmerge":{ "mutation": "somatic", "type": "SNV" diff --git a/BALSAMIC/config/cluster.json b/BALSAMIC/config/cluster.json index edd6965f1..d1e1a6a17 100644 --- a/BALSAMIC/config/cluster.json +++ b/BALSAMIC/config/cluster.json @@ -52,14 +52,6 @@ "time": "12:00:00", "n": 5 }, - "gatk_haplotypecaller": { - "time": "03:00:00", - "n": 10 - }, - "haplotypecaller_merge": { - "time": "01:30:00", - "n": 8 - }, "manta_germline": { "time": "05:00:00", "n": 16 @@ -128,14 +120,6 @@ "time": "24:00:00", "n": 24 }, - "sentieon_TNsnv": { - "time": "24:00:00", - "n": 24 - }, - "sentieon_TNsnv_tumor_only": { - "time": "24:00:00", - "n": 24 - }, "sentieon_align_sort": { "time": "24:00:00", "n": 24 diff --git a/BALSAMIC/snakemake_rules/variant_calling/germline.rule b/BALSAMIC/snakemake_rules/variant_calling/germline.rule index d8229520f..97fe45bc0 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/germline.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/germline.rule @@ -1,69 +1,6 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 - - -rule gatk_haplotypecaller: - input: - fa = config["reference"]["reference_genome"], - bam = bam_dir + "{sample_type}.merged.bam", - bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, - output: - vcf_dir + "haplotypecaller/split_vcf/{sample_type}.{bedchrom}_haplotypecaller.vcf.gz" - benchmark: - Path(benchmark_dir,'gatk_haplotypecaller_' + "{sample_type}.{bedchrom}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("gatk") + ".sif").as_posix() - params: - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - sample = '{sample_type}', - gatk_path = '/opt/conda/opt/gatk-3.8' - threads: - get_threads(cluster_config,'gatk_haplotypecaller') - message: - ("Calling germline variants using gatk haplotypecaller for {params.sample}") - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; - -java -jar -Djava.io.tmpdir={params.tmpdir} -Xms8G -Xmx32G {params.gatk_path}/GenomeAnalysisTK.jar \ --T HaplotypeCaller \ --R {input.fa} \ --I {input.bam} \ --L {input.bed} \ -| bgzip > {output}; - -rm -rf {params.tmpdir}; - """ - - -rule haplotypecaller_merge: - input: - expand(vcf_dir + "haplotypecaller/split_vcf/{{sample_type}}.{chrom}_haplotypecaller.vcf.gz", chrom=chromlist) - output: - vcf_dir + "SNV.germline.{sample_type}.haplotypecaller.vcf.gz" - benchmark: - Path(benchmark_dir, 'haplotypecaller_merge_' + "SNV.germline.{sample_type}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("gatk") + ".sif").as_posix() - params: - tmpdir = tempfile.mkdtemp(prefix = tmp_dir), - sample = '{sample_type}' - message: - "Concatenating haplotyper outputs from multiple VCF files using bcftools for {params.sample}" - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; - -bcftools concat {input} | bcftools sort --temp-dir {params.tmpdir} - | bgzip > {output}; -tabix -f -p vcf {output}; - -rm -rf {params.tmpdir}; - """ - - rule sentieon_DNAscope: input: bam = bam_dir + "{sample_type}.merged.bam", diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index 606a3bb97..dc300a54b 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -184,7 +184,6 @@ class VCFModel(BaseModel): tnscope: VarcallerAttribute dnascope: VarcallerAttribute tnhaplotyper: VarcallerAttribute - haplotypecaller: VarcallerAttribute TNscope_umi: VarcallerAttribute manta_germline: VarcallerAttribute manta: VarcallerAttribute diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2a4a8fdfd..af9926c52 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -53,6 +53,7 @@ Removed * bcftools_stats from vep #898 * QC delivery report workflow (generating the ``_qc_report.html`` file) #878 * ``--sample-id-map`` and ``--case-id-map`` flags from the ``balsamic report deliver`` command #878 +* Removed `gatk_haplotypecaller` for reporting panel germline variants #918 [8.2.10] -------- diff --git a/tests/conftest.py b/tests/conftest.py index 25ca26a09..bfbd55af8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -603,11 +603,9 @@ def sample_config(): "vardict": {"mutation": "somatic", "type": "SNV"}, "mutect": {"mutation": "somatic", "type": "SNV"}, "tnscope": {"mutation": "somatic", "type": "SNV"}, - "tnsnv": {"mutation": "somatic", "type": "SNV"}, "tnhaplotyper": {"mutation": "somatic", "type": "SNV"}, "dnascope": {"mutation": "germline", "type": "SNV"}, "manta_germline": {"mutation": "germline", "type": "SV"}, - "haplotypecaller": {"mutation": "germline", "type": "SNV"}, }, "samples": { "S1_R": { diff --git a/tests/test_data/config.json b/tests/test_data/config.json index a9e5ba822..3619e0789 100644 --- a/tests/test_data/config.json +++ b/tests/test_data/config.json @@ -24,36 +24,12 @@ "merged": "manta_germline.vcf.gz", "type": "SV" }, - "strelka_germline": { - "default": ["variants.vcf.gz", "germline.S1.vcf.gz"], - "mutation": "germline", - "merged": "strelka_germline.vcf.gz", - "type": "SNV" - }, - "strelka": { - "default": ["somatic.snvs.vcf.gz", "somatic.indels.vcf.gz"], - "mutation": "somatic", - "merged": "strelka.vcf.gz", - "type": "SNV" - }, "mutect": { "default": "mutect.vcf.gz", "mutation": "somatic", "merged": "mutect.vcf.gz", "type": "SNV" }, - "freebayes": { - "default": "freebayes.vcf.gz", - "mutation": "germline", - "merged": "freebayes.vcf.gz", - "type": "SNV" - }, - "haplotypecaller": { - "default": "haplotypecaller.vcf.gz", - "mutation": "germline", - "merged": "haplotypecaller.vcf.gz", - "type": "SNV" - }, "vardict": { "default": "vardict.vcf.gz", "mutation": "somatic", From 8142fac90c81f6bec957f2d36709dcadcd9b28c3 Mon Sep 17 00:00:00 2001 From: ashwini06 Date: Mon, 2 May 2022 09:30:07 +0200 Subject: [PATCH 56/58] refactor: balsamic containers (#921) * update align_qc base image * update align_qc tool versions * add tabix version * remove csvkit from align_qc * remove csvkit frm bioinfo_tool env * update align _qc container tool versions in readthedocs * add samtools versions to tests * update changelog * update base image in coverage_qc container * update tool versions in cover_qc container * update tool versions in bioinfo softwares docs * update changelog * update base image in container varcall_cnvkit * update cnvkit version * update purecn version and lock bcftools and tabix versions * update docs and changelog * update base image in varcall_py36 container * update tools in varcall_py36 * update samtools version in docs * update changelog * update base image of annotate container * update ensembl vep in annotate container * update readthedocs for vep version * update changelog * fix typo in varcall_py27 --- BALSAMIC/config/balsamic_env.yaml | 1 - BALSAMIC/constants/common.py | 1 - BALSAMIC/containers/align_qc/Dockerfile | 4 +-- BALSAMIC/containers/align_qc/align_qc.yaml | 15 ++++------ BALSAMIC/containers/annotate/Dockerfile | 4 +-- BALSAMIC/containers/annotate/annotate.yaml | 6 ++-- BALSAMIC/containers/coverage_qc/Dockerfile | 4 +-- .../containers/coverage_qc/coverage_qc.yaml | 4 +-- BALSAMIC/containers/varcall_cnvkit/Dockerfile | 4 +-- .../varcall_cnvkit/varcall_cnvkit.sh | 2 +- .../varcall_cnvkit/varcall_cnvkit.yaml | 6 ++-- BALSAMIC/containers/varcall_py27/Dockerfile | 2 +- BALSAMIC/containers/varcall_py36/Dockerfile | 4 +-- .../containers/varcall_py36/varcall_py36.yaml | 6 ++-- CHANGELOG.rst | 3 ++ docs/balsamic_methods.rst | 30 +++++++++---------- docs/bioinfo_softwares.rst | 28 +++++++---------- tests/utils/test_utils.py | 2 +- 18 files changed, 58 insertions(+), 68 deletions(-) diff --git a/BALSAMIC/config/balsamic_env.yaml b/BALSAMIC/config/balsamic_env.yaml index c3ecb6b34..ce56b91ce 100644 --- a/BALSAMIC/config/balsamic_env.yaml +++ b/BALSAMIC/config/balsamic_env.yaml @@ -6,7 +6,6 @@ align_qc: - picard - multiqc - fastp - - csvkit annotate: - ensembl-vep - vcfanno diff --git a/BALSAMIC/constants/common.py b/BALSAMIC/constants/common.py index ecf9eca5e..44a75a7a0 100644 --- a/BALSAMIC/constants/common.py +++ b/BALSAMIC/constants/common.py @@ -63,7 +63,6 @@ "picard": "align_qc", "multiqc": "align_qc", "fastp": "align_qc", - "csvkit": "align_qc", "ensembl-vep": "annotate", "genmod": "annotate", "vcfanno": "annotate", diff --git a/BALSAMIC/containers/align_qc/Dockerfile b/BALSAMIC/containers/align_qc/Dockerfile index fe942c7d3..e49620ab6 100644 --- a/BALSAMIC/containers/align_qc/Dockerfile +++ b/BALSAMIC/containers/align_qc/Dockerfile @@ -1,6 +1,6 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/align_qc/align_qc.yaml b/BALSAMIC/containers/align_qc/align_qc.yaml index 4e0e7444a..2fbde5eb4 100644 --- a/BALSAMIC/containers/align_qc/align_qc.yaml +++ b/BALSAMIC/containers/align_qc/align_qc.yaml @@ -3,14 +3,11 @@ channels: dependencies: - bioconda::bedtools=2.30.0 - - bioconda::bwa=0.7.15 + - bioconda::bwa=0.7.17 - bioconda::fastqc=0.11.9 - - bioconda::samtools=1.12 + - bioconda::samtools=1.15.1 - bioconda::tabix=0.2.6 - - bioconda::picard=2.25.0 - - bioconda::multiqc=1.11 - - bioconda::fastp=0.20.1 - - conda-forge::csvkit=1.0.4 - - conda-forge::libiconv - - conda-forge::fontconfig - - conda-forge::r-base=4.1.1 + - bioconda::picard=2.27.1 + - bioconda::multiqc=1.12 + - bioconda::fastp=0.23.2 + - conda-forge::r-base=4.1.3 diff --git a/BALSAMIC/containers/annotate/Dockerfile b/BALSAMIC/containers/annotate/Dockerfile index f30c8c301..8c25633ab 100644 --- a/BALSAMIC/containers/annotate/Dockerfile +++ b/BALSAMIC/containers/annotate/Dockerfile @@ -1,6 +1,6 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/annotate/annotate.yaml b/BALSAMIC/containers/annotate/annotate.yaml index 8b30f319e..40fe8cb5c 100644 --- a/BALSAMIC/containers/annotate/annotate.yaml +++ b/BALSAMIC/containers/annotate/annotate.yaml @@ -1,13 +1,11 @@ channels: - - anaconda - defaults - - conda-forge dependencies: - anaconda::python=3.7 - - bioconda::ensembl-vep=100.2 - - bioconda::bcftools=1.10 - conda-forge::libopenblas=0.3.20 + - bioconda::ensembl-vep=104.3 + - bioconda::bcftools=1.10 - bioconda::vcfanno=0.3.3 - anaconda::gxx_linux-64=7.3.0 - anaconda::pip=20.2.4 diff --git a/BALSAMIC/containers/coverage_qc/Dockerfile b/BALSAMIC/containers/coverage_qc/Dockerfile index fe942c7d3..e49620ab6 100644 --- a/BALSAMIC/containers/coverage_qc/Dockerfile +++ b/BALSAMIC/containers/coverage_qc/Dockerfile @@ -1,6 +1,6 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/coverage_qc/coverage_qc.yaml b/BALSAMIC/containers/coverage_qc/coverage_qc.yaml index e796ce73a..70540ce90 100644 --- a/BALSAMIC/containers/coverage_qc/coverage_qc.yaml +++ b/BALSAMIC/containers/coverage_qc/coverage_qc.yaml @@ -3,5 +3,5 @@ channels: - conda-forge dependencies: - - bioconda::sambamba=0.6.6 - - bioconda::mosdepth=0.2.9 + - bioconda::sambamba=0.8.2 + - bioconda::mosdepth=0.3.3 diff --git a/BALSAMIC/containers/varcall_cnvkit/Dockerfile b/BALSAMIC/containers/varcall_cnvkit/Dockerfile index f30c8c301..8c25633ab 100644 --- a/BALSAMIC/containers/varcall_cnvkit/Dockerfile +++ b/BALSAMIC/containers/varcall_cnvkit/Dockerfile @@ -1,6 +1,6 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.sh b/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.sh index cfd487f09..1a649ce4b 100644 --- a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.sh +++ b/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.sh @@ -1,2 +1,2 @@ conda env update -n base --file ${1}.yaml --prune -pip install --no-cache-dir cnvkit==0.9.4 biopython==1.76 +pip install --no-cache-dir cnvkit==0.9.9 biopython==1.79 diff --git a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.yaml b/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.yaml index 45b5882f1..465b8b479 100644 --- a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.yaml +++ b/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.yaml @@ -12,6 +12,6 @@ dependencies: - bioconda::bioconductor-genomicranges=1.46.0 - bioconda::bioconductor-dnacopy=1.68.0 - bioconda::bioconductor-variantannotation=1.40.0 - - bioconda::bioconductor-purecn=2.0.1 - - bioconda::bcftools>=1.13 - - bioconda::tabix>=0.2.6 + - bioconda::bioconductor-purecn=2.0.2 + - bioconda::bcftools=1.13 + - bioconda::tabix=0.2.6 diff --git a/BALSAMIC/containers/varcall_py27/Dockerfile b/BALSAMIC/containers/varcall_py27/Dockerfile index 856d2eef3..367c8c646 100644 --- a/BALSAMIC/containers/varcall_py27/Dockerfile +++ b/BALSAMIC/containers/varcall_py27/Dockerfile @@ -1,6 +1,6 @@ FROM continuumio/miniconda:4.7.12 -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda:4.7.12" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/varcall_py36/Dockerfile b/BALSAMIC/containers/varcall_py36/Dockerfile index fe942c7d3..e49620ab6 100644 --- a/BALSAMIC/containers/varcall_py36/Dockerfile +++ b/BALSAMIC/containers/varcall_py36/Dockerfile @@ -1,6 +1,6 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/varcall_py36/varcall_py36.yaml b/BALSAMIC/containers/varcall_py36/varcall_py36.yaml index dec8bbdd7..0505a41e2 100644 --- a/BALSAMIC/containers/varcall_py36/varcall_py36.yaml +++ b/BALSAMIC/containers/varcall_py36/varcall_py36.yaml @@ -7,8 +7,8 @@ dependencies: - bioconda::tabix=0.2.6 - bioconda::samtools=1.11 - bioconda::gatk=3.8 - - bioconda::vardict=2019.06.04=pl526_0 - - bioconda::vardict-java=1.7 + - bioconda::vardict=2019.06.04 + - bioconda::vardict-java=1.8.3 - bioconda::svdb=2.6.0 - conda-forge::libiconv - - conda-forge::r-base=3.6.3 + - conda-forge::r-base=4.1.1 diff --git a/CHANGELOG.rst b/CHANGELOG.rst index af9926c52..ab198ec48 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -33,6 +33,9 @@ Changed: * Upgrade black to 22.3.0 * For UMI workflow, post filter `gnomad_pop_freq` value is changed from `0.005` to `0.02` #919 * updated delly to 0.9.1 #920 +* container base_image (align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py36) to 4.10.3-alpine #921 +* update container (align_qc, annotate, coverage_qc, varcall_cnvkit,varcall_py36) bioinfo tool versions #921 +* update tool versions (align_qc, annotate, coverage_qc, varcall_cnvkit) in methods and softwares docs #921 Fixed: ^^^^^^ diff --git a/docs/balsamic_methods.rst b/docs/balsamic_methods.rst index 34f1c9061..de08e90fe 100644 --- a/docs/balsamic_methods.rst +++ b/docs/balsamic_methods.rst @@ -7,39 +7,39 @@ Target Genome Analysis BALSAMIC :superscript:`1` (**version** = 8.2.10) was used to analyze the data from raw FASTQ files. We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. -Adapter sequences and low-quality bases were trimmed using fastp v0.20.1 :superscript:`3`. -Trimmed reads were mapped to the reference genome hg19 using BWA MEM v0.7.15 :superscript:`4`. -The resulted SAM files were converted to BAM files and sorted using samtools v1.12 :superscript:`5`. -Duplicated reads were marked using Picard tools MarkDuplicate v2.25.0 :superscript:`6` +Adapter sequences and low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. +Trimmed reads were mapped to the reference genome hg19 using BWA MEM v0.7.17 :superscript:`4`. +The resulted SAM files were converted to BAM files and sorted using samtools v1.15.1 :superscript:`5`. +Duplicated reads were marked using Picard tools MarkDuplicate v2.27.1 :superscript:`6` and promptly quality controlled using CollectHsMetrics, CollectInsertSizeMetrics and CollectAlignmentSummaryMetrics functionalities. -Results of the quality controlled steps were summarized by MultiQC v1.11 :superscript:`7`. +Results of the quality controlled steps were summarized by MultiQC v1.12 :superscript:`7`. Small somatic mutations (SNVs and INDELs) were called for each sample using VarDict v2019.06.04 :superscript:`8`. Apart from the Vardict filters to report the variants, the called-variants were also further second filtered using the criteria (*MQ >= 40, DP >= 100, VD >= 5, Minimum AF >= 0.007, Maximum AF < 1, GNOMADAF_popmax <= 0.005*). Only those variants that fulfilled the filtering criteria and scored as `PASS` in the VCF file were reported. Structural variants were called using Manta v1.6.0 :superscript:`9` and Delly v0.9.1 :superscript:`10`. -Copy number aberrations were called using CNVkit v0.9.4 :superscript:`11`. +Copy number aberrations were called using CNVkit v0.9.9 :superscript:`11`. The variant calls from CNVkit, Manta and Delly were merged using SVDB v2.6.0 :superscript:`12`. -All variants were annotated using Ensembl VEP v100.2 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` +All variants were annotated using Ensembl VEP v104.3 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`. Whole Genome Analysis ~~~~~~~~~~~~~~~~~~~~~ BALSAMIC :superscript:`1` (**version** = 8.2.10) was used to analyze the data from raw FASTQ files. We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. -Adapter sequences and low-quality bases were trimmed using fastp v0.20.1 :superscript:`3`. +Adapter sequences and low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. Trimmed reads were mapped to the reference genome hg19 using sentieon-tools :superscript:`15`. -The resulted SAM files were converted to BAM files and sorted using samtools v1.12 :superscript:`5`. -Duplicated reads were marked using Picard tools MarkDuplicate v2.25.0 :superscript:`6` +The resulted SAM files were converted to BAM files and sorted using samtools v1.15.1 :superscript:`5`. +Duplicated reads were marked using Picard tools MarkDuplicate v2.27.1 :superscript:`6` and promptly quality controlled using CollectMultipleMetrics and CollectWgsMetrics functionalities. -Results of the quality controlled steps were summarized by MultiQC v1.11 :superscript:`7`. +Results of the quality controlled steps were summarized by MultiQC v1.12 :superscript:`7`. Small somatic mutations (SNVs and INDELs) were called for each sample using Sentieon TNscope and TNhaplotyper :superscript:`16`. The called-variants were also further second filtered using the criteria (DP(tumor,normal) >= 10; AD(tumor) >= 3; AF(tumor) >= 0.05, Maximum AF(tumor < 1; GNOMADAF_popmax <= 0.001; normalized base quality scores >= 20, read_counts of alt,ref alle > 0). The filtered variants from TNscope and TNhaplotyper were merged using bcftools isec functionality to reduce the number of variants for tumor-only samples. Structural variants were called using Manta v1.6.0 :superscript:`9` and Delly v0.9.1 :superscript:`10`. Copy number aberrations were called using ascatNgs v4.5.0 :superscript:`17` for tumor-normal samples. The structural variant calls from Manta, Delly and ascatNgs were merged using SVDB v2.6.0 :superscript:`12` -All variants were finally annotated using Ensembl VEP v100.2 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` +All variants were finally annotated using Ensembl VEP v104.3 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`. ============================= @@ -48,16 +48,16 @@ UMI Data Analysis BALSAMIC :superscript:`1` (**version** = 8.2.10) was used to analyze the data from raw FASTQ files. We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. -Adapter sequences and low-quality bases were trimmed using fastp v0.20.1 :superscript:`3`. +Adapter sequences and low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. UMI tag extraction and consensus generation were performed using Sentieon tools v202010.02 :superscript:`15`. The alignment of UMI extracted and consensus called reads to the human reference genome (hg19) was done by bwa-mem and samtools using Sentieon utils. Consensus reads were filtered based on the number of minimum reads supporting each UMI tag group. We applied a criteria filter of minimum reads `3,1,1`. It means that at least three UMI tag groups should be ideally considered from both DNA strands, where a minimum of at least one UMI tag group should exist in each single-stranded consensus read. -The filtered consensus reads were quality controlled using Picard CollectHsMetrics v2.25.0 :superscript:`5`. Results of the quality controlled steps were summarized by MultiQC v1.11 :superscript:`6`. +The filtered consensus reads were quality controlled using Picard CollectHsMetrics v2.27.1 :superscript:`5`. Results of the quality controlled steps were summarized by MultiQC v1.12 :superscript:`6`. For each sample, somatic mutations were called using Sentieon TNscope :superscript:`16`, with non-default parameters for passing the final list of variants (--min_tumor_allele_frac 0.0005, --filter_t_alt_frac 0.0005, --min_init_tumor_lod 0.5, min_tumor_lod 4, --max_error_per_read 5 --pcr_indel_model NONE, GNOMADAF_popmax <= 0.001). -All variants were finally annotated using Ensembl VEP v100.2 :superscript:`7`. We used vcfanno v0.3.3 :superscript:`8` to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`. +All variants were finally annotated using Ensembl VEP v104.3 :superscript:`7`. We used vcfanno v0.3.3 :superscript:`8` to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`. For exact parameters used for each software, please refer to https://github.com/Clinical-Genomics/BALSAMIC. We used three commercially available products from SeraCare [Material numbers: 0710-067110 :superscript:`19`, 0710-067211 :superscript:`20`, 0710-067312 :superscript:`21`] for validating the efficiency of the UMI workflow in identifying 14 mutation sites at known allelic frequencies. diff --git a/docs/bioinfo_softwares.rst b/docs/bioinfo_softwares.rst index 29e14d4a8..2de11c2fe 100644 --- a/docs/bioinfo_softwares.rst +++ b/docs/bioinfo_softwares.rst @@ -16,7 +16,7 @@ bcftools ~~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `>1.9` +:Version: `>=1.10` bedtools ~~~~~~~~ @@ -28,19 +28,13 @@ bwa ~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.7.15` +:Version: `0.7.17` cnvkit ~~~~~~ :Source code: `GitHub` ``_ :Article: `PLOS Computational Biology` ``_ -:Version: `0.9.4` - -csvkit -~~~~~~ -:Source code: `GitHub` ``_ -:Article: `-` -:Version: `1.0.4` +:Version: `0.9.9` delly ~~~~~~~ @@ -52,13 +46,13 @@ ensembl-vep ~~~~~~~~~~~ :Source code: `GitHub` ``_ :Article: `Genome Biology` ``_ -:Version: `100.2` +:Version: `104.3` fastp ~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.20.1` +:Version: `0.23.2` fastqc ~~~~~~ @@ -82,31 +76,31 @@ multiqc ~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `1.11` +:Version: `1.12` mosdepth ~~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.2.9` +:Version: `0.3.3` picard ~~~~~~ :Source code: `GitHub` ``_ :Article: `-` -:Version: `2.25.0` +:Version: `2.27.1` sambamba ~~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.6.6` +:Version: `0.8.2` samtools ~~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `1.12` +:Version: `>1.11` sentieon-tools ~~~~~~~~~~~~~~ @@ -124,7 +118,7 @@ tabix ~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.2.6` +:Version: `1.11` vardict ~~~~~~~ diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index f0ee2ef0b..77bf50a55 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -187,7 +187,7 @@ def test_get_bioinfo_tools_version(): # THEN assert it is a dictionary and versions are correct assert isinstance(bioinfo_tools_dict, dict) - assert set(observed_versions).issubset(set(["1.12", "1.11", "1.9"])) + assert set(observed_versions).issubset(set(["1.15.1", "1.12", "1.11", "1.9"])) def test_get_delivery_id(): From 5120e1799e8aead018eedd005682517b318af449 Mon Sep 17 00:00:00 2001 From: ivadym Date: Mon, 2 May 2022 12:55:44 +0200 Subject: [PATCH 57/58] refactor: Update the list of files to be stored and delivered (#915) --- BALSAMIC/assets/scripts/collect_qc_metrics.py | 4 +- BALSAMIC/constants/workflow_rules.py | 38 +++--- .../varcaller_filter_tumor_normal.rule | 1 - .../varcaller_filter_tumor_only.rule | 1 - .../varcaller_wgs_filter_tumor_normal.rule | 1 - .../varcaller_wgs_filter_tumor_only.rule | 2 - BALSAMIC/snakemake_rules/annotation/vep.rule | 5 +- .../quality_control/fastp.rule | 9 +- .../umi/sentieon_varcall_tnscope.rule | 5 +- .../umi/sentieon_varcall_tnscope_tn.rule | 5 +- .../variant_calling/sentieon_t_varcall.rule | 5 +- .../variant_calling/sentieon_tn_varcall.rule | 5 +- .../somatic_sv_tumor_normal.rule | 37 ++--- .../somatic_sv_tumor_only.rule | 14 +- .../variant_calling/somatic_tumor_normal.rule | 9 +- .../variant_calling/somatic_tumor_only.rule | 9 +- BALSAMIC/workflows/balsamic.smk | 129 +++++++++++------- CHANGELOG.rst | 2 + tests/commands/report/test_deliver.py | 6 +- tests/conftest.py | 48 ++++++- tests/helpers.py | 32 +++++ tests/utils/test_utils.py | 32 +++++ 22 files changed, 275 insertions(+), 124 deletions(-) diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py index e376430f4..33834ec10 100755 --- a/BALSAMIC/assets/scripts/collect_qc_metrics.py +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -51,8 +51,8 @@ def capture_kit_resolve_type(capture_kit: str): if capture_kit == "None": return None - else: - return capture_kit + + return capture_kit def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str: diff --git a/BALSAMIC/constants/workflow_rules.py b/BALSAMIC/constants/workflow_rules.py index bbd5c0d0a..4f77c2731 100644 --- a/BALSAMIC/constants/workflow_rules.py +++ b/BALSAMIC/constants/workflow_rules.py @@ -113,31 +113,31 @@ DELIVERY_RULES = [ - "fastp", "multiqc", "collect_custom_qc_metrics", - "vep_somatic_snv", - "vep_somatic_sv", - "vep_germline", - "tmb_calculation", - "bcftools_filter_TNscope_umi_tumor_only", - "bcftools_filter_TNscope_umi_tumor_normal", - "bcftools_filter_vardict_tumor_only", - "bcftools_filter_vardict_tumor_normal", - "bcftools_filter_tnscope_tumor_only", - "bcftools_filter_tnscope_tumor_normal", - "bcftools_filter_tnhaplotyper_tumor_only", - "bcftools_filter_tnhaplotyper_tumor_normal", - "bcftools_filter_svdb", - "bcftools_intersect_tumor_only", - "bcftools_filter_TNscope_umi_tumor_only", - "genmod_score_vardict", "mergeBam_tumor", "mergeBam_normal", "mergeBam_tumor_umiconsensus", "mergeBam_normal_umiconsensus", - "cnvkit_paired", + "vep_germline", + "svdb_merge_tumor_only", + "svdb_merge_tumor_normal", + "sentieon_TNscope_tumor_only", + "sentieon_TNscope", + "vardict_merge", + "sentieon_tnscope_umi", + "sentieon_tnscope_umi_tn", + "ascat_tumor_normal", + "ascat_tumor_normal_merge_output", + "delly_cnv_tumor_only", "cnvkit_single", + "cnvkit_paired", "vcf2cytosure_convert", - "ascat_tumor_normal_merge_output", + "bcftools_filter_svdb", + "bcftools_intersect_tumor_only", + "bcftools_filter_tnscope_tumor_normal", + "bcftools_filter_vardict_tumor_only", + "bcftools_filter_vardict_tumor_normal", + "bcftools_filter_TNscope_umi_tumor_only", + "bcftools_filter_TNscope_umi_tumor_normal", ] diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule index 532702769..a6068ea4e 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule @@ -63,7 +63,6 @@ rule bcftools_filter_tnhaplotyper_tumor_normal: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: pop_freq = [COMMON_FILTERS.pop_freq.tag_value, COMMON_FILTERS.pop_freq.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_normal') diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule index 7c90f12f8..90e338f76 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule @@ -61,7 +61,6 @@ rule bcftools_filter_tnhaplotyper_tumor_only: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: pop_freq = [COMMON_FILTERS.pop_freq.tag_value, COMMON_FILTERS.pop_freq.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_only') diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule index 017937cd3..4473d2974 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule @@ -60,7 +60,6 @@ rule bcftools_filter_tnhaplotyper_tumor_normal: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: pop_freq = [SENTIEON_CALLER.pop_freq.tag_value, SENTIEON_CALLER.pop_freq.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_normal') diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule index 4a73b17be..79befa5e7 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule @@ -23,7 +23,6 @@ rule bcftools_filter_tnscope_tumor_only: strand_reads = [SENTIEON_CALLER.strand_reads.tag_value, SENTIEON_CALLER.strand_reads.filter_name], qss = [SENTIEON_CALLER.qss.tag_value, SENTIEON_CALLER.qss.filter_name], sor = [SENTIEON_CALLER.sor.tag_value, SENTIEON_CALLER.sor.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnscope_tumor_only') @@ -67,7 +66,6 @@ rule bcftools_filter_tnhaplotyper_tumor_only: pop_freq = [SENTIEON_CALLER.pop_freq.tag_value, SENTIEON_CALLER.pop_freq.filter_name], strand_reads = [SENTIEON_CALLER.strand_reads.tag_value, SENTIEON_CALLER.strand_reads.filter_name], qss = [SENTIEON_CALLER.qss.tag_value, SENTIEON_CALLER.qss.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_only') diff --git a/BALSAMIC/snakemake_rules/annotation/vep.rule b/BALSAMIC/snakemake_rules/annotation/vep.rule index a79526cc6..ad30ab891 100644 --- a/BALSAMIC/snakemake_rules/annotation/vep.rule +++ b/BALSAMIC/snakemake_rules/annotation/vep.rule @@ -15,7 +15,6 @@ rule vep_somatic_snv: singularity: Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() params: - housekeeper_id = {"id": "{case_name}", "tags": "annotated-somatic"}, ref_path = Path(config["reference"]["gnomad_variant"]).parent.as_posix(), message_text = "SNV.somatic.{case_name}.{var_caller}.vcf.gz", tmpvcf = vep_dir + "SNV.somatic.{case_name}.{var_caller}.tmp.vcf.gz", @@ -62,7 +61,6 @@ rule vep_somatic_sv: singularity: Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() params: - housekeeper_id = {"id": "{case_name}", "tags": "annotated-somatic"}, message_text = "SV.somatic.{case_name}.svdb.vcf.gz", vep_cache = config["reference"]["vep"], vep_defaults = params.vep.vep_filters @@ -100,7 +98,6 @@ rule tmb_calculation: params: af_cutoff = "0.05", bed = config["panel"]["capture_kit"] if "panel" in config else "", - housekeeper_id = {"id": "{case_name}", "tags": "stat-somatic"}, message_text = "{var_type}.somatic.{case_name}.{var_caller}.all", tmpdir = tempfile.mkdtemp(prefix=tmp_dir), threads: @@ -156,7 +153,7 @@ rule vep_germline: singularity: Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() params: - housekeeper_id = {"id": "{sample}", "tags": "annotated-germline"}, + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "annotated-germline"}, sample = '{sample}', vep_cache = config["reference"]["vep"], vep_defaults = params.vep.vep_filters diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp.rule b/BALSAMIC/snakemake_rules/quality_control/fastp.rule index 0c1bd57fe..06296f4fc 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastp.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastp.rule @@ -32,8 +32,8 @@ rule fastp_umi: read1=config["analysis"]["fastq_path"] + "{sample}" + "_1.fastq.gz", read2=config["analysis"]["fastq_path"] + "{sample}" + "_2.fastq.gz", output: - read1 = fastq_dir + "{sample}_1.umi_optimized.fastq.gz", - read2 = fastq_dir + "{sample}_2.umi_optimized.fastq.gz", + read1 = temp(fastq_dir + "{sample}_1.umi_optimized.fastq.gz"), + read2 = temp(fastq_dir + "{sample}_2.umi_optimized.fastq.gz"), json = qc_dir + "fastp/{sample}_fastp_umi.json", html = qc_dir + "fastp/{sample}_fastp_umi.html", benchmark: @@ -73,8 +73,8 @@ rule fastp: read1 = fastq_dir + "{sample}_1.umi_optimized.fastq.gz", read2 = fastq_dir + "{sample}_2.umi_optimized.fastq.gz" output: - read1 = fastq_dir + "{sample}_1.fp.fastq.gz", - read2 = fastq_dir + "{sample}_2.fp.fastq.gz", + read1 = temp(fastq_dir + "{sample}_1.fp.fastq.gz"), + read2 = temp(fastq_dir + "{sample}_2.fp.fastq.gz"), json = qc_dir + "fastp/{sample}_fastp.json", html = qc_dir + "fastp/{sample}_fastp.html" benchmark: @@ -82,7 +82,6 @@ rule fastp: singularity: Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix() params: - housekeeper_id = {"id": "{sample}", "tags": "quality-trimmed-fastq"}, tmpdir = tmp_dir, umi = " ".join(fastp_param_umi), minimum_length = config["QC"]["min_seq_length"], diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule index cff784b12..6dac5db99 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule @@ -12,11 +12,12 @@ rule sentieon_tnscope_umi: bed = config["panel"]["capture_kit"], dbsnp = config["reference"]["dbsnp"] output: - vcf = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".TNscope_umi.vcf.gz", + vcf_tnscope_umi = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".TNscope_umi.vcf.gz", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".TNscope_umi.sample_name_map" benchmark: Path(benchmark_dir, "sentieon_tnscope_umi_" + config["analysis"]["case_id"] + ".tsv").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), sentieon_exec = config["SENTIEON_EXEC"], sentieon_lic = config["SENTIEON_LICENSE"], @@ -58,7 +59,7 @@ export SENTIEON_LICENSE={params.sentieon_lic}; --max_error_per_read {params.error_rate} \ --pcr_indel_model {params.pcr_model} \ --prune_factor {params.prune_factor} \ -{output.vcf}; +{output.vcf_tnscope_umi}; echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; """ diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule index 64ded84bb..7205afc2c 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule @@ -12,11 +12,12 @@ rule sentieon_tnscope_umi_tn: bed = config["panel"]["capture_kit"], dbsnp = config["reference"]["dbsnp"] output: - vcf = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".TNscope_umi.vcf.gz", + vcf_tnscope_umi = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".TNscope_umi.vcf.gz", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".TNscope_umi.sample_name_map" benchmark: Path(benchmark_dir, "sentieon_tnscope_umi_" + config["analysis"]["case_id"] + ".tsv").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), sentieon_exec = config["SENTIEON_EXEC"], sentieon_lic = config["SENTIEON_LICENSE"], @@ -62,7 +63,7 @@ export SENTIEON_LICENSE={params.sentieon_lic}; --max_error_per_read {params.error_rate} \ --pcr_indel_model {params.pcr_model} \ --prune_factor {params.prune_factor} \ -{output.vcf}; +{output.vcf_tnscope_umi}; echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap}; """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule index 1c9750b5e..75a8cefcf 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule @@ -127,12 +127,13 @@ rule sentieon_TNscope_tumor_only: bam = expand(bam_dir + "tumor.merged.bam"), recal = expand(bam_dir + "tumor.merged.recal_data.table") output: - vcf = vcf_dir + "sentieon_tnscope" + "/" + "ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + vcf_tnscope = vcf_dir + "sentieon_tnscope" + "/" + "ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", namemap_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", namemap_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", benchmark: Path(benchmark_dir, "sentieon_TNscope_tumor_only_" + config["analysis"]["case_id"] + ".tsv").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), tumor = "TUMOR", tumor_options = VARCALL_PARAMS["tnscope"]["tumor"], @@ -162,7 +163,7 @@ export SENTIEON_LICENSE={params.sentieon_lic}; --tumor_sample {params.tumor} {params.pon} \ --dbsnp {input.dbsnp} \ --pcr_indel_mode {params.pcr_model} \ -{params.tumor_options} {output.vcf}; +{params.tumor_options} {output.vcf_tnscope}; echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap_snv}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule index f2b756710..91113ff1b 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule @@ -162,12 +162,13 @@ rule sentieon_TNscope: recalT = expand(bam_dir + "tumor.merged.recal_data.table"), recalN = expand(bam_dir + "normal.merged.recal_data.table"), output: - vcf_all = vcf_dir + "sentieon_tnscope/ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + vcf_tnscope = vcf_dir + "sentieon_tnscope/ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", namemap_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", namemap_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", benchmark: Path(benchmark_dir, 'sentieon_TNscope_' + config[ "analysis" ][ "case_id" ] + ".tsv").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), tumor = "TUMOR", normal = "NORMAL", @@ -211,7 +212,7 @@ intermediate_vcf={params.tmpdir}/tn_sentieon_varcall_file -r {input.ref} \ --algo TNModelApply \ -m {params.sentieon_ml_tnscope} \ --v $intermediate_vcf {output.vcf_all}; +-v $intermediate_vcf {output.vcf_tnscope}; echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap_snv}; cp {output.namemap_snv} {output.namemap_sv} diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule index d08d06914..6c5fe40fb 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule @@ -122,17 +122,20 @@ rule ascat_tumor_normal: output: final_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.vcf.gz", ascat_copynumber = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.copynumber.txt.gz", - sample_statistics = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.samplestatistics.txt", - ascat_plots= expand( - vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat." + "{output_suffix}" + ".png", - output_suffix=["ascatprofile", "rawprofile", "ASPCF", "tumor", "germline", "sunrise"] - ), + sample_statistics = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.samplestatistics.txt"), + plot_ascat_profile = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.ascatprofile.png"), + plot_raw_profile = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.rawprofile.png"), + plot_aspcf = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.ASPCF.png"), + plot_tumor = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.tumor.png"), + plot_germline = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.germline.png"), + plot_sunrise = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.sunrise.png"), namemap = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.sample_name_map", benchmark: benchmark_dir + 'ascat_tumor_normal_' + config["analysis"]["case_id"] + "_ascat.tsv" singularity: Path(singularity_image, config["bioinfo_tools"].get("ascatNgs") + ".sif").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), tumor = "TUMOR", normal = "NORMAL", @@ -164,17 +167,17 @@ cp {params.tmpdir}/{params.tumor}.copynumber.txt.gz {output.ascat_copynumber} cp {params.tmpdir}/{params.tumor}.samplestatistics.txt {output.sample_statistics}; -cp {params.tmpdir}/{params.tumor}.ASCATprofile.png {output.ascat_plots[0]}; +cp {params.tmpdir}/{params.tumor}.ASCATprofile.png {output.plot_ascat_profile}; -cp {params.tmpdir}/{params.tumor}.rawprofile.png {output.ascat_plots[1]}; +cp {params.tmpdir}/{params.tumor}.rawprofile.png {output.plot_raw_profile}; -cp {params.tmpdir}/{params.tumor}.ASPCF.png {output.ascat_plots[2]}; +cp {params.tmpdir}/{params.tumor}.ASPCF.png {output.plot_aspcf}; -cp {params.tmpdir}/{params.tumor}.tumour.png {output.ascat_plots[3]}; +cp {params.tmpdir}/{params.tumor}.tumour.png {output.plot_tumor}; -cp {params.tmpdir}/{params.tumor}.germline.png {output.ascat_plots[4]}; +cp {params.tmpdir}/{params.tumor}.germline.png {output.plot_germline}; -cp {params.tmpdir}/{params.tumor}.sunrise.png {output.ascat_plots[5]}; +cp {params.tmpdir}/{params.tumor}.sunrise.png {output.plot_sunrise}; tabix -p vcf -f {output.final_vcf}; @@ -191,9 +194,9 @@ rule ascat_tumor_normal_merge_output: output_suffix=["ascatprofile", "rawprofile", "ASPCF", "tumor", "germline", "sunrise"] ), output: - ascat_output_pdf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.output.pdf" + ascat_pdf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.output.pdf" params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, merge_ascat_output_script= get_script_path("create_pdf.py"), singularity: Path(singularity_image, "balsamic.sif").as_posix() @@ -203,7 +206,7 @@ rule ascat_tumor_normal_merge_output: "Merging the output plots and the sample statistics from ascatNGS into a single PDF" shell: """ -python {params.merge_ascat_output_script} {output.ascat_output_pdf} {input.sample_statistics} {input.ascat_plots} +python {params.merge_ascat_output_script} {output.ascat_pdf} {input.sample_statistics} {input.ascat_plots} """ rule svdb_merge_tumor_normal: @@ -215,13 +218,14 @@ rule svdb_merge_tumor_normal: vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".{caller}.vcf.gz", caller=somatic_caller_cnv) output: - svdb_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.vcf.gz", + vcf_svdb = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.vcf.gz", namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.sample_name_map", benchmark: Path(benchmark_dir, 'svdb_merge_tumor_normal_' + config["analysis"]["case_id"] + ".tsv") singularity: Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tumor = get_sample_type(config["samples"], "tumor"), normal = get_sample_type(config["samples"], "normal"), case_name = config["analysis"]["case_id"], @@ -236,7 +240,8 @@ rule svdb_merge_tumor_normal: svdb --merge --no_intra --bnd_distance 5000 --overlap 0.80 \ --vcf {params.vcf} \ --priority {params.svdb_priority} | \ -bgzip -l 9 -c > {output.svdb_vcf}; +bgzip -l 9 -c > {output.vcf_svdb}; +tabix -p vcf -f {output.vcf_svdb}; echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap}; """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule index ac11b2ae5..4a3620f8a 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule @@ -87,8 +87,8 @@ rule delly_cnv_tumor_only: bcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", map = config["reference"]["delly_mappability"], output: - cnv = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", - rd = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".delly.cov.gz", + cnv_delly = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", + rd_delly = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".dellycnv.cov.gz", namemap= vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".dellycnv.sample_name_map", benchmark: benchmark_dir + 'delly_cnv_tumor_only_' + config["analysis"]["case_id"] + ".tsv" @@ -96,6 +96,7 @@ rule delly_cnv_tumor_only: Path(singularity_image, config["bioinfo_tools"].get("delly") + ".sif").as_posix() params: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + housekeeper_id= {"id": config["analysis"]["case_id"],"tags": "clinical"}, runmode = "local", tumor = "TUMOR", case_name = config["analysis"]["case_id"] @@ -105,7 +106,7 @@ rule delly_cnv_tumor_only: ("Calling copy number variants using delly for {params.case_name}") shell: """ -delly cnv -m {input.map} -g {input.fa} -c {output.rd} -o {output.cnv} -l {input.bcf} {input.bamT} +delly cnv -m {input.map} -g {input.fa} -c {output.rd_delly} -o {output.cnv_delly} -l {input.bcf} {input.bamT} echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; @@ -149,13 +150,14 @@ rule svdb_merge_tumor_only: vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".{caller}.vcf.gz", caller=somatic_caller_cnv) output: - svdb_vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.vcf.gz", + vcf_svdb = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.vcf.gz", namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.sample_name_map", benchmark: Path(benchmark_dir, 'svdb_merge_tumor_only_' + config["analysis"]["case_id"] + ".tsv") singularity: Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tumor = get_sample_type(config["samples"], "tumor"), case_name = config["analysis"]["case_id"], vcf= lambda wildcards, input:[input[index] + ":" + svdb_callers_prio[index] for index in range(0,len(input))], @@ -169,6 +171,8 @@ rule svdb_merge_tumor_only: svdb --merge --no_intra --bnd_distance 5000 --overlap 0.80 \ --vcf {params.vcf} \ --priority {params.svdb_priority} | \ -bgzip -l 9 -c > {output.svdb_vcf}; +bgzip -l 9 -c > {output.vcf_svdb}; +tabix -p vcf -f {output.vcf_svdb}; + echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule index 532460b78..95d14e31c 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule @@ -50,10 +50,11 @@ rule vardict_merge: input: expand(vcf_dir + "vardict/split_vcf/{chrom}_vardict.vcf.gz", chrom=chromlist) output: - vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", + vcf_vardict = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", yaml = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.yaml", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.sample_name_map" params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), case_name = config["analysis"]["case_id"], benchmark: @@ -68,13 +69,13 @@ rule vardict_merge: """ mkdir -p {params.tmpdir}; -bcftools concat {input} | bcftools sort --temp-dir {params.tmpdir} - | bgzip > {output.vcf}; -tabix -f -p vcf {output.vcf}; +bcftools concat {input} | bcftools sort --temp-dir {params.tmpdir} - | bgzip > {output.vcf_vardict}; +tabix -f -p vcf {output.vcf_vardict}; echo -e \"{params.case_name}\\tTUMOR\\n{params.case_name}-match\\tNORMAL\" > {output.namemap}; echo -e \"{params.case_name}\" > {output.namemap}.tumor; echo -e \"{params.case_name}-match\" > {output.namemap}.normal; -echo '{{ vcf: {{ vardict: {{ name: vardict, path: {output.vcf} }} }} }}' > {output.yaml}; +echo '{{ vcf: {{ vardict: {{ name: vardict, path: {output.vcf_vardict} }} }} }}' > {output.yaml}; rm -rf {params.tmpdir}; """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule index 37992d751..267fd091a 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule @@ -53,12 +53,13 @@ rule vardict_merge: output: namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.sample_name_map", yaml = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.yaml", - vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz" + vcf_vardict = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz" benchmark: Path(benchmark_dir, 'vardict_merge_' + config["analysis"]["case_id"] + ".tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("vardict") + ".sif").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), case_name = config["analysis"]["case_id"], threads: @@ -72,12 +73,12 @@ export TMPDIR={params.tmpdir}; bcftools concat {input} \ | bcftools sort --temp-dir {params.tmpdir} - \ -| bgzip > {output.vcf}; -tabix -f -p vcf {output.vcf}; +| bgzip > {output.vcf_vardict}; +tabix -f -p vcf {output.vcf_vardict}; echo -e \"{params.case_name}\\tTUMOR\" > {output.namemap}; echo -e \"{params.case_name}\" > {output.namemap}.tumor; -echo '{{ vcf: {{ vardict: {{ name: vardict , path: {output.vcf} }} }} }}' > {output.yaml}; +echo '{{ vcf: {{ vardict: {{ name: vardict , path: {output.vcf_vardict} }} }} }}' > {output.yaml}; """ diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 071e9d743..a36e91bff 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -40,6 +40,11 @@ logging.getLogger("filelock").setLevel("WARN") tmp_dir = os.path.join(get_result_dir(config), "tmp", "" ) Path.mkdir(Path(tmp_dir), exist_ok=True) +# Set case id/name +case_id = config["analysis"]["case_id"] + +# Directories +analysis_dir = config["analysis"]["analysis_dir"] + "/" +case_id + "/" benchmark_dir = config["analysis"]["benchmark"] fastq_dir = get_result_dir(config) + "/fastq/" bam_dir = get_result_dir(config) + "/bam/" @@ -50,7 +55,6 @@ vcf_dir = get_result_dir(config) + "/vcf/" vep_dir = get_result_dir(config) + "/vep/" qc_dir = get_result_dir(config) + "/qc/" delivery_dir = get_result_dir(config) + "/delivery/" - umi_dir = get_result_dir(config) + "/umi/" umi_qc_dir = qc_dir + "umi_qc/" @@ -76,9 +80,6 @@ tumor_sample = get_sample_type(config["samples"], "tumor")[0] if config['analysis']['analysis_type'] == "paired": normal_sample = get_sample_type(config["samples"], "normal")[0] -# Set case id/name -case_id = config["analysis"]["case_id"] - # explicitly check if cluster_config dict has zero keys. if len(cluster_config.keys()) == 0: cluster_config = config @@ -231,47 +232,81 @@ for r in rules_to_include: # Define common and analysis specific outputs quality_control_results = [ + os.path.join(qc_dir,case_id + "_metrics_deliverables.yaml"), os.path.join(qc_dir, "multiqc_report.html"), - os.path.join(qc_dir, case_id + "_metrics_deliverables.yaml"), + os.path.join(qc_dir, "multiqc_data/multiqc_data.json") ] -analysis_specific_results = [expand(vep_dir + "{vcf}.vcf.gz", - vcf=get_vcf(config, germline_caller, germline_call_samples)), - expand(vep_dir + "{vcf}.all.vcf.gz", - vcf=get_vcf(config, somatic_caller, [config["analysis"]["case_id"]]))] +# Analysis results +analysis_specific_results = [] -if config["analysis"]["sequencing_type"] != "wgs": - analysis_specific_results.append(expand(vep_dir + "{vcf}.all.filtered.pass.ranked.vcf.gz", - vcf=get_vcf(config, ["vardict"], [config["analysis"]["case_id"]]))) +# Germline SNVs/SVs +analysis_specific_results.extend( + expand(vep_dir + "{vcf}.vcf.gz", vcf=get_vcf(config, germline_caller, germline_call_samples)) +) - analysis_specific_results.append(expand(vcf_dir + "CNV.somatic.{case_name}.{var_caller}.vcf2cytosure.cgh", - case_name=config["analysis"]["case_id"], - var_caller=["cnvkit"])) +# Raw VCFs +analysis_specific_results.extend( + expand(vcf_dir + "{vcf}.vcf.gz", vcf=get_vcf(config, somatic_caller, [case_id])) +) - analysis_specific_results.append(expand(umi_qc_dir + "{sample}.umi.mean_family_depth", sample=config["samples"])) - - if background_variant_file: - analysis_specific_results.extend([expand(umi_qc_dir + "{case_name}.{var_caller}.AFtable.txt", - case_name=config["analysis"]["case_id"], - var_caller=["TNscope_umi"])]), +# Filtered and passed post annotation VCFs +analysis_specific_results.extend( + expand(vep_dir + "{vcf}.all.filtered.pass.vcf.gz", vcf=get_vcf(config, somatic_caller, [case_id])) +) -#Calculate TMB per somatic variant caller -analysis_specific_results.extend(expand(vep_dir + "{vcf}.balsamic_stat", - vcf=get_vcf(config, somatic_caller_tmb, [config["analysis"]["case_id"]]))) +# TMB +analysis_specific_results.extend( + expand(vep_dir + "{vcf}.balsamic_stat", vcf=get_vcf(config, somatic_caller_tmb, [case_id])) +) -#Gather all the filtered and PASSed variants post annotation -analysis_specific_results.extend([expand(vep_dir + "{vcf}.all.filtered.pass.vcf.gz", - vcf=get_vcf(config, somatic_caller, [config["analysis"]["case_id"]]))]) - -LOG.info(f"Following outputs will be delivered {analysis_specific_results}") +# TGA specific files +if config["analysis"]["sequencing_type"] != "wgs": + # CNVkit + analysis_specific_results.append(cnv_dir + "tumor.merged.cns") + analysis_specific_results.extend(expand(cnv_dir + "tumor.merged-{plot}", plot=["diagram.pdf", "scatter.pdf"])) + analysis_specific_results.append(cnv_dir + case_id +".gene_metrics") + # vcf2cytosure + analysis_specific_results.extend(expand( + vcf_dir + "CNV.somatic.{case_name}.{var_caller}.vcf2cytosure.cgh", + case_name=case_id, + var_caller=["cnvkit"] + )) + # VarDict + analysis_specific_results.extend( + expand(vep_dir + "{vcf}.all.filtered.pass.ranked.vcf.gz", vcf=get_vcf(config, ["vardict"], [case_id])) + ) + # UMI + analysis_specific_results.extend(expand(umi_qc_dir + "{sample}.umi.mean_family_depth",sample=config["samples"])) + if background_variant_file: + analysis_specific_results.extend( + expand(umi_qc_dir + "{case_name}.{var_caller}.AFtable.txt", case_name=case_id, var_caller=["TNscope_umi"]) + ) +# AscatNgs +if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "paired": + analysis_specific_results.extend( + expand(vcf_dir + "{vcf}.output.pdf", vcf=get_vcf(config, ["ascat"], [case_id])) + ) + analysis_specific_results.extend( + expand(vcf_dir + "{vcf}.copynumber.txt.gz", vcf=get_vcf(config, ["ascat"], [case_id])) + ) + +# Delly CNV +if config['analysis']['analysis_type'] == "single": + analysis_specific_results.extend( + expand(vcf_dir + "{vcf}.cov.gz",vcf=get_vcf(config,["dellycnv"],[case_id])) + ) + +# Dragen if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "single": if "dragen" in config: - analysis_specific_results.extend([Path(result_dir, "dragen", "SNV.somatic." + config["analysis"]["case_id"] + ".dragen_tumor.bam").as_posix(), - Path(result_dir, "dragen", "SNV.somatic." + config["analysis"]["case_id"] + ".dragen.vcf.gz").as_posix()]) + analysis_specific_results.extend([ + Path(result_dir, "dragen", "SNV.somatic." + case_id + ".dragen_tumor.bam").as_posix(), + Path(result_dir, "dragen", "SNV.somatic." + case_id + ".dragen.vcf.gz").as_posix() + ]) -if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "paired": - analysis_specific_results.append(expand(vcf_dir + "{vcf}.output.pdf", vcf=get_vcf(config, ["ascat"], [config["analysis"]["case_id"]]))) +LOG.info(f"Following outputs will be delivered {analysis_specific_results}") if 'benchmark_plots' in config: log_dir = config["analysis"]["log"] @@ -303,20 +338,20 @@ if 'benchmark_plots' in config: for plots in my_rule_plots: plots.unlink() - - if 'delivery' in config: - wildcard_dict = {"sample": list(config["samples"].keys())+["tumor", "normal"], - "case_name": config["analysis"]["case_id"], - "allow_missing": True - } + wildcard_dict = { + "sample": list(config["samples"].keys())+["tumor", "normal"], + "case_name": case_id, + "allow_missing": True + } if config['analysis']["analysis_type"] in ["paired", "single"]: - wildcard_dict.update({"var_type": ["CNV", "SNV", "SV"], - "var_class": ["somatic", "germline"], - "var_caller": somatic_caller + germline_caller, - "bedchrom": config["panel"]["chrom"] if "panel" in config else [], - }) + wildcard_dict.update({ + "var_type": ["CNV", "SNV", "SV"], + "var_class": ["somatic", "germline"], + "var_caller": somatic_caller + germline_caller, + "bedchrom": config["panel"]["chrom"] if "panel" in config else [], + }) if 'rules_to_deliver' in config: rules_to_deliver = config['rules_to_deliver'].split(",") @@ -338,9 +373,7 @@ if 'delivery' in config: output_files_ready.extend(files_to_deliver) output_files_ready = [dict(zip(output_files_ready[0], value)) for value in output_files_ready[1:]] - delivery_ready = os.path.join(get_result_dir(config), - "delivery_report", - config["analysis"]["case_id"] + "_delivery_ready.hk") + delivery_ready = os.path.join(get_result_dir(config), "delivery_report", case_id + "_delivery_ready.hk") write_json(output_files_ready, delivery_ready) FormatFile(delivery_ready) @@ -359,7 +392,7 @@ rule all: # Perform validation of extracted QC metrics try: - validate_qc_metrics(read_yaml(input[1])) + validate_qc_metrics(read_yaml(input[0])) except ValueError as val_exc: LOG.error(val_exc) raise BalsamicError diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ab198ec48..f3c926236 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -36,6 +36,7 @@ Changed: * container base_image (align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py36) to 4.10.3-alpine #921 * update container (align_qc, annotate, coverage_qc, varcall_cnvkit,varcall_py36) bioinfo tool versions #921 * update tool versions (align_qc, annotate, coverage_qc, varcall_cnvkit) in methods and softwares docs #921 +* Updated the list of files to be stored and delivered #848 Fixed: ^^^^^^ @@ -43,6 +44,7 @@ Fixed: * ``collect_qc_metrics.py`` failing for WGS cases with empty ``capture_kit`` argument #850 * QC metric validation for different panel bed version #855 * Fixed development version of ``fpdf2`` to ``2.4.6`` #878 +* Added missing svdb index file #848 Removed ^^^^^^^ diff --git a/tests/commands/report/test_deliver.py b/tests/commands/report/test_deliver.py index 43477c106..6a5e52754 100644 --- a/tests/commands/report/test_deliver.py +++ b/tests/commands/report/test_deliver.py @@ -58,17 +58,17 @@ def test_deliver_tumor_normal_panel( # Actual delivery files dummies with and without index cnv_result_dir = Path(helpers.result_dir, "cnv") cnv_result_dir.mkdir(parents=True, exist_ok=True) - actual_delivery_file = Path(cnv_result_dir, "tumor.merged.cnr") + actual_delivery_file = Path(cnv_result_dir, "tumor.merged.cns") actual_delivery_file.touch() vep_result_dir = Path(helpers.result_dir, "vep") vep_result_dir.mkdir(parents=True, exist_ok=True) touch_vcf_delivery_file = Path( - vep_result_dir, "SNV.somatic." + helpers.case_id + ".vardict.all.vcf.gz" + vep_result_dir, "SNV.somatic." + helpers.case_id + ".vardict.vcf.gz" ) touch_vcf_delivery_file.touch() touch_vcf_delivery_file_index = Path( - vep_result_dir, "SNV.somatic." + helpers.case_id + ".vardict.all.vcf.gz.tbi" + vep_result_dir, "SNV.somatic." + helpers.case_id + ".vardict.vcf.gz.tbi" ) touch_vcf_delivery_file_index.touch() diff --git a/tests/conftest.py b/tests/conftest.py index bfbd55af8..f5eb9aefb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ from click.testing import CliRunner from BALSAMIC.utils.cli import read_yaml -from .helpers import ConfigHelper +from .helpers import ConfigHelper, Map from BALSAMIC.commands.base import cli from BALSAMIC import __version__ as balsamic_version @@ -673,3 +673,49 @@ def qc_requested_metrics(): def qc_extracted_metrics(metrics_yaml_path): """Extracted and formatted QC metrics""" return read_yaml(metrics_yaml_path) + + +@pytest.fixture(scope="function") +def snakemake_fastqc_rule(tumor_only_config, helpers): + """FastQC snakemake mock rule""" + + helpers.read_config(tumor_only_config) + fastq_path = os.path.join( + helpers.analysis_dir, + helpers.case_id, + "analysis", + "fastq", + "concatenated_tumor_XXXXXX_R_{read}.fastq.gz", + ) + + return Map( + { + "fastqc": Map( + { + "params": Map( + { + "housekeeper_id": { + "id": "sample_tumor_only", + "tags": "quality-trimmed-seq", + } + } + ), + "output": Map( + { + "_names": Map({"fastqc": fastq_path}), + "fastqc": fastq_path, + } + ), + "rule": Map( + { + "name": "fastq", + "output": [ + fastq_path, + ], + "temp_output": set(), + } + ), + } + ) + } + ) diff --git a/tests/helpers.py b/tests/helpers.py index 5d8c17a4f..fae159606 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -16,3 +16,35 @@ def read_config(self, balsamic_config): self.analysis_dir = sample_config["analysis"]["analysis_dir"] self.result_dir = sample_config["analysis"]["result"] self.delivery_dir = Path(self.result_dir, "delivery_report").as_posix() + + +class Map(dict): + """Mock class to use dot notation to access values of a dictionary""" + + def __init__(self, *args, **kwargs): + super(Map, self).__init__(*args, **kwargs) + for arg in args: + if isinstance(arg, dict): + for k, v in arg.items(): + self[k] = v + + if kwargs: + for k, v in kwargs.items(): + self[k] = v + + def __getattr__(self, attr): + return self.get(attr) + + def __setattr__(self, key, value): + self.__setitem__(key, value) + + def __setitem__(self, key, value): + super(Map, self).__setitem__(key, value) + self.__dict__.update({key: value}) + + def __delattr__(self, item): + self.__delitem__(item) + + def __delitem__(self, key): + super(Map, self).__delitem__(key) + del self.__dict__[key] diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 77bf50a55..92afad384 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -1,4 +1,5 @@ import json +import os import subprocess import pytest import sys @@ -59,7 +60,9 @@ get_threads, get_delivery_id, get_reference_output_files, + get_rule_output, ) +from tests.helpers import Map def test_get_variant_callers_wrong_analysis_type(tumor_normal_config): @@ -1045,3 +1048,32 @@ def test_create_md5(tmp_path): # THEN md5 file exists assert dummy_file.exists() + + +def test_get_rule_output(snakemake_fastqc_rule): + """Tests retrieval of existing output files from a specific workflow""" + + # GIVEN a snakemake fastqc rule object, a rule name and a list of associated wildcards + rules = snakemake_fastqc_rule + rule_name = "fastqc" + output_file_wildcards = { + "sample": ["concatenated_tumor_XXXXXX_R", "tumor", "normal"], + "case_name": "sample_tumor_only", + } + + # THEN retrieve the output files + output_files = get_rule_output(rules, rule_name, output_file_wildcards) + + # THEN check that the fastq files has been picked up by the function and that the tags has been correctly created + assert len(output_files) == 2 + for file in output_files: + # Expected file names + assert ( + os.path.basename(file[0]) == "concatenated_tumor_XXXXXX_R_1.fastq.gz" + or os.path.basename(file[0]) == "concatenated_tumor_XXXXXX_R_2.fastq.gz" + ) + # Expected tags + assert ( + file[3] == "1,fastqc,quality-trimmed-seq-fastqc" + or file[3] == "2,fastqc,quality-trimmed-seq-fastqc" + ) From e628c426f3d10735411c3cd9cf4d2dcadd9b86bf Mon Sep 17 00:00:00 2001 From: ivadym Date: Mon, 2 May 2022 15:28:55 +0200 Subject: [PATCH 58/58] feat: bcftools counts QC validation (#925) --- BALSAMIC/assets/scripts/collect_qc_metrics.py | 73 +++++++++++++--- BALSAMIC/constants/qc_metrics.py | 1 + BALSAMIC/constants/workflow_rules.py | 1 + .../quality_control/multiqc.rule | 25 +----- .../quality_control/qc_metrics.rule | 36 ++++++++ CHANGELOG.rst | 2 + tests/conftest.py | 8 ++ tests/scripts/test_collect_qc_metrics.py | 86 +++++++++++++++++++ ....somatic.case.svdb.all.filtered.pass.stats | 6 ++ 9 files changed, 202 insertions(+), 36 deletions(-) create mode 100644 BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule create mode 100644 tests/test_data/qc_files/analysis/vep/SNV.somatic.case.svdb.all.filtered.pass.stats diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py index 33834ec10..e3a9e13f8 100755 --- a/BALSAMIC/assets/scripts/collect_qc_metrics.py +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -16,11 +16,13 @@ ) @click.argument("output_path", type=click.Path(exists=False), required=True) @click.argument("multiqc_data_path", type=click.Path(exists=True), required=True) +@click.argument("counts_path", nargs=-1, type=click.Path(exists=True), required=False) @click.argument("sequencing_type", required=True) @click.argument("capture_kit", required=True) def collect_qc_metrics( output_path: Path, multiqc_data_path: Path, + counts_path: List[Path], sequencing_type: str, capture_kit: str, ): @@ -29,17 +31,23 @@ def collect_qc_metrics( Args: output_path: Path; destination path for the extracted YAML formatted metrics multiqc_data_path: Path; multiqc JSON path from which the metrics will be extracted + counts_path: Path; list of variant caller specific files containing the number of variants sequencing_type: str; analysis sequencing type capture_kit: str; capture kit used for targeted analysis ("None" for WGS) """ + # MultiQC metrics + metrics = get_multiqc_metrics( + multiqc_data_path, sequencing_type, capture_kit_resolve_type(capture_kit) + ) + + # Number of variants + for count in counts_path: + metrics += get_variant_metrics(count) + with open(output_path, "w") as fn: yaml.dump( - get_multiqc_metrics( - multiqc_data_path, - sequencing_type, - capture_kit_resolve_type(capture_kit), - ), + metrics, fn, sort_keys=False, default_flow_style=False, @@ -111,26 +119,26 @@ def get_qc_supported_capture_kit(capture_kit, metrics: List[str]) -> str: def get_requested_metrics( - metrics: dict, sequencing_type: str, capture_kit: Union[str, None] + metrics: dict, analysis_type: str, capture_kit: Union[str, None] ) -> dict: """Parses the defined and requested metrics and returns them as a dictionary""" - requested_metrics = metrics[sequencing_type] + requested_metrics = metrics[analysis_type] if capture_kit: - requested_metrics = metrics[sequencing_type]["default"] + requested_metrics = metrics[analysis_type]["default"] supported_capture_kit = get_qc_supported_capture_kit( - capture_kit, metrics[sequencing_type] + capture_kit, metrics[analysis_type] ) if supported_capture_kit: - requested_metrics.update(metrics[sequencing_type][supported_capture_kit]) + requested_metrics.update(metrics[analysis_type][supported_capture_kit]) return requested_metrics def get_multiqc_metrics( multiqc_data_path: Path, sequencing_type: str, capture_kit: Union[str, None] -) -> dict: - """Extracts the requested metrics from a multiqc JSON file and returns them as a dictionary""" +) -> list: + """Extracts and returns the requested metrics from a multiqc JSON file""" with open(multiqc_data_path, "r") as f: multiqc_data = json.load(f) @@ -164,5 +172,46 @@ def extract(data, output_metrics, sample=None, source=None): return extract(multiqc_data["report_saved_raw_data"], []) +def extract_number_variants(counts: list) -> dict: + """Formats the number of SNPs, Indels, and total number of sites""" + + variant_metrics = dict() + + for count in counts: + # Transforms string "Number of sites: 125" into a key value object {"NUMBER_OF_SITES": 125} + count = count.split(":") + if len(count) > 1: + variant_metrics.update( + {count[0].strip().upper().replace(" ", "_"): int(count[1].strip())} + ) + + return variant_metrics + + +def get_variant_metrics(counts_path: list) -> list: + """Retrieves the variant metrics and returns them as a MetricModel list""" + + output_metrics = list() + + with open(counts_path, "r") as input_file: + counts = input_file.read().split("\n") + + variant_metrics = extract_number_variants(counts) + requested_metrics = get_requested_metrics(METRICS, "variants", None) + for metric in requested_metrics: + output_metrics.append( + MetricModel( + id=os.path.basename(counts_path).split(".")[2], # case_id + input=os.path.basename(counts_path), + name=metric, + step="collect_custom_qc_metrics", + value=variant_metrics[metric], + condition=requested_metrics[metric]["condition"], + ).dict() + ) + + return output_metrics + + if __name__ == "__main__": collect_qc_metrics() diff --git a/BALSAMIC/constants/qc_metrics.py b/BALSAMIC/constants/qc_metrics.py index 509940587..f2acc4a8a 100644 --- a/BALSAMIC/constants/qc_metrics.py +++ b/BALSAMIC/constants/qc_metrics.py @@ -48,4 +48,5 @@ "PCT_100X": {"condition": None}, "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, }, + "variants": {"NUMBER_OF_SITES": {"condition": {"norm": "lt", "threshold": 10000}}}, } diff --git a/BALSAMIC/constants/workflow_rules.py b/BALSAMIC/constants/workflow_rules.py index 4f77c2731..0304d68ed 100644 --- a/BALSAMIC/constants/workflow_rules.py +++ b/BALSAMIC/constants/workflow_rules.py @@ -5,6 +5,7 @@ "snakemake_rules/quality_control/fastp.rule", "snakemake_rules/quality_control/fastqc.rule", "snakemake_rules/quality_control/multiqc.rule", + "snakemake_rules/quality_control/qc_metrics.rule", "snakemake_rules/variant_calling/mergetype_tumor.rule", ], "align": [], diff --git a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule index d31976f64..aa1605095 100644 --- a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule @@ -30,7 +30,6 @@ if config["analysis"]["sequencing_type"] == 'wgs': if config['analysis']['analysis_type'] == "paired": multiqc_input.append(bam_dir+"normal.merged.recal_data.table") - else: # fastqc metrics multiqc_input.extend(expand(fastqc_dir + "{sample}_{read_num}_fastqc.zip", sample=config["samples"], read_num=[1, 2])) @@ -57,9 +56,9 @@ else: multiqc_input.extend(expand(bam_dir + "{sample}.samtools.{stats}.txt", sample=config["samples"], stats=['flagstats', 'idxstats', 'stats'])) if config["umiworkflow"]: + # UMI picard metrics multiqc_input.extend(expand(umi_qc_dir + "{sample}.umi.collect_hsmetric", sample=config["samples"])) multiqc_input.extend(expand(umi_qc_dir + "{sample}.umi.metrics", sample=config["samples"])) - rule multiqc: input: @@ -90,25 +89,3 @@ multiqc --force --outdir {params.qc_dir} \ chmod -R 777 {params.qc_dir}; """ - - -rule collect_custom_qc_metrics: - input: - json = qc_dir + "multiqc_data/multiqc_data.json" - output: - yaml = qc_dir + config["analysis"]["case_id"] + "_metrics_deliverables.yaml" - params: - collect_qc_metrics_script = get_script_path("collect_qc_metrics.py"), - sequencing_type = get_sequencing_type(config), - capture_kit = get_capture_kit(config), - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "qc-metrics"} - singularity: - Path(singularity_image, "balsamic.sif").as_posix() - threads: - get_threads(cluster_config, "collect_custom_qc_metrics") - message: - "Extract the manually specified QC metric for validation and delivery" - shell: - """ -python {params.collect_qc_metrics_script} {output.yaml} {input.json} {params.sequencing_type} {params.capture_kit} - """ diff --git a/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule b/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule new file mode 100644 index 000000000..b5ac0851d --- /dev/null +++ b/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule @@ -0,0 +1,36 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +bcftools_counts_input = [vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.all.filtered.pass.stats"] + +if config["analysis"]["sequencing_type"] == 'wgs': + bcftools_counts_input.append(vep_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.all.filtered.pass.stats") + +else: + bcftools_counts_input.append(vep_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.all.filtered.pass.stats") + + if config["umiworkflow"]: + # bcftools counts + bcftools_counts_input.append(vep_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".TNscope_umi.all.filtered.pass.stats") + +rule collect_custom_qc_metrics: + input: + bcftools_counts = bcftools_counts_input, + json = qc_dir + "multiqc_data/multiqc_data.json" + output: + yaml = qc_dir + config["analysis"]["case_id"] + "_metrics_deliverables.yaml" + params: + collect_qc_metrics_script = get_script_path("collect_qc_metrics.py"), + sequencing_type = get_sequencing_type(config), + capture_kit = get_capture_kit(config), + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "qc-metrics"} + singularity: + Path(singularity_image, "balsamic.sif").as_posix() + threads: + get_threads(cluster_config, "collect_custom_qc_metrics") + message: + "Extract the manually specified QC metric for validation and delivery" + shell: + """ +python {params.collect_qc_metrics_script} {output.yaml} {input.json} {input.bcftools_counts} {params.sequencing_type} {params.capture_kit} + """ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f3c926236..fe53e422e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -23,6 +23,7 @@ Added: * ReadtheDocs for BALSAMIC annotation resources #916 * Delly CNV for tumor only workflow #923 * Delly CNV Read-depth profiles for tumor only workflows #924 +* New metric to be extracted and validated: ``NUMBER_OF_SITES`` (``bcftools`` counts) #925 Changed: ^^^^^^^^ @@ -37,6 +38,7 @@ Changed: * update container (align_qc, annotate, coverage_qc, varcall_cnvkit,varcall_py36) bioinfo tool versions #921 * update tool versions (align_qc, annotate, coverage_qc, varcall_cnvkit) in methods and softwares docs #921 * Updated the list of files to be stored and delivered #848 +* Moved ``collect_custom_qc_metrics`` rule from ``multiqc.rule`` #925 Fixed: ^^^^^^ diff --git a/tests/conftest.py b/tests/conftest.py index f5eb9aefb..351b5dcdd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -645,6 +645,14 @@ def metrics_yaml_path(analysis_path): ) +@pytest.fixture(scope="session") +def bcftools_counts_path(analysis_path): + """svdb.all.filtered.pass.stats test path""" + return os.path.join( + analysis_path, "vep", "SNV.somatic.case.svdb.all.filtered.pass.stats" + ) + + @pytest.fixture(scope="session") def qc_requested_metrics(): """Raw requested metrics""" diff --git a/tests/scripts/test_collect_qc_metrics.py b/tests/scripts/test_collect_qc_metrics.py index 9b2cb7e16..1f62034ef 100644 --- a/tests/scripts/test_collect_qc_metrics.py +++ b/tests/scripts/test_collect_qc_metrics.py @@ -1,4 +1,5 @@ import json +import os.path from pathlib import Path from BALSAMIC.assets.scripts.collect_qc_metrics import ( @@ -8,6 +9,8 @@ get_qc_supported_capture_kit, get_requested_metrics, capture_kit_resolve_type, + extract_number_variants, + get_variant_metrics, ) @@ -147,6 +150,58 @@ def test_get_multiqc_metrics_filtering_umi(multiqc_data_path): assert "umi" not in metric["input"] +def test_extract_number_variants(): + """tests number of variants formatting""" + + # GIVEN a raw input list of variant metrics + counts = [ + "Number of samples: 2", + "Number of SNPs: 111", + "Number of INDELs: 14", + "Number of MNPs: 0", + "Number of sites: 125", + "", + ] + + # GIVEN an expected output after arranging the input list + expected_variants_metrics = { + "NUMBER_OF_SAMPLES": 2, + "NUMBER_OF_SNPS": 111, + "NUMBER_OF_INDELS": 14, + "NUMBER_OF_MNPS": 0, + "NUMBER_OF_SITES": 125, + } + + # WHEN performing the extraction of variant metrics + variant_metrics = extract_number_variants(counts) + + # THEN verify that the number of variants has been correctly retrieved + assert expected_variants_metrics == variant_metrics + + +def test_get_variant_metrics(bcftools_counts_path): + """tests variant metrics retrieval""" + + # GIVEN an SVDB bcftools counts path + + # GIVEN an expected MetricsModel dictionary + expected_output_metris = { + "header": None, + "id": "case", + "input": os.path.basename(bcftools_counts_path), + "name": "NUMBER_OF_SITES", + "step": "collect_custom_qc_metrics", + "value": 125, + "condition": {"norm": "lt", "threshold": 10000.0}, + } + + # WHEN extracting the number of variants + output_metrics = get_variant_metrics(bcftools_counts_path) + + # THEN check that the output metrics has been correctly shaped + assert expected_output_metris == output_metrics[0] + + def test_collect_qc_metrics_targeted(tmp_path, multiqc_data_path, cli_runner): """tests qc metrics yaml file generation for targeted analysis""" @@ -187,3 +242,34 @@ def test_collect_qc_metrics_wgs(tmp_path, multiqc_data_path, cli_runner): # THEN check if the YAML is correctly created and there are no errors assert result.exit_code == 0 assert Path(output_path).exists() + + +def test_collect_qc_metrics_counts( + tmp_path, multiqc_data_path, bcftools_counts_path, cli_runner +): + """tests qc metrics yaml file generation for targeted analysis and providing a bcftools counts path""" + + # GIVEN the output, multiqc metrics and bcftools counts paths + output_path = tmp_path / "sample_tumor_only_metrics_deliverables.yaml" + + # GIVEN a sequencing type and a capture kit + seq_type = "targeted" + capture_kit = "gmsmyeloid_5.2_hg19_design.bed" + + # WHEN invoking the python script + result = cli_runner.invoke( + collect_qc_metrics, + [ + str(output_path), + multiqc_data_path, + bcftools_counts_path, # multiple counts path regarding different variant callers + bcftools_counts_path, + bcftools_counts_path, + seq_type, + capture_kit, + ], + ) + + # THEN check if the YAML is correctly created and there are no errors + assert result.exit_code == 0 + assert Path(output_path).exists() diff --git a/tests/test_data/qc_files/analysis/vep/SNV.somatic.case.svdb.all.filtered.pass.stats b/tests/test_data/qc_files/analysis/vep/SNV.somatic.case.svdb.all.filtered.pass.stats new file mode 100644 index 000000000..2a56b7719 --- /dev/null +++ b/tests/test_data/qc_files/analysis/vep/SNV.somatic.case.svdb.all.filtered.pass.stats @@ -0,0 +1,6 @@ +Number of samples: 2 +Number of SNPs: 111 +Number of INDELs: 14 +Number of MNPs: 0 +Number of others: 0 +Number of sites: 125