diff --git a/.github/workflows/docker_build_push.yml b/.github/workflows/docker_build_push.yml index 860760f08..379b72e2c 100644 --- a/.github/workflows/docker_build_push.yml +++ b/.github/workflows/docker_build_push.yml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: true matrix: - container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly] + container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly, vcf2cytosure] steps: - name: Set up QEMU uses: docker/setup-qemu-action@v1 diff --git a/.github/workflows/docker_build_push_release.yml b/.github/workflows/docker_build_push_release.yml index f4e2c566c..d44c84e06 100644 --- a/.github/workflows/docker_build_push_release.yml +++ b/.github/workflows/docker_build_push_release.yml @@ -11,7 +11,7 @@ jobs: strategy: fail-fast: true matrix: - container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly] + container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly, vcf2cytosure] steps: - name: Set up QEMU uses: docker/setup-qemu-action@v1 diff --git a/.github/workflows/docker_build_test_pull_request.yml b/.github/workflows/docker_build_test_pull_request.yml index 1b7c86b80..f4b5f4379 100644 --- a/.github/workflows/docker_build_test_pull_request.yml +++ b/.github/workflows/docker_build_test_pull_request.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly] + container-name: [align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py27, varcall_py36, balsamic, delly, vcf2cytosure] steps: - name: Git checkout id: git_checkout diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py new file mode 100755 index 000000000..e3a9e13f8 --- /dev/null +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python +import json +import os +from pathlib import Path +from typing import List, Union + +import click +import yaml + +from BALSAMIC.constants.qc_metrics import METRICS +from BALSAMIC.utils.models import MetricModel + + +@click.command( + short_help="Extract the manually specified QC metrics", +) +@click.argument("output_path", type=click.Path(exists=False), required=True) +@click.argument("multiqc_data_path", type=click.Path(exists=True), required=True) +@click.argument("counts_path", nargs=-1, type=click.Path(exists=True), required=False) +@click.argument("sequencing_type", required=True) +@click.argument("capture_kit", required=True) +def collect_qc_metrics( + output_path: Path, + multiqc_data_path: Path, + counts_path: List[Path], + sequencing_type: str, + capture_kit: str, +): + """Extracts the requested metrics from a JSON multiqc file and saves them to a YAML file + + Args: + output_path: Path; destination path for the extracted YAML formatted metrics + multiqc_data_path: Path; multiqc JSON path from which the metrics will be extracted + counts_path: Path; list of variant caller specific files containing the number of variants + sequencing_type: str; analysis sequencing type + capture_kit: str; capture kit used for targeted analysis ("None" for WGS) + """ + + # MultiQC metrics + metrics = get_multiqc_metrics( + multiqc_data_path, sequencing_type, capture_kit_resolve_type(capture_kit) + ) + + # Number of variants + for count in counts_path: + metrics += get_variant_metrics(count) + + with open(output_path, "w") as fn: + yaml.dump( + metrics, + fn, + sort_keys=False, + default_flow_style=False, + ) + + +def capture_kit_resolve_type(capture_kit: str): + """Resolves the capture_kit type (NoneType or String)""" + + if capture_kit == "None": + return None + + return capture_kit + + +def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str: + """Extracts the metrics data source associated with a specific sample and tool + + Args: + multiqc_data: dict; raw data from the multiqc_data.json file + sample: str; sample ID + tool: str; QC analysis tools applied during the workflow (e.g. "multiqc_picard_dups") + + Returns: + A source file that was used to produce a specific metric + """ + + if tool == "multiqc_general_stats": + subtool_name = ["multiqc", "FastQC", "all_sections"] + else: + # Use case: splits multiqc_picard_dups into ['multiqc', 'picard', 'dup'] in order to retrieve the + # ["report_data_sources"]["Picard"]["DuplicationMetrics"] values from multiqc_data.json + subtool_name = tool[:-1].split("_") + + # Nested json fetching + for source_tool in multiqc_data["report_data_sources"]: + # source_tool: Picard, fastp, FastQC, etc. + for source_subtool in multiqc_data["report_data_sources"][source_tool]: + # source_subtool (for Picard): AlignmentSummaryMetrics, HsMetrics, DuplicationMetric, etc. + if ( + subtool_name[1].lower() in source_tool.lower() + and subtool_name[2].lower() in source_subtool.lower() + ): + try: + return os.path.basename( + multiqc_data["report_data_sources"][source_tool][ + source_subtool + ][sample] + ) + except KeyError: + # Deletes par orientation information from the sample name (insertSize metrics) + sample = sample.rsplit("_", 1)[0] + return os.path.basename( + multiqc_data["report_data_sources"][source_tool][ + source_subtool + ][sample] + ) + + +def get_qc_supported_capture_kit(capture_kit, metrics: List[str]) -> str: + """Returns a BALSAMIC supported panel bed name associated to a specific capture_kit parameter""" + available_panel_beds = [] + + for k in metrics: + if k != "default": + available_panel_beds.append(k) + + return next((i for i in available_panel_beds if i in capture_kit), None) + + +def get_requested_metrics( + metrics: dict, analysis_type: str, capture_kit: Union[str, None] +) -> dict: + """Parses the defined and requested metrics and returns them as a dictionary""" + + requested_metrics = metrics[analysis_type] + if capture_kit: + requested_metrics = metrics[analysis_type]["default"] + supported_capture_kit = get_qc_supported_capture_kit( + capture_kit, metrics[analysis_type] + ) + if supported_capture_kit: + requested_metrics.update(metrics[analysis_type][supported_capture_kit]) + + return requested_metrics + + +def get_multiqc_metrics( + multiqc_data_path: Path, sequencing_type: str, capture_kit: Union[str, None] +) -> list: + """Extracts and returns the requested metrics from a multiqc JSON file""" + + with open(multiqc_data_path, "r") as f: + multiqc_data = json.load(f) + + requested_metrics = get_requested_metrics(METRICS, sequencing_type, capture_kit) + + def extract(data, output_metrics, sample=None, source=None): + """Recursively fetch metrics data from a nested multiqc JSON""" + + if isinstance(data, dict): + for k in data: + # Ignore UMI and reverse reads metrics + if "umi" not in k: + if k in requested_metrics: + output_metrics.append( + MetricModel( + id=sample.split("_")[1], + input=get_multiqc_data_source( + multiqc_data, sample, source + ), + name=k, + step=source, + value=data[k], + condition=requested_metrics[k]["condition"], + ).dict() + ) + extract(data[k], output_metrics, k, sample) + + return output_metrics + + return extract(multiqc_data["report_saved_raw_data"], []) + + +def extract_number_variants(counts: list) -> dict: + """Formats the number of SNPs, Indels, and total number of sites""" + + variant_metrics = dict() + + for count in counts: + # Transforms string "Number of sites: 125" into a key value object {"NUMBER_OF_SITES": 125} + count = count.split(":") + if len(count) > 1: + variant_metrics.update( + {count[0].strip().upper().replace(" ", "_"): int(count[1].strip())} + ) + + return variant_metrics + + +def get_variant_metrics(counts_path: list) -> list: + """Retrieves the variant metrics and returns them as a MetricModel list""" + + output_metrics = list() + + with open(counts_path, "r") as input_file: + counts = input_file.read().split("\n") + + variant_metrics = extract_number_variants(counts) + requested_metrics = get_requested_metrics(METRICS, "variants", None) + for metric in requested_metrics: + output_metrics.append( + MetricModel( + id=os.path.basename(counts_path).split(".")[2], # case_id + input=os.path.basename(counts_path), + name=metric, + step="collect_custom_qc_metrics", + value=variant_metrics[metric], + condition=requested_metrics[metric]["condition"], + ).dict() + ) + + return output_metrics + + +if __name__ == "__main__": + collect_qc_metrics() diff --git a/BALSAMIC/commands/config/base.py b/BALSAMIC/commands/config/base.py index 7744c765a..f0530ca7b 100644 --- a/BALSAMIC/commands/config/base.py +++ b/BALSAMIC/commands/config/base.py @@ -3,6 +3,7 @@ from BALSAMIC.commands.config.case import case_config as case_command from BALSAMIC.commands.config.pon import pon_config as pon_command +from BALSAMIC.commands.config.qc import qc_config as qc_command @click.group() @@ -14,3 +15,4 @@ def config(context): config.add_command(case_command) config.add_command(pon_command) +config.add_command(qc_command) diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index a4b0eab9d..b52d11824 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -75,6 +75,12 @@ required=False, help="Background set of valid variants for UMI", ) +@click.option( + "--pon-cnn", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="Panel of normal reference (.cnn) for cnvkit", +) @click.option( "--balsamic-cache", type=click.Path(exists=True, resolve_path=True), @@ -124,7 +130,7 @@ "-g", "--genome-version", default="hg19", - type=click.Choice(["hg19", "hg38"]), + type=click.Choice(["hg19", "hg38", "canfam3"]), help=( "Genome version to prepare reference. Path to genome" "will be /genome_version" @@ -140,6 +146,7 @@ def case_config( quality_trim, panel_bed, background_variants, + pon_cnn, analysis_dir, tumor, normal, @@ -196,6 +203,7 @@ def case_config( panel={ "capture_kit": panel_bed, "chrom": get_panel_chrom(panel_bed), + "pon_cnn": pon_cnn, } if panel_bed else None, diff --git a/BALSAMIC/commands/config/qc.py b/BALSAMIC/commands/config/qc.py new file mode 100644 index 000000000..e56f7f6f1 --- /dev/null +++ b/BALSAMIC/commands/config/qc.py @@ -0,0 +1,213 @@ +import os +import json +import logging +from pathlib import Path + +import click + +from BALSAMIC import __version__ as balsamic_version +from BALSAMIC.utils.cli import ( + get_sample_dict, + get_panel_chrom, + get_bioinfo_tools_version, + create_fastq_symlink, + generate_graph, +) +from BALSAMIC.constants.common import ( + CONTAINERS_CONDA_ENV_PATH, + BIOINFO_TOOL_ENV, +) +from BALSAMIC.utils.models import BalsamicConfigModel + + +LOG = logging.getLogger(__name__) + + +@click.command( + "qc_panel", + short_help="Create a sample config file for panel cases to perform QC", +) +@click.option( + "--case-id", + required=True, + help="Sample id that is used for reporting, \ + naming the analysis jobs, and analysis path", +) +@click.option( + "--quality-trim/--no-quality-trim", + default=True, + show_default=True, + is_flag=True, + help="Trim low quality reads in fastq", +) +@click.option( + "--adapter-trim/--no-adapter-trim", + default=True, + show_default=True, + is_flag=True, + help="Trim adapters from reads in fastq", +) +@click.option( + "-p", + "--panel-bed", + type=click.Path(exists=True, resolve_path=True), + required=False, + help="Panel bed file for variant calling.", +) +@click.option( + "--balsamic-cache", + type=click.Path(exists=True, resolve_path=True), + required=True, + help="Path to BALSAMIC cache", +) +@click.option( + "--container-version", + show_default=True, + default=balsamic_version, + type=click.Choice(["develop", "master", balsamic_version]), + help="Container for BALSAMIC version to download", +) +@click.option( + "--analysis-dir", + type=click.Path(exists=True, resolve_path=True), + required=True, + help="Root analysis path to store analysis logs and results. \ + The final path will be analysis-dir/sample-id", +) +@click.option( + "-t", + "--tumor", + type=click.Path(exists=True, resolve_path=True), + required=True, + multiple=True, + help="Fastq files for tumor sample.", +) +@click.option( + "-n", + "--normal", + type=click.Path(exists=True, resolve_path=True), + required=False, + multiple=True, + help="Fastq files for normal sample.", +) +@click.option( + "--umi/--no-umi", + default=True, + show_default=True, + is_flag=True, + help=("UMI processing steps for samples with UMI tags."), +) +@click.option( + "--umi-trim-length", + default=5, + show_default=True, + type=int, + help="Trim N bases from reads in fastq", +) +@click.option("--tumor-sample-name", help="Tumor sample name") +@click.option("--normal-sample-name", help="Normal sample name") +@click.option( + "-g", + "--genome-version", + default="hg19", + type=click.Choice(["hg19", "hg38", "canfam3"]), + help=( + "Genome version to prepare reference. Path to genome" + "will be /genome_version" + ), +) +@click.pass_context +def qc_config( + context, + case_id, + umi, + umi_trim_length, + adapter_trim, + quality_trim, + panel_bed, + analysis_dir, + tumor, + normal, + tumor_sample_name, + normal_sample_name, + genome_version, + balsamic_cache, + container_version, +): + + if container_version: + balsamic_version = container_version + + try: + samples = get_sample_dict( + tumor=tumor, + normal=normal, + tumor_sample_name=tumor_sample_name, + normal_sample_name=normal_sample_name, + ) + except AttributeError: + LOG.error(f"File name is invalid, use convention [SAMPLE_ID]_R_[1,2].fastq.gz") + raise click.Abort() + + reference_config = os.path.join( + balsamic_cache, balsamic_version, genome_version, "reference.json" + ) + with open(reference_config, "r") as f: + reference_dict = json.load(f)["reference"] + + config_collection_dict = BalsamicConfigModel( + QC={ + "quality_trim": quality_trim, + "adapter_trim": adapter_trim, + "umi_trim": umi if panel_bed else False, + "umi_trim_length": umi_trim_length, + }, + analysis={ + "case_id": case_id, + "analysis_dir": analysis_dir, + "analysis_type": "qc_panel", + "sequencing_type": "targeted" if panel_bed else "wgs", + }, + reference=reference_dict, + singularity=os.path.join(balsamic_cache, balsamic_version, "containers"), + samples=samples, + bioinfo_tools=BIOINFO_TOOL_ENV, + bioinfo_tools_version=get_bioinfo_tools_version( + BIOINFO_TOOL_ENV, CONTAINERS_CONDA_ENV_PATH + ), + panel={ + "capture_kit": panel_bed, + "chrom": get_panel_chrom(panel_bed), + } + if panel_bed + else None, + umiworkflow=False, + ).dict(by_alias=True, exclude_none=True) + LOG.info("QC config file generated successfully") + + Path.mkdir( + Path(config_collection_dict["analysis"]["fastq_path"]), + parents=True, + exist_ok=True, + ) + LOG.info("Directories created successfully") + + create_fastq_symlink( + casefiles=(tumor + normal), + symlink_dir=Path(config_collection_dict["analysis"]["fastq_path"]), + ) + LOG.info(f"Symlinks generated successfully") + + config_path = Path(analysis_dir) / case_id / (case_id + "_QC.json") + with open(config_path, "w+") as fh: + fh.write(json.dumps(config_collection_dict, indent=4)) + LOG.info(f"QC config file saved successfully - {config_path}") + + try: + generate_graph(config_collection_dict, config_path) + LOG.info(f"BALSAMIC QC Workflow has been configured successfully!") + except ValueError: + LOG.error( + f'BALSAMIC QC dag graph generation failed - {config_collection_dict["analysis"]["dag"]}', + ) + raise click.Abort() diff --git a/BALSAMIC/commands/init/base.py b/BALSAMIC/commands/init/base.py index 6adfc5d10..78032b302 100644 --- a/BALSAMIC/commands/init/base.py +++ b/BALSAMIC/commands/init/base.py @@ -57,11 +57,11 @@ is_flag=True, help="Force re-downloading all containers", ) -@click.option("-c", "--cosmic-key", required=True, help="cosmic db authentication key") +@click.option("-c", "--cosmic-key", required=False, help="cosmic db authentication key") @click.option( "-s", "--snakefile", - default=get_snakefile("generate_ref"), + default=None, type=click.Path(), show_default=True, help="snakefile for reference generation", @@ -77,7 +77,7 @@ "-g", "--genome-version", default="hg19", - type=click.Choice(["hg19", "hg38"]), + type=click.Choice(["hg19", "hg38", "canfam3"]), help=( "Genome version to prepare reference. Path to genome" "will be /genome_version" @@ -212,6 +212,10 @@ def initialize( ) raise click.Abort() + if genome_version in ["hg38", "hg19"] and not cosmic_key: + LOG.error("cosmic db authentication key required with hg38 and hg19") + raise click.Abort() + # resolve outdir to absolute path outdir = Path(outdir).resolve() container_outdir = Path(outdir, balsamic_version, "containers") @@ -261,6 +265,10 @@ def initialize( write_json(config_dict, config_json) LOG.info("Reference generation workflow configured successfully - %s" % config_json) + snakefile = ( + snakefile if snakefile else get_snakefile("generate_ref", genome_version) + ) + with CaptureStdout() as graph_dot: snakemake.snakemake( snakefile=snakefile, diff --git a/BALSAMIC/commands/report/deliver.py b/BALSAMIC/commands/report/deliver.py index e258f9127..81b050d5e 100644 --- a/BALSAMIC/commands/report/deliver.py +++ b/BALSAMIC/commands/report/deliver.py @@ -5,20 +5,16 @@ import yaml import click import snakemake -import datetime import subprocess from pathlib import Path -from BALSAMIC.constants.quality_check_reporting import METRICS_TO_DELIVER -from BALSAMIC.utils.cli import get_file_extension +from BALSAMIC.utils.cli import get_file_extension, read_yaml from BALSAMIC.utils.cli import write_json from BALSAMIC.utils.cli import get_snakefile from BALSAMIC.utils.cli import SnakeMake from BALSAMIC.utils.cli import convert_deliverables_tags -from BALSAMIC.utils.rule import get_result_dir, get_capture_kit +from BALSAMIC.utils.rule import get_result_dir from BALSAMIC.utils.exc import BalsamicError -from BALSAMIC.utils.qc_metrics import get_qc_metrics_json, extract_metrics_for_delivery -from BALSAMIC.utils.qc_report import render_html, report_data_population from BALSAMIC.constants.workflow_params import VCF_DICT from BALSAMIC.constants.workflow_rules import DELIVERY_RULES @@ -35,25 +31,6 @@ required=True, help="Sample config file. Output of balsamic config sample", ) -@click.option( - "--sample-id-map", - required=False, - help=( - "Separated internal sample ID with external ID. Use comma for" - "multiple samples. These IDs MUST exist in sample-config." - "Syntax: internal_id:sample_type:external_id" - ". e.g. ACC1:tumor:KS454,ACC2:normal:KS556" - ), -) -@click.option( - "--case-id-map", - required=False, - help=( - "Separated internal case ID with external ID." - "Syntax: gene_panel_name:external_id" - ". e.g. gmck-solid:KSK899:apptag" - ), -) @click.option( "-a", "--analysis-type", @@ -86,14 +63,6 @@ help=f"Run workflow with selected variant caller(s) disable. Use comma to remove multiple variant callers. Valid " f"values are: {list(VCF_DICT.keys())}", ) -@click.option( - "--qc-metrics/--no-qc-metrics", - default=True, - show_default=True, - is_flag=True, - help=f"Generates a YAML file of quality control metrics. " - f"Currently retrieved metrics: {', '.join(list(set(METRICS_TO_DELIVER['targeted'] + METRICS_TO_DELIVER['wgs'])))}", -) @click.pass_context def deliver( context, @@ -102,9 +71,6 @@ def deliver( rules_to_deliver, delivery_mode, disable_variant_caller, - sample_id_map, - case_id_map, - qc_metrics, ): """ cli for deliver sub-command. @@ -138,40 +104,8 @@ def deliver( if analysis_type else sample_config_dict["analysis"]["analysis_type"] ) - sequencing_type = sample_config_dict["analysis"]["sequencing_type"] - snakefile = get_snakefile(analysis_type, sequencing_type) - - balsamic_qc_report = None - if sequencing_type != "wgs" and sample_id_map and case_id_map: - case_id_map = case_id_map.split(":") - sample_id_map = sample_id_map.split(",") - sample_map = dict() - sample_type = dict() - for sample in sample_id_map: - lims_id = sample.split(":")[0] - sample_map[lims_id] = sample.split(":")[1] - sample_type[lims_id] = sample.split(":")[2] - - meta = dict() - meta["sample_map"] = sample_map - meta["sample_type"] = sample_type - meta["now"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") - meta["config_date"] = sample_config_dict["analysis"]["config_creation_date"] - meta["internal_case_id"] = case_name - meta["gene_panel_name"] = case_id_map[0] - meta["case_name"] = case_id_map[1] - meta["apptag"] = case_id_map[2] - - collected_qc = get_qc_metrics_json( - sample_config_dict["analysis"]["result"], - sequencing_type, - get_capture_kit(sample_config_dict), - ) - meta = report_data_population(collected_qc=collected_qc, meta=meta) - balsamic_qc_report = os.path.join( - yaml_write_directory, case_name + "_qc_report.html" - ) - balsamic_qc_report = render_html(meta=meta, html_out=balsamic_qc_report) + reference_genome = sample_config_dict["reference"]["reference_genome"] + snakefile = get_snakefile(analysis_type, reference_genome) report_file_name = os.path.join( yaml_write_directory, sample_config_dict["analysis"]["case_id"] + "_report.html" @@ -253,41 +187,6 @@ def deliver( "id": case_name, } ) - # Add balsamic_qc_report - if balsamic_qc_report: - delivery_json["files"].append( - { - "path": balsamic_qc_report, - "step": "balsamic_delivery", - "format": get_file_extension(balsamic_qc_report), - "tag": ["coverage-qc-report"], - "id": case_name, - } - ) - - # Add output metrics delivery to report - if qc_metrics: - metric_delivery_report = os.path.join( - yaml_write_directory, case_name + "_metrics_deliverables.yaml" - ) - metrics = extract_metrics_for_delivery( - sample_config_dict["analysis"]["result"], sequencing_type - ) - - with open(metric_delivery_report, "w") as fn: - yaml.dump(metrics, fn, default_flow_style=False) - - LOG.info(f"Created metrics delivery file: {metric_delivery_report}") - - delivery_json["files"].append( - { - "path": metric_delivery_report, - "step": "balsamic_delivery", - "format": get_file_extension(metric_delivery_report), - "tag": ["qc-metrics-yaml"], - "id": case_name, - } - ) write_json(delivery_json, delivery_file_name) with open(delivery_file_name + ".yaml", "w") as fn: diff --git a/BALSAMIC/commands/report/status.py b/BALSAMIC/commands/report/status.py index d8c427053..5658246f1 100644 --- a/BALSAMIC/commands/report/status.py +++ b/BALSAMIC/commands/report/status.py @@ -53,8 +53,8 @@ def status(context, sample_config, show_only_missing, print_files): result_dir = get_result_dir(sample_config_dict) analysis_type = sample_config_dict["analysis"]["analysis_type"] - sequencing_type = sample_config_dict["analysis"]["sequencing_type"] - snakefile = get_snakefile(analysis_type, sequencing_type) + reference_genome = sample_config_dict["reference"]["reference_genome"] + snakefile = get_snakefile(analysis_type, reference_genome) if os.path.isfile(os.path.join(result_dir, "analysis_finish")): snakemake.snakemake( diff --git a/BALSAMIC/commands/run/analysis.py b/BALSAMIC/commands/run/analysis.py index 2bd4896a8..30a83d702 100644 --- a/BALSAMIC/commands/run/analysis.py +++ b/BALSAMIC/commands/run/analysis.py @@ -210,7 +210,6 @@ def analysis( resultpath = sample_config["analysis"]["result"] benchmarkpath = sample_config["analysis"]["benchmark"] case_name = sample_config["analysis"]["case_id"] - sequencing_type = sample_config["analysis"]["sequencing_type"] if run_analysis: # if not dry run, then create (new) log/script directory @@ -239,6 +238,8 @@ def analysis( bind_path.append(sample_config.get("panel").get("capture_kit")) if "background_variants" in sample_config: bind_path.append(sample_config.get("background_variants")) + if "pon_cnn" in sample_config: + bind_path.append(sample_config.get("panel").get("pon_cnn")) bind_path.append(BALSAMIC_SCRIPTS) bind_path.append(sample_config["analysis"]["analysis_dir"]) bind_path.extend(get_fastq_bind_path(sample_config["analysis"]["fastq_path"])) @@ -252,9 +253,7 @@ def analysis( ).as_posix() + "/" ) - balsamic_run.snakefile = ( - snake_file if snake_file else get_snakefile(analysis_type, sequencing_type) - ) + balsamic_run.snakefile = snake_file if snake_file else get_snakefile(analysis_type) balsamic_run.configfile = sample_config_path balsamic_run.run_mode = run_mode balsamic_run.cluster_config = cluster_config diff --git a/BALSAMIC/config/MSK_impact.json b/BALSAMIC/config/MSK_impact.json deleted file mode 100644 index d45582e58..000000000 --- a/BALSAMIC/config/MSK_impact.json +++ /dev/null @@ -1,119 +0,0 @@ -{ - "version": - "0.1.1", - "reference_documents": - ["https://www.accessdata.fda.gov/cdrh_docs/reviews/DEN170058.pdf"], - "base_line": {}, - "filters": { - "set_1": { - "VF_ratio": "5", - "name": "'MSK-IMPACT high confidence'", - "in_mvl": "T", - "variantcaller": ["MUTECT2"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_2": { - "VF_ratio": "5", - "name": "'MSK-IMPACT low confidence'", - "in_mvl": "F", - "variantcaller": ["MUTECT2"], - "TUMOR": { - "DP": "20", - "AD": "10", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_3": { - "VF_ratio": "5", - "name": "'Discovery High confidence'", - "in_mvl": "T", - "variantcaller": ["MUTECT2", "VARDICT", "STRELKA"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_4": { - "VF_ratio": "5", - "name": "'Discovery low confidence'", - "in_mvl": "F", - "variantcaller": ["MUTECT2", "VARDICT", "STRELKA"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_5": { - "VF_ratio": "1", - "name": "'Discovery extra'", - "in_mvl": "F", - "variantcaller": ["MUTECT2", "VARDICT", "STRELKA"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - } - } -} diff --git a/BALSAMIC/config/MSK_impact_noStrelka.json b/BALSAMIC/config/MSK_impact_noStrelka.json deleted file mode 100644 index 1db906629..000000000 --- a/BALSAMIC/config/MSK_impact_noStrelka.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "version": - "0.1.1", - "reference_documents": - ["https://www.accessdata.fda.gov/cdrh_docs/reviews/DEN170058.pdf"], - "base_line": {}, - "filters": { - "set_1": { - "VF_ratio": "5", - "name": "'High confidence set (in MSK-IMPACT)'", - "in_mvl": "T", - "variantcaller": ["MUTECT2"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_2": { - "VF_ratio": "5", - "name": "'Low confidence set (not in MSK-IMPACT)'", - "in_mvl": "F", - "variantcaller": ["MUTECT2"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - }, - "set_3": { - "VF_ratio": "5", - "name": "'Discovery High confidence (in MSK-IMPACT)'", - "in_mvl": "T", - "variantcaller": ["MUTECT2", "VARDICT"], - "TUMOR": { - "DP": "20", - "AD": "8", - "AF_max": "1", - "AF_min": "0.01" - }, - "annotation": { - "SNV": [ - "missense_variant", "nonsynonymous_variant", "stop_gained", - "stop_lost", "start_lost", "splice_acceptor_variant", - "splice_donor_variant", "splice_donor_5th_base_variant", - "splice_site_variant", "splicing_variant" - ], - "INDEL": - ["frameshift_variant", "frameshift", "non-frameshift"] - } - } - } -} diff --git a/BALSAMIC/config/analysis.json b/BALSAMIC/config/analysis.json index da437c5c7..bbfd71a83 100644 --- a/BALSAMIC/config/analysis.json +++ b/BALSAMIC/config/analysis.json @@ -22,14 +22,6 @@ "mutation": "somatic", "type": "SNV" }, - "pindel": { - "mutation": "somatic", - "type": "SV" - }, - "strelka": { - "mutation": "somatic", - "type": "SNV" - }, "mutect": { "mutation": "somatic", "type": "SNV" @@ -38,10 +30,6 @@ "mutation": "somatic", "type": "SNV" }, - "tnsnv": { - "mutation": "somatic", - "type": "SNV" - }, "tnhaplotyper": { "mutation": "somatic", "type": "SNV" @@ -54,25 +42,25 @@ "mutation": "germline", "type": "SV" }, - "haplotypecaller": { - "mutation": "germline", - "type": "SNV" - }, - "strelka_germline": { - "mutation": "germline", - "type": "SNV" - }, "vcfmerge":{ "mutation": "somatic", "type": "SNV" }, - "delly":{ + "dellysv":{ "mutation": "somatic", "type": "SV" }, + "dellycnv":{ + "mutation": "somatic", + "type": "CNV" + }, "ascat": { "mutation": "somatic", "type": "SV" + }, + "svdb": { + "mutation": "somatic", + "type": "SV" } } } diff --git a/BALSAMIC/config/balsamic_env.yaml b/BALSAMIC/config/balsamic_env.yaml index 3c49b6ffb..ce56b91ce 100644 --- a/BALSAMIC/config/balsamic_env.yaml +++ b/BALSAMIC/config/balsamic_env.yaml @@ -6,7 +6,6 @@ align_qc: - picard - multiqc - fastp - - csvkit annotate: - ensembl-vep - vcfanno @@ -19,8 +18,8 @@ varcall_py36: - gatk - vardict - libiconv + - svdb varcall_py27: - - strelka - manta varcall_cnvkit: - cnvkit @@ -30,4 +29,6 @@ vcf_merge: delly: - delly ascatNgs: - -ascat + - ascat +vcf2cytosure: + - vcf2cytosure diff --git a/BALSAMIC/config/cluster.json b/BALSAMIC/config/cluster.json index a32679a8a..d1e1a6a17 100644 --- a/BALSAMIC/config/cluster.json +++ b/BALSAMIC/config/cluster.json @@ -11,10 +11,6 @@ "time": "00:15:00", "n": 1 }, - "BaseRecalibrator": { - "time": "15:00:00", - "n": 10 - }, "CollectAlignmentSummaryMetrics": { "time": "03:30:00", "n": 8 @@ -35,10 +31,6 @@ "time": "03:30:00", "n": 8 }, - "RealignerTargetCreator": { - "time": "10:00:00", - "n": 10 - }, "bwa_mem": { "time": "08:00:00", "n": 16 @@ -60,14 +52,6 @@ "time": "12:00:00", "n": 5 }, - "gatk_haplotypecaller": { - "time": "03:00:00", - "n": 10 - }, - "haplotypecaller_merge": { - "time": "01:30:00", - "n": 8 - }, "manta_germline": { "time": "05:00:00", "n": 16 @@ -100,18 +84,6 @@ "time": "00:15:00", "n": 4 }, - "mutect2_merge": { - "time": "01:30:00", - "n": 8 - }, - "mutect2_tumor_normal": { - "time": "24:00:00", - "n": 12 - }, - "mutect2_tumor_only": { - "time": "24:00:00", - "n": 12 - }, "sambamba_exon_depth": { "time": "02:30:00", "n": 8 @@ -148,14 +120,6 @@ "time": "24:00:00", "n": 24 }, - "sentieon_TNsnv": { - "time": "24:00:00", - "n": 24 - }, - "sentieon_TNsnv_tumor_only": { - "time": "24:00:00", - "n": 24 - }, "sentieon_align_sort": { "time": "24:00:00", "n": 24 @@ -180,14 +144,6 @@ "time": "06:00:00", "n": 10 }, - "strelka_germline": { - "time": "08:00:00", - "n": 10 - }, - "strelka_tumor_normal": { - "time": "10:00:00", - "n": 10 - }, "vardict_merge": { "time": "01:30:00", "n": 5 @@ -220,10 +176,14 @@ "time": "4:00:00", "n": 12 }, - "vep_somatic": { + "vep_somatic_snv": { "time":"18:00:00", "n": 24 }, + "vep_somatic_sv": { + "time":"12:00:00", + "n": 24 + }, "vep_germline": { "time":"06:00:00", "n": 10 @@ -259,5 +219,21 @@ "ascat_tumor_normal_merge_output": { "time": "00:15:00", "n": 1 + }, + "collect_custom_qc_metrics": { + "time": "00:15:00", + "n": 1 + }, + "svdb_merge_tumor_normal": { + "time": "01:00:00", + "n": 8 + }, + "svdb_merge_tumor_only": { + "time": "01:00:00", + "n": 8 + }, + "bcftools_filter_svdb": { + "time": "01:00:00", + "n": 8 } } diff --git a/BALSAMIC/constants/common.py b/BALSAMIC/constants/common.py index bd0494ff0..44a75a7a0 100644 --- a/BALSAMIC/constants/common.py +++ b/BALSAMIC/constants/common.py @@ -36,7 +36,7 @@ # Analysis related constants MUTATION_CLASS = ["somatic", "germline"] MUTATION_TYPE = ["SNV", "SV", "CNV"] -ANALYSIS_TYPES = ["paired", "single", "qc", "pon"] +ANALYSIS_TYPES = ["paired", "single", "qc_panel", "pon"] WORKFLOW_SOLUTION = ["BALSAMIC", "Sentieon", "DRAGEN", "Sentieon_umi"] SEQUENCING_TYPE = ["wgs", "targeted"] @@ -52,6 +52,7 @@ "delly", "ascatNgs", "balsamic", + "vcf2cytosure", } BIOINFO_TOOL_ENV = { @@ -62,7 +63,6 @@ "picard": "align_qc", "multiqc": "align_qc", "fastp": "align_qc", - "csvkit": "align_qc", "ensembl-vep": "annotate", "genmod": "annotate", "vcfanno": "annotate", @@ -72,12 +72,13 @@ "tabix": "varcall_py36", "gatk": "varcall_py36", "vardict": "varcall_py36", - "strelka": "varcall_py27", + "svdb": "varcall_py36", "manta": "varcall_py27", "cnvkit": "varcall_cnvkit", "delly": "delly", "ascatNgs": "ascatNgs", "sentieon": "sentieon", + "vcf2cytosure": "vcf2cytosure", } VALID_OPS = { diff --git a/BALSAMIC/constants/qc_metrics.py b/BALSAMIC/constants/qc_metrics.py new file mode 100644 index 000000000..f2acc4a8a --- /dev/null +++ b/BALSAMIC/constants/qc_metrics.py @@ -0,0 +1,52 @@ +METRICS = { + "targeted": { + "default": { + "MEAN_INSERT_SIZE": {"condition": None}, + "PERCENT_DUPLICATION": {"condition": None}, + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 500}}, + "PCT_TARGET_BASES_50X": {"condition": None}, + "PCT_TARGET_BASES_100X": {"condition": None}, + "PCT_TARGET_BASES_250X": {"condition": None}, + "PCT_TARGET_BASES_500X": {"condition": None}, + "PCT_TARGET_BASES_1000X": {"condition": None}, + "MEAN_TARGET_COVERAGE": {"condition": None}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, + "PCT_OFF_BAIT": {"condition": None}, + }, + "gicfdna": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, + }, + "gmcksolid": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 500}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, + }, + "gmsmyeloid": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, + }, + "lymphoma": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, + }, + "gmslymphoid": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 1000}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.6}}, + }, + "twistexome": { + "MEDIAN_TARGET_COVERAGE": {"condition": {"norm": "gt", "threshold": 100}}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, + }, + }, + "wgs": { + "MEAN_INSERT_SIZE": {"condition": None}, + "MEDIAN_COVERAGE": {"condition": None}, + "FastQC_mqc-generalstats-fastqc-percent_duplicates": {"condition": None}, + "PCT_15X": {"condition": None}, + "PCT_30X": {"condition": None}, + "PCT_60X": {"condition": None}, + "PCT_100X": {"condition": None}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, + }, + "variants": {"NUMBER_OF_SITES": {"condition": {"norm": "lt", "threshold": 10000}}}, +} diff --git a/BALSAMIC/constants/quality_check_reporting.py b/BALSAMIC/constants/quality_check_reporting.py deleted file mode 100644 index 3516cccad..000000000 --- a/BALSAMIC/constants/quality_check_reporting.py +++ /dev/null @@ -1,163 +0,0 @@ -REPORT_MODEL = { - "qc": { - "MEDIAN_TARGET_COVERAGE": { - "sv": "Mediansekvensdjup [x]", - "en": "Median sequencing depth [x]", - "decimal": 0, - }, - "FOLD_80_BASE_PENALTY": { - "sv": "Fold 80 base penalty", - "en": "Fold 80 base penalty", - "decimal": 2, - }, - "MEAN_INSERT_SIZE": { - "sv": "Fragmentlängd, medel [baspar]", - "en": "Mean insert size [base pair]", - "decimal": 2, - }, - }, - "coverage": { - "PCT_TARGET_BASES_50X": { - "sv": "Täckningsgrad [50X]", - "en": "Target coverage [50X]", - "decimal": 2, - "as_percent": True, - }, - "PCT_TARGET_BASES_100X": { - "sv": "Täckningsgrad [100X]", - "en": "Target coverage [100X]", - "decimal": 2, - "as_percent": True, - }, - "PCT_TARGET_BASES_250X": { - "sv": "Täckningsgrad [250X]", - "en": "Target coverage [250X]", - "decimal": 2, - "as_percent": True, - }, - "PCT_TARGET_BASES_500X": { - "sv": "Täckningsgrad [500X]", - "en": "Target coverage [500X]", - "decimal": 2, - "as_percent": True, - }, - "PCT_TARGET_BASES_1000X": { - "sv": "Täckningsgrad [1000X]", - "en": "Target coverage [1000X]", - "decimal": 2, - "as_percent": True, - }, - }, -} - -METRIC_FILES = { - "picard_insertSize": "multiqc_picard_insertSize.json", - "picard_dups": "multiqc_picard_dups.json", - "picard_HsMetrics": "multiqc_picard_HsMetrics.json", - "picard_wgsmetrics": "multiqc_picard_wgsmetrics.json", -} - -METRICS = { - "qc": { - "targeted": { - "default": { - METRIC_FILES["picard_insertSize"]: { - "MEAN_INSERT_SIZE": {"condition": None}, - }, - METRIC_FILES["picard_dups"]: { - "PERCENT_DUPLICATION": {"condition": None} - }, - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": {"condition": None}, - "PCT_TARGET_BASES_50X": {"condition": None}, - "PCT_TARGET_BASES_100X": {"condition": None}, - "PCT_TARGET_BASES_250X": {"condition": None}, - "PCT_TARGET_BASES_500X": {"condition": None}, - "PCT_TARGET_BASES_1000X": {"condition": None}, - "MEAN_TARGET_COVERAGE": {"condition": None}, - "FOLD_80_BASE_PENALTY": {"condition": None}, - "PCT_OFF_BAIT": {"condition": None}, - }, - }, - "gicfdna_3.1_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 1000} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.6} - }, - } - }, - "gmcksolid_4.1_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 500} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.8} - }, - } - }, - "gmsmyeloid_5.2_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 1000} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.6} - }, - } - }, - "lymphoma_6.1_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 1000} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.6} - }, - } - }, - "gmslymphoid_7.1_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 1000} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.6} - }, - } - }, - "twistexomerefseq_9.1_hg19_design.bed": { - METRIC_FILES["picard_HsMetrics"]: { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 100} - }, - "FOLD_80_BASE_PENALTY": { - "condition": {"norm": "lt", "threshold": 1.8} - }, - } - }, - }, - "wgs": { - METRIC_FILES["picard_wgsmetrics"]: { - "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}} - }, - }, - } -} - -METRICS_TO_DELIVER = { - "targeted": [ - "MEAN_INSERT_SIZE", - "PERCENT_DUPLICATION", - "MEAN_TARGET_COVERAGE", - "MEDIAN_TARGET_COVERAGE", - "FOLD_80_BASE_PENALTY", - "PCT_OFF_BAIT", - ], - "wgs": [ - "FOLD_80_BASE_PENALTY", - ], -} diff --git a/BALSAMIC/constants/reference.py b/BALSAMIC/constants/reference.py index 7b6e08f74..d27b6c0fc 100644 --- a/BALSAMIC/constants/reference.py +++ b/BALSAMIC/constants/reference.py @@ -1,6 +1,6 @@ # reference related constants VALID_REF_FORMAT = ["fasta", "vcf", "text", "gtf", "gff"] -VALID_GENOME_VER = ["hg19", "hg38"] +VALID_GENOME_VER = ["hg19", "hg38", "canfam3"] # reference files REFERENCE_FILES = { @@ -133,6 +133,30 @@ "output_file": "delly_exclusion.tsv", "output_path": "genome", }, + "delly_mappability": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh38.delly.blacklist.gz", + "file_type": "text", + "gzip": False, + "genome_version": "hg38", + "output_file": "delly_mappability.gz", + "output_path": "genome", + }, + "delly_mappability_gindex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh38.delly.blacklist.gz.gzi", + "file_type": "text", + "gzip": False, + "genome_version": "hg38", + "output_file": "delly_mappability.gz.gzi", + "output_path": "genome", + }, + "delly_mappability_findex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh38.delly.blacklist.gz.fai", + "file_type": "text", + "gzip": False, + "genome_version": "hg38", + "output_file": "delly_mappability.gz.fai", + "output_path": "genome", + }, "ascat_gccorrection": { "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/35465e2644f76f2d59427a9b379d34ecea71f259/cancer/references/hg38_SnpGcCorrections.tsv.gz", "file_type": "text", @@ -287,6 +311,30 @@ "output_file": "delly_exclusion.tsv", "output_path": "genome", }, + "delly_mappability": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz", + "file_type": "text", + "gzip": False, + "genome_version": "hg19", + "output_file": "delly_mappability.gz", + "output_path": "genome", + }, + "delly_mappability_gindex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz.gzi", + "file_type": "text", + "gzip": False, + "genome_version": "hg19", + "output_file": "delly_mappability.gz.gzi", + "output_path": "genome", + }, + "delly_mappability_findex": { + "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/86aab2d10c5ffc009bc8c68ad077ab7283d8fe06/cancer/references/GRCh37.delly.blacklist.gz.fai", + "file_type": "text", + "gzip": False, + "genome_version": "hg19", + "output_file": "delly_mappability.gz.fai", + "output_path": "genome", + }, "ascat_gccorrection": { "url": "https://raw.githubusercontent.com/Clinical-Genomics/reference-files/12a6c760fd542c02de2cda286b6245e46f4b6a97/cancer/references/GRCh37_SnpGcCorrections.tsv.gz", "file_type": "text", @@ -312,4 +360,38 @@ "output_path": "variants", }, }, + "canfam3": { + "reference_genome": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/bigZips/canFam3.fa.gz", + "file_type": "fasta", + "gzip": True, + "genome_version": "canfam3", + "output_file": "canFam3.fasta", + "output_path": "genome", + }, + "refgene_txt": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/database/refGene.txt.gz", + "file_type": "text", + "gzip": True, + "genome_version": "canfam3", + "output_file": "canfam3_refGene.txt", + "output_path": "genome", + }, + "refgene_sql": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/database/refGene.sql", + "file_type": "text", + "gzip": False, + "genome_version": "canfam3", + "output_file": "canfam3_refGene.sql", + "output_path": "genome", + }, + "genome_chrom_size": { + "url": "https://hgdownload.cse.ucsc.edu/goldenPath/canFam3/bigZips/canFam3.chrom.sizes", + "file_type": "text", + "gzip": False, + "genome_version": "canfam3", + "output_file": "canfam3.chrom.sizes", + "output_path": "genome", + }, + }, } diff --git a/BALSAMIC/constants/variant_filters.py b/BALSAMIC/constants/variant_filters.py index 44e78a746..3ad66fe0b 100644 --- a/BALSAMIC/constants/variant_filters.py +++ b/BALSAMIC/constants/variant_filters.py @@ -47,6 +47,11 @@ "filter_name": "balsamic_high_pop_freq", "field": "INFO", }, + "pop_freq_umi": { + "tag_value": 0.02, + "filter_name": "balsamic_umi_high_pop_freq", + "field": "INFO", + }, "qss": { "tag_value": 20, "filter_name": "balsamic_low_quality_scores", diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 206c541e2..f65780c87 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -71,11 +71,18 @@ "sequencing_type": ["targeted"], "workflow_solution": ["BALSAMIC"], }, - "delly": { + "dellysv": { "mutation": "somatic", "type": "SV", "analysis_type": ["paired", "single"], - "sequencing_type": ["wgs", "targeted"], + "sequencing_type": ["targeted", "wgs"], + "workflow_solution": ["BALSAMIC"], + }, + "dellycnv": { + "mutation": "somatic", + "type": "CNV", + "analysis_type": ["single"], + "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["BALSAMIC"], }, "ascat": { @@ -85,6 +92,13 @@ "sequencing_type": ["wgs"], "workflow_solution": ["BALSAMIC"], }, + "svdb": { + "mutation": "somatic", + "type": "SV", + "analysis_type": ["paired", "single"], + "sequencing_type": ["targeted", "wgs"], + "workflow_solution": ["BALSAMIC"], + }, } WORKFLOW_PARAMS = { @@ -137,6 +151,7 @@ "init_tumorLOD": 0.5, "error_rate": 5, "prunefactor": 3, + "padding": 100, "disable_detect": "sv", }, } diff --git a/BALSAMIC/constants/workflow_rules.py b/BALSAMIC/constants/workflow_rules.py index 87f231178..0304d68ed 100644 --- a/BALSAMIC/constants/workflow_rules.py +++ b/BALSAMIC/constants/workflow_rules.py @@ -5,11 +5,15 @@ "snakemake_rules/quality_control/fastp.rule", "snakemake_rules/quality_control/fastqc.rule", "snakemake_rules/quality_control/multiqc.rule", + "snakemake_rules/quality_control/qc_metrics.rule", "snakemake_rules/variant_calling/mergetype_tumor.rule", ], "align": [], "varcall": ["snakemake_rules/variant_calling/germline_sv.rule"], - "annotate": ["snakemake_rules/annotation/vep.rule"], + "annotate": [ + "snakemake_rules/annotation/vep.rule", + "snakemake_rules/annotation/varcaller_sv_filter.rule", + ], }, "single_targeted": { "qc": [ @@ -36,8 +40,8 @@ ], "annotate": [ "snakemake_rules/annotation/rankscore.rule", - "snakemake_rules/annotation/varcaller_sv_filter.rule", "snakemake_rules/annotation/varcaller_filter_tumor_only.rule", + "snakemake_rules/annotation/vcf2cytosure_convert.rule", ], }, "paired_targeted": { @@ -68,8 +72,8 @@ ], "annotate": [ "snakemake_rules/annotation/rankscore.rule", - "snakemake_rules/annotation/varcaller_sv_filter.rule", "snakemake_rules/annotation/varcaller_filter_tumor_normal.rule", + "snakemake_rules/annotation/vcf2cytosure_convert.rule", ], }, "single_wgs": { @@ -87,7 +91,6 @@ ], "annotate": [ "snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule", - "snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule", ], }, "paired_wgs": { @@ -105,38 +108,37 @@ ], "annotate": [ "snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule", - "snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule", ], }, } DELIVERY_RULES = [ - "fastp", "multiqc", - "vep_somatic", - "vep_germline", - "tmb_calculation", - "bcftools_filter_TNscope_umi_tumor_only", - "bcftools_filter_TNscope_umi_tumor_normal", - "bcftools_filter_vardict_tumor_only", - "bcftools_filter_vardict_tumor_normal", - "bcftools_filter_tnscope_tumor_only", - "bcftools_filter_tnscope_tumor_normal", - "bcftools_filter_tnhaplotyper_tumor_only", - "bcftools_filter_tnhaplotyper_tumor_normal", - "bcftools_filter_manta", - "bcftools_filter_delly", - "bcftools_filter_ascat", - "bcftools_filter_cnvkit", - "bcftools_intersect_tumor_only", - "bcftools_filter_TNscope_umi_tumor_only", - "genmod_score_vardict", + "collect_custom_qc_metrics", "mergeBam_tumor", "mergeBam_normal", "mergeBam_tumor_umiconsensus", "mergeBam_normal_umiconsensus", - "cnvkit_paired", - "cnvkit_single", + "vep_germline", + "svdb_merge_tumor_only", + "svdb_merge_tumor_normal", + "sentieon_TNscope_tumor_only", + "sentieon_TNscope", + "vardict_merge", + "sentieon_tnscope_umi", + "sentieon_tnscope_umi_tn", + "ascat_tumor_normal", "ascat_tumor_normal_merge_output", + "delly_cnv_tumor_only", + "cnvkit_single", + "cnvkit_paired", + "vcf2cytosure_convert", + "bcftools_filter_svdb", + "bcftools_intersect_tumor_only", + "bcftools_filter_tnscope_tumor_normal", + "bcftools_filter_vardict_tumor_only", + "bcftools_filter_vardict_tumor_normal", + "bcftools_filter_TNscope_umi_tumor_only", + "bcftools_filter_TNscope_umi_tumor_normal", ] diff --git a/BALSAMIC/containers/align_qc/Dockerfile b/BALSAMIC/containers/align_qc/Dockerfile index fe942c7d3..e49620ab6 100644 --- a/BALSAMIC/containers/align_qc/Dockerfile +++ b/BALSAMIC/containers/align_qc/Dockerfile @@ -1,6 +1,6 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/align_qc/align_qc.yaml b/BALSAMIC/containers/align_qc/align_qc.yaml index 4e0e7444a..2fbde5eb4 100644 --- a/BALSAMIC/containers/align_qc/align_qc.yaml +++ b/BALSAMIC/containers/align_qc/align_qc.yaml @@ -3,14 +3,11 @@ channels: dependencies: - bioconda::bedtools=2.30.0 - - bioconda::bwa=0.7.15 + - bioconda::bwa=0.7.17 - bioconda::fastqc=0.11.9 - - bioconda::samtools=1.12 + - bioconda::samtools=1.15.1 - bioconda::tabix=0.2.6 - - bioconda::picard=2.25.0 - - bioconda::multiqc=1.11 - - bioconda::fastp=0.20.1 - - conda-forge::csvkit=1.0.4 - - conda-forge::libiconv - - conda-forge::fontconfig - - conda-forge::r-base=4.1.1 + - bioconda::picard=2.27.1 + - bioconda::multiqc=1.12 + - bioconda::fastp=0.23.2 + - conda-forge::r-base=4.1.3 diff --git a/BALSAMIC/containers/annotate/Dockerfile b/BALSAMIC/containers/annotate/Dockerfile index f30c8c301..8c25633ab 100644 --- a/BALSAMIC/containers/annotate/Dockerfile +++ b/BALSAMIC/containers/annotate/Dockerfile @@ -1,6 +1,6 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/annotate/annotate.yaml b/BALSAMIC/containers/annotate/annotate.yaml index 8b30f319e..40fe8cb5c 100644 --- a/BALSAMIC/containers/annotate/annotate.yaml +++ b/BALSAMIC/containers/annotate/annotate.yaml @@ -1,13 +1,11 @@ channels: - - anaconda - defaults - - conda-forge dependencies: - anaconda::python=3.7 - - bioconda::ensembl-vep=100.2 - - bioconda::bcftools=1.10 - conda-forge::libopenblas=0.3.20 + - bioconda::ensembl-vep=104.3 + - bioconda::bcftools=1.10 - bioconda::vcfanno=0.3.3 - anaconda::gxx_linux-64=7.3.0 - anaconda::pip=20.2.4 diff --git a/BALSAMIC/containers/coverage_qc/Dockerfile b/BALSAMIC/containers/coverage_qc/Dockerfile index fe942c7d3..e49620ab6 100644 --- a/BALSAMIC/containers/coverage_qc/Dockerfile +++ b/BALSAMIC/containers/coverage_qc/Dockerfile @@ -1,6 +1,6 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/coverage_qc/coverage_qc.yaml b/BALSAMIC/containers/coverage_qc/coverage_qc.yaml index e796ce73a..70540ce90 100644 --- a/BALSAMIC/containers/coverage_qc/coverage_qc.yaml +++ b/BALSAMIC/containers/coverage_qc/coverage_qc.yaml @@ -3,5 +3,5 @@ channels: - conda-forge dependencies: - - bioconda::sambamba=0.6.6 - - bioconda::mosdepth=0.2.9 + - bioconda::sambamba=0.8.2 + - bioconda::mosdepth=0.3.3 diff --git a/BALSAMIC/containers/delly/Dockerfile b/BALSAMIC/containers/delly/Dockerfile index 89fe792ec..5d17634fa 100644 --- a/BALSAMIC/containers/delly/Dockerfile +++ b/BALSAMIC/containers/delly/Dockerfile @@ -28,11 +28,13 @@ RUN apt-get update && apt-get install -y \ # set environment ENV BOOST_ROOT /usr ENV PATH="/opt/delly/bin:${PATH}" +ENV OMP_NUM_THREADS 2 # install delly RUN cd /opt \ && git clone --recursive https://github.com/dellytools/delly.git \ && cd /opt/delly/ \ - && git checkout v0.8.7 \ - && make STATIC=1 all \ - && make install + && git checkout v0.9.1 \ + && make STATIC=1 PARALLEL=1 all \ + && make install \ + diff --git a/BALSAMIC/containers/delly/delly.yaml b/BALSAMIC/containers/delly/delly.yaml index 5ef59fcbe..1689329d4 100644 --- a/BALSAMIC/containers/delly/delly.yaml +++ b/BALSAMIC/containers/delly/delly.yaml @@ -1 +1 @@ -- delly=0.8.7 +- delly=0.9.1 diff --git a/BALSAMIC/containers/varcall_cnvkit/Dockerfile b/BALSAMIC/containers/varcall_cnvkit/Dockerfile index f30c8c301..8c25633ab 100644 --- a/BALSAMIC/containers/varcall_cnvkit/Dockerfile +++ b/BALSAMIC/containers/varcall_cnvkit/Dockerfile @@ -1,6 +1,6 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.sh b/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.sh index cfd487f09..1a649ce4b 100644 --- a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.sh +++ b/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.sh @@ -1,2 +1,2 @@ conda env update -n base --file ${1}.yaml --prune -pip install --no-cache-dir cnvkit==0.9.4 biopython==1.76 +pip install --no-cache-dir cnvkit==0.9.9 biopython==1.79 diff --git a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.yaml b/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.yaml index 45b5882f1..465b8b479 100644 --- a/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.yaml +++ b/BALSAMIC/containers/varcall_cnvkit/varcall_cnvkit.yaml @@ -12,6 +12,6 @@ dependencies: - bioconda::bioconductor-genomicranges=1.46.0 - bioconda::bioconductor-dnacopy=1.68.0 - bioconda::bioconductor-variantannotation=1.40.0 - - bioconda::bioconductor-purecn=2.0.1 - - bioconda::bcftools>=1.13 - - bioconda::tabix>=0.2.6 + - bioconda::bioconductor-purecn=2.0.2 + - bioconda::bcftools=1.13 + - bioconda::tabix=0.2.6 diff --git a/BALSAMIC/containers/varcall_py27/Dockerfile b/BALSAMIC/containers/varcall_py27/Dockerfile index 856d2eef3..367c8c646 100644 --- a/BALSAMIC/containers/varcall_py27/Dockerfile +++ b/BALSAMIC/containers/varcall_py27/Dockerfile @@ -1,6 +1,6 @@ FROM continuumio/miniconda:4.7.12 -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda:4.7.12" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/varcall_py36/Dockerfile b/BALSAMIC/containers/varcall_py36/Dockerfile index fe942c7d3..e49620ab6 100644 --- a/BALSAMIC/containers/varcall_py36/Dockerfile +++ b/BALSAMIC/containers/varcall_py36/Dockerfile @@ -1,6 +1,6 @@ -FROM continuumio/miniconda3:4.9.2-alpine +FROM continuumio/miniconda3:4.10.3-alpine -LABEL base_image="continuumio/miniconda3:4.9.2-alpine" +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" LABEL about.documentation="https://balsamic.readthedocs.io/" LABEL about.license="MIT License (MIT)" diff --git a/BALSAMIC/containers/varcall_py36/varcall_py36.yaml b/BALSAMIC/containers/varcall_py36/varcall_py36.yaml index 55528ce44..0505a41e2 100644 --- a/BALSAMIC/containers/varcall_py36/varcall_py36.yaml +++ b/BALSAMIC/containers/varcall_py36/varcall_py36.yaml @@ -2,12 +2,13 @@ channels: - defaults dependencies: - - anaconda::python=3.6 + - anaconda::python=3.8 - bioconda::bcftools=1.11 - bioconda::tabix=0.2.6 - bioconda::samtools=1.11 - bioconda::gatk=3.8 - - bioconda::vardict=2019.06.04=pl526_0 - - bioconda::vardict-java=1.7 + - bioconda::vardict=2019.06.04 + - bioconda::vardict-java=1.8.3 + - bioconda::svdb=2.6.0 - conda-forge::libiconv - - conda-forge::r-base=3.6.3 + - conda-forge::r-base=4.1.1 diff --git a/BALSAMIC/containers/vcf2cytosure/Dockerfile b/BALSAMIC/containers/vcf2cytosure/Dockerfile new file mode 100644 index 000000000..0c5c7370d --- /dev/null +++ b/BALSAMIC/containers/vcf2cytosure/Dockerfile @@ -0,0 +1,20 @@ +FROM continuumio/miniconda3:4.10.3-alpine + +LABEL base.image="continuumio/miniconda3:4.10.3-alpine" +LABEL about.home="https://github.com/Clinical-Genomics/BALSAMIC" +LABEL about.documentation="https://balsamic.readthedocs.io/" +LABEL about.license="MIT License (MIT)" +LABEL about.maintainer="Ashwini Jeggari ashwini dot jeggari at scilifelab dot se" +LABEL about.description="Bioinformatic analysis pipeline for somatic mutations in cancer" + +ARG CONTAINER_NAME +ENV PATH="/opt/${CONTAINER_NAME}/bin:${PATH}" +ENV PYTHONPATH="/opt/${CONTAINER_NAME}" + + +RUN apk add --no-cache bash gcc git python3 + +RUN cd /opt \ + && git clone https://github.com/NBISweden/vcf2cytosure.git \ + && cd /opt/${CONTAINER_NAME}/ \ + && pip install --no-cache-dir . diff --git a/BALSAMIC/containers/vcf2cytosure/__init__.py b/BALSAMIC/containers/vcf2cytosure/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/BALSAMIC/containers/vcf2cytosure/vcf2cytosure.yaml b/BALSAMIC/containers/vcf2cytosure/vcf2cytosure.yaml new file mode 100644 index 000000000..892ae60cb --- /dev/null +++ b/BALSAMIC/containers/vcf2cytosure/vcf2cytosure.yaml @@ -0,0 +1 @@ +- vcf2cytosure=0.7.1 diff --git a/BALSAMIC/snakemake_rules/annotation/rankscore.rule b/BALSAMIC/snakemake_rules/annotation/rankscore.rule index 0d23dad80..1982b0f1c 100644 --- a/BALSAMIC/snakemake_rules/annotation/rankscore.rule +++ b/BALSAMIC/snakemake_rules/annotation/rankscore.rule @@ -19,11 +19,11 @@ rule genmod_score_vardict: threads: get_threads(cluster_config, 'genmod_score_vardict') message: - ("Score annotated vardict variants using genmod" - "and compress vcf using bcftools on {params.case_name}") + ("Scoring annotated vardict variants using genmod for {params.case_name}") shell: """ genmod score -r -c {input.rankscore} {input.vcf} | \ + bcftools view -o {output.vcf_pass} -O z; tabix -p vcf -f {output.vcf_pass}; diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule index 9a29e5869..a6068ea4e 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule @@ -9,7 +9,8 @@ rule bcftools_filter_vardict_tumor_normal: vcf = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.vcf.gz", + vcf_pass_vardict = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_filter_vardict_tumor_normal_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -27,7 +28,7 @@ rule bcftools_filter_vardict_tumor_normal: threads: get_threads(cluster_config, 'bcftools_filter_vardict_tumor_normal') message: - "Filtering vardict tumor-normal annotated variants using bcftools on {params.case_name}" + "Filtering vardict tumor-normal annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ @@ -42,9 +43,11 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_vardict} -O z {output.vcf_filtered}; -tabix -p vcf -f {output.vcf_pass}; +tabix -p vcf -f {output.vcf_pass_vardict}; + +bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts}; """ @@ -53,19 +56,18 @@ rule bcftools_filter_tnhaplotyper_tumor_normal: vcf = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", + vcf_pass_tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_filter_tnhaplotyper_tumor_normal' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: pop_freq = [COMMON_FILTERS.pop_freq.tag_value, COMMON_FILTERS.pop_freq.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_normal') message: - "Filtering tnhaplotyper tumor-normal annotated variants using bcftools on {params.case_name}" + "Filtering tnhaplotyper tumor-normal annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ @@ -74,9 +76,10 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_tnhaplotyper} -O z {output.vcf_filtered}; + +tabix -p vcf -f {output.vcf_pass_tnhaplotyper}; -tabix -p vcf -f {output.vcf_pass}; """ @@ -85,19 +88,20 @@ rule bcftools_filter_TNscope_umi_tumor_normal: vcf = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.vcf.gz", + vcf_pass_TNscope_umi = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_filter_TNscope_umi_tumor_normal' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - pop_freq = [COMMON_FILTERS.pop_freq.tag_value, COMMON_FILTERS.pop_freq.filter_name], + pop_freq = [SENTIEON_CALLER.pop_freq_umi.tag_value, SENTIEON_CALLER.pop_freq_umi.filter_name], housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_TNscope_umi_tumor_normal') message: - "Filtering TNscope_umi tumor-normal annotated variants using bcftools on {params.case_name}" + "Filtering TNscope_umi tumor-normal annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ @@ -106,7 +110,9 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_TNscope_umi} -O z {output.vcf_filtered}; + +tabix -p vcf -f {output.vcf_pass_TNscope_umi}; -tabix -p vcf -f {output.vcf_pass}; +bcftools +counts {output.vcf_pass_TNscope_umi} > {output.bcftools_counts}; """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule index 986c68d46..90e338f76 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule @@ -9,7 +9,8 @@ rule bcftools_filter_vardict_tumor_only: vcf = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.vcf.gz", + vcf_pass_vardict = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.vardict.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_filter_vardict_tumor_only_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -40,9 +41,11 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_vardict} -O z {output.vcf_filtered}; -tabix -p vcf -f {output.vcf_pass}; +tabix -p vcf -f {output.vcf_pass_vardict}; + +bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts}; """ @@ -51,19 +54,18 @@ rule bcftools_filter_tnhaplotyper_tumor_only: vcf = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", + vcf_pass_tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_filter_tnhaplotyper_tumor_only' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: pop_freq = [COMMON_FILTERS.pop_freq.tag_value, COMMON_FILTERS.pop_freq.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_only') message: - "Filtering tnhaplotyper tumor-only annotated variants using bcftools on {params.case_name}" + "Filtering tnhaplotyper tumor-only annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ @@ -72,9 +74,10 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_tnhaplotyper} -O z {output.vcf_filtered}; + +tabix -p vcf -f {output.vcf_pass_tnhaplotyper}; -tabix -p vcf -f {output.vcf_pass}; """ @@ -83,19 +86,20 @@ rule bcftools_filter_TNscope_umi_tumor_only: vcf = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.vcf.gz", + vcf_pass_TNscope_umi = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.TNscope_umi.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_filter_TNscope_umi_tumor_only' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - pop_freq = [COMMON_FILTERS.pop_freq.tag_value, COMMON_FILTERS.pop_freq.filter_name], + pop_freq = [SENTIEON_CALLER.pop_freq_umi.tag_value, SENTIEON_CALLER.pop_freq_umi.filter_name], housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_TNscope_umi_tumor_only') message: - "Filtering TNscope_umi tumor-only annotated variants using bcftools on {params.case_name}" + "Filtering TNscope_umi tumor-only annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ @@ -104,7 +108,9 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_TNscope_umi} -O z {output.vcf_filtered}; + +tabix -p vcf -f {output.vcf_pass_TNscope_umi}; -tabix -p vcf -f {output.vcf_pass}; +bcftools +counts {output.vcf_pass_TNscope_umi} > {output.bcftools_counts}; """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule index cb52cf100..cb7529d83 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule @@ -1,75 +1,29 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -# NGS filters for various scenarios +# NGS filters for merged SVs and CNVs - - - -rule bcftools_filter_manta: +rule bcftools_filter_svdb: input: - vcf = vep_dir + "{var_type}.somatic.{case_name}.manta.all.vcf.gz", + vcf = vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz", output: - vcf_sv_pass = vep_dir + "{var_type}.somatic.{case_name}.manta.all.filtered.pass.vcf.gz", + vcf_pass_svdb = vep_dir + "SV.somatic.{case_name}.svdb.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "SV.somatic.{case_name}.svdb.all.filtered.pass.stats" benchmark: - Path(benchmark_dir, 'bcftools_filter_manta_' + "{var_type}.somatic.{case_name}.tsv").as_posix() + benchmark_dir + "bcftools_filter_svdb_SV.somatic.{case_name}.svdb.vep.tsv" singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = '{case_name}', - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"} - threads: - get_threads(cluster_config, 'bcftools_filter_manta') - message: - "Filtering Manta results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; - -tabix -p vcf -f {output.vcf_sv_pass}; - """ - - -rule bcftools_filter_cnvkit: - input: - vcf = vep_dir + "{var_type}.somatic.{case_name}.cnvkit.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "{var_type}.somatic.{case_name}.cnvkit.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + 'bcftools_filter_' + "{var_type}.somatic.{case_name}.cnvkit.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, threads: - get_threads(cluster_config, 'bcftools_filter_cnvkit') + get_threads(cluster_config, "bcftools_filter_svdb") message: - "Filtering CNVkit results for PASS variants using bcftools for sample '{params.case_name}' " + "Filtering merged structural and copy number variants using bcftools for {params.case_name}" shell: """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ +bcftools view --threads {threads} -f .,PASS -o {output.vcf_pass_svdb} -O z {input.vcf}; +tabix -p vcf -f {output.vcf_pass_svdb}; -rule bcftools_filter_delly: - input: - vcf = vep_dir + "{var_type}.somatic.{case_name}.delly.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "{var_type}.somatic.{case_name}.delly.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + 'bcftools_filter_' + "{var_type}.somatic.{case_name}.delly.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - threads: - get_threads(cluster_config, 'bcftools_filter_delly') - message: - "Filtering Delly results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; +bcftools +counts {output.vcf_pass_svdb} > {output.bcftools_counts}; """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule deleted file mode 100644 index 5f2fbb596..000000000 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_normal.rule +++ /dev/null @@ -1,77 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 -# NGS filters for various scenarios - - - - -rule bcftools_filter_manta: - input: - vcf = vep_dir + "SV.somatic.{case_name}.manta.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "SV.somatic.{case_name}.manta.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_manta_SV.somatic.{case_name}.manta.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, - var_caller = "manta" - threads: - get_threads(cluster_config, "bcftools_filter_manta") - message: - "Filtering Manta results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ - - -rule bcftools_filter_delly: - input: - vcf = vep_dir + "SV.somatic.{case_name}.delly.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "SV.somatic.{case_name}.delly.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_delly_SV.somatic.{case_name}.delly.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - var_caller = "delly" - threads: - get_threads(cluster_config, "bcftools_filter_delly") - message: - "Filtering Delly results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ - - -rule bcftools_filter_ascat: - input: - vcf = vep_dir + "CNV.somatic.{case_name}.ascat.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "CNV.somatic.{case_name}.ascat.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_ascat_CNV.somatic.{case_name}.ascat.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - var_caller = "ascat" - threads: - get_threads(cluster_config, "bcftools_filter_ascat") - message: - "Filtering Ascat results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule deleted file mode 100644 index 081ca681c..000000000 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_wgs_filter_tumor_only.rule +++ /dev/null @@ -1,53 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 -# NGS filters for various scenarios - - - - -rule bcftools_filter_manta: - input: - vcf = vep_dir + "SV.somatic.{case_name}.manta.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "SV.somatic.{case_name}.manta.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_manta_SV.somatic.{case_name}.manta.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, - var_caller = "manta" - threads: - get_threads(cluster_config, "bcftools_filter_manta") - message: - "Filtering Manta results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ - - -rule bcftools_filter_delly: - input: - vcf = vep_dir + "SV.somatic.{case_name}.delly.all.vcf.gz", - output: - vcf_sv_pass = vep_dir + "SV.somatic.{case_name}.delly.all.filtered.pass.vcf.gz", - benchmark: - benchmark_dir + "bcftools_filter_delly_SV.somatic.{case_name}.delly.vep.tsv" - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = "{case_name}", - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - var_caller = "delly" - threads: - get_threads(cluster_config, "bcftools_filter_delly") - message: - "Filtering Delly results for PASS variants using bcftools for sample '{params.case_name}' " - shell: - """ -bcftools view --threads {threads} -f .,PASS -o {output.vcf_sv_pass} -O z {input.vcf}; -tabix -p vcf -f {output.vcf_sv_pass}; - """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule index 59b9c0435..4473d2974 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule @@ -9,7 +9,8 @@ rule bcftools_filter_tnscope_tumor_normal: vcf = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.vcf.gz", output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.vcf.gz", + vcf_pass_tnscope = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_filter_tnscope_tumor_normal_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -25,7 +26,7 @@ rule bcftools_filter_tnscope_tumor_normal: threads: get_threads(cluster_config, 'bcftools_filter_tnscope_tumor_normal') message: - "Filtering wgs tumor-normal tnscope annotated variants using bcftools on {params.case_name}" + "Filtering WGS tumor-normal tnscope annotated variants using bcftools for {params.case_name}" shell: """ bcftools view {input.vcf} \ @@ -38,9 +39,11 @@ bcftools view {input.vcf} \ tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_tnscope} -O z {output.vcf_filtered}; -tabix -p vcf -f {output.vcf_pass}; +tabix -p vcf -f {output.vcf_pass_tnscope}; + +bcftools +counts {output.vcf_pass_tnscope} > {output.bcftools_counts}; """ @@ -50,19 +53,18 @@ rule bcftools_filter_tnhaplotyper_tumor_normal: wgs_calling_file = config["reference"]["wgs_calling_interval"] output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", + vcf_pass_tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_filter_tnhaplotyper_tumor_normal_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: pop_freq = [SENTIEON_CALLER.pop_freq.tag_value, SENTIEON_CALLER.pop_freq.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_normal') message: - "Filtering wgs tumor-normal tnhaplotyper annotated variants using bcftools on {params.case_name}" + "Filtering WGS tumor-normal tnhaplotyper annotated variants using bcftools for {params.case_name}" shell: """ grep -v '^@' {input.wgs_calling_file} > {input.wgs_calling_file}.bed @@ -73,7 +75,8 @@ bcftools view -f PASS --threads {threads} --regions-file {input.wgs_calling_file tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS --threads {threads} -O z -o {output.vcf_pass} {output.vcf_filtered} +bcftools view -f PASS --threads {threads} -O z -o {output.vcf_pass_tnhaplotyper} {output.vcf_filtered} + +tabix -p vcf -f {output.vcf_pass_tnhaplotyper} -tabix -p vcf -f {output.vcf_pass} """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule index a95af1602..79befa5e7 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule @@ -23,12 +23,11 @@ rule bcftools_filter_tnscope_tumor_only: strand_reads = [SENTIEON_CALLER.strand_reads.tag_value, SENTIEON_CALLER.strand_reads.filter_name], qss = [SENTIEON_CALLER.qss.tag_value, SENTIEON_CALLER.qss.filter_name], sor = [SENTIEON_CALLER.sor.tag_value, SENTIEON_CALLER.sor.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnscope_tumor_only') message: - "Filtering wgs tumor-only tnscope annotated variants using bcftools on {params.case_name}" + "Filtering WGS tumor-only tnscope annotated variants using bcftools for {params.case_name}" shell: """ grep -v '^@' {input.wgs_calling_file} > {input.wgs_calling_file}.bed @@ -54,7 +53,7 @@ rule bcftools_filter_tnhaplotyper_tumor_only: wgs_calling_file = config["reference"]["wgs_calling_interval"] output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", + vcf_pass_tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.pass.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_filter_tnhaplotyper_tumor_only_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -67,12 +66,11 @@ rule bcftools_filter_tnhaplotyper_tumor_only: pop_freq = [SENTIEON_CALLER.pop_freq.tag_value, SENTIEON_CALLER.pop_freq.filter_name], strand_reads = [SENTIEON_CALLER.strand_reads.tag_value, SENTIEON_CALLER.strand_reads.filter_name], qss = [SENTIEON_CALLER.qss.tag_value, SENTIEON_CALLER.qss.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_only') message: - "Filtering wgs tumor-only tnhaplotyper annotated variants using bcftools on {params.case_name}" + "Filtering WGS tumor-only tnhaplotyper annotated variants using bcftools for {params.case_name}" shell: """ grep -v '^@' {input.wgs_calling_file} > {input.wgs_calling_file}.bed @@ -89,9 +87,10 @@ bcftools view -f PASS --threads {threads} --regions-file {input.wgs_calling_file tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS --threads {threads} -O z -o {output.vcf_pass} {output.vcf_filtered} +bcftools view -f PASS --threads {threads} -O z -o {output.vcf_pass_tnhaplotyper} {output.vcf_filtered} + +tabix -p vcf -f {output.vcf_pass_tnhaplotyper} -tabix -p vcf -f {output.vcf_pass} """ @@ -101,7 +100,8 @@ rule bcftools_intersect_tumor_only: tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.all.filtered.vcf.gz" output: vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.vcf.gz", - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.vcf.gz", + vcf_pass_tnscope = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.vcf.gz", + bcftools_counts = vep_dir + "{var_type}.somatic.{case_name}.tnscope.all.filtered.pass.stats" benchmark: Path(benchmark_dir, 'bcftools_intersect_tumor_only_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: @@ -122,9 +122,11 @@ cp {params.vcf_dir}/0002.vcf.gz {output.vcf_filtered}; tabix -p vcf -f {output.vcf_filtered}; -bcftools view -f PASS -o {output.vcf_pass} -O z {output.vcf_filtered}; +bcftools view -f PASS -o {output.vcf_pass_tnscope} -O z {output.vcf_filtered}; + +tabix -p vcf -f {output.vcf_pass_tnscope}; -tabix -p vcf -f {output.vcf_pass}; +bcftools +counts {output.vcf_pass_tnscope} > {output.bcftools_counts}; rm -r {params.vcf_dir} """ diff --git a/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule b/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule new file mode 100644 index 000000000..ab6e51c5b --- /dev/null +++ b/BALSAMIC/snakemake_rules/annotation/vcf2cytosure_convert.rule @@ -0,0 +1,24 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + + +rule vcf2cytosure_convert: + input: + cnv_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf.gz", + cnv_cnr = cnv_dir + "tumor.merged" + ".cnr" + output: + cgh_file = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf2cytosure.cgh" + benchmark: + Path(benchmark_dir, 'vcf2cytosure_convert.' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("vcf2cytosure") + ".sif").as_posix() + threads: + get_threads(cluster_config, "vcf2cytosure_convert") + params: + case_name = config["analysis"]["case_id"], + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv-somatic"}, + message: "Converting CNVs from VCF to the CGH format using vcf2cytosure for {params.case_name}" + shell: + """ +vcf2cytosure --vcf {input.cnv_vcf} --cn {input.cnv_cnr} --out {output.cgh_file} --bins 1 + """ diff --git a/BALSAMIC/snakemake_rules/annotation/vep.rule b/BALSAMIC/snakemake_rules/annotation/vep.rule index 0f6a76ad3..ad30ab891 100644 --- a/BALSAMIC/snakemake_rules/annotation/vep.rule +++ b/BALSAMIC/snakemake_rules/annotation/vep.rule @@ -3,32 +3,28 @@ # VEP annotation module. Annotate all VCFs generated through VEP - -rule vep_somatic: +rule vep_somatic_snv: input: - vcf = vcf_dir + "{var_type}.somatic.{case_name}.{var_caller}.vcf.gz", - header = vcf_dir + "{var_type}.somatic.{case_name}.{var_caller}.sample_name_map", + vcf = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.vcf.gz", + header = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.sample_name_map", cosmic = config["reference"]["cosmic"] output: - vcf_all = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.all.vcf.gz", - vcf_summary = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.all.vcf.gz_summary.html", - bcftools_stats = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.all.stats" + vcf_all = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}.all.vcf.gz"), benchmark: - Path(benchmark_dir, "vep_somatic_{var_type}.somatic.{case_name}.{var_caller}.tsv").as_posix() + Path(benchmark_dir, "vep_somatic_SNV.somatic.{case_name}.{var_caller}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() params: - housekeeper_id = {"id": "{case_name}", "tags": "annotated-somatic"}, ref_path = Path(config["reference"]["gnomad_variant"]).parent.as_posix(), - message_text = "{var_type}.somatic.{case_name}.{var_caller}.vcf.gz", - tmpvcf = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.tmp.vcf.gz", + message_text = "SNV.somatic.{case_name}.{var_caller}.vcf.gz", + tmpvcf = vep_dir + "SNV.somatic.{case_name}.{var_caller}.tmp.vcf.gz", vcfanno_toml = VCFANNO_TOML, vep_cache = config["reference"]["vep"], vep_defaults = params.vep.vep_filters threads: - get_threads(cluster_config, "vep_somatic") + get_threads(cluster_config, "vep_somatic_snv") message: - "Running vep annotation on {params.message_text}" + "Running vep annotation for single nuceotide variants on {params.message_text}" shell: """ vep_path=$(dirname $(readlink -f $(which vep))); @@ -51,11 +47,44 @@ vep \ tabix -p vcf -f {output.vcf_all}; -bcftools stats {output.vcf_all} > {output.bcftools_stats}; - rm $tmpvcf; """ +rule vep_somatic_sv: + input: + vcf = vcf_dir + "SV.somatic.{case_name}.svdb.vcf.gz", + header = vcf_dir + "SV.somatic.{case_name}.svdb.sample_name_map", + output: + vcf_all = temp(vep_dir + "SV.somatic.{case_name}.svdb.all.vcf.gz"), + benchmark: + Path(benchmark_dir, "vep_somatic_SV.somatic.{case_name}.svdb.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() + params: + message_text = "SV.somatic.{case_name}.svdb.vcf.gz", + vep_cache = config["reference"]["vep"], + vep_defaults = params.vep.vep_filters + threads: + get_threads(cluster_config, "vep_somatic_sv") + message: + "Running vep annotation for structural and copy number variants on {params.message_text}" + shell: + """ +vep_path=$(dirname $(readlink -f $(which vep))); +export PERL5LIB=; + +bcftools reheader --threads {threads} -s {input.header} {input.vcf} | \ +bcftools view --threads {threads} -O v | \ +vep \ +--dir $vep_path \ +--dir_cache {params.vep_cache} \ +--dir_plugins $vep_path \ +--output_file {output.vcf_all} \ +--fork {threads} \ +{params.vep_defaults} \ + +tabix -p vcf -f {output.vcf_all}; + """ rule tmb_calculation: input: @@ -69,13 +98,12 @@ rule tmb_calculation: params: af_cutoff = "0.05", bed = config["panel"]["capture_kit"] if "panel" in config else "", - housekeeper_id = {"id": "{case_name}", "tags": "stat-somatic"}, message_text = "{var_type}.somatic.{case_name}.{var_caller}.all", tmpdir = tempfile.mkdtemp(prefix=tmp_dir), threads: get_threads(cluster_config, "vep") message: - "Calculating TMB for {params.message_text}" + "Calculating TMB score for {params.message_text}" shell: """ mkdir -p {params.tmpdir}; @@ -120,21 +148,19 @@ rule vep_germline: cosmic = config["reference"]["cosmic"] output: vcf_all = vep_dir + "{var_type}.germline.{sample}.{var_caller}.vcf.gz", - vcf_summary = vep_dir + "{var_type}.germline.{sample}.{var_caller}.vcf.gz_summary.html", - bcftools_stats = vep_dir + "{var_type}.germline.{sample}.{var_caller}.all.stats" benchmark: Path(benchmark_dir, "vep_germline_{var_type}.germline.{sample}.{var_caller}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() params: - housekeeper_id = {"id": "{sample}", "tags": "annotated-germline"}, + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "annotated-germline"}, sample = '{sample}', vep_cache = config["reference"]["vep"], vep_defaults = params.vep.vep_filters threads: get_threads(cluster_config, 'vep_germline') message: - "Running vep annotation on {params.sample}" + "Running vep annotation on germline variants for {params.sample}" shell: """ vep_path=$(dirname $(readlink -f $(which vep))); @@ -152,5 +178,4 @@ vep \ tabix -p vcf -f {output.vcf_all}; -bcftools stats {output.vcf_all} > {output.bcftools_stats}; """ diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp.rule b/BALSAMIC/snakemake_rules/quality_control/fastp.rule index 0c1bd57fe..06296f4fc 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastp.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastp.rule @@ -32,8 +32,8 @@ rule fastp_umi: read1=config["analysis"]["fastq_path"] + "{sample}" + "_1.fastq.gz", read2=config["analysis"]["fastq_path"] + "{sample}" + "_2.fastq.gz", output: - read1 = fastq_dir + "{sample}_1.umi_optimized.fastq.gz", - read2 = fastq_dir + "{sample}_2.umi_optimized.fastq.gz", + read1 = temp(fastq_dir + "{sample}_1.umi_optimized.fastq.gz"), + read2 = temp(fastq_dir + "{sample}_2.umi_optimized.fastq.gz"), json = qc_dir + "fastp/{sample}_fastp_umi.json", html = qc_dir + "fastp/{sample}_fastp_umi.html", benchmark: @@ -73,8 +73,8 @@ rule fastp: read1 = fastq_dir + "{sample}_1.umi_optimized.fastq.gz", read2 = fastq_dir + "{sample}_2.umi_optimized.fastq.gz" output: - read1 = fastq_dir + "{sample}_1.fp.fastq.gz", - read2 = fastq_dir + "{sample}_2.fp.fastq.gz", + read1 = temp(fastq_dir + "{sample}_1.fp.fastq.gz"), + read2 = temp(fastq_dir + "{sample}_2.fp.fastq.gz"), json = qc_dir + "fastp/{sample}_fastp.json", html = qc_dir + "fastp/{sample}_fastp.html" benchmark: @@ -82,7 +82,6 @@ rule fastp: singularity: Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix() params: - housekeeper_id = {"id": "{sample}", "tags": "quality-trimmed-fastq"}, tmpdir = tmp_dir, umi = " ".join(fastp_param_umi), minimum_length = config["QC"]["min_seq_length"], diff --git a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule index 4922e2ee2..aa1605095 100644 --- a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule @@ -30,7 +30,6 @@ if config["analysis"]["sequencing_type"] == 'wgs': if config['analysis']['analysis_type'] == "paired": multiqc_input.append(bam_dir+"normal.merged.recal_data.table") - else: # fastqc metrics multiqc_input.extend(expand(fastqc_dir + "{sample}_{read_num}_fastqc.zip", sample=config["samples"], read_num=[1, 2])) @@ -57,10 +56,9 @@ else: multiqc_input.extend(expand(bam_dir + "{sample}.samtools.{stats}.txt", sample=config["samples"], stats=['flagstats', 'idxstats', 'stats'])) if config["umiworkflow"]: + # UMI picard metrics multiqc_input.extend(expand(umi_qc_dir + "{sample}.umi.collect_hsmetric", sample=config["samples"])) - - - + multiqc_input.extend(expand(umi_qc_dir + "{sample}.umi.metrics", sample=config["samples"])) rule multiqc: input: @@ -83,6 +81,11 @@ rule multiqc: shell: """ echo -e \"{params.dir_list}\" > {params.qc_dir}/dir_list; -multiqc --force --outdir {params.qc_dir} --exclude {params.exclude_module} --data-format json -l {params.qc_dir}/dir_list; + +multiqc --force --outdir {params.qc_dir} \ +--exclude {params.exclude_module} \ +--data-format json \ +-l {params.qc_dir}/dir_list; + chmod -R 777 {params.qc_dir}; """ diff --git a/BALSAMIC/snakemake_rules/quality_control/picard.rule b/BALSAMIC/snakemake_rules/quality_control/picard.rule index 86f660dd5..6a159629b 100644 --- a/BALSAMIC/snakemake_rules/quality_control/picard.rule +++ b/BALSAMIC/snakemake_rules/quality_control/picard.rule @@ -1,7 +1,10 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 - +if "canfam3" in config['reference']['reference_genome']: + memory = "20g" +else: + memory = "16g" rule picard_CollectHsMetrics: input: @@ -16,7 +19,7 @@ rule picard_CollectHsMetrics: singularity: Path(singularity_image, config["bioinfo_tools"].get("picard") + ".sif").as_posix() params: - mem = "16g", + mem = memory, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), baitsetname = os.path.basename(config["panel"]["capture_kit"]), sample = '{sample}' @@ -103,7 +106,7 @@ rule picard_CollectInsertSizeMetrics: "Calculating picard InsertSize metrics for sample '{params.sample}'" shell: """ -mkdir -p {params.tmpdir}; +mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ diff --git a/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule b/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule new file mode 100644 index 000000000..b5ac0851d --- /dev/null +++ b/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule @@ -0,0 +1,36 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +bcftools_counts_input = [vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.all.filtered.pass.stats"] + +if config["analysis"]["sequencing_type"] == 'wgs': + bcftools_counts_input.append(vep_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.all.filtered.pass.stats") + +else: + bcftools_counts_input.append(vep_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.all.filtered.pass.stats") + + if config["umiworkflow"]: + # bcftools counts + bcftools_counts_input.append(vep_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".TNscope_umi.all.filtered.pass.stats") + +rule collect_custom_qc_metrics: + input: + bcftools_counts = bcftools_counts_input, + json = qc_dir + "multiqc_data/multiqc_data.json" + output: + yaml = qc_dir + config["analysis"]["case_id"] + "_metrics_deliverables.yaml" + params: + collect_qc_metrics_script = get_script_path("collect_qc_metrics.py"), + sequencing_type = get_sequencing_type(config), + capture_kit = get_capture_kit(config), + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "qc-metrics"} + singularity: + Path(singularity_image, "balsamic.sif").as_posix() + threads: + get_threads(cluster_config, "collect_custom_qc_metrics") + message: + "Extract the manually specified QC metric for validation and delivery" + shell: + """ +python {params.collect_qc_metrics_script} {output.yaml} {input.json} {input.bcftools_counts} {params.sequencing_type} {params.capture_kit} + """ diff --git a/BALSAMIC/snakemake_rules/umi/generate_AF_tables.rule b/BALSAMIC/snakemake_rules/umi/generate_AF_tables.rule index 56a0f62c9..4faf00b11 100644 --- a/BALSAMIC/snakemake_rules/umi/generate_AF_tables.rule +++ b/BALSAMIC/snakemake_rules/umi/generate_AF_tables.rule @@ -19,7 +19,7 @@ rule bcftools_query_generatebackgroundaf_umitable: threads: get_threads(cluster_config, "bcftools_query_generatebackgroundaf_umitable") message: - "Creating Allelic frequency table from VCF file for sample {params.case_name}" + "Creating Allelic frequency table from VCF file for {params.case_name}" shell: """ bcftools query \ diff --git a/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule b/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule index 55f07a4c7..ab2f5b401 100644 --- a/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule +++ b/BALSAMIC/snakemake_rules/umi/mergetype_normal_umi.rule @@ -21,12 +21,14 @@ rule mergeBam_normal_umiconsensus: threads: get_threads(cluster_config, "mergeBam_normal_umiconsensus") message: - ("Replace ReadGroups using picard for normal sample {params.sample} " - "and convert bam to cram format") + ("Replacing ReadGroups using picard and converting from bam to cram format for {params.sample}") shell: """ picard AddOrReplaceReadGroups {params.picard} INPUT={input.bam} OUTPUT={output.bam}; + samtools index {output.bam}; + samtools view -h -T {input.fasta} --threads {threads} -C -o {output.cram} {output.bam}; + samtools index {output.cram}; """ diff --git a/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule b/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule index 7915d6b00..f294f2e82 100644 --- a/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule +++ b/BALSAMIC/snakemake_rules/umi/mergetype_tumor_umi.rule @@ -21,8 +21,7 @@ rule mergeBam_tumor_umiconsensus: threads: get_threads(cluster_config, "mergeBam_tumor_umiconsensus") message: - ("Replace ReadGroups using picard for tumor sample {params.sample} " - "and convert bam to cram") + ("Replacing ReadGroups using picard and converting from bam to cram for {params.sample}") shell: """ picard AddOrReplaceReadGroups {params.picard} INPUT={input.bam} OUTPUT={output.bam}; diff --git a/BALSAMIC/snakemake_rules/umi/qc_umi.rule b/BALSAMIC/snakemake_rules/umi/qc_umi.rule index 2eb891e9f..a4b09f124 100644 --- a/BALSAMIC/snakemake_rules/umi/qc_umi.rule +++ b/BALSAMIC/snakemake_rules/umi/qc_umi.rule @@ -2,8 +2,6 @@ # coding: utf-8 ## UmiAwareMarkDuplicatesWithMateCigar - umimetrics - - rule picard_umiaware: input: bam = umi_dir + "{sample}_consensusfiltered_umi.bam" @@ -20,7 +18,7 @@ rule picard_umiaware: threads: get_threads(cluster_config, "picard_umiaware") message: - "Picard Umiaware mark dups for sample {params.sample_id}" + "Marking duplicates using Picardtools with UmiAware for {params.sample_id}" shell: """ picard UmiAwareMarkDuplicatesWithMateCigar \ @@ -29,9 +27,8 @@ O={output.bam} \ M={output.duplicates} \ UMI_METRICS={output.umimetrics}; """ -## CollectHSmetrics - median target coverage-required - +## CollectHSmetrics - median target coverage-required rule picard_collecthsmetrics_umi: input: bam = umi_dir + "{sample}_consensusfiltered_umi.bam", @@ -50,7 +47,7 @@ rule picard_collecthsmetrics_umi: threads: get_threads(cluster_config, "CollectHsMetrics") message: - "Collect HSmetrics using Picardtools for {params.sample_id}" + "Collecting HSmetrics using Picardtools for {params.sample_id}" shell: """ picard BedToIntervalList \ @@ -68,9 +65,8 @@ COVERAGE_CAP=50000 \ BAIT_SET_NAME={params.baitsetname} \ METRIC_ACCUMULATION_LEVEL=ALL_READS; """ -## SUM(Reads in each family)/ the number of families after correction, collapsing on supporting reads. - +## SUM(Reads in each family)/ the number of families after correction, collapsing on supporting reads. rule samtools_view_calculatemeanfamilydepth_umi: input: bam = umi_dir + "{sample}_consensusfiltered_umi.bam" diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule b/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule index e6f61bdc6..460833320 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule @@ -22,7 +22,7 @@ rule sentieon_consensuscall_umi: threads: get_threads(cluster_config, "sentieon_consensuscall_umi") message: - "Consensus molecule creation using sentieon for sample {params.sample_id}" + "Calling consensus molecules using sentieon for {params.sample_id}" shell: """ export LD_PRELOAD={params.sentieon_install_dir}/lib/libjemalloc.so.1 @@ -63,7 +63,7 @@ rule sentieon_bwa_umiconsensus: threads: get_threads(cluster_config, "sentieon_bwa_umiconsensus") message: - "Mapping of consensus reads with the sentieon bwa mem, sorting for sample {params.sample_id}" + "Mapping consensus reads and sorting using sentieon bwa-mem for {params.sample_id}" shell: """ export LD_PRELOAD={params.sentieon_install_dir}/lib/libjemalloc.so.1 @@ -106,7 +106,7 @@ rule sentieon_consensusfilter_umi: threads: get_threads(cluster_config, "sentieon_consensusfilter_umi") message: - "Filtering consensus reads based on XZ tag for sample {params.sample_id}" + "Filtering consensus reads based on XZ tag for {params.sample_id}" shell: """ samtools view -h {input} | \ diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule b/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule index 747dff6ea..71e5e2f73 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_umiextract.rule @@ -22,7 +22,7 @@ rule sentieon_umiextract: threads: get_threads(cluster_config, "sentieon_umiextract") message: - "UMI tag extraction using sentieon for sample {params.sample_id}" + "Extracing UMI tags using sentieon for {params.sample_id}" shell: """ export LD_PRELOAD={params.sentieon_install_dir}/lib/libjemalloc.so.1 @@ -59,7 +59,7 @@ rule sentieon_bwa_umiextract: threads: get_threads(cluster_config, "sentieon_bwa_umiextract") message: - "Aligning of UMI extracted reads with sentieon bwa mem, sorting for sample {params.sample_id}" + "Aligning UMI extracted reads and sorting using sentieon bwa-mem for {params.sample_id}" shell: """ export LD_PRELOAD={params.sentieon_install_dir}/lib/libjemalloc.so.1 diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule index 399e9be17..6dac5db99 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule @@ -12,11 +12,12 @@ rule sentieon_tnscope_umi: bed = config["panel"]["capture_kit"], dbsnp = config["reference"]["dbsnp"] output: - vcf = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".TNscope_umi.vcf.gz", + vcf_tnscope_umi = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".TNscope_umi.vcf.gz", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".TNscope_umi.sample_name_map" benchmark: Path(benchmark_dir, "sentieon_tnscope_umi_" + config["analysis"]["case_id"] + ".tsv").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), sentieon_exec = config["SENTIEON_EXEC"], sentieon_lic = config["SENTIEON_LICENSE"], @@ -27,12 +28,13 @@ rule sentieon_tnscope_umi: init_tumor_lod = params.tnscope_umi.init_tumorLOD, error_rate = params.tnscope_umi.error_rate, prune_factor = params.tnscope_umi.prunefactor, + padding = params.tnscope_umi.padding, tumor = "TUMOR", pcr_model = params.common.pcr_model threads: get_threads(cluster_config, "sentieon_tnscope_umi") message: - "Calling SNVs using TNscope for sample {params.tumor}" + "Calling single nucleotide variants using TNscope for {params.tumor}" shell: """ mkdir -p {params.tmpdir}; @@ -44,6 +46,8 @@ export SENTIEON_LICENSE={params.sentieon_lic}; -t {threads} \ -r {input.ref_fa} \ -i {input.bam} \ +--interval {input.bed} \ +--interval_padding {params.padding} \ --algo {params.algo} \ --tumor_sample {params.tumor} \ --dbsnp {input.dbsnp} \ @@ -55,7 +59,7 @@ export SENTIEON_LICENSE={params.sentieon_lic}; --max_error_per_read {params.error_rate} \ --pcr_indel_model {params.pcr_model} \ --prune_factor {params.prune_factor} \ -{output.vcf}; +{output.vcf_tnscope_umi}; echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; """ diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule index 0aa0b95fe..7205afc2c 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule @@ -12,11 +12,12 @@ rule sentieon_tnscope_umi_tn: bed = config["panel"]["capture_kit"], dbsnp = config["reference"]["dbsnp"] output: - vcf = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".TNscope_umi.vcf.gz", + vcf_tnscope_umi = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".TNscope_umi.vcf.gz", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".TNscope_umi.sample_name_map" benchmark: Path(benchmark_dir, "sentieon_tnscope_umi_" + config["analysis"]["case_id"] + ".tsv").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), sentieon_exec = config["SENTIEON_EXEC"], sentieon_lic = config["SENTIEON_LICENSE"], @@ -28,13 +29,14 @@ rule sentieon_tnscope_umi_tn: error_rate = params.tnscope_umi.error_rate, prune_factor = params.tnscope_umi.prunefactor, pcr_model = params.common.pcr_model, + padding = params.tnscope_umi.padding, tumor = "TUMOR", - normal = "NORMAL" + normal = "NORMAL", + case_name= config["analysis"]["case_id"] threads: get_threads(cluster_config, "sentieon_tnscope_umi") message: - "Calling SNVs using TNscope for sample: {params.tumor}" - " versus sample {params.normal}" + "Calling single nucleotide variants using TNscope for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; @@ -47,6 +49,8 @@ export SENTIEON_LICENSE={params.sentieon_lic}; -r {input.ref_fa} \ -i {input.bamT} \ -i {input.bamN} \ +--interval {input.bed} \ +--interval_padding {params.padding} \ --algo {params.algo} \ --tumor_sample {params.tumor} \ --normal_sample {params.normal} \ @@ -59,7 +63,7 @@ export SENTIEON_LICENSE={params.sentieon_lic}; --max_error_per_read {params.error_rate} \ --pcr_indel_model {params.pcr_model} \ --prune_factor {params.prune_factor} \ -{output.vcf}; +{output.vcf_tnscope_umi}; echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap}; """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_paired.rule b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_paired.rule index c8c834ab4..c2ee61797 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_paired.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_paired.rule @@ -35,7 +35,7 @@ rule cnvkit_paired: sample_id = "TUMOR", genome = GENOME_VERSION message: - "Run CNVkit pipeline for sample {params.case_name} while tumor purity/ploidy calculated using PureCN" + "Calling CNVs using CNVkit and calculating tumor purity/ploidy using PureCN for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule index 6b7ffaf09..9a3dc08a8 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_single.rule @@ -1,6 +1,14 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 + +def get_pon_cnn(config): + if "pon_cnn" in config["panel"]: + return os.path.abspath(config["panel"]["pon_cnn"]) + else: + return None + + rule cnvkit_single: input: access_bed = config["reference"]["access_regions"], @@ -33,10 +41,10 @@ rule cnvkit_single: min_mapq= params.common.min_mapq, case_name = config["analysis"]["case_id"], sample_id = "TUMOR", - genome_version = GENOME_VERSION + genome_version = GENOME_VERSION, + pon = " " if get_pon_cnn(config) is None else get_pon_cnn(config) message: - ("Run CNVkit pipeline for sample {params.case_name}," - "while tumor purity/ploidy calculated using PureCN") + ("Calling CNVs using CNVkit and calculating tumor purity/ploidy using PureCN for {params.case_name}") shell: """ mkdir -p {params.tmpdir}; @@ -66,18 +74,17 @@ cnvkit.py coverage {input.bamT} \ --processes {threads} \ --output {params.cnv_dir}/tumor.antitargetcoverage.cnn; -# Compile a coverage reference from the given list of files -cnvkit.py reference --output {params.cnv_dir}/FlatReference.cnn \ ---fasta {input.fasta} \ ---targets {params.cnv_dir}/targets.bed \ ---antitargets {params.cnv_dir}/antitarget_bed; # Combine the uncorrected target and antitarget coverage tables (.cnn) and # correct for biases in regional coverage and GC content, according to the given reference -cnvkit.py fix {params.cnv_dir}/tumor.targetcoverage.cnn \ -{params.cnv_dir}/tumor.antitargetcoverage.cnn \ -{params.cnv_dir}/FlatReference.cnn \ ---output {output.cnr}; +if [[ ! -f "{params.pon}" ]]; then +cnvkit.py reference --output {params.cnv_dir}/FlatReference.cnn --fasta {input.fasta} --targets {params.cnv_dir}/targets.bed --antitargets {params.cnv_dir}/antitarget_bed; +cnvkit.py fix {params.cnv_dir}/tumor.targetcoverage.cnn {params.cnv_dir}/tumor.antitargetcoverage.cnn {params.cnv_dir}/FlatReference.cnn --output {output.cnr}; +else +echo "PON reference exists- Using it for coverage correction" +cnvkit.py fix {params.cnv_dir}/tumor.targetcoverage.cnn {params.cnv_dir}/tumor.antitargetcoverage.cnn {params.pon} --output {output.cnr}; +fi + # Infer copy number segments from the given coverage table # segmentattion methods (-m): cbs: reccommended for mid-size target panels and exomes diff --git a/BALSAMIC/snakemake_rules/variant_calling/germline.rule b/BALSAMIC/snakemake_rules/variant_calling/germline.rule index 8af31a0f1..97fe45bc0 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/germline.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/germline.rule @@ -1,70 +1,6 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 - - -rule gatk_haplotypecaller: - input: - fa = config["reference"]["reference_genome"], - bam = bam_dir + "{sample_type}.merged.bam", - bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, - output: - vcf_dir + "haplotypecaller/split_vcf/{sample_type}.{bedchrom}_haplotypecaller.vcf.gz" - benchmark: - Path(benchmark_dir,'gatk_haplotypecaller_' + "{sample_type}.{bedchrom}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("gatk") + ".sif").as_posix() - params: - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - sample = '{sample_type}', - gatk_path = '/opt/conda/opt/gatk-3.8' - threads: - get_threads(cluster_config,'gatk_haplotypecaller') - message: - ("Calling germline variants using gatk haplotypecaller for " - "targeted-panel sample {params.sample}") - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; - -java -jar -Djava.io.tmpdir={params.tmpdir} -Xms8G -Xmx32G {params.gatk_path}/GenomeAnalysisTK.jar \ --T HaplotypeCaller \ --R {input.fa} \ --I {input.bam} \ --L {input.bed} \ -| bgzip > {output}; - -rm -rf {params.tmpdir}; - """ - - -rule haplotypecaller_merge: - input: - expand(vcf_dir + "haplotypecaller/split_vcf/{{sample_type}}.{chrom}_haplotypecaller.vcf.gz", chrom=chromlist) - output: - vcf_dir + "SNV.germline.{sample_type}.haplotypecaller.vcf.gz" - benchmark: - Path(benchmark_dir, 'haplotypecaller_merge_' + "SNV.germline.{sample_type}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("gatk") + ".sif").as_posix() - params: - tmpdir = tempfile.mkdtemp(prefix = tmp_dir), - sample = '{sample_type}' - message: - "Concatenate haplotyper outputs of multiple chr vcfs using bcftools for sample {params.sample}" - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; - -bcftools concat {input} | bcftools sort --temp-dir {params.tmpdir} - | bgzip > {output}; -tabix -f -p vcf {output}; - -rm -rf {params.tmpdir}; - """ - - rule sentieon_DNAscope: input: bam = bam_dir + "{sample_type}.merged.bam", @@ -83,7 +19,7 @@ rule sentieon_DNAscope: threads: get_threads(cluster_config, 'sentieon_DNAscope') message: - "Calling germline variants using Sentieon DNAscope for sample {params.sample}" + "Calling germline variants using Sentieon DNAscope for {params.sample}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule b/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule index 11c5ed4ab..4eed3880e 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/germline_sv.rule @@ -20,7 +20,7 @@ rule manta_germline: threads: get_threads(cluster_config,"manta_germline") message: - "Calling germline variants using manta for sample {params.sample}" + "Calling germline variants using manta for {params.sample}" shell: """ configManta.py \ diff --git a/BALSAMIC/snakemake_rules/variant_calling/mergetype_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/mergetype_normal.rule index 488e43b86..4f1b26a3d 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/mergetype_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/mergetype_normal.rule @@ -26,7 +26,7 @@ rule mergeBam_normal: threads: get_threads(cluster_config, "mergeBam_normal") message: - "Replace bam header using Picard tools for normal sample {params.sample}" + "Replacing bam header using Picardtools for {params.sample}" shell: """ picard AddOrReplaceReadGroups {params.picard} INPUT={input.bam} OUTPUT={output.bam}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/mergetype_tumor.rule b/BALSAMIC/snakemake_rules/variant_calling/mergetype_tumor.rule index 2d3fe24ed..8ac8bbf3b 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/mergetype_tumor.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/mergetype_tumor.rule @@ -26,7 +26,7 @@ rule mergeBam_tumor: threads: get_threads(cluster_config, "mergeBam_tumor") message: - "Replace bam header using Picard tools for tumor sample {params.sample}" + "Replacing bam header using Picardtools for {params.sample}" shell: """ picard AddOrReplaceReadGroups {params.picard} INPUT={input.bam} OUTPUT={output.bam}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule index 8f32aa665..4d39675ae 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_germline.rule @@ -23,7 +23,7 @@ rule sentieon_DNAscope: threads: get_threads(cluster_config, 'sentieon_DNAscope') message: - "Calling germline variants using Sentieon DNAscope for sample {params.sample}" + "Calling germline variants using Sentieon DNAscope for {params.sample}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_split_snv_sv.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_split_snv_sv.rule index ca8189759..0751a0512 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_split_snv_sv.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_split_snv_sv.rule @@ -20,7 +20,7 @@ rule bcftools_view_split_variant: threads: get_threads(cluster_config, 'bcftools_view_split_variant') message: - "Split tnscope snv and sv variants using bcftools for sample {params.case_name}" + "Split tnscope snv and sv variants using bcftools for {params.case_name}" shell: """ export TMPDIR={params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule index f97ed8189..75a8cefcf 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule @@ -31,7 +31,7 @@ rule sentieon_base_calibration: threads: get_threads(cluster_config, 'sentieon_base_calibration') message: - "Base recalibration using sentieon tools for sample {params.sample}" + "Recalibrating bases using sentieon tools for {params.sample}" shell: """ mkdir -p {params.tmpdir}; @@ -96,7 +96,7 @@ rule sentieon_TNhaplotyper_tumor_only: threads: get_threads(cluster_config, 'sentieon_TNhaplotyper_tumor_only') message: - "Calling SNVs using sentieon TNhaplotyper for sample {params.case_name}" + "Calling SNVs using sentieon TNhaplotyper for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; @@ -127,12 +127,13 @@ rule sentieon_TNscope_tumor_only: bam = expand(bam_dir + "tumor.merged.bam"), recal = expand(bam_dir + "tumor.merged.recal_data.table") output: - vcf = vcf_dir + "sentieon_tnscope" + "/" + "ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + vcf_tnscope = vcf_dir + "sentieon_tnscope" + "/" + "ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", namemap_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", namemap_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", benchmark: Path(benchmark_dir, "sentieon_TNscope_tumor_only_" + config["analysis"]["case_id"] + ".tsv").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), tumor = "TUMOR", tumor_options = VARCALL_PARAMS["tnscope"]["tumor"], @@ -145,7 +146,7 @@ rule sentieon_TNscope_tumor_only: threads: get_threads(cluster_config, 'sentieon_TNscope_tumor_only') message: - "Calling SNVs using sentieon TNscope for sample {params.case_name}" + "Calling SNVs using sentieon TNscope for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; @@ -162,7 +163,7 @@ export SENTIEON_LICENSE={params.sentieon_lic}; --tumor_sample {params.tumor} {params.pon} \ --dbsnp {input.dbsnp} \ --pcr_indel_mode {params.pcr_model} \ -{params.tumor_options} {output.vcf}; +{params.tumor_options} {output.vcf_tnscope}; echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap_snv}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule index f2d372091..91113ff1b 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule @@ -24,7 +24,7 @@ rule sentieon_base_calibration: threads: get_threads(cluster_config, 'sentieon_base_calibration') message: - "Base recalibration using Sentieon tools for sample {params.sample}" + "Base recalibration using Sentieon tools for {params.sample}" shell: """ mkdir -p {params.tmpdir}; @@ -162,12 +162,13 @@ rule sentieon_TNscope: recalT = expand(bam_dir + "tumor.merged.recal_data.table"), recalN = expand(bam_dir + "normal.merged.recal_data.table"), output: - vcf_all = vcf_dir + "sentieon_tnscope/ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + vcf_tnscope = vcf_dir + "sentieon_tnscope/ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", namemap_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", namemap_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", benchmark: Path(benchmark_dir, 'sentieon_TNscope_' + config[ "analysis" ][ "case_id" ] + ".tsv").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), tumor = "TUMOR", normal = "NORMAL", @@ -211,7 +212,7 @@ intermediate_vcf={params.tmpdir}/tn_sentieon_varcall_file -r {input.ref} \ --algo TNModelApply \ -m {params.sentieon_ml_tnscope} \ --v $intermediate_vcf {output.vcf_all}; +-v $intermediate_vcf {output.vcf_tnscope}; echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap_snv}; cp {output.namemap_snv} {output.namemap_sv} diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule index c9bb6b186..6c5fe40fb 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule @@ -54,8 +54,7 @@ echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap} rm -rf {params.tmpdir}; """ - -rule delly_tumor_normal: +rule delly_sv_tumor_normal: input: fa = config["reference"]["reference_genome"], bamN = bam_dir + normal_bam, @@ -63,9 +62,9 @@ rule delly_tumor_normal: excl = config["reference"]["delly_exclusion_converted"], output: final = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", - namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.sample_name_map", + namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".dellysv.sample_name_map", benchmark: - Path(benchmark_dir, 'delly_tumor_normal_' + config["analysis"]["case_id"] + ".tsv") + Path(benchmark_dir, 'delly_sv_tumor_normal_' + config["analysis"]["case_id"] + ".tsv") singularity: Path(singularity_image, config["bioinfo_tools"].get("delly") + ".sif").as_posix() params: @@ -91,12 +90,11 @@ echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap} rm -rf {params.tmpdir}; """ - rule bcftools_bcf2vcf_delly: input: bcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", output: - vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.vcf.gz", + vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".dellysv.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_bcf2vcf_delly_' + config["analysis"]["case_id"] + ".tsv") singularity: @@ -106,7 +104,7 @@ rule bcftools_bcf2vcf_delly: threads: get_threads(cluster_config, "bcftools_bcf2vcf_delly") message: - ("Convert bcf to vcf for structural variants called using delly for {params.case_name}") + ("Converting BCF from delly to VCF for {params.case_name}") shell: """ bcftools view --threads {threads} -f PASS -O z -o {output.vcf} {input.bcf}; @@ -114,7 +112,6 @@ bcftools view --threads {threads} -f PASS -O z -o {output.vcf} {input.bcf}; tabix -p vcf -f {output.vcf}; """ - rule ascat_tumor_normal: input: fa = config["reference"]["reference_genome"] , @@ -124,26 +121,30 @@ rule ascat_tumor_normal: chryloci= config["reference"]["ascat_chryloci"], output: final_vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.vcf.gz", - sample_statistics = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.samplestatistics.txt", - ascat_plots= expand( - vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat." + "{output_suffix}" + ".png", - output_suffix=["ascatprofile", "rawprofile", "ASPCF", "tumor", "germline", "sunrise"] - ), + ascat_copynumber = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.copynumber.txt.gz", + sample_statistics = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.samplestatistics.txt"), + plot_ascat_profile = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.ascatprofile.png"), + plot_raw_profile = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.rawprofile.png"), + plot_aspcf = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.ASPCF.png"), + plot_tumor = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.tumor.png"), + plot_germline = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.germline.png"), + plot_sunrise = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.sunrise.png"), namemap = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.sample_name_map", benchmark: benchmark_dir + 'ascat_tumor_normal_' + config["analysis"]["case_id"] + "_ascat.tsv" singularity: Path(singularity_image, config["bioinfo_tools"].get("ascatNgs") + ".sif").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), tumor = "TUMOR", normal = "NORMAL", - genome = GENOME_VERSION + genome = GENOME_VERSION, + case_name = config["analysis"]["case_id"] threads: get_threads(cluster_config, "ascat_tumor_normal") message: - ("Call copy number variants using ascatNGS for {input.bamT} vs {input.bamN} files then " - "filter somatic variants and finally convert to compressed vcf file") + ("Calling copy number variants using ascatNGS for {params.case_name}") shell: """ export LD_LIBRARY_PATH=:/opt/wtsi-cgp/lib; @@ -162,19 +163,21 @@ ascat.pl \ cp {params.tmpdir}/{params.tumor}.copynumber.caveman.vcf.gz {output.final_vcf}; +cp {params.tmpdir}/{params.tumor}.copynumber.txt.gz {output.ascat_copynumber} + cp {params.tmpdir}/{params.tumor}.samplestatistics.txt {output.sample_statistics}; -cp {params.tmpdir}/{params.tumor}.ASCATprofile.png {output.ascat_plots[0]}; +cp {params.tmpdir}/{params.tumor}.ASCATprofile.png {output.plot_ascat_profile}; -cp {params.tmpdir}/{params.tumor}.rawprofile.png {output.ascat_plots[1]}; +cp {params.tmpdir}/{params.tumor}.rawprofile.png {output.plot_raw_profile}; -cp {params.tmpdir}/{params.tumor}.ASPCF.png {output.ascat_plots[2]}; +cp {params.tmpdir}/{params.tumor}.ASPCF.png {output.plot_aspcf}; -cp {params.tmpdir}/{params.tumor}.tumour.png {output.ascat_plots[3]}; +cp {params.tmpdir}/{params.tumor}.tumour.png {output.plot_tumor}; -cp {params.tmpdir}/{params.tumor}.germline.png {output.ascat_plots[4]}; +cp {params.tmpdir}/{params.tumor}.germline.png {output.plot_germline}; -cp {params.tmpdir}/{params.tumor}.sunrise.png {output.ascat_plots[5]}; +cp {params.tmpdir}/{params.tumor}.sunrise.png {output.plot_sunrise}; tabix -p vcf -f {output.final_vcf}; @@ -183,7 +186,6 @@ echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap} rm -rf {params.tmpdir}; """ - rule ascat_tumor_normal_merge_output: input: sample_statistics = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.samplestatistics.txt", @@ -192,17 +194,54 @@ rule ascat_tumor_normal_merge_output: output_suffix=["ascatprofile", "rawprofile", "ASPCF", "tumor", "germline", "sunrise"] ), output: - ascat_output_pdf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.output.pdf" + ascat_pdf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".ascat.output.pdf" params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, merge_ascat_output_script= get_script_path("create_pdf.py"), singularity: Path(singularity_image, "balsamic.sif").as_posix() threads: get_threads(cluster_config, "ascat_tumor_normal_merge_output") message: - "Merge the ascatNgs output plots together with the sample statistics into a single PDF" + "Merging the output plots and the sample statistics from ascatNGS into a single PDF" shell: """ -python {params.merge_ascat_output_script} {output.ascat_output_pdf} {input.sample_statistics} {input.ascat_plots} +python {params.merge_ascat_output_script} {output.ascat_pdf} {input.sample_statistics} {input.ascat_plots} + """ + +rule svdb_merge_tumor_normal: + input: + vcf = expand( + vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".{caller}.vcf.gz", + caller=somatic_caller_sv) + + expand( + vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".{caller}.vcf.gz", + caller=somatic_caller_cnv) + output: + vcf_svdb = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.vcf.gz", + namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.sample_name_map", + benchmark: + Path(benchmark_dir, 'svdb_merge_tumor_normal_' + config["analysis"]["case_id"] + ".tsv") + singularity: + Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() + params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, + tumor = get_sample_type(config["samples"], "tumor"), + normal = get_sample_type(config["samples"], "normal"), + case_name = config["analysis"]["case_id"], + vcf= lambda wildcards, input:[input[index] + ":" + svdb_callers_prio[index] for index in range(0,len(input))], + svdb_priority= ",".join(svdb_callers_prio) + threads: + get_threads(cluster_config, "svdb_merge_tumor_normal") + message: + "Merging structural and copy number variants using SVDB for {params.case_name}" + shell: + """ +svdb --merge --no_intra --bnd_distance 5000 --overlap 0.80 \ +--vcf {params.vcf} \ +--priority {params.svdb_priority} | \ +bgzip -l 9 -c > {output.vcf_svdb}; +tabix -p vcf -f {output.vcf_svdb}; + +echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap}; """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule index 2bf3655f4..4a3620f8a 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule @@ -50,17 +50,16 @@ echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; rm -rf {params.tmpdir}; """ - -rule delly_tumor_only: +rule delly_sv_tumor_only: input: fa = config["reference"]["reference_genome"], bamT = bam_dir + tumor_bam, excl = config["reference"]["delly_exclusion_converted"], output: bcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", - namemap= vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.sample_name_map", + namemap= vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".dellysv.sample_name_map", benchmark: - benchmark_dir + 'delly_tumor_only_' + config["analysis"]["case_id"] + ".tsv" + benchmark_dir + 'delly_sv_tumor_only_' + config["analysis"]["case_id"] + ".tsv" singularity: Path(singularity_image, config["bioinfo_tools"].get("delly") + ".sif").as_posix() params: @@ -71,8 +70,7 @@ rule delly_tumor_only: threads: get_threads(cluster_config, "delly_tumor_only") message: - ("Calling structural variants using delly for {params.case_name}," - "filter somatic variants and finally convert from bcf to compressed vcf file") + ("Calling structural variants using delly for {params.case_name}") shell: """ delly call -x {input.excl} -o {output.bcf} -g {input.fa} {input.bamT} @@ -82,11 +80,46 @@ echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; rm -rf {params.tmpdir}; """ -rule bcftools_bcf2vcf_delly: +rule delly_cnv_tumor_only: input: + fa = config["reference"]["reference_genome"], + bamT = bam_dir + tumor_bam, bcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", + map = config["reference"]["delly_mappability"], output: - vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.vcf.gz", + cnv_delly = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", + rd_delly = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".dellycnv.cov.gz", + namemap= vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".dellycnv.sample_name_map", + benchmark: + benchmark_dir + 'delly_cnv_tumor_only_' + config["analysis"]["case_id"] + ".tsv" + singularity: + Path(singularity_image, config["bioinfo_tools"].get("delly") + ".sif").as_posix() + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + housekeeper_id= {"id": config["analysis"]["case_id"],"tags": "clinical"}, + runmode = "local", + tumor = "TUMOR", + case_name = config["analysis"]["case_id"] + threads: + get_threads(cluster_config, "delly_tumor_only") + message: + ("Calling copy number variants using delly for {params.case_name}") + shell: + """ +delly cnv -m {input.map} -g {input.fa} -c {output.rd_delly} -o {output.cnv_delly} -l {input.bcf} {input.bamT} + +echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; + +rm -rf {params.tmpdir}; + """ + +rule bcftools_bcf2vcf_delly: + input: + bcf_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", + bcf_cnv= vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".delly.bcf", + output: + vcf_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".dellysv.vcf.gz", + vcf_cnv= vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".dellycnv.vcf.gz", benchmark: Path(benchmark_dir, 'bcftools_bcf2vcf_delly_' + config["analysis"]["case_id"] + ".tsv") singularity: @@ -96,12 +129,50 @@ rule bcftools_bcf2vcf_delly: threads: get_threads(cluster_config, "bcftools_bcf2vcf_delly") message: - ("Convert bcf to vcf for structural variants called using delly for {params.case_name}") + ("Converting BCF from delly to VCF for {params.case_name}") shell: """ -bcftools view --threads {threads} -f PASS -O z -o {output.vcf} {input.bcf}; +bcftools view --threads {threads} -f PASS -O z -o {output.vcf_sv} {input.bcf_sv}; -tabix -p vcf -f {output.vcf}; +bcftools view --threads {threads} -f PASS -O z -o {output.vcf_cnv} {input.bcf_cnv} + +tabix -p vcf -f {output.vcf_sv}; + +tabix -p vcf -f {output.vcf_cnv} """ +rule svdb_merge_tumor_only: + input: + vcf = expand( + vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".{caller}.vcf.gz", + caller=somatic_caller_sv) + + expand( + vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".{caller}.vcf.gz", + caller=somatic_caller_cnv) + output: + vcf_svdb = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.vcf.gz", + namemap = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.sample_name_map", + benchmark: + Path(benchmark_dir, 'svdb_merge_tumor_only_' + config["analysis"]["case_id"] + ".tsv") + singularity: + Path(singularity_image, config["bioinfo_tools"].get("svdb") + ".sif").as_posix() + params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, + tumor = get_sample_type(config["samples"], "tumor"), + case_name = config["analysis"]["case_id"], + vcf= lambda wildcards, input:[input[index] + ":" + svdb_callers_prio[index] for index in range(0,len(input))], + svdb_priority= ",".join(svdb_callers_prio) + threads: + get_threads(cluster_config, "svdb_merge_tumor_only") + message: + "Merging structural and copy number variants using SVDB for {params.case_name}" + shell: + """ +svdb --merge --no_intra --bnd_distance 5000 --overlap 0.80 \ +--vcf {params.vcf} \ +--priority {params.svdb_priority} | \ +bgzip -l 9 -c > {output.vcf_svdb}; +tabix -p vcf -f {output.vcf_svdb}; +echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; + """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule index e7a4de20f..95d14e31c 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule @@ -25,7 +25,7 @@ rule vardict_tumor_normal: threads: get_threads(cluster_config, "vardict_tumor_normal") message: - "Calling variants using vardict for sample {params.case_name}" + "Calling variants using vardict for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; @@ -50,10 +50,11 @@ rule vardict_merge: input: expand(vcf_dir + "vardict/split_vcf/{chrom}_vardict.vcf.gz", chrom=chromlist) output: - vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", + vcf_vardict = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", yaml = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.yaml", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.sample_name_map" params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), case_name = config["analysis"]["case_id"], benchmark: @@ -63,19 +64,18 @@ rule vardict_merge: threads: get_threads(cluster_config,"vardict_merge") message: - ("Merging all chromosomes vardict results into " - "single vcf using bcftools for sample {params.case_name}") + ("Merging multiple VCFs from vardict into single VCF using bcftools for {params.case_name}") shell: """ mkdir -p {params.tmpdir}; -bcftools concat {input} | bcftools sort --temp-dir {params.tmpdir} - | bgzip > {output.vcf}; -tabix -f -p vcf {output.vcf}; +bcftools concat {input} | bcftools sort --temp-dir {params.tmpdir} - | bgzip > {output.vcf_vardict}; +tabix -f -p vcf {output.vcf_vardict}; echo -e \"{params.case_name}\\tTUMOR\\n{params.case_name}-match\\tNORMAL\" > {output.namemap}; echo -e \"{params.case_name}\" > {output.namemap}.tumor; echo -e \"{params.case_name}-match\" > {output.namemap}.normal; -echo '{{ vcf: {{ vardict: {{ name: vardict, path: {output.vcf} }} }} }}' > {output.yaml}; +echo '{{ vcf: {{ vardict: {{ name: vardict, path: {output.vcf_vardict} }} }} }}' > {output.yaml}; rm -rf {params.tmpdir}; """ @@ -103,7 +103,7 @@ rule sentieon_TNhaplotyper: threads: get_threads(cluster_config, 'sentieon_TNhaplotyper') message: - "Calling variants using TNhaplotyper for sample {params.case_name}" + "Calling single nucleotide variants using TNhaplotyper for {params.case_name}" shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule index 73c1a5f2d..267fd091a 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule @@ -1,15 +1,6 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -def get_pon(config): - """ return pon cli string, complete with file """ - if "PON" in config["analysis"]: - return os.path.abspath(config["analysis"]["PON"]) - else: - return None - - - rule vardict_tumor_only: input: fa = config["reference"]["reference_genome"], @@ -31,7 +22,7 @@ rule vardict_tumor_only: threads: get_threads(cluster_config, "vardict_tumor_only") message: - "Calling variants using vardict for sample {params.case_name}" + "Calling single nucleotide variants using vardict for {params.case_name}" shell: """ export PERL5LIB=; @@ -62,19 +53,19 @@ rule vardict_merge: output: namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.sample_name_map", yaml = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.yaml", - vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz" + vcf_vardict = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz" benchmark: Path(benchmark_dir, 'vardict_merge_' + config["analysis"]["case_id"] + ".tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("vardict") + ".sif").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), case_name = config["analysis"]["case_id"], threads: get_threads(cluster_config,"vardict_merge") message: - ("Merging all chromosomes vardict results into " - "single vcf using bcftools for sample {params.case_name}") + ("Merging multiple VCFs from vardict into single VCF using bcftools for {params.case_name}") shell: """ mkdir -p {params.tmpdir}; @@ -82,12 +73,12 @@ export TMPDIR={params.tmpdir}; bcftools concat {input} \ | bcftools sort --temp-dir {params.tmpdir} - \ -| bgzip > {output.vcf}; -tabix -f -p vcf {output.vcf}; +| bgzip > {output.vcf_vardict}; +tabix -f -p vcf {output.vcf_vardict}; echo -e \"{params.case_name}\\tTUMOR\" > {output.namemap}; echo -e \"{params.case_name}\" > {output.namemap}.tumor; -echo '{{ vcf: {{ vardict: {{ name: vardict , path: {output.vcf} }} }} }}' > {output.yaml}; +echo '{{ vcf: {{ vardict: {{ name: vardict , path: {output.vcf_vardict} }} }} }}' > {output.yaml}; """ @@ -105,7 +96,6 @@ rule sentieon_TNhaplotyper_tumor_only: Path(benchmark_dir,'sentieon_TNhaplotyper_tumor_only_' + config["analysis"]["case_id"] + ".tsv").as_posix() params: tumor = "TUMOR", - pon = " " if get_pon(config) is None else " ".join(["--pon", get_pon(config)]), tmpdir= tempfile.mkdtemp(prefix=tmp_dir), sentieon_exec = config["SENTIEON_EXEC"], sentieon_lic = config["SENTIEON_LICENSE"], @@ -127,7 +117,7 @@ export SENTIEON_LICENSE={params.sentieon_lic}; -i {input.bam} \ --interval {input.interval} \ --algo TNhaplotyper \ ---tumor_sample {params.tumor} {params.pon} \ +--tumor_sample {params.tumor} \ --cosmic {input.cosmic} \ --dbsnp {input.dbsnp} {output.vcf}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule b/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule index 856a95a51..747d7604e 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule @@ -19,8 +19,7 @@ rule bedtools_splitbed_by_chrom: split_bed_dir = vcf_dir + "split_bed/", origin_bed = capture_kit, message: - ("Capturing reference genome chromosome size and splitting the panel bed per chromosome" - "Extend the region by 100bp on each direction, sort and merge the overlapping intervals using bedtools") + ("Splitting the panel bed per chromosome, flanking regions by 100bp and merging into single VCF using bedtools") shell: """ mkdir -p {params.tmpdir}; diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 6dab1503c..7780e2c13 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -9,6 +9,7 @@ from pathlib import Path from io import StringIO from distutils.spawn import find_executable +import zlib import yaml import snakemake @@ -231,6 +232,15 @@ def write_json(json_out, output_config): raise error +def read_yaml(yaml_path): + """Retrieves data from a yaml file""" + if Path(yaml_path).exists(): + with open(yaml_path, "r") as fn: + return yaml.load(fn, Loader=yaml.SafeLoader) + else: + raise FileNotFoundError(f"The YAML file {yaml_path} was not found.") + + def iterdict(dic): """dictionary iteration - returns generator""" for key, value in dic.items(): @@ -251,17 +261,24 @@ def get_schedulerpy(): return scheduler -def get_snakefile(analysis_type, sequencing_type="targeted"): +def get_snakefile(analysis_type, reference_genome="hg19"): """ Return a string path for variant calling snakefile. """ p = Path(__file__).parents[1] snakefile = Path(p, "workflows", "balsamic.smk") + if analysis_type == "generate_ref": snakefile = Path(p, "workflows", "reference.smk") + if "canfam3" in reference_genome: + snakefile = Path(p, "workflows", "reference-canfam3.smk") + return str(snakefile) + if analysis_type == "pon": snakefile = Path(p, "workflows", "PON.smk") + if "qc_panel" in analysis_type: + snakefile = Path(p, "workflows", "QC.smk") return str(snakefile) @@ -583,7 +600,9 @@ def generate_graph(config_collection_dict, config_path): snakemake.snakemake( snakefile=get_snakefile( analysis_type=config_collection_dict["analysis"]["analysis_type"], - sequencing_type=config_collection_dict["analysis"]["sequencing_type"], + reference_genome=config_collection_dict["reference"][ + "reference_genome" + ], ), dryrun=True, configfiles=[config_path], @@ -683,3 +702,22 @@ def create_pon_fastq_symlink(pon_fastqs, symlink_dir): os.symlink(pon_fastq, pon_sym_file) except FileExistsError: LOG.info(f"File {pon_sym_file} exists, skipping") + + +def get_md5(filename): + with open(filename, "rb") as fh: + hashed = 0 + while True: + s = fh.read(65536) + if not s: + break + hashed = zlib.crc32(s, hashed) + return "%08X" % (hashed & 0xFFFFFFFF) + + +def create_md5(reference, check_md5): + """create a md5 file for all reference data""" + with open(check_md5, "w") as fh: + for key, value in reference.items(): + if os.path.isfile(value): + fh.write(get_md5(value) + " " + value + "\n") diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index 9397c1892..dc300a54b 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -3,7 +3,7 @@ import os from datetime import datetime from pathlib import Path -from typing import Optional, List, Dict +from typing import Optional, List, Dict, Any from pydantic import BaseModel, validator, Field, AnyUrl, root_validator from pydantic.types import DirectoryPath, FilePath @@ -56,6 +56,7 @@ class VarCallerFilter(BaseModel): MQ: VCFAttributes (optional); minimum mapping quality DP: VCFAttributes (optional); minimum read depth pop_freq: VCFAttributes (optional); maximum gnomad_af + pop_freq_umi: VCFAttributes (optional); maximum gnomad_af for UMI workflow strand_reads: VCFAttributes (optional); minimum strand specific read counts qss: VCFAttributes (optional); minimum sum of base quality scores sor: VCFAttributes (optional); minimum symmetrical log-odds ratio @@ -71,6 +72,7 @@ class VarCallerFilter(BaseModel): MQ: Optional[VCFAttributes] DP: Optional[VCFAttributes] pop_freq: Optional[VCFAttributes] + pop_freq_umi: Optional[VCFAttributes] strand_reads: Optional[VCFAttributes] qss: Optional[VCFAttributes] sor: Optional[VCFAttributes] @@ -178,17 +180,18 @@ def sequencing_type_literal(cls, value) -> str: class VCFModel(BaseModel): """Contains VCF config""" - manta: VarcallerAttribute - cnvkit: VarcallerAttribute vardict: VarcallerAttribute tnscope: VarcallerAttribute dnascope: VarcallerAttribute tnhaplotyper: VarcallerAttribute - manta_germline: VarcallerAttribute - haplotypecaller: VarcallerAttribute TNscope_umi: VarcallerAttribute - delly: VarcallerAttribute + manta_germline: VarcallerAttribute + manta: VarcallerAttribute + dellysv: VarcallerAttribute + cnvkit: VarcallerAttribute ascat: VarcallerAttribute + dellycnv: VarcallerAttribute + svdb: VarcallerAttribute class AnalysisModel(BaseModel): @@ -197,10 +200,11 @@ class AnalysisModel(BaseModel): Attributes: case_id : Field(required); string case identifier - analysis_type : Field(required); string literal [single, paired, pon] + analysis_type : Field(required); string literal [single, paired, pon, qc_panel] single : if only tumor samples are provided paired : if both tumor and normal samples are provided pon : panel of normal analysis + qc_panel : QC analysis only sequencing_type : Field(required); string literal [targeted, wgs] targeted : if capture kit was used to enrich specific genomic regions wgs : if whole genome sequencing was performed @@ -216,7 +220,7 @@ class AnalysisModel(BaseModel): Raises: ValueError: - When analysis_type is set to any value other than [single, paired, qc, pon] + When analysis_type is set to any value other than [single, paired, pon, qc_panel] When sequencing_type is set to any value other than [wgs, targeted] """ @@ -353,6 +357,7 @@ class PanelModel(BaseModel): Attributes: capture_kit : Field(str(Path)); string representation of path to PANEL BED file chrom : Field(list(str)); list of chromosomes in PANEL BED + pon_cnn: Field(optional); Path where PON reference .cnn file is stored Raises: ValueError: @@ -362,11 +367,18 @@ class PanelModel(BaseModel): capture_kit: Optional[FilePath] chrom: Optional[List[str]] + pon_cnn: Optional[FilePath] @validator("capture_kit") def path_as_abspath_str(cls, value): return Path(value).resolve().as_posix() + @validator("pon_cnn") + def pon_abspath_as_str(cls, value): + if value: + return Path(value).resolve().as_posix() + return None + class PonBalsamicConfigModel(BaseModel): """Summarizes config models in preparation for export @@ -418,7 +430,7 @@ class BalsamicConfigModel(BaseModel): """ QC: QCModel - vcf: VCFModel + vcf: Optional[VCFModel] analysis: AnalysisModel samples: Dict[str, SampleInstanceModel] reference: Dict[str, Path] @@ -526,6 +538,7 @@ class ReferenceMeta(BaseModel): rankscore: ReferenceUrlsModel. Optional rankscore model access_regions: ReferenceUrlsModel. Optional field for accessible genome regions delly_exclusion: ReferenceUrlsModel. Optional field for genome exclusion regions + delly_mappability: ReferenceUrlsModel. Optional field for genome mappability ascat_gccorrection: ReferenceUrlsModel. Optional field for genome gc correction bins ascat_chryloci: ReferenceUrlsModel. Optional field for chromosome Y loci clinvar: ReferenceUrlsModel. Optional field for clinvar reference @@ -548,6 +561,9 @@ class ReferenceMeta(BaseModel): rankscore: Optional[ReferenceUrlsModel] access_regions: Optional[ReferenceUrlsModel] delly_exclusion: Optional[ReferenceUrlsModel] + delly_mappability: Optional[ReferenceUrlsModel] + delly_mappability_gindex: Optional[ReferenceUrlsModel] + delly_mappability_findex: Optional[ReferenceUrlsModel] ascat_gccorrection: Optional[ReferenceUrlsModel] ascat_chryloci: Optional[ReferenceUrlsModel] clinvar: Optional[ReferenceUrlsModel] @@ -619,6 +635,7 @@ class UMIParamsTNscope(BaseModel): init_tumorLOD: float (required); minimum tumor log odds in the initial pass calling variants error_rate: int (required); allow error-rate to consider in calling prunefactor: int (required); pruning factor in the kmer graph + padding: int(required); amount to pad bed interval regions """ algo: str @@ -626,6 +643,7 @@ class UMIParamsTNscope(BaseModel): min_tumorLOD: int error_rate: int prunefactor: int + padding: int disable_detect: str @@ -697,64 +715,20 @@ class BalsamicWorkflowConfig(BaseModel): tnscope_umi: UMIParamsTNscope -class QCMetricModel(BaseModel): - """Defines the quality control metric model +class MetricConditionModel(BaseModel): + """Defines the metric condition model Attributes: - name: str (required); quality control metric name norm: string (optional); validation condition threshold: float (optional); validation cut off - value: float (required); metrics value - - Raises: - ValueError: when a metric does not meet its validation requirements """ - name: str norm: Optional[str] = None threshold: Optional[float] = None - value: float - - @root_validator() - def check_metric(cls, values): - """Checks if a metric meets its filtering condition""" - if ( - values["norm"] - and values["threshold"] - and not VALID_OPS[values["norm"]](values["value"], values["threshold"]) - ): - raise ValueError( - f"QC metric {values['name']}: {values['value']} validation has failed. " - f"(Condition: {values['norm']} {values['threshold']})." - ) - - LOG.info(f"QC metric {values['name']}: {values['value']} meets its condition.") - return values - - -class QCValidationModel(BaseModel): - """Defines the quality control validation model - - Attributes: - metrics: Dict(sample_name, list(QCMetricModel)) (required); quality control metric attributes - """ - metrics: Dict[str, List[QCMetricModel]] - - @property - def get_json(self): - """Restructures the metrics dictionary and returns a metric-value json object""" - metrics_json = {k: {} for k in self.metrics} - - for sample_name, metrics in self.metrics.items(): - for metric in metrics: - metrics_json[sample_name].update({metric.name: metric.value}) - - return metrics_json - -class DeliveryMetricModel(BaseModel): - """Defines the metric attributes model for delivery +class MetricModel(BaseModel): + """Defines the metric attributes model Attributes: header: str (optional); data @@ -762,7 +736,8 @@ class DeliveryMetricModel(BaseModel): input: str (required); input file name: str (required); metric name step: str (required); step that generated the metric - value: float (required); metric value + value: Any (required and can take None as a value); metric value + condition: MetricConditionModel (required and can take None as a value); metric validation condition """ header: Optional[str] @@ -770,4 +745,43 @@ class DeliveryMetricModel(BaseModel): input: str name: str step: str - value: float + value: Any = ... + condition: Optional[MetricConditionModel] = ... + + @validator("name") + def validate_name(cls, name, values): + """Updates the name if the source is FastQC""" + + if "fastqc-percent_duplicates" in name: + return "PERCENT_DUPLICATION_R" + values["input"].split("_")[-2] + + return name + + +class MetricValidationModel(BaseModel): + """Defines the metric validation model + + Attributes: + metrics: List[MetricModel] (required); metric model to validate + + Raises: + ValueError: when a metric does not meet its validation requirements + """ + + metrics: List[MetricModel] + + @validator("metrics", each_item=True) + def validate_metrics(cls, metric): + """Checks if a metric meets its filtering condition""" + + if metric.condition and not VALID_OPS[metric.condition.norm]( + metric.value, metric.condition.threshold + ): + raise ValueError( + f"QC metric {metric.name}: {metric.value} validation has failed. " + f"(Condition: {metric.condition.norm} {metric.condition.threshold}, ID: {metric.id})." + ) + + LOG.info(f"QC metric {metric.name}: {metric.value} meets its condition.") + + return metric diff --git a/BALSAMIC/utils/qc_metrics.py b/BALSAMIC/utils/qc_metrics.py index 704641245..17d4b281f 100644 --- a/BALSAMIC/utils/qc_metrics.py +++ b/BALSAMIC/utils/qc_metrics.py @@ -1,162 +1,7 @@ -import json -import os +from BALSAMIC.utils.models import MetricValidationModel -from BALSAMIC.constants.quality_check_reporting import ( - METRICS, - METRICS_TO_DELIVER, -) -from BALSAMIC.utils.models import QCValidationModel, DeliveryMetricModel +def validate_qc_metrics(metrics: dict) -> dict: + """Returns a set of validated QC metrics""" -def get_qc_available_panel_beds(metrics): - """Returns available panel beds file names for QC validation""" - available_beds = [] - - for k in metrics: - if k != "default": - available_beds.append(k) - - return available_beds - - -def merge_dicts(*dicts): - """Merges multiple dictionaries integrating by common keys""" - merged_dict = {} - - for d in dicts: - for key in d: - try: - # Overwrites the default values with panel specific ones - merged_dict[key].update(d[key]) - except KeyError: - merged_dict[key] = d[key] - - return merged_dict - - -def read_metrics(analysis_path, file_name): - """Extracts all the metrics from a specific QC file""" - with open(os.path.join(analysis_path, "qc", "multiqc_data", file_name), "r") as f: - raw_metrics = json.load(f) - - # Ignore the metrics associated with UMIs - filtered_raw_metrics = { - sample_name: metrics - for sample_name, metrics in raw_metrics.items() - if "umi" not in sample_name - } - - return filtered_raw_metrics - - -def update_metrics_dict(sample_id, metric, value, metrics_dict): - """Appends a {metric, value, condition} object to a dictionary""" - sample_name = "_".join([sample_id.split("_")[0], sample_id.split("_")[1]]) - - if sample_name not in metrics_dict: - metrics_dict[sample_name] = [] - - try: - norm = metric[1]["condition"]["norm"] - threshold = metric[1]["condition"]["threshold"] - except TypeError: - norm = None - threshold = None - - metrics_dict[sample_name].append( - {"name": metric[0], "norm": norm, "threshold": threshold, "value": value} - ) - - return metrics_dict - - -def get_qc_metrics_dict(analysis_path, requested_metrics): - """Returns a dictionary of the requested QC metrics along with their values and filtering conditions""" - metrics_dict = {} - - # Loop through MultiQC json files - for file_name, metrics in requested_metrics.items(): - raw_metrics = read_metrics(analysis_path, file_name) - for j in raw_metrics: - for k in metrics.items(): - metrics_dict = update_metrics_dict( - j, k, raw_metrics[j][k[0]], metrics_dict - ) - return metrics_dict - - -def get_qc_metrics_json(analysis_path, sequencing_type, panel_bed): - """Extracts the metrics of interest and returns them as a json object""" - if sequencing_type != "wgs" and panel_bed in get_qc_available_panel_beds( - METRICS["qc"][sequencing_type] - ): - metrics = merge_dicts( - METRICS["qc"][sequencing_type]["default"], - METRICS["qc"][sequencing_type][panel_bed], - ) - elif sequencing_type != "wgs": - metrics = METRICS["qc"][sequencing_type]["default"] - else: - metrics = METRICS["qc"][sequencing_type] - - qc_model = QCValidationModel.parse_obj( - {"metrics": get_qc_metrics_dict(analysis_path, metrics)} - ) - - return qc_model.get_json - - -def get_multiqc_data_source(data, sample, source_name): - """Extracts the metrics data source associated with sample and source names""" - - # Splits multiqc_picard_dups into ['multiqc', 'picard', 'dup'] in order to retrieve the - # ["report_data_sources"]["Picard"]["DuplicationMetrics"] values from multiqc_data.json - source = source_name[:-1].split("_") - - # Nested json fetching - for source_tool in data["report_data_sources"]: - for source_step in data["report_data_sources"][source_tool]: - if ( - source[1].lower() in source_tool.lower() - and source[2].lower() in source_step.lower() - ): - try: - return os.path.basename( - data["report_data_sources"][source_tool][source_step][sample] - ) - except KeyError: - # Deletes par orientation information from the sample name (insertSize metrics) - sample = sample.rsplit("_", 1)[0] - - return os.path.basename( - data["report_data_sources"][source_tool][source_step][sample] - ) - - -def extract_metrics_for_delivery(analysis_path, sequencing_type): - """Extracts the output metrics to be delivered""" - with open( - os.path.join(analysis_path, "qc", "multiqc_data", "multiqc_data.json"), "r" - ) as f: - raw_data = json.load(f) - - def extract(data, output_metrics, sample=None, source=None): - """Recursively fetch metrics information from nested multiQC JSON""" - if isinstance(data, dict): - for k in data: - if "umi" not in k: - if k in METRICS_TO_DELIVER[sequencing_type]: - output_metrics.append( - DeliveryMetricModel( - id=sample.split("_")[1], - input=get_multiqc_data_source(raw_data, sample, source), - name=k, - step=source, - value=data[k], - ).dict() - ) - extract(data[k], output_metrics, k, sample) - - return output_metrics - - return extract(raw_data["report_saved_raw_data"], []) + return MetricValidationModel(metrics=metrics).dict()["metrics"] diff --git a/BALSAMIC/utils/qc_report.py b/BALSAMIC/utils/qc_report.py deleted file mode 100644 index 405659821..000000000 --- a/BALSAMIC/utils/qc_report.py +++ /dev/null @@ -1,97 +0,0 @@ -from markdown import Markdown -from jinja2 import Environment, FileSystemLoader -from datetime import datetime -from pathlib import Path - -from BALSAMIC import __version__ as balsamic_version -from BALSAMIC.constants.quality_check_reporting import REPORT_MODEL - - -def report_data_population(collected_qc: dict, meta: dict, lang: str = "sv") -> dict: - """populates a metadata dictionary that contains qc and case/sample information""" - meta = { - **meta, - **{ - "title": "Kvalitetsrapport", - "subtitle": "Klinisk sekvensering av cancerprover", - "footnote": "Slut på rapporten", - "bioinformatic": f"BALSAMIC version {balsamic_version}", - "qc_table_content": {}, - "coverage_table_content": {}, - }, - } - - meta["qc_table_header"] = [v[lang] for x, v in REPORT_MODEL["qc"].items()] - meta["coverage_table_header"] = [ - v[lang] for x, v in REPORT_MODEL["coverage"].items() - ] - - for sample_id, analysis_results in collected_qc.items(): - lims_id = sample_id.split("_")[1] - sample_qc = [meta["sample_map"][lims_id], meta["sample_type"][lims_id]] - sample_cov = [meta["sample_map"][lims_id], meta["sample_type"][lims_id]] - - sample_qc = sample_qc + parse_collected_qc( - collected_qc=collected_qc, model_param="qc", sample_id=sample_id - ) - sample_cov = sample_cov + parse_collected_qc( - collected_qc=collected_qc, model_param="coverage", sample_id=sample_id - ) - - meta["qc_table_content"][lims_id] = sample_qc - meta["coverage_table_content"][lims_id] = sample_cov - - return meta - - -def parse_collected_qc(collected_qc: dict, model_param: str, sample_id: str) -> list: - """parses collect qc and returns model_param""" - parsed_qc = list() - - for qc_item, qc_value in REPORT_MODEL[model_param].items(): - decimal_point = qc_value["decimal"] - qc_to_report = collected_qc[sample_id][qc_item] - if "as_percent" in qc_value: - qc_to_report = qc_to_report * 100 - qc_to_report = str(round(qc_to_report, decimal_point)) - if "as_percent" in qc_value: - qc_to_report = f"{qc_to_report} %" - parsed_qc.append(qc_to_report) - - return parsed_qc - - -def render_html(meta: dict, html_out: str): - """renders html report from template""" - - p = Path(__file__).parents[1] - template_path = Path(p, "assets", "report_template").as_posix() - - report_body = render_body(meta=meta, template_path=template_path) - - md_template = Markdown(extensions=["meta", "tables", "def_list", "fenced_code"]) - - markdown_text = md_template.convert(source=report_body) - - env = Environment(loader=FileSystemLoader(template_path), autoescape=False) - - template = env.get_template("balsamic_report.html") - - html_report = template.render(body=markdown_text, meta=meta) - - with open(html_out, "w") as f: - f.write(html_report) - return html_out - - -def render_body( - meta: dict, template_path: str, body_template_md: str = "balsamic_report.md" -) -> str: - """renders text body of the report from a markdown template""" - env = Environment(loader=FileSystemLoader(template_path), autoescape=False) - - template = env.get_template(body_template_md) - - report_body = template.render(meta=meta) - - return report_body diff --git a/BALSAMIC/utils/rule.py b/BALSAMIC/utils/rule.py index 5a6144885..d7cb07d62 100644 --- a/BALSAMIC/utils/rule.py +++ b/BALSAMIC/utils/rule.py @@ -75,7 +75,7 @@ def get_variant_callers( WorkflowRunError if values are not valid """ - valid_variant_callers = set() + valid_variant_callers = list() if mutation_type not in MUTATION_TYPE: raise WorkflowRunError(f"{mutation_type} is not a valid mutation type.") @@ -99,7 +99,7 @@ def get_variant_callers( and workflow_solution in variant_caller_params.get("workflow_solution") and sequencing_type in variant_caller_params.get("sequencing_type") ): - valid_variant_callers.add(variant_caller_name) + valid_variant_callers.append(variant_caller_name) return list(valid_variant_callers) diff --git a/BALSAMIC/workflows/QC.smk b/BALSAMIC/workflows/QC.smk new file mode 100644 index 000000000..7e475e464 --- /dev/null +++ b/BALSAMIC/workflows/QC.smk @@ -0,0 +1,154 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 +import os +import logging +import tempfile + +from pathlib import Path +from yapf.yapflib.yapf_api import FormatFile + +from snakemake.exceptions import RuleException, WorkflowError + +from BALSAMIC.utils.exc import BalsamicError + +from BALSAMIC.utils.cli import (write_json, check_executable, generate_h5) + +from BALSAMIC.utils.models import BalsamicWorkflowConfig + +from BALSAMIC.utils.rule import (get_rule_output, get_result_dir, + get_sample_type, get_picard_mrkdup, get_script_path, + get_threads, get_sequencing_type, get_capture_kit) + +from BALSAMIC.constants.common import (RULE_DIRECTORY); +from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS + + +shell.executable("/bin/bash") +shell.prefix("set -eo pipefail; ") + +LOG = logging.getLogger(__name__) +logging.getLogger("filelock").setLevel("WARN") + +# Create a temporary directory with trailing / +tmp_dir = os.path.join(get_result_dir(config), "tmp", "" ) +Path.mkdir(Path(tmp_dir), exist_ok=True) + +benchmark_dir = config["analysis"]["benchmark"] +fastq_dir = get_result_dir(config) + "/fastq/" +bam_dir = get_result_dir(config) + "/bam/" +fastqc_dir = get_result_dir(config) + "/fastqc/" +result_dir = get_result_dir(config) + "/" +qc_dir = get_result_dir(config) + "/qc/" +delivery_dir = get_result_dir(config) + "/delivery/" + +singularity_image = config['singularity']['image'] + +# picarddup flag +picarddup = get_picard_mrkdup(config) + +# parse parameters as constants to workflows +params = BalsamicWorkflowConfig.parse_obj(WORKFLOW_PARAMS) + +# Capture kit name +if config["analysis"]["sequencing_type"] != "wgs": + capture_kit = os.path.split(config["panel"]["capture_kit"])[1] + +# Sample names for tumor or normal +tumor_sample = get_sample_type(config["samples"], "tumor")[0] +if "paired" in config['analysis']['analysis_type']: + normal_sample = get_sample_type(config["samples"], "normal")[0] + +# Set case id/name +case_id = config["analysis"]["case_id"] + +# explicitly check if cluster_config dict has zero keys. +if len(cluster_config.keys()) == 0: + cluster_config = config + +# Add reference assembly if not defined for backward compatibility +if 'genome_version' not in config["reference"]: + GENOME_VERSION = 'hg19' ## if hg19 convention works, replace accordingly + LOG.info('Genome version was not found in config. Setting it to %s', GENOME_VERSION) + + +# Set temporary dir environment variable +os.environ['TMPDIR'] = get_result_dir(config) + +analysis_type = config['analysis']["analysis_type"] + +rules_to_include = [ + "snakemake_rules/quality_control/fastp.rule", + "snakemake_rules/quality_control/fastqc.rule", + "snakemake_rules/quality_control/multiqc.rule", + "snakemake_rules/variant_calling/mergetype_tumor.rule", + "snakemake_rules/quality_control/picard.rule", + "snakemake_rules/quality_control/sambamba_depth.rule", + "snakemake_rules/quality_control/mosdepth.rule", + "snakemake_rules/align/bwa_mem.rule" +] + +if "paired" in config['analysis']['analysis_type']: + rules_to_include.append("snakemake_rules/variant_calling/mergetype_normal.rule") + + + +# for r in rules_to_include: +for r in rules_to_include: + include: Path(RULE_DIRECTORY, r).as_posix() +LOG.info(f"The following rules will be included in the workflow: {rules_to_include}") + +# Define common and analysis specific outputs +quality_control_results = [result_dir + "qc/" + "multiqc_report.html"] + +if 'delivery' in config: + wildcard_dict = {"sample": list(config["samples"].keys())+["tumor", "normal"], + "case_name": config["analysis"]["case_id"], + "allow_missing": True + } + + if 'rules_to_deliver' in config: + rules_to_deliver = config['rules_to_deliver'].split(",") + else: + rules_to_deliver = ['multiqc'] + + output_files_ready = [('path', 'path_index', 'step', 'tag', 'id', 'format')] + + for my_rule in set(rules_to_deliver): + try: + housekeeper_id = getattr(rules, my_rule).params.housekeeper_id + except (ValueError, AttributeError, RuleException, WorkflowError) as e: + LOG.warning("Cannot deliver step (rule) {}: {}".format(my_rule, e)) + continue + + LOG.info("Delivering step (rule) {} {}.".format(my_rule, housekeeper_id)) + files_to_deliver = get_rule_output(rules=rules, rule_name=my_rule, output_file_wildcards=wildcard_dict) + LOG.debug("The following files added to delivery: {}".format(files_to_deliver)) + output_files_ready.extend(files_to_deliver) + + output_files_ready = [dict(zip(output_files_ready[0], value)) for value in output_files_ready[1:]] + delivery_ready = os.path.join(get_result_dir(config), + "delivery_report", + config["analysis"]["case_id"] + "_delivery_ready.hk") + write_json(output_files_ready, delivery_ready) + FormatFile(delivery_ready) + +rule all: + input: + quality_control_results + output: + finish_file = os.path.join(get_result_dir(config), "analysis_finish") + params: + tmp_dir = tmp_dir, + run: + import datetime + import shutil + + # Delete a temporal directory tree + try: + shutil.rmtree(params.tmp_dir) + except OSError as e: + print ("Error: %s - %s." % (e.filename, e.strerror)) + + # Finish timestamp file + with open(str(output.finish_file), mode="w") as finish_file: + finish_file.write("%s\n" % datetime.datetime.now()) diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index f15fe175f..a36e91bff 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -13,7 +13,7 @@ from PyPDF2 import PdfFileMerger from BALSAMIC.utils.exc import BalsamicError -from BALSAMIC.utils.cli import (write_json, check_executable, generate_h5) +from BALSAMIC.utils.cli import (write_json, check_executable, generate_h5, read_yaml) from BALSAMIC.utils.models import VarCallerFilter, BalsamicWorkflowConfig @@ -27,7 +27,7 @@ from BALSAMIC.constants.common import (SENTIEON_DNASCOPE, SENTIEON_TNSCOPE, RULE_DIRECTORY, VCFANNO_TOML, MUTATION_TYPE); from BALSAMIC.constants.variant_filters import COMMON_SETTINGS,VARDICT_SETTINGS,SENTIEON_VARCALL_SETTINGS; from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS, VARCALL_PARAMS -from BALSAMIC.constants.workflow_rules import SNAKEMAKE_RULES +from BALSAMIC.constants.workflow_rules import SNAKEMAKE_RULES shell.executable("/bin/bash") @@ -40,6 +40,11 @@ logging.getLogger("filelock").setLevel("WARN") tmp_dir = os.path.join(get_result_dir(config), "tmp", "" ) Path.mkdir(Path(tmp_dir), exist_ok=True) +# Set case id/name +case_id = config["analysis"]["case_id"] + +# Directories +analysis_dir = config["analysis"]["analysis_dir"] + "/" +case_id + "/" benchmark_dir = config["analysis"]["benchmark"] fastq_dir = get_result_dir(config) + "/fastq/" bam_dir = get_result_dir(config) + "/bam/" @@ -50,8 +55,7 @@ vcf_dir = get_result_dir(config) + "/vcf/" vep_dir = get_result_dir(config) + "/vep/" qc_dir = get_result_dir(config) + "/qc/" delivery_dir = get_result_dir(config) + "/delivery/" - -umi_dir = get_result_dir(config) + "/umi/" +umi_dir = get_result_dir(config) + "/umi/" umi_qc_dir = qc_dir + "umi_qc/" singularity_image = config['singularity']['image'] @@ -76,9 +80,6 @@ tumor_sample = get_sample_type(config["samples"], "tumor")[0] if config['analysis']['analysis_type'] == "paired": normal_sample = get_sample_type(config["samples"], "normal")[0] -# Set case id/name -case_id = config["analysis"]["case_id"] - # explicitly check if cluster_config dict has zero keys. if len(cluster_config.keys()) == 0: cluster_config = config @@ -95,7 +96,7 @@ try: config["SENTIEON_TNSCOPE"] = SENTIEON_TNSCOPE config["SENTIEON_DNASCOPE"] = SENTIEON_DNASCOPE - + except KeyError as error: LOG.error("Set environment variables SENTIEON_LICENSE, SENTIEON_INSTALL_DIR, SENTIEON_EXEC " "to run SENTIEON variant callers") @@ -130,6 +131,8 @@ os.environ['TMPDIR'] = get_result_dir(config) # Extract variant callers for the workflow germline_caller = [] somatic_caller = [] +somatic_caller_cnv = [] +somatic_caller_sv = [] for m in MUTATION_TYPE: germline_caller_balsamic = get_variant_callers(config=config, analysis_type=config['analysis']['analysis_type'], @@ -145,7 +148,7 @@ for m in MUTATION_TYPE: sequencing_type=config["analysis"]["sequencing_type"], mutation_class="germline") - germline_caller = germline_caller + germline_caller_balsamic + germline_caller_sentieon + germline_caller = germline_caller + germline_caller_balsamic + germline_caller_sentieon somatic_caller_balsamic = get_variant_callers(config=config, @@ -170,6 +173,26 @@ for m in MUTATION_TYPE: mutation_class="somatic") somatic_caller = somatic_caller + somatic_caller_sentieon_umi + somatic_caller_balsamic + somatic_caller_sentieon +somatic_caller_sv = get_variant_callers(config=config, + analysis_type=config['analysis']['analysis_type'], + workflow_solution="BALSAMIC", + mutation_type="SV", + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="somatic") + +somatic_caller_cnv = get_variant_callers(config=config, + analysis_type=config['analysis']['analysis_type'], + workflow_solution="BALSAMIC", + mutation_type="CNV", + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="somatic") +somatic_caller_sv.remove("svdb") +svdb_callers_prio = somatic_caller_sv + somatic_caller_cnv + +for var_caller in svdb_callers_prio: + if var_caller in somatic_caller: + somatic_caller.remove(var_caller) + # Collect only snv callers for calculating tmb somatic_caller_tmb = [] for ws in ["BALSAMIC","Sentieon","Sentieon_umi"]: @@ -208,41 +231,82 @@ for r in rules_to_include: include: Path(RULE_DIRECTORY, r).as_posix() # Define common and analysis specific outputs -quality_control_results = [result_dir + "qc/" + "multiqc_report.html"] - -analysis_specific_results = [expand(vep_dir + "{vcf}.vcf.gz", - vcf=get_vcf(config, germline_caller, germline_call_samples)), - expand(vep_dir + "{vcf}.all.vcf.gz", - vcf=get_vcf(config, somatic_caller, [config["analysis"]["case_id"]]))] - +quality_control_results = [ + os.path.join(qc_dir,case_id + "_metrics_deliverables.yaml"), + os.path.join(qc_dir, "multiqc_report.html"), + os.path.join(qc_dir, "multiqc_data/multiqc_data.json") +] + +# Analysis results +analysis_specific_results = [] + +# Germline SNVs/SVs +analysis_specific_results.extend( + expand(vep_dir + "{vcf}.vcf.gz", vcf=get_vcf(config, germline_caller, germline_call_samples)) +) + +# Raw VCFs +analysis_specific_results.extend( + expand(vcf_dir + "{vcf}.vcf.gz", vcf=get_vcf(config, somatic_caller, [case_id])) +) + +# Filtered and passed post annotation VCFs +analysis_specific_results.extend( + expand(vep_dir + "{vcf}.all.filtered.pass.vcf.gz", vcf=get_vcf(config, somatic_caller, [case_id])) +) + +# TMB +analysis_specific_results.extend( + expand(vep_dir + "{vcf}.balsamic_stat", vcf=get_vcf(config, somatic_caller_tmb, [case_id])) +) + +# TGA specific files if config["analysis"]["sequencing_type"] != "wgs": - analysis_specific_results.append(expand(vep_dir + "{vcf}.all.filtered.pass.ranked.vcf.gz", - vcf=get_vcf(config, ["vardict"], [config["analysis"]["case_id"]]))) - - analysis_specific_results.append(expand(umi_qc_dir + "{sample}.umi.mean_family_depth", sample=config["samples"])) - + # CNVkit + analysis_specific_results.append(cnv_dir + "tumor.merged.cns") + analysis_specific_results.extend(expand(cnv_dir + "tumor.merged-{plot}", plot=["diagram.pdf", "scatter.pdf"])) + analysis_specific_results.append(cnv_dir + case_id +".gene_metrics") + # vcf2cytosure + analysis_specific_results.extend(expand( + vcf_dir + "CNV.somatic.{case_name}.{var_caller}.vcf2cytosure.cgh", + case_name=case_id, + var_caller=["cnvkit"] + )) + # VarDict + analysis_specific_results.extend( + expand(vep_dir + "{vcf}.all.filtered.pass.ranked.vcf.gz", vcf=get_vcf(config, ["vardict"], [case_id])) + ) + # UMI + analysis_specific_results.extend(expand(umi_qc_dir + "{sample}.umi.mean_family_depth",sample=config["samples"])) if background_variant_file: - analysis_specific_results.extend([expand(umi_qc_dir + "{case_name}.{var_caller}.AFtable.txt", - case_name=config["analysis"]["case_id"], - var_caller=["TNscope_umi"])]), - -#Calculate TMB per somatic variant caller -analysis_specific_results.extend(expand(vep_dir + "{vcf}.balsamic_stat", - vcf=get_vcf(config, somatic_caller_tmb, [config["analysis"]["case_id"]]))) - -#Gather all the filtered and PASSed variants post annotation -analysis_specific_results.extend([expand(vep_dir + "{vcf}.all.filtered.pass.vcf.gz", - vcf=get_vcf(config, somatic_caller, [config["analysis"]["case_id"]]))]) - -LOG.info(f"Following outputs will be delivered {analysis_specific_results}") + analysis_specific_results.extend( + expand(umi_qc_dir + "{case_name}.{var_caller}.AFtable.txt", case_name=case_id, var_caller=["TNscope_umi"]) + ) +# AscatNgs +if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "paired": + analysis_specific_results.extend( + expand(vcf_dir + "{vcf}.output.pdf", vcf=get_vcf(config, ["ascat"], [case_id])) + ) + analysis_specific_results.extend( + expand(vcf_dir + "{vcf}.copynumber.txt.gz", vcf=get_vcf(config, ["ascat"], [case_id])) + ) + +# Delly CNV +if config['analysis']['analysis_type'] == "single": + analysis_specific_results.extend( + expand(vcf_dir + "{vcf}.cov.gz",vcf=get_vcf(config,["dellycnv"],[case_id])) + ) + +# Dragen if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "single": if "dragen" in config: - analysis_specific_results.extend([Path(result_dir, "dragen", "SNV.somatic." + config["analysis"]["case_id"] + ".dragen_tumor.bam").as_posix(), - Path(result_dir, "dragen", "SNV.somatic." + config["analysis"]["case_id"] + ".dragen.vcf.gz").as_posix()]) + analysis_specific_results.extend([ + Path(result_dir, "dragen", "SNV.somatic." + case_id + ".dragen_tumor.bam").as_posix(), + Path(result_dir, "dragen", "SNV.somatic." + case_id + ".dragen.vcf.gz").as_posix() + ]) -if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "paired": - analysis_specific_results.append(expand(vcf_dir + "{vcf}.output.pdf", vcf=get_vcf(config, ["ascat"], [config["analysis"]["case_id"]]))) +LOG.info(f"Following outputs will be delivered {analysis_specific_results}") if 'benchmark_plots' in config: log_dir = config["analysis"]["log"] @@ -252,7 +316,7 @@ if 'benchmark_plots' in config: # Make individual plot per job for log_file in Path(log_dir).glob("*.err"): log_file_list = log_file.name.split(".") - job_name = ".".join(log_file_list[0:4]) + job_name = ".".join(log_file_list[0:4]) job_id = log_file_list[4].split("_")[1] h5_file = generate_h5(job_name, job_id, log_file.parent) benchmark_plot = Path(benchmark_dir, job_name + ".pdf") @@ -274,20 +338,20 @@ if 'benchmark_plots' in config: for plots in my_rule_plots: plots.unlink() - - if 'delivery' in config: - wildcard_dict = {"sample": list(config["samples"].keys())+["tumor", "normal"], - "case_name": config["analysis"]["case_id"], - "allow_missing": True - } + wildcard_dict = { + "sample": list(config["samples"].keys())+["tumor", "normal"], + "case_name": case_id, + "allow_missing": True + } if config['analysis']["analysis_type"] in ["paired", "single"]: - wildcard_dict.update({"var_type": ["CNV", "SNV", "SV"], - "var_class": ["somatic", "germline"], - "var_caller": somatic_caller + germline_caller, - "bedchrom": config["panel"]["chrom"] if "panel" in config else [], - }) + wildcard_dict.update({ + "var_type": ["CNV", "SNV", "SV"], + "var_class": ["somatic", "germline"], + "var_caller": somatic_caller + germline_caller, + "bedchrom": config["panel"]["chrom"] if "panel" in config else [], + }) if 'rules_to_deliver' in config: rules_to_deliver = config['rules_to_deliver'].split(",") @@ -309,9 +373,7 @@ if 'delivery' in config: output_files_ready.extend(files_to_deliver) output_files_ready = [dict(zip(output_files_ready[0], value)) for value in output_files_ready[1:]] - delivery_ready = os.path.join(get_result_dir(config), - "delivery_report", - config["analysis"]["case_id"] + "_delivery_ready.hk") + delivery_ready = os.path.join(get_result_dir(config), "delivery_report", case_id + "_delivery_ready.hk") write_json(output_files_ready, delivery_ready) FormatFile(delivery_ready) @@ -319,23 +381,18 @@ rule all: input: quality_control_results + analysis_specific_results output: - qc_json_file = os.path.join(get_result_dir(config), "qc", "qc_metrics_summary.json"), finish_file = os.path.join(get_result_dir(config), "analysis_finish") params: - tmp_dir = tmp_dir, - result_dir = result_dir, - sequencing_type = get_sequencing_type(config), - panel_bed = get_capture_kit(config) + tmp_dir = tmp_dir run: import datetime import shutil - from BALSAMIC.utils.qc_metrics import get_qc_metrics_json + from BALSAMIC.utils.qc_metrics import validate_qc_metrics - # Save QC metrics to a JSON file + # Perform validation of extracted QC metrics try: - qc_metrics_summary = get_qc_metrics_json(params.result_dir, params.sequencing_type, params.panel_bed) - write_json(qc_metrics_summary, str(output.qc_json_file)) + validate_qc_metrics(read_yaml(input[0])) except ValueError as val_exc: LOG.error(val_exc) raise BalsamicError diff --git a/BALSAMIC/workflows/reference-canfam3.smk b/BALSAMIC/workflows/reference-canfam3.smk new file mode 100644 index 000000000..91699c777 --- /dev/null +++ b/BALSAMIC/workflows/reference-canfam3.smk @@ -0,0 +1,217 @@ +# syntax=python tabstop=4 expandtab +# coding: utf-8 + +import os +import logging +from pathlib import Path + +from copy import deepcopy + +from BALSAMIC.utils.rule import get_script_path +from BALSAMIC.utils.rule import get_reference_output_files +from BALSAMIC.utils.models import ReferenceMeta +from BALSAMIC.constants.reference import REFERENCE_FILES as REFERENCE_MODEL +from BALSAMIC.utils.cli import get_md5 +from BALSAMIC.utils.cli import create_md5 + +LOG = logging.getLogger(__name__) + +# explictly check if cluster_config dict has zero keys. +if len(cluster_config.keys()) == 0: + cluster_config = config + +genome_ver = config['genome_version'] + +# essential path reference files +basedir = os.path.join(config['output']) +genome_dir = os.path.join(basedir, "genome") + +# Set temporary dir environment variable +os.environ['TMPDIR'] = basedir + +REFERENCE_FILES = deepcopy(REFERENCE_MODEL) + +# intialize reference files +REFERENCE_FILES[genome_ver]['basedir'] = basedir +reference_file_model = ReferenceMeta.parse_obj(REFERENCE_FILES[genome_ver]) +reference_genome_url = reference_file_model.reference_genome +genome_chrom_size_url = reference_file_model.genome_chrom_size +refgene_txt_url = reference_file_model.refgene_txt +refgene_sql_url = reference_file_model.refgene_sql + +check_md5 = os.path.join(basedir, "reference.json.md5") + +shell.executable("/bin/bash") +shell.prefix("set -eo pipefail; ") + +singularity_image_path = config['singularity']['image_path'] +singularity_images = [Path(singularity_image_path, image_name + ".sif").as_posix() for image_name in config["singularity"]["containers"].keys()] + +########################################################## +# Generating Reference files for BALSAMIC pipeline +# Writing reference json file +########################################################## + +rule all: + input: + singularity_images, + reference_genome = reference_genome_url.get_output_file, + bwa_index = expand(reference_genome_url.get_output_file + "{ext}", ext=['.amb','.ann','.bwt','.pac','.sa']), + refgenome_fai = reference_genome_url.get_output_file + ".fai", + refgenome_dict = reference_genome_url.get_output_file.replace("fasta","dict"), + refseq_bed = refgene_txt_url.get_output_file.replace("txt", "flat") + ".bed", + refseq_flat = refgene_txt_url.get_output_file.replace("txt", "flat"), + refgene = refgene_txt_url.get_output_file, + genome_chrom_size = genome_chrom_size_url.get_output_file, + output: + finished = os.path.join(basedir,"reference.finished"), + reference_json = os.path.join(basedir, "reference.json"), + check_md5 = check_md5 + log: + os.path.join(basedir, "reference.json.log") + run: + import json + from datetime import datetime + + today = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + ref_json = dict() + ref_json['reference'] = { + "reference_genome": input.reference_genome, + "exon_bed": input.refseq_bed, + "refflat": input.refseq_flat, + "refGene": input.refgene, + "genome_chrom_size": input.genome_chrom_size, + "reference_access_date": today, + } + + with open(str(output.reference_json), "w") as fh: + json.dump(ref_json, fh, indent=4) + + create_md5(ref_json['reference'], output.check_md5) + + with open(str(output.finished), mode='w') as finish_file: + finish_file.write('%s\n' % today ) + +########################################################### +# Download all singularity container images from dockerhub +########################################################### + +rule download_container: + output: singularity_images + run: + for image_name, docker_path in config["singularity"]["containers"].items(): + cmd = "singularity pull {}/{}.sif {}".format(config["singularity"]["image_path"], image_name, docker_path) + shell(cmd) + +########################################################## +# Download the reference genome, variant db +########################################################## +download_content = [reference_genome_url, genome_chrom_size_url, refgene_txt_url, refgene_sql_url] + +rule download_reference: + output: + expand("{output}", output=[ref.get_output_file for ref in download_content]) + run: + import requests + + for ref in download_content: + output_file = ref.get_output_file + log_file = output_file + ".log" + + cmd = "wget -a {} -O - {}".format(log_file, ref.url) + + if ref.gzip: + cmd += " | gunzip " + + cmd += " > {}".format(output_file) + shell(cmd) + ref.write_md5 + +########################################################## +# Preprocess refseq file by fetching relevant columns and +# standardize the chr column +########################################################## + +rule prepare_refgene: + input: + singularity_images, + refgene_txt = refgene_txt_url.get_output_file, + refgene_sql = refgene_sql_url.get_output_file, + params: + refgene_sql_awk = get_script_path('refseq_sql.awk'), + output: + refflat = refgene_txt_url.get_output_file.replace("txt", "flat"), + bed = refgene_txt_url.get_output_file.replace("txt", "flat") + ".bed", + log: + refgene_sql = os.path.join(basedir, "genome", "refgene_sql.log"), + refgene_txt = os.path.join(basedir, "genome", "refgene_txt.log") + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix() + shell: + """ +header=$(awk -f {params.refgene_sql_awk} {input.refgene_sql}); +(echo \"$header\"; cat {input.refgene_txt};) \ +| csvcut -t -c chrom,exonStarts,exonEnds,name,score,strand,exonCount,txStart,txEnd,name2 \ +| csvformat -T \ +| bedtools expand -c 2,3 \ +| awk '$1~/chr[1-9]/ && $1!~/[_]/' | sort -k1,1 -k2,2n > {output.bed}; + +awk -v OFS=\"\\t\" '$3!~/_/ {{ gsub(\"chr\",\"chr\",$3); $1=$13; print }}' {input.refgene_txt} \ +| cut -f 1-11 > {output.refflat}; + """ + +########################################################## +# Create BWA Index for reference genome +########################################################## + +rule bwa_index: + input: + singularity_img = singularity_images, + reference_genome = reference_genome_url.get_output_file + output: + expand(reference_genome_url.get_output_file + "{ext}", ext=['.amb','.ann','.bwt','.pac','.sa']) + log: + reference_genome_url.get_output_file + ".bwa_index.log" + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("bwa") + ".sif").as_posix() + shell: + """ +bwa index -a bwtsw {input.reference_genome} 2> {log}; + """ + +########################################################## +# Create index for fasta file - .fai +########################################################## + +rule samtools_index_fasta: + input: + singularity_img = singularity_images, + reference_genome = reference_genome_url.get_output_file + output: + reference_genome_url.get_output_file + ".fai" + log: + reference_genome_url.get_output_file + ".faidx.log" + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("samtools") + ".sif").as_posix() + shell: + """ +samtools faidx {input.reference_genome} 2> {log}; + """ + + +########################################################## +# create reference dictionary using picard +########################################################## + +rule picard_ref_dict: + input: + singularity_img = singularity_images, + reference_genome = reference_genome_url.get_output_file + output: + reference_genome_url.get_output_file.replace("fasta","dict") + log: + reference_genome_url.get_output_file + ".ref_dict.log" + singularity: Path(singularity_image_path, config["bioinfo_tools"].get("picard") + ".sif").as_posix() + shell: + """ +picard CreateSequenceDictionary REFERENCE={input.reference_genome} OUTPUT={output} 2> {log}; + """ + diff --git a/BALSAMIC/workflows/reference.smk b/BALSAMIC/workflows/reference.smk index 81ba9d0e4..6d5cfa0eb 100644 --- a/BALSAMIC/workflows/reference.smk +++ b/BALSAMIC/workflows/reference.smk @@ -2,7 +2,6 @@ # coding: utf-8 import os -import hashlib import logging from pathlib import Path @@ -12,6 +11,9 @@ from BALSAMIC.utils.rule import get_script_path from BALSAMIC.utils.rule import get_reference_output_files from BALSAMIC.utils.models import ReferenceMeta from BALSAMIC.constants.reference import REFERENCE_FILES as REFERENCE_MODEL +from BALSAMIC.utils.cli import get_md5 +from BALSAMIC.utils.cli import create_md5 + LOG = logging.getLogger(__name__) @@ -64,6 +66,9 @@ refgene_sql_url = reference_file_model.refgene_sql rankscore_url = reference_file_model.rankscore access_regions_url = reference_file_model.access_regions delly_exclusion_url = reference_file_model.delly_exclusion +delly_mappability_url = reference_file_model.delly_mappability +delly_mappability_gindex_url = reference_file_model.delly_mappability_gindex +delly_mappability_findex_url = reference_file_model.delly_mappability_findex ascat_gccorrection_url = reference_file_model.ascat_gccorrection ascat_chryloci_url = reference_file_model.ascat_chryloci clinvar_url = reference_file_model.clinvar @@ -76,21 +81,6 @@ check_md5 = os.path.join(basedir, "reference.json.md5") shell.executable("/bin/bash") shell.prefix("set -eo pipefail; ") -def get_md5(filename): - hash_md5 = hashlib.md5() - with open(str(filename), 'rb') as fh: - for chunk in iter(lambda: fh.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() - - -def create_md5(reference, check_md5): - """ create a md5 file for all reference data""" - with open(check_md5, 'w') as fh: - for key, value in reference.items(): - if os.path.isfile(value): - fh.write( get_md5(value) + ' ' + value + '\n') - singularity_image_path = config['singularity']['image_path'] singularity_images = [Path(singularity_image_path, image_name + ".sif").as_posix() for image_name in config["singularity"]["containers"].keys()] @@ -125,6 +115,9 @@ rule all: access_regions = access_regions_url.get_output_file, delly_exclusion = delly_exclusion_url.get_output_file, delly_exclusion_converted = delly_exclusion_url.get_output_file.replace(".tsv", "_converted.tsv"), + delly_mappability= delly_mappability_url.get_output_file, + delly_mappability_gindex= delly_mappability_gindex_url.get_output_file, + delly_mappability_findex= delly_mappability_findex_url.get_output_file, ascat_gccorrection = ascat_gccorrection_url.get_output_file, ascat_chryloci = ascat_chryloci_url.get_output_file, clinvar = clinvar_url.get_output_file + ".gz", @@ -160,6 +153,7 @@ rule all: "access_regions": input.access_regions, "delly_exclusion" : input.delly_exclusion, "delly_exclusion_converted" : input.delly_exclusion_converted, + "delly_mappability": input.delly_mappability, "ascat_gccorrection" : input.ascat_gccorrection, "ascat_chryloci" : input.ascat_chryloci, "clinvar": input.clinvar, @@ -193,7 +187,8 @@ download_content = [reference_genome_url, dbsnp_url, hc_vcf_1kg_url, wgs_calling_url, genome_chrom_size_url, gnomad_url, gnomad_tbi_url, cosmicdb_url, refgene_txt_url, refgene_sql_url, rankscore_url, access_regions_url, - delly_exclusion_url, ascat_gccorrection_url, ascat_chryloci_url, clinvar_url] + delly_exclusion_url, delly_mappability_url, delly_mappability_gindex_url, + delly_mappability_findex_url, ascat_gccorrection_url, ascat_chryloci_url, clinvar_url] rule download_reference: output: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f252fb514..a3e1f4528 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,67 @@ +[9.0.0] +======= + +Added: +^^^^^^ + +* Snakemake workflow to create canfam3 reference #843 +* Call umi variants using TNscope in bed defined regions #821 +* UMI duplication metrics to report in multiqc_picard_dups.json #844 +* Option to use PON reference in cnv calling for TGA tumor-only cases +* QC default validation conditions (for not defined capture kits) #855 +* SVdb to the varcall_py36 container #872 +* SVdb to WGS workflow #873 +* Docker container for vcf2cytosure #858 +* Snakemake rule for creating `.cgh` files from `CNVkit` outputs #858 +* SVdb to TGA workflow #879 +* SVdb merge SV and CNV #886 +* Readthedocs for BALSAMIC method descriptions #892 +* Readthedocs for BALSAMIC variant filters for WGS somatic callers #892 +* bcftools counts to varcall filter rules #898 +* Additional WGS metrics to be stored in ``_metrics_deliverables.yaml`` #907 +* ascatNGS copynumber file #914 +* ReadtheDocs for BALSAMIC annotation resources #916 +* Delly CNV for tumor only workflow #923 +* Delly CNV Read-depth profiles for tumor only workflows #924 +* New metric to be extracted and validated: ``NUMBER_OF_SITES`` (``bcftools`` counts) #925 + +Changed: +^^^^^^^^ + +* Merge QC metric extraction workflows #833 +* Changed the base-image for balsamic container to 4.10.3-alpine #869 +* Updated SVdb to 2.6.0 #871 +* Upgrade black to 22.3.0 +* For UMI workflow, post filter `gnomad_pop_freq` value is changed from `0.005` to `0.02` #919 +* updated delly to 0.9.1 #920 +* container base_image (align_qc, annotate, coverage_qc, varcall_cnvkit, varcall_py36) to 4.10.3-alpine #921 +* update container (align_qc, annotate, coverage_qc, varcall_cnvkit,varcall_py36) bioinfo tool versions #921 +* update tool versions (align_qc, annotate, coverage_qc, varcall_cnvkit) in methods and softwares docs #921 +* Updated the list of files to be stored and delivered #848 +* Moved ``collect_custom_qc_metrics`` rule from ``multiqc.rule`` #925 + +Fixed: +^^^^^^ +* Automate balsamic version for readthedocs install page #888 +* ``collect_qc_metrics.py`` failing for WGS cases with empty ``capture_kit`` argument #850 +* QC metric validation for different panel bed version #855 +* Fixed development version of ``fpdf2`` to ``2.4.6`` #878 +* Added missing svdb index file #848 + +Removed +^^^^^^^ + +* ``--qc-metrics/--no-qc-metrics`` flag from the ``balsamic report deliver`` command #833 +* Unused pon option for SNV calling with TNhaplotyper tumor-only +* SV and CNV callers from annotation and filtering #889 +* vcfanno and COSMIC from SV annotation #891 +* Removed `MSK_impact` and `MSK_impact_noStrelka` json files from config +* Cleanup of `strelka`, `pindel` , `mutect2` variables from BALSAMIC +* bcftools_stats from vep #898 +* QC delivery report workflow (generating the ``_qc_report.html`` file) #878 +* ``--sample-id-map`` and ``--case-id-map`` flags from the ``balsamic report deliver`` command #878 +* Removed `gatk_haplotypecaller` for reporting panel germline variants #918 + [8.2.10] -------- @@ -24,6 +88,7 @@ Removed: Added: ^^^^^^ + * Added slurm qos tag `express` #885 * Included more text about UMI-workflow variant calling settings to the readthedocs #888 * Extend QCModel to include `n_base_limit` which outputs in config json `QC` dict @@ -38,12 +103,12 @@ Changed: * fastp default setting of `n_base_limit` is changed to `50` from `5` [8.2.8] -------- +-------- Added: ^^^^^^ * Added the readthedocs page for BALSAMIC variant-calling filters #867 -* Project requirements (setup.py) to build the docs #874 +* Project requirements (setup.py) to build the docs #874 * Generate cram from umi-consensus called bam files #865 Changed: @@ -72,13 +137,9 @@ Fixes: * Set right qos in scheduler command #856 - [8.2.5] ------- -Added: -^^^^^^ - * balsamic.sif container installation during cache generation #841 Fixed: @@ -100,11 +161,11 @@ Added: Fixed: ^^^^^^ -* Add default for gender if ``purecn`` captures dual gender values #824 +* Add default for gender if ``purecn`` captures dual gender values #824 Changed: ^^^^^^^^ -* Updated ``purecn`` and its dependencies to latest versions +* Updated ``purecn`` and its dependencies to latest versions [8.2.2] ------- @@ -134,7 +195,7 @@ Added: * Added various basic filters to all variant callers irregardless of their delivery status #750 * BALSAMIC container #728 -* BALSAMIC reference generation via cluster submission for both reference and container #686 +* BALSAMIC reference generation via cluster submission for both reference and container #686 * Container specific tests #770 * BALSAMIC quality control metrics extraction and validation #754 * Delly is added as a submodule and removed from rest of the conda environments #787 @@ -172,12 +233,12 @@ Fixed: * Bumped version for ``bcftools`` in cnvkit container * Fixed issues #776 and #777 with correct install paths for gatk and manta * Fixed issue #782 for missing AF in the vcf INFO field -* Fixed issues #748 #749 with correct sample names +* Fixed issues #748 #749 with correct sample names * Fixed issue #767 for ascatngs hardcoded values -* Fixed missing output option in bcftools filters for tnhaplotyper #793 +* Fixed missing output option in bcftools filters for tnhaplotyper #793 * Fixed issue #795 with increasing resources for vep and filter SV prior to vep * Building ``wheel`` for ``cryptography`` bug inside BALSAMIC container #801 -* Fixed badget for docker container master and develop status +* Fixed badget for docker container master and develop status * ReadtheDocs building failure due to dependencies, fixed by locking versions #773 * Dev requirements installation for Sphinx docs (Github Action) #812 * Changed path for main Dockerfile version in ``.bumpversion.cfg`` @@ -191,7 +252,7 @@ Added: * Workflow to check PR tiltes to make easier to tell PR intents #724 * ``bcftools stats`` to calculate Ti/Tv for all post annotate germline and somatic calls #93 * Added reference download date to ``reference.json`` #726 -* ``ascatngs`` hg38 references to constants #683 +* ``ascatngs`` hg38 references to constants #683 * Added ClinVar as a source to download and to be annotated with VCFAnno #737 Changed: @@ -253,7 +314,7 @@ Added: * Individual rules (i.e. ngs filters) for cnv and sv callers. Only Manta will be delivered and added to the list of output files. #708 * Added "targeted" and "wgs" tags to variant callers to provide another layer of separation. #708 * ``manta`` convert inversion #709 -* Sentieon version to bioinformatic tool version parsing #685 +* Sentieon version to bioinformatic tool version parsing #685 * added ``CITATION.cff`` to cite BALSAMIC @@ -262,9 +323,9 @@ Changed: * Upgrade to latest sentieon version 202010.02 * New name ``MarkDuplicates`` to ``picard_markduplicates`` in ``bwa_mem`` rule and ``cluster.json`` -* New name rule ``GATK_contest`` to ``gatk_contest`` +* New name rule ``GATK_contest`` to ``gatk_contest`` * Avoid running pytest github actions workflow on ``docs/**`` and ``CHANGELOG.rst`` changes -* Updated ``snakemake`` to ``v6.5.3`` #501 +* Updated ``snakemake`` to ``v6.5.3`` #501 * Update ``GNOMAD`` URL * Split Tumor-only ``cnvkit batch`` into individual commands * Improved TMB calculation issue #51 @@ -282,7 +343,7 @@ Fixed: * post-processing of the umi consensus in handling BI tags * vcf-filtered-clinical tag files will have all variants including PASS * Refactor snakemake ``annotate`` rules according to snakemake etiquette #636 -* Refactor snakemake ``align`` rules according to snakemake etiquette #636 +* Refactor snakemake ``align`` rules according to snakemake etiquette #636 * Refactor snakemake ``fastqc`` ``vep`` contest and ``mosdepth`` rules according to ``snakemake`` etiquette #636 * Order of columns in QC and coverage report issue #601 * ``delly`` not showing in workflow at runtime #644 @@ -478,7 +539,7 @@ Fixed: ^^^^^^ * umi_workflow config json is set as true for panel and wgs as false. -* Rename umiconsensus bam file headers from {samplenames} to TUMOR/NORMAL. +* Rename umiconsensus bam file headers from {samplenames} to TUMOR/NORMAL. * Documentation autobuild on RTFD @@ -502,7 +563,7 @@ Removed Fixed ^^^^^ -* Fixed issue 577 with missing ``tumor.merged.bam`` and ``normal.merged.bam`` +* Fixed issue 577 with missing ``tumor.merged.bam`` and ``normal.merged.bam`` * Issue 448 with lingering tmp_dir. It is not deleted after analysis is properly finished. Changed @@ -562,7 +623,7 @@ Changed * Update FastQC to 0.11.9 PR #532 * Update BCFTools to 1.11 PR #537 * Update Samtools to 1.11 PR #537 -* Increase resources and runtime for various workflows in PRs #482 +* Increase resources and runtime for various workflows in PRs #482 * Python package dependenicies versions fixed in PR #480 * QoL changes to workflow in series of PR #471 * Series of documentation updates in PRs #489 #553 diff --git a/container_tests/varcall_py36/varcall_py36.sh b/container_tests/varcall_py36/varcall_py36.sh index 50d551eac..3011d1fa2 100644 --- a/container_tests/varcall_py36/varcall_py36.sh +++ b/container_tests/varcall_py36/varcall_py36.sh @@ -1,7 +1,7 @@ #!/bin/bash # Test if commands exist -valid_commands=( "bcftools" "samtools" "tabix" "vardict" "vardict-java" ) +valid_commands=( "bcftools" "samtools" "tabix" "vardict" "vardict-java" "svdb") for valid_command in "${valid_commands[@]}" do diff --git a/container_tests/vcf2cytosure/vcf2cytosure.sh b/container_tests/vcf2cytosure/vcf2cytosure.sh new file mode 100644 index 000000000..e953274c4 --- /dev/null +++ b/container_tests/vcf2cytosure/vcf2cytosure.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Test if commands exist + +valid_commands=( "pip" "vcf2cytosure" ) + +for valid_command in "${valid_commands[@]}" +do + if ! command -v "${valid_command}" &> /dev/null + then + echo "${valid_command} could not be found" + exit 1 + else + echo "${valid_command} command is found and valid" + fi +done diff --git a/docs/balsamic_annotation.rst b/docs/balsamic_annotation.rst new file mode 100644 index 000000000..dd51e1dcc --- /dev/null +++ b/docs/balsamic_annotation.rst @@ -0,0 +1,254 @@ +*********************************** +BALSAMIC Annotation Resources +*********************************** + +BALSAMIC annotates somatic single nucleotide variants (SNVs) using ``ensembl-vep`` and ``vcfanno``. Somatic structural variants (SVs), somatic copy-number variants (CNVs) and germline single nucleotide variants are annotated using only ``ensembl-vep``. All SVs and CNVs are merged using ``SVDB`` before annotating for `Target Genome Analysis (TGA)` or `Whole Genome Sequencing (WGS)` analyses. + +`BALSAMIC` adds the following annotation from `gnomAD` database using ``vcfanno``. + +.. list-table:: gnomAD + :widths: 50 50 + :header-rows: 1 + + * - VCF tag + - description + * - GNOMADAF_popmax + - maximum allele frequency across populations + * - GNOMADAF + - fraction of the reads supporting the alternate allele, allelic frequency + +`BALSAMIC` adds the following annotation from `ClinVar` database using ``vcfanno``. + +.. list-table:: ClinVar + :widths: 50 50 + :header-rows: 1 + + * - VCF tag + - description + * - CLNACC + - Variant Accession and Versions + * - CLNREVSTAT + - ClinVar review status for the Variation ID + * - CLNSIG + - Clinical significance for this single variant + * - CLNVCSO + - Sequence Ontology id for variant type + * - CLNVC + - Variant type + * - ORIGIN + - Allele origin + +The values for `ORIGIN` are described below: + +.. list-table:: ORIGIN + :widths: 25 25 + :header-rows: 1 + + * - Value + - Annotation + * - 0 + - unknown + * - 1 + - germline + * - 2 + - somatic + * - 4 + - inherited + * - 8 + - paternal + * - 16 + - maternal + * - 32 + - *de-novo* + * - 64 + - biparental + * - 128 + - uniparental + * - 256 + - not-tested + * - 512 + - tested-inconclusive + * - 1073741824 + - other + +`BALSAMIC` uses `ensembl-vep` to add the following annotation from `COSMIC` database. + +.. list-table:: COSMIC + :widths: 50 50 + :header-rows: 1 + + * - VCF tag + - description + * - COSMIC_CDS + - CDS annotation + * - COSMIC_GENE + - gene name + * - COSMIC_STRAND + - strand + * - COSMIC_CNT + - number of samples with this mutation in the `COSMIC` database + * - COSMIC_AA + - peptide annotation + + +Where relevant, `BALSAMIC` uses `ensembl-vep` to annotate somatic and germline SNVs and somatic SVs/CNVs from `1000genomes (phase3)`, `ClinVar`, `ESP, HGMD-PUBLIC`, `dbSNP`, `gencode`, `gnomAD`, `polyphen`, `refseq`, and `sift` databases. +The following annotations are added by `ensembl-vep`. + +.. list-table:: ensembl-vep + :widths: 10 60 + :header-rows: 1 + + * - Annotation + - description + * - Allele + - the variant allele used to calculate the consequence + * - Gene + - Ensembl stable ID of affected gene + * - Feature + - Ensembl stable ID of feature + * - Feature type + - type of feature. Currently one of Transcript, RegulatoryFeature, MotifFeature. + * - Consequence + - consequence type of this variant + * - Position in cDNA + - relative position of base pair in cDNA sequence + * - Position in CDS + - relative position of base pair in coding sequence + * - Position in protein + - relative position of amino acid in protein + * - Amino acid change + - only given if the variant affects the protein-coding sequence + * - Codon change + - the alternative codons with the variant base in upper case + * - Co-located variation + - identifier of any existing variants + * - VARIANT_CLASS + - Sequence Ontology variant class + * - SYMBOL + - the gene symbol + * - SYMBOL_SOURCE + - the source of the gene symbol + * - STRAND + - the DNA strand (1 or -1) on which the transcript/feature lies + * - ENSP + - the Ensembl protein identifier of the affected transcript + * - FLAGS + - | transcript quality flags: + | cds_start_NF: CDS 5' incomplete + | cds_end_NF: CDS 3' incomplete + * - SWISSPROT + - Best match UniProtKB/Swiss-Prot accession of protein product + * - TREMBL + - Best match UniProtKB/TrEMBL accession of protein product + * - UNIPARC + - Best match UniParc accession of protein product + * - HGVSc + - the HGVS coding sequence name + * - HGVSp + - the HGVS protein sequence name + * - HGVSg + - the HGVS genomic sequence name + * - HGVS_OFFSET + - Indicates by how many bases the HGVS notations for this variant have been shifted + * - SIFT + - the SIFT prediction and/or score, with both given as prediction(score) + * - PolyPhen + - the PolyPhen prediction and/or score + * - MOTIF_NAME + - The source and identifier of a transcription factor binding profile aligned at this position + * - MOTIF_POS + - The relative position of the variation in the aligned TFBP + * - HIGH_INF_POS + - A flag indicating if the variant falls in a high information position of a transcription factor binding profile (TFBP) + * - MOTIF_SCORE_CHANGE + - The difference in motif score of the reference and variant sequences for the TFBP + * - CANONICAL + - a flag indicating if the transcript is denoted as the canonical transcript for this gene + * - CCDS + - the CCDS identifer for this transcript, where applicable + * - INTRON + - the intron number (out of total number) + * - EXON + - the exon number (out of total number) + * - DOMAINS + - the source and identifer of any overlapping protein domains + * - DISTANCE + - Shortest distance from variant to transcript + * - AF + - Frequency of existing variant in 1000 Genomes + * - AFR_AF + - Frequency of existing variant in 1000 Genomes combined African population + * - AMR_AF + - Frequency of existing variant in 1000 Genomes combined American population + * - EUR_AF + - Frequency of existing variant in 1000 Genomes combined European population + * - EAS_AF + - Frequency of existing variant in 1000 Genomes combined East Asian population + * - SAS_AF + - Frequency of existing variant in 1000 Genomes combined South Asian population + * - AA_AF + - Frequency of existing variant in NHLBI-ESP African American population + * - EA_AF + - Frequency of existing variant in NHLBI-ESP European American population + * - gnomAD_AF + - Frequency of existing variant in gnomAD exomes combined population + * - gnomAD_AFR_AF + - Frequency of existing variant in gnomAD exomes African/American population + * - gnomAD_AMR_AF + - Frequency of existing variant in gnomAD exomes American population + * - gnomAD_ASJ_AF + - Frequency of existing variant in gnomAD exomes Ashkenazi Jewish population + * - gnomAD_EAS_AF + - Frequency of existing variant in gnomAD exomes East Asian population + * - gnomAD_FIN_AF + - Frequency of existing variant in gnomAD exomes Finnish population + * - gnomAD_NFE_AF + - Frequency of existing variant in gnomAD exomes Non-Finnish European population + * - gnomAD_OTH_AF + - Frequency of existing variant in gnomAD exomes combined other combined populations + * - gnomAD_SAS_AF + - Frequency of existing variant in gnomAD exomes South Asian population + * - MAX_AF + - Maximum observed allele frequency in 1000 Genomes, ESP and gnomAD + * - MAX_AF_POPS + - Populations in which maximum allele frequency was observed + * - CLIN_SIG + - ClinVar clinical significance of the dbSNP variant + * - BIOTYPE + - Biotype of transcript or regulatory feature + * - APPRIS + - Annotates alternatively spliced transcripts as primary or alternate based on a range of computational methods. NB: not available for GRCh37 + * - TSL + - Transcript support level. NB: not available for GRCh37 + * - PUBMED + - Pubmed ID(s) of publications that cite existing variant + * - SOMATIC + - Somatic status of existing variant(s); multiple values correspond to multiple values in the Existing_variation field + * - PHENO + - Indicates if existing variant is associated with a phenotype, disease or trait; multiple values correspond to multiple values in the Existing_variation field + * - GENE_PHENO + - Indicates if overlapped gene is associated with a phenotype, disease or trait + * - BAM_EDIT + - Indicates success or failure of edit using BAM file + * - GIVEN_REF + - Reference allele from input + * - REFSEQ_MATCH + - | the RefSeq transcript match status; contains a number of flags indicating whether this RefSeq transcript matches the underlying reference sequence and/or an Ensembl transcript (more information): + - rseq_3p_mismatch: signifies a mismatch between the RefSeq transcript and the underlying primary genome assembly sequence. Specifically, there is a mismatch in the 3' UTR of the RefSeq model with respect to the primary genome assembly (e.g. GRCh37/GRCh38). + - rseq_5p_mismatch: signifies a mismatch between the RefSeq transcript and the underlying primary genome assembly sequence. Specifically, there is a mismatch in the 5' UTR of the RefSeq model with respect to the primary genome assembly. + - rseq_cds_mismatch: signifies a mismatch between the RefSeq transcript and the underlying primary genome assembly sequence. Specifically, there is a mismatch in the CDS of the RefSeq model with respect to the primary genome assembly. + - rseq_ens_match_cds: signifies that for the RefSeq transcript there is an overlapping Ensembl model that is identical across the CDS region only. A CDS match is defined as follows: the CDS and peptide sequences are identical and the genomic coordinates of every translatable exon match. Useful related attributes are: rseq_ens_match_wt and rseq_ens_no_match. + - rseq_ens_match_wt: signifies that for the RefSeq transcript there is an overlapping Ensembl model that is identical across the whole transcript. A whole transcript match is defined as follows: 1) In the case that both models are coding, the transcript, CDS and peptide sequences are all identical and the genomic coordinates of every exon match. 2) In the case that both transcripts are non-coding the transcript sequences and the genomic coordinates of every exon are identical. No comparison is made between a coding and a non-coding transcript. Useful related attributes are: rseq_ens_match_cds and rseq_ens_no_match. + - rseq_ens_no_match: signifies that for the RefSeq transcript there is no overlapping Ensembl model that is identical across either the whole transcript or the CDS. This is caused by differences between the transcript, CDS or peptide sequences or between the exon genomic coordinates. Useful related attributes are: rseq_ens_match_wt and rseq_ens_match_cds. + - rseq_mrna_match: signifies an exact match between the RefSeq transcript and the underlying primary genome assembly sequence (based on a match between the transcript stable id and an accession in the RefSeq mRNA file). An exact match occurs when the underlying genomic sequence of the model can be perfectly aligned to the mRNA sequence post polyA clipping. + - rseq_mrna_nonmatch: signifies a non-match between the RefSeq transcript and the underlying primary genome assembly sequence. A non-match is deemed to have occurred if the underlying genomic sequence does not have a perfect alignment to the mRNA sequence post polyA clipping. It can also signify that no comparison was possible as the model stable id may not have had a corresponding entry in the RefSeq mRNA file (sometimes happens when accessions are retired or changed). When a non-match occurs one or several of the following transcript attributes will also be present to provide more detail on the nature of the non-match: rseq_5p_mismatch, rseq_cds_mismatch, rseq_3p_mismatch, rseq_nctran_mismatch, rseq_no_comparison + - rseq_nctran_mismatch: signifies a mismatch between the RefSeq transcript and the underlying primary genome assembly sequence. This is a comparison between the entire underlying genomic sequence of the RefSeq model to the mRNA in the case of RefSeq models that are non-coding. + - rseq_no_comparison: signifies that no alignment was carried out between the underlying primary genome assembly sequence and a corresponding RefSeq mRNA. The reason for this is generally that no corresponding, unversioned accession was found in the RefSeq mRNA file for the transcript stable id. This sometimes happens when accessions are retired or replaced. A second possibility is that the sequences were too long and problematic to align (though this is rare). + * - CHECK_REF + - Reports variants where the input reference does not match the expected reference + * - HGNC_ID + - A unique ID provided by the HGNC for each gene with an approved symbol + * - MANE + - indicating if the transcript is the MANE Select or MANE Plus Clinical transcript for the gene. + * - miRNA + - Reports where the variant lies in the miRNA secondary structure. \ No newline at end of file diff --git a/docs/balsamic_filters.rst b/docs/balsamic_filters.rst index ac774b2c0..313f8d582 100644 --- a/docs/balsamic_filters.rst +++ b/docs/balsamic_filters.rst @@ -3,9 +3,9 @@ BALSAMIC Variant Calling Algorithms *********************************** In BALSAMIC, various bioinfo tools are integrated for reporting somatic and germline variants. Also, the choice of these tools differs between the type of analysis, -for eg: `Target Genome Analysis (TGA)` or `Whole Genome Sequencing (WGS)`. Various filters (Pre-call filtering and Post-call filtering) are applied at different levels to report high-confidence variant calls. +e.g.: `Target Genome Analysis (TGA)` or analysis of `Whole Genome Sequencing (WGS)`. Various filters (Pre-call and Post-call filtering) are applied at different levels to report high-confidence variant calls. -* **Pre-call filtering** is where the variant-calling tool decides not to call a variant line to the VCF file, if the default filters did not pass the criteria. The set of default filters differs between the various variant-calling algorithms. +**Pre-call filtering** is where the variant-calling tool decides not to add a variant to the VCF file if the default filters of the variant-caller did not pass the filter criteria. The set of default filters differs between the various variant-calling algorithms. To know more about the pre-call filters used by the variant callers, please have a look at the VCF header of the particular variant-calling results. For example: @@ -13,27 +13,59 @@ For example: .. figure:: images/vcf_filters.png :width: 500px - Pre-call filters applied by the `Vardict` variant-caller is listed out in the VCF header + Pre-call filters applied by the `Vardict` variant-caller is listed in the VCF header. -In the VCF file, `FILTER` status is `PASS` if this position has passed all filters, i.e., a call is made at this position. Otherwise, -if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. e.g., `p8;pSTD` might -indicate that at this site, the mean position in reads is less than 8 and position in reads has a standard deviation of 0. +In the VCF file, the `FILTER` status is `PASS` if this position has passed all filters, i.e., a call is made at this position. Contrary, +if the site has not passed any of the filters, a semicolon-separated list of those failed filter(s) will be appended to the `FILTER` column instead of `PASS`. E.g., `p8;pSTD` might +indicate that at this site, the mean position in reads is less than 8, and the position in reads has a standard deviation of 0. + + +.. note:: -.. important:: In BALSAMIC, this VCF file is named as `*.all.vcf.gz` (eg: `SNV.somatic..vardict.all.vcf.gz`) + .. figure:: images/filter_status.png :width: 500px Vardict Variant calls with different 'FILTER' status underlined in white line (`NM4.5`, `PASS`, `p8;pSTD`) -* **Post-call filtering** is where a variant is further filtered with criteria such as quality, depth, VAF etc with more stringent thresholds. + +**Post-call filtering** is where a variant is further filtered with quality, depth, VAF, etc., with more stringent thresholds. For `Post-call filtering`, in BALSAMIC we have applied various filtering criteria (`Vardict_filtering`_, `TNscope filtering (Tumor_normal)`_ ) depending on the analysis-type (TGS/WGS) and sample-type(tumor-only/tumor-normal). -.. important:: - In BALSAMIC, this VCF file is named as `*.all.filtered.pass.vcf.gz` (eg: `SNV.somatic..vardict.all.filtered.pass.vcf.gz`) +.. note:: + In BALSAMIC, this VCF file is named as `*.all.filtered.vcf.gz` (eg: `SNV.somatic..vardict.all.filtered.vcf.gz`) + + +Only those variants that fulfill the pre-call and post-call filters are scored as `PASS` in the `STATUS` column of the VCF file. We filter those `PASS` variants and deliver a final list of variants to the customer either via `Scout` or `Caesar` + +.. note:: + In BALSAMIC, this VCF file is named as `*.all.filtered.pass.vcf.gz` (eg: `SNV.somatic..vardict.all.filtered.pass.vcf.gz`) + + +.. list-table:: Description of VCF files + :widths: 30 50 20 + :header-rows: 1 + + * - VCF file name + - Description + - Delivered to the customer + * - .vcf.gz + - Unannotated VCF file with pre-call filters included in the STATUS column + - Yes (Caesar) + * - .all.vcf.gz + - Annotated VCF file with pre-call filters included in the STATUS column + - No + * - .all.filtered.vcf.gz + - Annotated VCF file with pre-call and post-call filters included in the STATUS column + - No + * - .all.filtered.pass.vcf.gz + - Annotated and filtered VCF file by excluding all filters that did not meet the pre and post-call filter criteria. Includes only variants with the `PASS` STATUS + - Yes (Caesar and Scout) + **Targeted Genome Analysis** ############################# @@ -41,7 +73,6 @@ For `Post-call filtering`, in BALSAMIC we have applied various filtering criteri Somatic Callers for reporting SNVs/INDELS ****************************************** - **Vardict** =========== @@ -51,7 +82,7 @@ These high-confidence variant calls are the final list of variants uploaded to S **Vardict_filtering** ^^^^^^^^^^^^^^^^^^^^^^ -Following are the set of criterias applied for filtering vardict results. Applies for both tumor-normal and tumor-only samples +Following is the set of criteria applied for filtering vardict results. It is used for both tumor-normal and tumor-only samples. *Mean Mapping Quality (MQ)*: Refers to the root mean square (RMS) mapping quality of all the reads spanning the given variant site. @@ -88,7 +119,7 @@ Following are the set of criterias applied for filtering vardict results. Applie GNOMADAF_popmax <= 0.005 (or) GNOMADAF_popmax == "." .. important:: - Additionally, for tumor-normal cases; the variant is excluded if it marked as 'germline' in the `STATUS` column of vcf file. + Additionally, the variant is excluded for tumor-normal cases if marked as 'germline' in the `STATUS` column of the VCF file. **Whole Genome Sequencing (WGS)** ********************************** @@ -96,19 +127,20 @@ Following are the set of criterias applied for filtering vardict results. Applie **Sentieon's TNscope** ======================= -BALSAMIC utilizes `TNscope` algorithm for the variant calling of somatic SNV/INDELS in WGS samples. +BALSAMIC utilizes the `TNscope` algorithm for calling somatic SNVs and INDELS in WGS samples. The `TNscope `_ algorithm performs the somatic variant calling on the tumor-normal or the tumor-only samples, using a Haplotyper algorithm. **TNscope filtering (Tumor_normal)** ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The following filters are applied to the variants in TNscope raw VCF file (`SNV.somatic.$CASE_ID.tnscope.all.vcf.gz`). The variants scored as `PASS` are included in the final vcf file (`SNV.somatic.$CASE_ID.tnscope.all.filtered.pass.vcf.gz`). *Total Depth (DP)*: Refers to the overall read depth from all target samples supporting the variant call :: - DP(tumor) >= 10 || DP(normal) >= 10 + DP(tumor) >= 10 (or) DP(normal) >= 10 -*Allelic Depth (AD)*: Total reads supporting the ALT allele in tumor sample +*Allelic Depth (AD)*: Total reads supporting the ALT allele in the tumor sample :: @@ -129,6 +161,8 @@ The `TNscope `_ algor **TNscope filtering (tumor_only)** ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The somatic variants in TNscope raw VCF file (`SNV.somatic.$CASE_ID.tnscope.all.vcf.gz`) are filtered out for the genomic regions that are not reliable (eg: centromeric regions, non-chromosome contigs) to enhance the computation time. This WGS interval region file is collected from gatk_bundles ``_ +and following filters are applied. The variants that scored as `PASS` are considered for `Merging of TNscope and TNhaplotyper results (tumor_only)`_ *Total Depth (DP)*: Refers to the overall read depth supporting the variant call @@ -136,7 +170,7 @@ The `TNscope `_ algor DP(tumor) >= 10 -*Allelic Depth (AD)*: Total reads supporting the ALT allele in tumor sample +*Allelic Depth (AD)*: Total reads supporting the ALT allele in the tumor sample :: @@ -176,6 +210,58 @@ The `TNscope `_ algor SOR < 3 +**TNhaplotyper filtering (tumor_only)** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The somatic variants in TNhaplotyper raw VCF file (`SNV.somatic.$CASE_ID.tnhaplotyper.all.vcf.gz`) are filtered out for the genomic regions that are not reliable (eg: centromeric regions, non-chromosome contigs) to enhance the computation time. This WGS interval region file is collected from gatk_bundles ``_ +and following filters are applied. The variants that scored as `PASS` are considered for `Merging of TNscope and TNhaplotyper results (tumor_only)`_ + + +*Total Depth (DP)*: Refers to the overall read depth from all target samples supporting the variant call + +:: + + DP(tumor) >= 10 (or) DP(normal) >= 10 + +*Allelic Depth (AD)*: Total reads supporting the ALT allele in the tumor sample + +:: + + AD(tumor) >= 3 + +*Allelic Frequency (AF)*: Fraction of the reads supporting the alternate allele + +:: + + Minimum AF(tumor) >= 0.05 + Maximum AF(tumor) < 1 + +*GNOMADAF_POPMAX*: Maximum Allele Frequency across populations + +:: + + GNOMADAF_popmax <= 0.001 (or) GNOMADAF_popmax == "." + +*Normalized base quality scores*: The sum of base quality scores for each allele (QSS) is divided by the allelic depth of alt and ref alleles (AD) + +:: + + SUM(QSS)/SUM(AD) >= 20 + +*Read Counts*: Count of reads in a given (F1R2, F2R1) pair orientation supporting the alternate allele and reference alleles + +:: + + ALT_F1R2 > 0, ALT_F2R1 > 0 + REF_F1R2 > 0, REF_F2R1 > 0 + + +**Merging of TNscope and TNhaplotyper results (tumor_only)** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The filtered somatic variants from `TNscope filtering (tumor_only)`_ and `TNhaplotyper filtering (tumor_only)`_ are merged using the `bcftools` intersect command to reduce the number of reported somatic variants for tumor-only samples. +Next, the somatic variants that are called by both variant-callers are reported as the final filtered list of variants (`SNV.somatic.{CASE_ID}.tnscope.all.filtered.pass.vcf.gz`). +The final VCF constitutes a high confidence set of somatic variants, which is delivered to the customer either by scout or caesar filesystem. + **Target Genome Analysis with UMI's into account** ************************************************** @@ -192,25 +278,39 @@ The following filter applies for both tumor-normal and tumor-only samples. minreads = 3,1,1 -Which means that at least `3` UMI tag groups should be ideally considered from both DNA strands, where a minimum of atleast `1` UMI tag group should exist in each of the single-stranded consensus reads. +It means that at least `3` UMI tag groups should be ideally considered from both DNA strands, where a minimum of at least `1` UMI tag group should exist in each of the single-stranded consensus reads. -*min_init_tumor_lod* : Log odds is the likelihood that the candidate mutation is real over the likelihood that the candidate mutation is a sequencing error before any read-based filters are applied. -minimum log odds for the candidate selection. TNscope default: `4` +*min_init_tumor_lod*: Log odds is the likelihood that the candidate mutation is real over the likelihood that the candidate mutation is a sequencing error before any read-based filters are applied. +Minimum log-odds for the candidate selection. TNscope default: `4`. In our UMI-workflow we reduced this setting to `0.5` :: min_init_tumor_lod = 0.5 -*min_tumor_lod* : minimum log odds in the final call of variants. TNscope default: `6.3` +*min_tumor_lod*: minimum log odds in the final call of variants. TNscope default: `6.3`. In our UMI-workflow we reduced this setting to `4.0` :: min_tumor_lod = 4.0 +*min_tumor_allele_frac*: Set the minimum tumor AF to be considered as potential variant site. + +:: + + min_tumor_allele_frac = 0.0005 + +*interval_padding*: Adding an extra 100bp to each end of the target region in the bed file before variant calling. +:: + interval_padding = 100 + **Post-call Filters** *GNOMADAF_POPMAX*: Maximum Allele Frequency across populations :: - GNOMADAF_popmax <= 0.001 (or) GNOMADAF_popmax == "." + GNOMADAF_popmax <= 0.02 (or) GNOMADAF_popmax == "." + +.. attention:: + BALSAMIC <= v8.2.10 uses GNOMAD_popmax <= 0.005. From Balsamic v9.0.0, this settings is changed to 0.02, to reduce the stringency. + diff --git a/docs/balsamic_methods.rst b/docs/balsamic_methods.rst new file mode 100644 index 000000000..de08e90fe --- /dev/null +++ b/docs/balsamic_methods.rst @@ -0,0 +1,88 @@ +======================== +BALSAMIC METHODS +======================== + +Target Genome Analysis +~~~~~~~~~~~~~~~~~~~~~~ + +BALSAMIC :superscript:`1` (**version** = 8.2.10) was used to analyze the data from raw FASTQ files. +We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. +Adapter sequences and low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. +Trimmed reads were mapped to the reference genome hg19 using BWA MEM v0.7.17 :superscript:`4`. +The resulted SAM files were converted to BAM files and sorted using samtools v1.15.1 :superscript:`5`. +Duplicated reads were marked using Picard tools MarkDuplicate v2.27.1 :superscript:`6` +and promptly quality controlled using CollectHsMetrics, CollectInsertSizeMetrics and CollectAlignmentSummaryMetrics functionalities. +Results of the quality controlled steps were summarized by MultiQC v1.12 :superscript:`7`. +Small somatic mutations (SNVs and INDELs) were called for each sample using VarDict v2019.06.04 :superscript:`8`. +Apart from the Vardict filters to report the variants, the called-variants were also further second filtered using the criteria +(*MQ >= 40, DP >= 100, VD >= 5, Minimum AF >= 0.007, Maximum AF < 1, GNOMADAF_popmax <= 0.005*). +Only those variants that fulfilled the filtering criteria and scored as `PASS` in the VCF file were reported. +Structural variants were called using Manta v1.6.0 :superscript:`9` and Delly v0.9.1 :superscript:`10`. +Copy number aberrations were called using CNVkit v0.9.9 :superscript:`11`. +The variant calls from CNVkit, Manta and Delly were merged using SVDB v2.6.0 :superscript:`12`. +All variants were annotated using Ensembl VEP v104.3 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` +to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`. + +Whole Genome Analysis +~~~~~~~~~~~~~~~~~~~~~ +BALSAMIC :superscript:`1` (**version** = 8.2.10) was used to analyze the data from raw FASTQ files. +We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. +Adapter sequences and low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. +Trimmed reads were mapped to the reference genome hg19 using sentieon-tools :superscript:`15`. +The resulted SAM files were converted to BAM files and sorted using samtools v1.15.1 :superscript:`5`. +Duplicated reads were marked using Picard tools MarkDuplicate v2.27.1 :superscript:`6` +and promptly quality controlled using CollectMultipleMetrics and CollectWgsMetrics functionalities. +Results of the quality controlled steps were summarized by MultiQC v1.12 :superscript:`7`. +Small somatic mutations (SNVs and INDELs) were called for each sample using Sentieon TNscope and TNhaplotyper :superscript:`16`. +The called-variants were also further second filtered using the criteria (DP(tumor,normal) >= 10; AD(tumor) >= 3; AF(tumor) >= 0.05, Maximum AF(tumor < 1; GNOMADAF_popmax <= 0.001; normalized base quality scores >= 20, read_counts of alt,ref alle > 0). +The filtered variants from TNscope and TNhaplotyper were merged using bcftools isec functionality to reduce the number of variants for tumor-only samples. +Structural variants were called using Manta v1.6.0 :superscript:`9` and Delly v0.9.1 :superscript:`10`. +Copy number aberrations were called using ascatNgs v4.5.0 :superscript:`17` for tumor-normal samples. +The structural variant calls from Manta, Delly and ascatNgs were merged using SVDB v2.6.0 :superscript:`12` +All variants were finally annotated using Ensembl VEP v104.3 :superscript:`13`. We used vcfanno v0.3.3 :superscript:`14` +to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`. + +============================= +UMI Data Analysis +============================= + +BALSAMIC :superscript:`1` (**version** = 8.2.10) was used to analyze the data from raw FASTQ files. +We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. +Adapter sequences and low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. +UMI tag extraction and consensus generation were performed using Sentieon tools v202010.02 :superscript:`15`. +The alignment of UMI extracted and consensus called reads to the human reference genome (hg19) was done by bwa-mem and +samtools using Sentieon utils. Consensus reads were filtered based on the number of minimum reads supporting each UMI tag group. +We applied a criteria filter of minimum reads `3,1,1`. It means that at least three UMI tag groups should be ideally considered from both DNA strands, +where a minimum of at least one UMI tag group should exist in each single-stranded consensus read. +The filtered consensus reads were quality controlled using Picard CollectHsMetrics v2.27.1 :superscript:`5`. Results of the quality controlled steps were summarized by MultiQC v1.12 :superscript:`6`. +For each sample, somatic mutations were called using Sentieon TNscope :superscript:`16`, with non-default parameters for passing the final list of variants +(--min_tumor_allele_frac 0.0005, --filter_t_alt_frac 0.0005, --min_init_tumor_lod 0.5, min_tumor_lod 4, --max_error_per_read 5 --pcr_indel_model NONE, GNOMADAF_popmax <= 0.001). +All variants were finally annotated using Ensembl VEP v104.3 :superscript:`7`. We used vcfanno v0.3.3 :superscript:`8` to annotate somatic variants for their population allele frequency from gnomAD v2.1.1 :superscript:`18`. +For exact parameters used for each software, please refer to https://github.com/Clinical-Genomics/BALSAMIC. +We used three commercially available products from SeraCare [Material numbers: 0710-067110 :superscript:`19`, 0710-067211 :superscript:`20`, 0710-067312 :superscript:`21`] for validating the efficiency of the UMI workflow in identifying 14 mutation sites at known allelic frequencies. + + +**References** +~~~~~~~~~~~~~~~~ + +1. Foroughi-Asl, H., Jeggari, A., Maqbool, K., Ivanchuk, V., Elhami, K., & Wirta, V. BALSAMIC: Bioinformatic Analysis pipeLine for SomAtic MutatIons in Cancer (Version v8.2.10) [Computer software]. https://github.com/Clinical-Genomics/BALSAMIC +2. Babraham Bioinformatics - FastQC A Quality Control tool for High Throughput Sequence Data. Accessed June 22, 2020. https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ +3. Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. 2018;34(17):i884-i890. doi:10.1093/bioinformatics/bty560 +4. Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN] +5. Li H, Handsaker B, Wysoker A, Fennell T, Ruan J., Homer N., Marth G., Abecasis G., Durbin R. and 1000 Genome Project Data Processing Subgroup (2009) The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics, 25, 2078-9. doi: 10.1093/bioinformatics/btp352 +6. Picard Tools - By Broad Institute. Accessed June 22, 2020. https://broadinstitute.github.io/picard/ +7. Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016;32(19):3047-3048. doi:10.1093/bioinformatics/btw354 +8. Lai Z, Markovets A, Ahdesmaki M, Chapman B, Hofmann O, McEwen R, Johnson J, Dougherty B, Barrett JC, and Dry JR. VarDict: a novel and versatile variant caller for next-generation sequencing in cancer research. Nucleic Acids Res. 2016. https://doi.org/10.1093/nar/gkw227 +9. Chen, X. et al. (2016) Manta: rapid detection of structural variants and indels for germline and cancer sequencing applications. Bioinformatics, 32, 1220-1222. doi:10.1093/bioinformatics/btv710 +10. Tobias Rausch, Thomas Zichner, Andreas Schlattl, Adrian M. Stuetz, Vladimir Benes, Jan O. Korbel. DELLY: structural variant discovery by integrated paired-end and split-read analysis. Bioinformatics. 2012 Sep 15;28(18):i333-i339. https://doi.org/10.1093/bioinformatics/bts378 +11. Talevich, E, Shain, A.H, Botton, T, & Bastian, B.C. CNVkit: Genome-wide copy number detection and visualization from targeted sequencing. PLOS Computational Biology. 2016, 12(4):e1004873. https://doi.org/10.1371/journal.pcbi.1004873 +12. Jesper Eisfeldt et.al. TIDDIT, an efficient and comprehensive structural variant caller for massive parallel sequencing data. F1000 research. 2017. doi: 10.12688/f1000research.11168.2 +13. McLaren W, Gil L, Hunt SE, et al. The Ensembl Variant Effect Predictor. Genome Biology. 2016;17(1):122. +14. Pedersen BS, Layer RM, Quinlan AR. Vcfanno: fast, flexible annotation of genetic variants. Genome Biology. 2016;17(1):118. doi:10.1186/s13059-016-0973-5 +15. Donald Freed, Rafael Aldana, Jessica A. Weber, Jeremy S. Edwards. The Sentieon Genomics Tools - A fast and accurate solution to variant calling from next-generation sequence data. Bioinformatics. 2016, Volume 32,Issue 8. https://doi.org/10.1093/bioinformatics/btv710 +16. Donald Freed, Renke Pan, Rafael Aldana. TNscope: Accurate Detection of Somatic Mutations with Haplotype-based Variant Candidate Detection and Machine Learning Filtering. bioRvix. doi: https://doi.org/10.1101/250647 +17. Keiran MR, Peter VL, David CW, David J, Andrew M, Adam PB , Jon WT, Patrick T, Serena Nik-Zainal, Peter J C. ascatNgs: Identifying Somatically Acquired Copy-Number Alterations from Whole-Genome Sequencing Data. Curr Protoc Bioinformatics. 2016. doi:https://doi.org/10.1002/cpbi.17 +18. Karczewski, K.J., Francioli, L.C., Tiao, G. et al. The mutational constraint spectrum quantified from variation in 141,456 humans. Nature 581, 434–443 (2020). https://doi.org/10.1038/s41586-020-2308-7 +19. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF1-0710-0671/ +20. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF05-0710-0672/ +21. https://www.seracare.com/Seraseq-ctDNA-Complete-Reference-Material-AF01-0710-0673/ diff --git a/docs/bioinfo_softwares.rst b/docs/bioinfo_softwares.rst index a3192f9ab..2de11c2fe 100644 --- a/docs/bioinfo_softwares.rst +++ b/docs/bioinfo_softwares.rst @@ -16,7 +16,7 @@ bcftools ~~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `>1.9` +:Version: `>=1.10` bedtools ~~~~~~~~ @@ -28,37 +28,31 @@ bwa ~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.7.15` +:Version: `0.7.17` cnvkit ~~~~~~ :Source code: `GitHub` ``_ :Article: `PLOS Computational Biology` ``_ -:Version: `0.9.4` - -csvkit -~~~~~~ -:Source code: `GitHub` ``_ -:Article: `-` -:Version: `1.0.4` +:Version: `0.9.9` delly ~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.8.7` +:Version: `0.9.1` ensembl-vep ~~~~~~~~~~~ :Source code: `GitHub` ``_ :Article: `Genome Biology` ``_ -:Version: `100.2` +:Version: `104.3` fastp ~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.20.1` +:Version: `0.23.2` fastqc ~~~~~~ @@ -82,31 +76,31 @@ multiqc ~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `1.11` +:Version: `1.12` mosdepth ~~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.2.9` +:Version: `0.3.3` picard ~~~~~~ :Source code: `GitHub` ``_ :Article: `-` -:Version: `2.25.0` +:Version: `2.27.1` sambamba ~~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.6.6` +:Version: `0.8.2` samtools ~~~~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `1.12` +:Version: `>1.11` sentieon-tools ~~~~~~~~~~~~~~ @@ -114,11 +108,17 @@ sentieon-tools :Article: `Bioinformatics` ``_ :Version: `202010.02` +svdb +~~~~ +:Source code: `Github` ``_ +:Article: `F1000Res` ``_ +:Version: `2.6.0` + tabix ~~~~~ :Source code: `GitHub` ``_ :Article: `Bioinformatics` ``_ -:Version: `0.2.6` +:Version: `1.11` vardict ~~~~~~~ diff --git a/docs/index.rst b/docs/index.rst index c4e72d940..9503955ea 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,7 +16,9 @@ :hidden: :maxdepth: 1 + balsamic_annotation balsamic_filters + balsamic_methods bioinfo_softwares diff --git a/docs/resources.rst b/docs/resources.rst index b9dbe305e..b8839a9f9 100644 --- a/docs/resources.rst +++ b/docs/resources.rst @@ -143,7 +143,7 @@ Methods and tools #. **Delly2**\ : An integrated structural variant prediction method that can discover, genotype and visualize deletions, tandem duplications, inversions and translocations https://github.com/dellytools/delly #. **PLINK**\ : PLINK: Whole genome data analysis toolset https://www.cog-genomics.org/plink2 #. **freebayes**\ : a haplotype-based variant detector. https://github.com/ekg/freebayes -#. **ASCAT**\ : Allele-Specific Copy Number Analysis of Tumors, tumor purity and ploidy https://github.com/Crick-CancerGenomics/ascat +#. **AscatNGS**\ : Allele-Specific Copy Number Analysis of Tumors, tumor purity and ploidy https://github.com/cancerit/ascatNgs #. **MutationalPatterns**\ : R package for extracting and visualizing mutational patterns in base substitution catalogues https://github.com/UMCUGenetics/MutationalPatterns #. **desconstructSigs**\ : identification of mutational signatures within a single tumor sample https://github.com/raerose01/deconstructSigs #. **treeOmics**\ : Decrypting somatic mutation patterns to reveal the evolution of cancer @@ -190,3 +190,4 @@ Methods and tools #. **msisensor**\ : microsatellite instability detection using paired tumor-normal https://github.com/ding-lab/msisensor #. **MOSAIC**\ : MicrOSAtellite Instability Classifier https://github.com/ronaldhause/mosaic #. **MANTIS**\ : Microsatellite Analysis for Normal-Tumor InStability https://github.com/OSU-SRLab/MANTIS +#. **SBDB**\ : A toolkit for constricting and querying structural variant databases https://github.com/J35P312/SVDB diff --git a/requirements-dev.txt b/requirements-dev.txt index c2fedd5f7..f6d3737c3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,4 +4,4 @@ coveralls pylint black==22.3.0 pillow>=8.4.0 -fpdf2>=2.4.6 +fpdf2==2.4.6 diff --git a/tests/commands/config/test_config_qc.py b/tests/commands/config/test_config_qc.py new file mode 100644 index 000000000..1fea387a6 --- /dev/null +++ b/tests/commands/config/test_config_qc.py @@ -0,0 +1,264 @@ +import os +import json +import graphviz +import logging +from unittest import mock +from pathlib import Path +import pytest +from BALSAMIC.utils.cli import generate_graph + +qc_json = "_QC.json" + + +def test_qc_normal_config( + invoke_cli, + sample_fastq, + tmp_path, + balsamic_cache, + panel_bed_file, +): + # GIVEN a case ID, fastq files, and an analysis dir + test_analysis_dir = tmp_path / "test_analysis_dir" + test_analysis_dir.mkdir() + case_id = "sample_tumor_normal" + tumor = sample_fastq["tumor"] + normal = sample_fastq["normal"] + + # WHEN creating a case analysis + with mock.patch.dict( + "os.environ", + ): + result = invoke_cli( + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "-n", + normal, + "--case-id", + case_id, + "--analysis-dir", + test_analysis_dir, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + "ACC1", + "--normal-sample-name", + "ACC2", + ], + ) + + # THEN a config should be created and exist + assert result.exit_code == 0 + assert Path(test_analysis_dir, case_id, case_id + qc_json).exists() + # load json file and check if dag exists + qc_config = json.load(open(Path(test_analysis_dir, case_id, case_id + qc_json))) + # assert if config json dag file is created + assert Path(qc_config["analysis"]["dag"]).exists() + assert "BALSAMIC QC Workflow has been configured successfully!" in result.output + + +def test_qc_tumor_only_config( + invoke_cli, + sample_fastq, + tmp_path, + balsamic_cache, + panel_bed_file, + sentieon_license, + sentieon_install_dir, +): + # GIVEN a case ID, fastq files, and an analysis dir + test_analysis_dir = tmp_path / "test_analysis_dir" + test_analysis_dir.mkdir() + case_id = "sample_tumor_only" + tumor = sample_fastq["tumor"] + + # WHEN creating a case analysis + with mock.patch.dict( + "os.environ", + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + result = invoke_cli( + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + test_analysis_dir, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + "ACC1", + ], + ) + + # THEN a config should be created and exist + assert result.exit_code == 0 + assert Path(test_analysis_dir, case_id, case_id + qc_json).exists() + # load json file and check if dag exists + qc_config = json.load(open(Path(test_analysis_dir, case_id, case_id + qc_json))) + # assert if config json dag file is created + assert Path(qc_config["analysis"]["dag"]).exists() + + +def test_qc_config_bad_filename( + invoke_cli, + tmp_path_factory, + analysis_dir, + panel_bed_file, + balsamic_cache, +): + # GIVEN existing fastq file with wrong naming convention + faulty_fastq_dir = tmp_path_factory.mktemp("error_fastq") + fastq_file_name_tumor = "tumor_error.fastq.gz" + Path(faulty_fastq_dir / fastq_file_name_tumor).touch() + + case_id1 = "faulty_tumor" + tumor = Path(faulty_fastq_dir / fastq_file_name_tumor).as_posix() + # Invoke CLI command using file as argument + case_result = invoke_cli( + [ + "config", + "qc_panel", + "-t", + tumor, + "-p", + panel_bed_file, + "--case-id", + case_id1, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + ], + ) + + # THEN run should abort + assert case_result.exit_code == 1 + + +def test_qc_run_without_permissions( + invoke_cli, + no_write_perm_path, + sample_fastq, + panel_bed_file, + balsamic_cache, +): + # GIVEN CLI arguments including an analysis_dir without write permissions + case_id = "sample_tumor_only" + tumor = sample_fastq["tumor"] + + result = invoke_cli( + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + no_write_perm_path, + "--balsamic-cache", + balsamic_cache, + ], + ) + # THEN program exits before completion + assert result.exit_code == 1 + + +def test_qc_config_failed(invoke_cli, tmp_path, balsamic_cache, panel_bed_file): + # GIVEN a case ID, fastq files, and an analysis dir + test_analysis_dir = tmp_path / "test_analysis_dir" + test_analysis_dir.mkdir() + case_id = "sample_qc" + + # WHEN creating a case analysis + result = invoke_cli( + [ + "config", + "qc_panel", + "--case-id", + case_id, + "-p", + panel_bed_file, + "--analysis-dir", + test_analysis_dir, + "--balsamic-cache", + balsamic_cache, + ] + ) + + # THEN a config should not be created and exit + assert "Error: Missing option" in result.output + assert result.exit_code == 2 + + +def test_config_qc_graph_failed( + invoke_cli, sample_fastq, analysis_dir, balsamic_cache, panel_bed_file +): + # GIVEN an analysis config + case_id = "sample_tumor_only" + tumor = sample_fastq["tumor"] + + with mock.patch.object(graphviz, "Source") as mocked: + mocked.return_value = None + case_result = invoke_cli( + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + ], + ) + + assert case_result.exit_code == 1 + + +def test_config_qc_graph_failed_value_error( + invoke_cli, sample_fastq, analysis_dir, balsamic_cache, panel_bed_file +): + # GIVEN an analysis config + case_id = "sample_tumor_only" + tumor = sample_fastq["tumor"] + + with mock.patch.object(graphviz, "Source", side_effect=ValueError) as mocked: + mocked.return_value = None + case_result = invoke_cli( + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + ], + ) + + assert "BALSAMIC QC dag graph generation failed" in case_result.output diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py index b16a6173b..922fde841 100644 --- a/tests/commands/config/test_config_sample.py +++ b/tests/commands/config/test_config_sample.py @@ -249,3 +249,35 @@ def test_config_graph_failed( ) assert case_result.exit_code == 1 + + +def test_pon_cnn_file( + invoke_cli, sample_fastq, analysis_dir, balsamic_cache, panel_bed_file +): + + # GIVEN CLI arguments including optional pon reference '.cnn' file + case_id = "test_sample_cnv" + tumor = sample_fastq["tumor"] + pon_file = "tests/test_data/references/panel/test_panel_ponn.cnn" + + result = invoke_cli( + [ + "config", + "case", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--pon-cnn", + pon_file, + "--balsamic-cache", + balsamic_cache, + ], + ) + # THEN program exits and checks for filepath + assert result.exit_code == 0 + assert Path(pon_file).exists() diff --git a/tests/commands/init/test_init.py b/tests/commands/init/test_init.py index 2318f4753..74fcce209 100644 --- a/tests/commands/init/test_init.py +++ b/tests/commands/init/test_init.py @@ -71,6 +71,54 @@ def test_init_reference_no_write_perm(tmp_path, invoke_cli, no_write_perm_path): assert result.exit_code == 1 +def test_init_reference_no_cosmic_abort(tmp_path, invoke_cli): + # Given a path with no write permission + test_genome_version = "hg19" + test_container_version = "develop" + test_new_dir = tmp_path / "test_reference_dir" + test_new_dir.mkdir() + + # WHEN invoking config sample + result = invoke_cli( + [ + "init", + "-o", + str(test_new_dir), + "-v", + test_container_version, + "-g", + test_genome_version, + ] + ) + + # THEN it should create test_reference.json and exist with no error + assert result.exit_code == 1 + + +def test_init_reference_no_cosmic_run(tmp_path, invoke_cli): + # Given a path with no write permission + test_genome_version = "canfam3" + test_container_version = "develop" + test_new_dir = tmp_path / "test_reference_dir" + test_new_dir.mkdir() + + # WHEN invoking config sample + result = invoke_cli( + [ + "init", + "-o", + str(test_new_dir), + "-v", + test_container_version, + "-g", + test_genome_version, + ] + ) + + # THEN it should create test_reference.json and exist with no error + assert result.exit_code == 0 + + def test_init_reference_click_abort(invoke_cli, tmp_path): # Given test_reference output directory test_container_version = "develop" diff --git a/tests/commands/report/test_deliver.py b/tests/commands/report/test_deliver.py index 37090b83c..6a5e52754 100644 --- a/tests/commands/report/test_deliver.py +++ b/tests/commands/report/test_deliver.py @@ -31,10 +31,8 @@ def test_deliver_tumor_only_panel( "deliver", "--sample-config", tumor_only_config, - "--sample-id-map", - "tumor:tumor:KS454", - "--case-id-map", - "gmck-solid:KSK899:apptag", + "--disable-variant-caller", + "cnvkit", ] ) @@ -60,17 +58,17 @@ def test_deliver_tumor_normal_panel( # Actual delivery files dummies with and without index cnv_result_dir = Path(helpers.result_dir, "cnv") cnv_result_dir.mkdir(parents=True, exist_ok=True) - actual_delivery_file = Path(cnv_result_dir, "tumor.merged.cnr") + actual_delivery_file = Path(cnv_result_dir, "tumor.merged.cns") actual_delivery_file.touch() vep_result_dir = Path(helpers.result_dir, "vep") vep_result_dir.mkdir(parents=True, exist_ok=True) touch_vcf_delivery_file = Path( - vep_result_dir, "SNV.somatic." + helpers.case_id + ".vardict.all.vcf.gz" + vep_result_dir, "SNV.somatic." + helpers.case_id + ".vardict.vcf.gz" ) touch_vcf_delivery_file.touch() touch_vcf_delivery_file_index = Path( - vep_result_dir, "SNV.somatic." + helpers.case_id + ".vardict.all.vcf.gz.tbi" + vep_result_dir, "SNV.somatic." + helpers.case_id + ".vardict.vcf.gz.tbi" ) touch_vcf_delivery_file_index.touch() @@ -91,59 +89,10 @@ def test_deliver_tumor_normal_panel( ), caplog.at_level(logging.DEBUG): # WHEN running analysis result = invoke_cli( - [ - "report", - "deliver", - "--sample-config", - tumor_normal_config, - "--sample-id-map", - "tumor:tumor:KS454,normal:normal:KS999", - "--case-id-map", - "gmck-solid:KSK899:apptag", - ] + ["report", "deliver", "--sample-config", tumor_normal_config] ) # THEN it should run without any error assert result.exit_code == 0 assert actual_delivery_report.is_file() assert "following" in caplog.text - - -def test_deliver_metrics( - invoke_cli, - environ, - tumor_normal_config, - helpers, - sentieon_install_dir, - sentieon_license, - caplog, -): - - # GIVEN a tumor-normal config file - helpers.read_config(tumor_normal_config) - actual_metric_delivery_yaml = Path( - helpers.delivery_dir, helpers.case_id + "_metrics_deliverables.yaml" - ) - - with mock.patch.dict( - environ, - { - "SENTIEON_LICENSE": sentieon_license, - "SENTIEON_INSTALL_DIR": sentieon_install_dir, - }, - ), caplog.at_level(logging.DEBUG): - # WHEN running analysis - result = invoke_cli( - [ - "report", - "deliver", - "--sample-config", - tumor_normal_config, - "--qc-metrics", - ] - ) - - # THEN it should run without any error - assert result.exit_code == 0 - assert actual_metric_delivery_yaml.is_file() - assert "following" in caplog.text diff --git a/tests/commands/run/test_run_analysis.py b/tests/commands/run/test_run_analysis.py index 59146c119..b3d1d8e2e 100644 --- a/tests/commands/run/test_run_analysis.py +++ b/tests/commands/run/test_run_analysis.py @@ -92,3 +92,22 @@ def test_run_analysis_create_dir(invoke_cli, tumor_only_config): ) # THEN it should abort with error assert Path(re.sub("/$", ".1/", log_dir)).exists() + + +def test_run_analysis_ponpath(invoke_cli, tumor_only_pon_config): + # GIVEN a tumor-only with pon file in the config file + # WHEN running analysis + + with open(tumor_only_pon_config) as fh: + sample_config = json.load(fh) + + bind_path = ["/path_to_dummy/ash/"] + pon_fl = sample_config["panel"].get("pon_cnn") + pon_path = Path(pon_fl).resolve() + + if "pon_cnn" in sample_config["panel"]: + bind_path.append(str(pon_path)) + + # THEN it checks for existence of paths + assert pon_path.exists() + assert str(pon_path) in bind_path diff --git a/tests/conftest.py b/tests/conftest.py index 983a3c235..351b5dcdd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,9 @@ from pathlib import Path from functools import partial from click.testing import CliRunner -from .helpers import ConfigHelper + +from BALSAMIC.utils.cli import read_yaml +from .helpers import ConfigHelper, Map from BALSAMIC.commands.base import cli from BALSAMIC import __version__ as balsamic_version @@ -45,6 +47,7 @@ def config_files(): "analysis_single_umi": "BALSAMIC/config/analysis_single_umi.json", "panel_bed_file": "tests/test_data/references/panel/panel.bed", "background_variant_file": "tests/test_data/references/panel/background_variants.txt", + "pon_cnn": "tests/test_data/references/panel/test_panel_ponn.cnn", "pon_fastq_path": "tests/test_data/fastq/", } @@ -72,6 +75,9 @@ def reference(): "access_regions": "tests/test_data/references/genome/access-5k-mappable.hg19.bed", "delly_exclusion": "tests/test_data/references/genome/delly_exclusion.tsv", "delly_exclusion_converted": "tests/test_data/references/genome/delly_exclusion_converted.tsv", + "delly_mappability": "tests/test_data/references/genome/delly_mappability.gz", + "delly_mappability_gindex": "tests/test_data/references/genome/delly_mappability.gz.gzi", + "delly_mappability_findex": "tests/test_data/references/genome/delly_mappability.fai", "ascat_gccorrection": "tests/test_data/references/genome/GRCh37_SnpGcCorrections.tsv", "ascat_chryloci": "tests/test_data/references/genome/GRCh37_Y.loci", "clinvar": "tests/test_data/references/genome/clinvar.vcf.gz", @@ -94,6 +100,11 @@ def background_variant_file(): return "tests/test_data/references/panel/background_variants.txt" +@pytest.fixture(scope="session") +def pon_cnn(): + return "tests/test_data/references/panel/test_panel_ponn.cnn" + + @pytest.fixture(scope="session") def sentieon_license(tmp_path_factory): """ @@ -273,13 +284,57 @@ def tumor_normal_config( ], ) - qc_dir = Path(analysis_dir, case_id, "analysis", "qc", "multiqc_data") + qc_dir = Path(analysis_dir, case_id, "analysis", "qc") qc_dir.mkdir(parents=True, exist_ok=False) - copy_tree("tests/test_data/qc_files/analysis/qc/multiqc_data/", qc_dir.as_posix()) + copy_tree("tests/test_data/qc_files/analysis/qc/", qc_dir.as_posix()) return Path(analysis_dir, case_id, case_id + ".json").as_posix() +@pytest.fixture(scope="session") +def tumor_normal_qc_config( + tmp_path_factory, + sample_fastq, + analysis_dir, + balsamic_cache, + panel_bed_file, +): + """ + invokes balsamic config sample -t xxx -n xxx to create sample config + for tumor-normal + """ + case_id = "sample_tumor_normal" + tumor = sample_fastq["tumor"] + normal = sample_fastq["normal"] + + with mock.patch.dict(MOCKED_OS_ENVIRON): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "-n", + normal, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + "ACC1", + "--normal-sample-name", + "ACC2", + ], + ) + return Path(analysis_dir, case_id, case_id + "_QC.json").as_posix() + + @pytest.fixture(name="helpers") def fixture_config_helpers(): """Helper fixture for case config files""" @@ -334,7 +389,7 @@ def tumor_normal_wgs_config( @pytest.fixture(scope="session") def tumor_only_config( - tmpdir_factory, + tmp_path_factory, sample_fastq, balsamic_cache, background_variant_file, @@ -378,9 +433,9 @@ def tumor_only_config( ], ) - qc_dir = Path(analysis_dir, case_id, "analysis", "qc", "multiqc_data") + qc_dir = Path(analysis_dir, case_id, "analysis", "qc") qc_dir.mkdir(parents=True, exist_ok=False) - copy_tree("tests/test_data/qc_files/analysis/qc/multiqc_data/", qc_dir.as_posix()) + copy_tree("tests/test_data/qc_files/analysis/qc/", qc_dir.as_posix()) return Path(analysis_dir, case_id, case_id + ".json").as_posix() @@ -428,6 +483,94 @@ def tumor_only_wgs_config( return Path(analysis_dir, case_id, case_id + ".json").as_posix() +@pytest.fixture(scope="session") +def tumor_only_qc_config( + tmpdir_factory, + sample_fastq, + balsamic_cache, + analysis_dir, + panel_bed_file, +): + """ + invokes balsamic config sample -t xxx to create sample config + for tumor only + """ + case_id = "sample_tumor_only" + tumor = sample_fastq["tumor"] + + with mock.patch.dict( + MOCKED_OS_ENVIRON, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "qc_panel", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + ], + ) + return Path(analysis_dir, case_id, case_id + "_QC.json").as_posix() + + +@pytest.fixture(scope="session") +def tumor_only_pon_config( + tmp_path_factory, + sample_fastq, + balsamic_cache, + analysis_dir, + panel_bed_file, + sentieon_license, + sentieon_install_dir, + pon_cnn, +): + """ + invokes balsamic config sample -t xxx to create sample config + for tumor only + """ + case_id = "sample_tumor_only_pon" + tumor = sample_fastq["tumor"] + + with mock.patch.dict( + MOCKED_OS_ENVIRON, + { + "SENTIEON_LICENSE": sentieon_license, + "SENTIEON_INSTALL_DIR": sentieon_install_dir, + }, + ): + runner = CliRunner() + runner.invoke( + cli, + [ + "config", + "case", + "-p", + panel_bed_file, + "-t", + tumor, + "--case-id", + case_id, + "--analysis-dir", + analysis_dir, + "--balsamic-cache", + balsamic_cache, + "--pon-cnn", + pon_cnn, + ], + ) + + return Path(analysis_dir, case_id, case_id + ".json").as_posix() + + @pytest.fixture(scope="session") def sample_config(): """ @@ -458,16 +601,11 @@ def sample_config(): "vcf": { "manta": {"mutation": "somatic", "type": "SV"}, "vardict": {"mutation": "somatic", "type": "SNV"}, - "pindel": {"mutation": "somatic", "type": "SV"}, - "strelka": {"mutation": "somatic", "type": "SNV"}, "mutect": {"mutation": "somatic", "type": "SNV"}, "tnscope": {"mutation": "somatic", "type": "SNV"}, - "tnsnv": {"mutation": "somatic", "type": "SNV"}, "tnhaplotyper": {"mutation": "somatic", "type": "SNV"}, "dnascope": {"mutation": "germline", "type": "SNV"}, "manta_germline": {"mutation": "germline", "type": "SV"}, - "haplotypecaller": {"mutation": "germline", "type": "SNV"}, - "strelka_germline": {"mutation": "germline", "type": "SNV"}, }, "samples": { "S1_R": { @@ -494,71 +632,98 @@ def analysis_path(): @pytest.fixture(scope="session") -def qc_metrics(): - """Sample data for QC model testing""" - return { - "qc": { - "targeted": { - "multiqc_picard_insertSize.json": { - "MEAN_INSERT_SIZE": {"condition": None} - }, - "multiqc_picard_HsMetrics.json": { - "MEDIAN_TARGET_COVERAGE": { - "condition": {"norm": "gt", "threshold": 500.0} - } - }, - }, - "wgs": { - "multiqc_picard_insertSize.json": { - "MEAN_INSERT_SIZE": {"condition": None} - }, - "multiqc_picard_dups.json": { - "PERCENT_DUPLICATION": {"condition": None} - }, - }, - } - } +def multiqc_data_path(analysis_path): + """multiqc_data.json test path""" + return os.path.join(analysis_path, "qc", "multiqc_data", "multiqc_data.json") @pytest.fixture(scope="session") -def qc_extracted_metrics(): - """Extracted metrics for QC model testing""" - return { - "metrics": { - "sample_1": [ - { - "name": "MEAN_INSERT_SIZE_1", - "norm": "lt", - "threshold": 1.0, - "value": 0.5, - }, - { - "name": "MEAN_INSERT_SIZE_2", - "norm": "lt", - "threshold": 1.0, - "value": 0.5, - }, - ], - "sample_2": [ - { - "name": "MEAN_INSERT_SIZE_1", - "norm": "lt", - "threshold": 1.0, - "value": 0.5, - }, - ], - } - } +def metrics_yaml_path(analysis_path): + """sample_tumor_only_metrics_deliverables.yaml test path""" + return os.path.join( + analysis_path, "qc", "sample_tumor_only_metrics_deliverables.yaml" + ) + + +@pytest.fixture(scope="session") +def bcftools_counts_path(analysis_path): + """svdb.all.filtered.pass.stats test path""" + return os.path.join( + analysis_path, "vep", "SNV.somatic.case.svdb.all.filtered.pass.stats" + ) @pytest.fixture(scope="session") -def qc_raw_targeted_metrics(): - """Raw metrics""" +def qc_requested_metrics(): + """Raw requested metrics""" return { - "default": { - "metrics_1.json": {"METRIC_1": 0.1, "METRIC_2": 0.2}, - "metrics_2.json": {"METRIC_3": 0.3}, + "targeted": { + "default": { + "METRIC_1": {"condition": None}, + "METRIC_2": {"condition": {"norm": "gt", "threshold": 2}}, + }, + "panel_1": { + "METRIC_3": {"condition": {"norm": "gt", "threshold": 3}}, + }, + "panel_2": { + "METRIC_1": {"condition": {"norm": "gt", "threshold": 1}}, + "METRIC_2": {"condition": {"norm": "gt", "threshold": 22}}, + "METRIC_4": {"condition": {"norm": "gt", "threshold": 4}}, + }, + }, + "wgs": { + "METRIC_1": {"condition": {"norm": "gt", "threshold": 1}}, }, - "panel_1.bed": {"metrics_2.json": {"METRIC_4": 0.4}}, - "panel_2.bed": {"metrics_1.json": {"METRIC_1": 0.5, "METRIC_4": 0.4}}, } + + +@pytest.fixture(scope="session") +def qc_extracted_metrics(metrics_yaml_path): + """Extracted and formatted QC metrics""" + return read_yaml(metrics_yaml_path) + + +@pytest.fixture(scope="function") +def snakemake_fastqc_rule(tumor_only_config, helpers): + """FastQC snakemake mock rule""" + + helpers.read_config(tumor_only_config) + fastq_path = os.path.join( + helpers.analysis_dir, + helpers.case_id, + "analysis", + "fastq", + "concatenated_tumor_XXXXXX_R_{read}.fastq.gz", + ) + + return Map( + { + "fastqc": Map( + { + "params": Map( + { + "housekeeper_id": { + "id": "sample_tumor_only", + "tags": "quality-trimmed-seq", + } + } + ), + "output": Map( + { + "_names": Map({"fastqc": fastq_path}), + "fastqc": fastq_path, + } + ), + "rule": Map( + { + "name": "fastq", + "output": [ + fastq_path, + ], + "temp_output": set(), + } + ), + } + ) + } + ) diff --git a/tests/helpers.py b/tests/helpers.py index 5d8c17a4f..fae159606 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -16,3 +16,35 @@ def read_config(self, balsamic_config): self.analysis_dir = sample_config["analysis"]["analysis_dir"] self.result_dir = sample_config["analysis"]["result"] self.delivery_dir = Path(self.result_dir, "delivery_report").as_posix() + + +class Map(dict): + """Mock class to use dot notation to access values of a dictionary""" + + def __init__(self, *args, **kwargs): + super(Map, self).__init__(*args, **kwargs) + for arg in args: + if isinstance(arg, dict): + for k, v in arg.items(): + self[k] = v + + if kwargs: + for k, v in kwargs.items(): + self[k] = v + + def __getattr__(self, attr): + return self.get(attr) + + def __setattr__(self, key, value): + self.__setitem__(key, value) + + def __setitem__(self, key, value): + super(Map, self).__setitem__(key, value) + self.__dict__.update({key: value}) + + def __delattr__(self, item): + self.__delitem__(item) + + def __delitem__(self, key): + super(Map, self).__delitem__(key) + del self.__dict__[key] diff --git a/tests/scripts/test_collect_qc_metrics.py b/tests/scripts/test_collect_qc_metrics.py new file mode 100644 index 000000000..1f62034ef --- /dev/null +++ b/tests/scripts/test_collect_qc_metrics.py @@ -0,0 +1,275 @@ +import json +import os.path +from pathlib import Path + +from BALSAMIC.assets.scripts.collect_qc_metrics import ( + get_multiqc_data_source, + get_multiqc_metrics, + collect_qc_metrics, + get_qc_supported_capture_kit, + get_requested_metrics, + capture_kit_resolve_type, + extract_number_variants, + get_variant_metrics, +) + + +def test_capture_kit_resolve_type(): + """test capture_kit type""" + + # GIVEN an expected output + capture_kit = "panel.bed" + + # THEN check if the extracted capture kit is correctly formatted + assert capture_kit_resolve_type("None") is None + assert capture_kit_resolve_type(capture_kit) == capture_kit + + +def test_get_qc_supported_capture_kit(qc_requested_metrics): + """test extraction of the capture kit name available for analysis""" + + # GIVEN a capture kit + capture_kit = "panel_1_v1.0_hg19_design.bed" + + # GIVEN an expected output + expected_output = "panel_1" + + # WHEN calling the function + supported_capture_kit = get_qc_supported_capture_kit( + capture_kit, qc_requested_metrics["targeted"] + ) + + # THEN check if the extracted bed file name corresponds to the expected one + assert supported_capture_kit == expected_output + + +def test_get_requested_metrics_targeted(qc_requested_metrics): + """test retrieval of the requested targeted metrics""" + + # GIVEN a sequencing type and a capture kit + seq_type = "targeted" + capture_kit = "panel_2_v1.0_hg19_design.bed" + + # GIVEN the expected output + expected_output = { + "METRIC_1": {"condition": {"norm": "gt", "threshold": 1}}, + "METRIC_2": {"condition": {"norm": "gt", "threshold": 22}}, + "METRIC_4": {"condition": {"norm": "gt", "threshold": 4}}, + } + + # WHEN calling the function + requested_metrics = get_requested_metrics( + qc_requested_metrics, seq_type, capture_kit + ) + + # THEN check if the requested targeted metrics are correctly retrieved + assert requested_metrics.items() == expected_output.items() + + +def test_get_requested_metrics_wgs(qc_requested_metrics): + """test extraction of the requested WGS metrics""" + + # GIVEN a sequencing type and a capture kit + seq_type = "wgs" + capture_kit = None + + # GIVEN the expected output + expected_output = { + "METRIC_1": {"condition": {"norm": "gt", "threshold": 1}}, + } + + # WHEN calling the function + requested_metrics = get_requested_metrics( + qc_requested_metrics, seq_type, capture_kit + ) + + # THEN check if the requested metrics are WGS specific + assert requested_metrics.items() == expected_output.items() + + +def test_get_multiqc_data_source(multiqc_data_path): + """test multiqc source extraction from multiqc_data.json analysis file""" + + # GIVEN input parameters and the multiqc data + sample = "concatenated_tumor_XXXXXX_R" + source_name_hs_metrics = "multiqc_picard_HsMetrics" + source_name_dup = "multiqc_picard_dups" + + with open(multiqc_data_path, "r") as f: + multiqc_data = json.load(f) + + # GIVEN an expected output + source_hs_metrics = "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric" + source_dup = "concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt" + + # WHEN extracting the source of a specific sample and collection of metrics + out_source_hs_metrics = get_multiqc_data_source( + multiqc_data, sample, source_name_hs_metrics + ) + out_source_dup = get_multiqc_data_source(multiqc_data, sample, source_name_dup) + + # THEN check if the extracted source names correspond to the expected ones + assert source_hs_metrics == out_source_hs_metrics + assert source_dup == out_source_dup + + +def test_get_multiqc_metrics(multiqc_data_path, qc_extracted_metrics): + """test metrics retrieval from the multiqc_data.json file""" + + # GIVEN a sequencing type and a capture kit + seq_type = "targeted" + capture_kit = "lymphoma_6.1_hg19_design.bed" + + # WHEN calling the function + metrics = get_multiqc_metrics( + multiqc_data_path, + seq_type, + capture_kit, + ) + + # THEN check if the metrics are correctly retrieved + assert qc_extracted_metrics == metrics + + +def test_get_multiqc_metrics_filtering_umi(multiqc_data_path): + """tests that UMI data is filtered out when extracting metrics""" + + # GIVEN a sequencing type and a capture kit + seq_type = "targeted" + capture_kit = None + + # WHEN calling the function + metrics = get_multiqc_metrics( + multiqc_data_path, + seq_type, + capture_kit, + ) + + # THEN check if the UMI samples are filtered out + for metric in metrics: + assert "umi" not in metric["input"] + + +def test_extract_number_variants(): + """tests number of variants formatting""" + + # GIVEN a raw input list of variant metrics + counts = [ + "Number of samples: 2", + "Number of SNPs: 111", + "Number of INDELs: 14", + "Number of MNPs: 0", + "Number of sites: 125", + "", + ] + + # GIVEN an expected output after arranging the input list + expected_variants_metrics = { + "NUMBER_OF_SAMPLES": 2, + "NUMBER_OF_SNPS": 111, + "NUMBER_OF_INDELS": 14, + "NUMBER_OF_MNPS": 0, + "NUMBER_OF_SITES": 125, + } + + # WHEN performing the extraction of variant metrics + variant_metrics = extract_number_variants(counts) + + # THEN verify that the number of variants has been correctly retrieved + assert expected_variants_metrics == variant_metrics + + +def test_get_variant_metrics(bcftools_counts_path): + """tests variant metrics retrieval""" + + # GIVEN an SVDB bcftools counts path + + # GIVEN an expected MetricsModel dictionary + expected_output_metris = { + "header": None, + "id": "case", + "input": os.path.basename(bcftools_counts_path), + "name": "NUMBER_OF_SITES", + "step": "collect_custom_qc_metrics", + "value": 125, + "condition": {"norm": "lt", "threshold": 10000.0}, + } + + # WHEN extracting the number of variants + output_metrics = get_variant_metrics(bcftools_counts_path) + + # THEN check that the output metrics has been correctly shaped + assert expected_output_metris == output_metrics[0] + + +def test_collect_qc_metrics_targeted(tmp_path, multiqc_data_path, cli_runner): + """tests qc metrics yaml file generation for targeted analysis""" + + # GIVEN the output and multiqc metrics paths + output_path = tmp_path / "sample_tumor_only_metrics_deliverables.yaml" + + # GIVEN a sequencing type and a capture kit + seq_type = "targeted" + capture_kit = "lymphoma_6.1_hg19_design.bed" + + # WHEN invoking the python script + result = cli_runner.invoke( + collect_qc_metrics, + [str(output_path), multiqc_data_path, seq_type, capture_kit], + ) + + # THEN check if the YAML is correctly created and there are no errors + assert result.exit_code == 0 + assert Path(output_path).exists() + + +def test_collect_qc_metrics_wgs(tmp_path, multiqc_data_path, cli_runner): + """tests qc metrics yaml file generation for wgs analysis""" + + # GIVEN the output and multiqc metrics paths + output_path = tmp_path / "sample_tumor_only_wgs_metrics_deliverables.yaml" + + # GIVEN a sequencing type and a capture kit + seq_type = "wgs" + capture_kit = "None" + + # WHEN invoking the python script + result = cli_runner.invoke( + collect_qc_metrics, + [str(output_path), multiqc_data_path, seq_type, capture_kit], + ) + + # THEN check if the YAML is correctly created and there are no errors + assert result.exit_code == 0 + assert Path(output_path).exists() + + +def test_collect_qc_metrics_counts( + tmp_path, multiqc_data_path, bcftools_counts_path, cli_runner +): + """tests qc metrics yaml file generation for targeted analysis and providing a bcftools counts path""" + + # GIVEN the output, multiqc metrics and bcftools counts paths + output_path = tmp_path / "sample_tumor_only_metrics_deliverables.yaml" + + # GIVEN a sequencing type and a capture kit + seq_type = "targeted" + capture_kit = "gmsmyeloid_5.2_hg19_design.bed" + + # WHEN invoking the python script + result = cli_runner.invoke( + collect_qc_metrics, + [ + str(output_path), + multiqc_data_path, + bcftools_counts_path, # multiple counts path regarding different variant callers + bcftools_counts_path, + bcftools_counts_path, + seq_type, + capture_kit, + ], + ) + + # THEN check if the YAML is correctly created and there are no errors + assert result.exit_code == 0 + assert Path(output_path).exists() diff --git a/tests/scripts/test_create_pdf.py b/tests/scripts/test_create_pdf.py index 849232bc9..d5cb85cf3 100644 --- a/tests/scripts/test_create_pdf.py +++ b/tests/scripts/test_create_pdf.py @@ -62,8 +62,6 @@ def test_create_pdf(tmp_path, cli_runner): # GIVEN the output path output_path = tmp_path / "ascat.output.pdf" - print(output_path) - # WHEN invoking the python script result = cli_runner.invoke( create_pdf, [str(output_path), statistics_path, plots_path[0], plots_path[1]] diff --git a/tests/test_data/BALSAMIC_env.yaml b/tests/test_data/BALSAMIC_env.yaml index b6172c820..0e9038004 100644 --- a/tests/test_data/BALSAMIC_env.yaml +++ b/tests/test_data/BALSAMIC_env.yaml @@ -1,6 +1,5 @@ D_BALSAMIC-py27_test: - python -- strelka - manta - bcftools - tabix @@ -21,6 +20,6 @@ D_BALSAMIC-py36_test: - ensembl-vep - cnvkit - cutadapt -- pindel - multiqc - bedtools +- svdb diff --git a/tests/test_data/config.json b/tests/test_data/config.json index a9e5ba822..3619e0789 100644 --- a/tests/test_data/config.json +++ b/tests/test_data/config.json @@ -24,36 +24,12 @@ "merged": "manta_germline.vcf.gz", "type": "SV" }, - "strelka_germline": { - "default": ["variants.vcf.gz", "germline.S1.vcf.gz"], - "mutation": "germline", - "merged": "strelka_germline.vcf.gz", - "type": "SNV" - }, - "strelka": { - "default": ["somatic.snvs.vcf.gz", "somatic.indels.vcf.gz"], - "mutation": "somatic", - "merged": "strelka.vcf.gz", - "type": "SNV" - }, "mutect": { "default": "mutect.vcf.gz", "mutation": "somatic", "merged": "mutect.vcf.gz", "type": "SNV" }, - "freebayes": { - "default": "freebayes.vcf.gz", - "mutation": "germline", - "merged": "freebayes.vcf.gz", - "type": "SNV" - }, - "haplotypecaller": { - "default": "haplotypecaller.vcf.gz", - "mutation": "germline", - "merged": "haplotypecaller.vcf.gz", - "type": "SNV" - }, "vardict": { "default": "vardict.vcf.gz", "mutation": "somatic", diff --git a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json index c1b9e8f05..b06d8b626 100755 --- a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json +++ b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json @@ -11,6 +11,14 @@ "DuplicationMetrics": { "concatenated_tumor_XXXXXX_R": "tests/test_data/qc_files/analysis/bam/concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt" } + }, + "FastQC": { + "all_sections": { + "concatenated_tumor_XXXXXX_R_2": "tests/test_data/qc_files/analysis/fastqc/concatenated_tumor_XXXXXX_R_2_fastqc.zip", + "concatenated_normal_XXXXXX_R_1": "tests/test_data/qc_files/analysis/fastqc/concatenated_normal_XXXXXX_R_1_fastqc.zip", + "concatenated_normal_XXXXXX_R_2": "tests/test_data/qc_files/analysis/fastqc/concatenated_normal_XXXXXX_R_2_fastqc.zip", + "concatenated_tumor_XXXXXX_R_1": "tests/test_data/qc_files/analysis/fastqc/concatenated_tumor_XXXXXX_R_1_fastqc.zip" + } } }, "report_saved_raw_data": { @@ -27,7 +35,12 @@ "MIN_TARGET_COVERAGE": 0.0, "FOLD_80_BASE_PENALTY": 1.359189, "AT_DROPOUT": 6.093115, - "GC_DROPOUT": 0.027402 + "GC_DROPOUT": 0.027402, + "PCT_TARGET_BASES_50X": 1.0, + "PCT_TARGET_BASES_100X": 0.999987, + "PCT_TARGET_BASES_250X": 0.998445, + "PCT_TARGET_BASES_500X": 0.996675, + "PCT_TARGET_BASES_1000X": 0.992466 }, "concatenated_tumor_XXXXXX_R.consensusfiltered.umi": { "BAIT_SET": "concatenated_tumor_XXXXXX_R", @@ -41,7 +54,12 @@ "MIN_TARGET_COVERAGE": 0.0, "FOLD_80_BASE_PENALTY": 1.742114, "AT_DROPOUT": 12.048384, - "GC_DROPOUT": 0.150425 + "GC_DROPOUT": 0.150425, + "PCT_TARGET_BASES_50X": 0.999866, + "PCT_TARGET_BASES_100X": 0.99819, + "PCT_TARGET_BASES_250X": 0.996568, + "PCT_TARGET_BASES_500X": 0.994423, + "PCT_TARGET_BASES_1000X": 0.984181 } }, "multiqc_picard_insertSize": { @@ -66,6 +84,24 @@ "READ_PAIR_DUPLICATES": 18741892.0, "PERCENT_DUPLICATION": 0.391429 } + }, + "multiqc_general_stats": { + "concatenated_tumor_XXXXXX_R_2": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 15.03521942842923, + "FastQC_mqc-generalstats-fastqc-total_sequences": 600529762.0 + }, + "concatenated_normal_XXXXXX_R_1": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 14.426654287440797, + "FastQC_mqc-generalstats-fastqc-total_sequences": 464581551.0 + }, + "concatenated_normal_XXXXXX_R_2": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 14.214689357571501, + "FastQC_mqc-generalstats-fastqc-total_sequences": 464581551.0 + }, + "concatenated_tumor_XXXXXX_R_1": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 15.213739762327492, + "FastQC_mqc-generalstats-fastqc-total_sequences": 600529762.0 + } } } } diff --git a/tests/test_data/qc_files/analysis/qc/sample_tumor_normal_metrics_deliverables.yaml b/tests/test_data/qc_files/analysis/qc/sample_tumor_normal_metrics_deliverables.yaml new file mode 100644 index 000000000..91407b3be --- /dev/null +++ b/tests/test_data/qc_files/analysis/qc/sample_tumor_normal_metrics_deliverables.yaml @@ -0,0 +1,162 @@ +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_OFF_BAIT + step: multiqc_picard_HsMetrics + value: 0.161185 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 636.23177 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEDIAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 597.0 + condition: + norm: gt + threshold: 500.0 +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: FOLD_80_BASE_PENALTY + step: multiqc_picard_HsMetrics + value: 1.469357 + condition: + norm: lt + threshold: 1.8 +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_50X + step: multiqc_picard_HsMetrics + value: 0.998388 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_100X + step: multiqc_picard_HsMetrics + value: 0.99497 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_250X + step: multiqc_picard_HsMetrics + value: 0.965738 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_500X + step: multiqc_picard_HsMetrics + value: 0.679445 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_1000X + step: multiqc_picard_HsMetrics + value: 0.085208 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_OFF_BAIT + step: multiqc_picard_HsMetrics + value: 0.158226 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 888.343586 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEDIAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 805.0 + condition: + norm: gt + threshold: 500.0 +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: FOLD_80_BASE_PENALTY + step: multiqc_picard_HsMetrics + value: 1.566744 + condition: + norm: lt + threshold: 1.8 +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_50X + step: multiqc_picard_HsMetrics + value: 0.998554 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_100X + step: multiqc_picard_HsMetrics + value: 0.997177 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_250X + step: multiqc_picard_HsMetrics + value: 0.979764 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_500X + step: multiqc_picard_HsMetrics + value: 0.874594 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_1000X + step: multiqc_picard_HsMetrics + value: 0.304354 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.insertsizemetric + name: MEAN_INSERT_SIZE + step: multiqc_picard_insertSize + value: 125.819455 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.insertsizemetric + name: MEAN_INSERT_SIZE + step: multiqc_picard_insertSize + value: 131.280203 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt + name: PERCENT_DUPLICATION + step: multiqc_picard_dups + value: 0.356228 + condition: null +- header: null + id: normal + input: concatenated_normal_XXXXXX_R.sorted.mrkdup.txt + name: PERCENT_DUPLICATION + step: multiqc_picard_dups + value: 0.255692 + condition: null diff --git a/tests/test_data/qc_files/analysis/qc/sample_tumor_only_metrics_deliverables.yaml b/tests/test_data/qc_files/analysis/qc/sample_tumor_only_metrics_deliverables.yaml new file mode 100644 index 000000000..a9a2def98 --- /dev/null +++ b/tests/test_data/qc_files/analysis/qc/sample_tumor_only_metrics_deliverables.yaml @@ -0,0 +1,81 @@ +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_OFF_BAIT + step: multiqc_picard_HsMetrics + value: 0.364546 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 2314.698853 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: MEDIAN_TARGET_COVERAGE + step: multiqc_picard_HsMetrics + value: 2393.0 + condition: + norm: gt + threshold: 1000.0 +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: FOLD_80_BASE_PENALTY + step: multiqc_picard_HsMetrics + value: 1.359189 + condition: + norm: lt + threshold: 1.6 +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_50X + step: multiqc_picard_HsMetrics + value: 1.0 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_100X + step: multiqc_picard_HsMetrics + value: 0.999987 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_250X + step: multiqc_picard_HsMetrics + value: 0.998445 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_500X + step: multiqc_picard_HsMetrics + value: 0.996675 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric + name: PCT_TARGET_BASES_1000X + step: multiqc_picard_HsMetrics + value: 0.992466 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.insertsizemetric + name: MEAN_INSERT_SIZE + step: multiqc_picard_insertSize + value: 201.813054 + condition: null +- header: null + id: tumor + input: concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt + name: PERCENT_DUPLICATION + step: multiqc_picard_dups + value: 0.391429 + condition: null diff --git a/tests/test_data/qc_files/analysis/vep/SNV.somatic.case.svdb.all.filtered.pass.stats b/tests/test_data/qc_files/analysis/vep/SNV.somatic.case.svdb.all.filtered.pass.stats new file mode 100644 index 000000000..2a56b7719 --- /dev/null +++ b/tests/test_data/qc_files/analysis/vep/SNV.somatic.case.svdb.all.filtered.pass.stats @@ -0,0 +1,6 @@ +Number of samples: 2 +Number of SNPs: 111 +Number of INDELs: 14 +Number of MNPs: 0 +Number of others: 0 +Number of sites: 125 diff --git a/tests/test_data/references/genome/delly_mappability.gz b/tests/test_data/references/genome/delly_mappability.gz new file mode 100644 index 000000000..f7083a451 Binary files /dev/null and b/tests/test_data/references/genome/delly_mappability.gz differ diff --git a/tests/test_data/references/genome/delly_mappability.gz.fai b/tests/test_data/references/genome/delly_mappability.gz.fai new file mode 100644 index 000000000..af9ba0e09 --- /dev/null +++ b/tests/test_data/references/genome/delly_mappability.gz.fai @@ -0,0 +1,10 @@ +1 248956422 3 50 51 +2 242193529 253935557 50 51 +3 198295559 500972960 50 51 +4 190214555 703234434 50 51 +5 181538259 897253284 50 51 +6 170805979 1082422312 50 51 +7 159345973 1256644414 50 51 +8 145138636 1419177310 50 51 +9 138394717 1567218722 50 51 +10 133797422 1708381338 50 51 diff --git a/tests/test_data/references/genome/delly_mappability.gz.gzi b/tests/test_data/references/genome/delly_mappability.gz.gzi new file mode 100644 index 000000000..076cb8ee7 Binary files /dev/null and b/tests/test_data/references/genome/delly_mappability.gz.gzi differ diff --git a/tests/test_data/references/panel/test_panel_ponn.cnn b/tests/test_data/references/panel/test_panel_ponn.cnn new file mode 100644 index 000000000..1a62208ab --- /dev/null +++ b/tests/test_data/references/panel/test_panel_ponn.cnn @@ -0,0 +1,6 @@ +chromosome start end gene log2 depth gc rmask spread +12 49445437 49445694 KMT2D 0.100978 1741.99 0.622568 0.0934444 +12 51204937 51204938 ATF1 0.175285 1145.69 0 0.1385 +12 52055418 52199527 Antitarget 0.146543 0.271576 0.412285 0 0.172095 +12 52345527 52345618 ACVR1B -0.84849 837.93 0.747253 0.335741 +12 52346118 52357667 Antitarget 0.321353 0.344461 0.424452 0 0.520977 diff --git a/tests/test_data/references/reference.json b/tests/test_data/references/reference.json index aa611fcc2..f851bee0a 100644 --- a/tests/test_data/references/reference.json +++ b/tests/test_data/references/reference.json @@ -16,6 +16,7 @@ "exon_bed": "tests/test_data/references/genome/refseq.flat.bed", "delly_exclusion": "tests/test_data/references/genome/delly_exclusion.tsv", "delly_exclusion_converted": "tests/test_data/references/genome/delly_exclusion_converted.tsv", + "delly_mappability": "tests/test_data/references/genome/delly_mappability.gz", "ascat_gccorrection": "tests/test_data/references/genome/GRCh37_SnpGcCorrections.tsv", "ascat_chryloci": "tests/test_data/references/genome/GRCh37_Y.loci", "clinvar": "tests/test_data/references/genome/clinvar.vcf.gz" diff --git a/tests/test_workflow.py b/tests/test_workflow.py index 3ed867920..90c35117e 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -11,7 +11,8 @@ def test_workflow_tumor_normal( ): # GIVEN a sample config dict and snakefile workflow = "paired" - snakefile = get_snakefile(workflow) + reference_genome = "hg19" + snakefile = get_snakefile(workflow, reference_genome) config_json = tumor_normal_config # WHEN invoking snakemake module with dryrun option @@ -29,7 +30,8 @@ def test_workflow_tumor_normal( def test_workflow_tumor_only(tumor_only_config, sentieon_install_dir, sentieon_license): # GIVEN a sample config dict and snakefile workflow = "single" - snakefile = get_snakefile(workflow) + reference_genome = "hg19" + snakefile = get_snakefile(workflow, reference_genome) config_json = tumor_only_config # WHEN invoking snakemake module with dryrun option @@ -44,26 +46,66 @@ def test_workflow_tumor_only(tumor_only_config, sentieon_install_dir, sentieon_l assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) -def test_workflow_qc( - tumor_normal_config, tumor_only_config, sentieon_install_dir, sentieon_license -): +def test_workflow_qc_tumor_only(tumor_only_qc_config): + # GIVEN a sample config dict and snakefile - workflow = "qc" - snakefile = get_snakefile(workflow) + workflow = "qc_panel" + reference_genome = "hg19" + snakefile = get_snakefile(workflow, reference_genome) + config_json = tumor_only_qc_config + + # WHEN invoking snakemake module with dryrun option + # THEN it should return true + with mock.patch.dict( + MOCKED_OS_ENVIRON, + ): + assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) + + +def test_workflow_qc_tumor_only_canfam(tumor_only_qc_config): + + # GIVEN a sample config dict and snakefile + workflow = "qc_panel" + reference_genome = "canfam3" + snakefile = get_snakefile(workflow, reference_genome) + config_json = tumor_only_qc_config + + # WHEN invoking snakemake module with dryrun option + # THEN it should return true + with mock.patch.dict( + MOCKED_OS_ENVIRON, + ): + assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) + + +def test_workflow_qc_normal(tumor_normal_qc_config): + # GIVEN a sample config dict and snakefile + workflow = "qc_panel" + reference_genome = "hg19" + snakefile = get_snakefile(workflow, reference_genome) + config_json = tumor_normal_qc_config # WHEN invoking snakemake module with dryrun option # THEN it should return true with mock.patch.dict( MOCKED_OS_ENVIRON, - { - "SENTIEON_LICENSE": sentieon_license, - "SENTIEON_INSTALL_DIR": sentieon_install_dir, - }, ): - for config_json in (tumor_normal_config, tumor_only_config): - assert snakemake.snakemake( - snakefile, configfiles=[config_json], dryrun=True - ) + assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) + + +def test_workflow_qc_normal_canfam3(tumor_normal_qc_config): + # GIVEN a sample config dict and snakefile + workflow = "qc_panel" + reference_genome = "canfam3" + snakefile = get_snakefile(workflow, reference_genome) + config_json = tumor_normal_qc_config + + # WHEN invoking snakemake module with dryrun option + # THEN it should return true + with mock.patch.dict( + MOCKED_OS_ENVIRON, + ): + assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) def test_workflow_sentieon( @@ -74,7 +116,6 @@ def test_workflow_sentieon( ): # GIVEN a sample config dict and snakefile workflows = [("single", tumor_only_wgs_config), ("paired", tumor_normal_wgs_config)] - sequencing_type = "wgs" # WHEN invoking snakemake module with dryrun option # THEN it should return true @@ -88,5 +129,6 @@ def test_workflow_sentieon( for workflow in workflows: analysis_type = workflow[0] config = workflow[1] - snakefile = get_snakefile(analysis_type, sequencing_type) + reference_genome = "hg19" + snakefile = get_snakefile(analysis_type, reference_genome) assert snakemake.snakemake(snakefile, configfiles=[config], dryrun=True) diff --git a/tests/utils/test_models.py b/tests/utils/test_models.py index eb2be2cac..aa056e3f6 100644 --- a/tests/utils/test_models.py +++ b/tests/utils/test_models.py @@ -22,9 +22,9 @@ ParamsCommon, ParamsVardict, ParamsVEP, - QCMetricModel, - QCValidationModel, - DeliveryMetricModel, + MetricModel, + MetricConditionModel, + MetricValidationModel, ) @@ -354,6 +354,7 @@ def test_umiparams_tnscope(): "min_tumorLOD": 6, "error_rate": 5, "prunefactor": 3, + "padding": 30, "disable_detect": "abc", } @@ -367,6 +368,7 @@ def test_umiparams_tnscope(): assert test_tnscope_params_built.error_rate == 5 assert test_tnscope_params_built.prunefactor == 3 assert test_tnscope_params_built.disable_detect == "abc" + assert test_tnscope_params_built.padding == 30 def test_params_vardict(): @@ -403,116 +405,104 @@ def test_params_vep(): assert test_vep_built.vep_filters == "all defaults params" -def test_qc_metric_model_pass(qc_extracted_metrics): - """test QCMetricModel attribute parsing and positive validation""" +def test_metric_condition_model(): + """test MetricConditionModel attributes parsing""" # GIVEN input attributes - metric = qc_extracted_metrics["metrics"]["sample_1"][0] + metric_condition = {"norm": "gt", "threshold": 1} - # WHEN building the QC metric model - model = QCMetricModel(**metric) + # WHEN building the metric condition model + metrics_model = MetricConditionModel(**metric_condition) # THEN assert retrieved values from the created model - assert model.dict().items() == metric.items() + assert metrics_model.dict().items() == metric_condition.items() -def test_qc_metric_model_norm_fail(qc_extracted_metrics): - """test QCMetricModel ValueError raising for an operator that it is not accepted""" +def test_metric_model_pass_validation(): + """test MetricModel attributes parsing""" - # GIVEN incorrect input attributes - metric = copy.deepcopy(qc_extracted_metrics["metrics"]["sample_1"][0]) - metric["norm"] = "higher" + # GIVEN input attributes + metrics = { + "header": None, + "id": "tumor", + "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric", + "name": "MEDIAN_TARGET_COVERAGE", + "step": "multiqc_picard_HsMetrics", + "value": 2393.0, + "condition": {"norm": "gt", "threshold": 1000.0}, + } - # THEN model raises an error due to a non accepted norm - try: - QCMetricModel(**metric) - except KeyError as key_exc: - assert metric["norm"] in str(key_exc) + # WHEN building the metric model + metric_model = MetricModel(**metrics) + # THEN assert retrieved values from the created model + assert metric_model.dict().items() == metrics.items() -def test_qc_metric_model_condition_fail(qc_extracted_metrics): - """test QCMetricModel for an overly restrictive metric condition""" - # GIVEN input attributes with a value that does not meet the filtering condition - metric = copy.deepcopy(qc_extracted_metrics["metrics"]["sample_1"][0]) - metric["value"] = 10.0 +def test_metric_model_fail_validation(): + """test MetricModel behaviour for an incorrect input""" - # THEN check that the model filters the metric according to its norm - with pytest.raises(ValueError) as val_exc: - QCMetricModel(**metric) - assert ( - f"QC metric {metric['name']}: {metric['value']} validation has failed. " - f"(Condition: {metric['norm']} {metric['threshold']})" in str(val_exc.value) - ) + # GIVEN a non accepted input + invalid_input = {"header": None, "id": "tumor"} + + # THEN the model raises an error due to an incomplete input + with pytest.raises(ValueError) as input_exc: + MetricModel(**invalid_input) + assert f"field required" in str(input_exc.value) -def test_qc_validation_model_pass(qc_extracted_metrics): - """test QCValidationModel attribute parsing and validation""" +def test_metric_validation_model_pass(qc_extracted_metrics): + """test MetricValidationModel attribute parsing and positive validation""" - # WHEN building the QC validation model - model = QCValidationModel(**qc_extracted_metrics) + # WHEN building the MetricValidationModel model + model = MetricValidationModel(metrics=qc_extracted_metrics) # THEN assert retrieved values from the created model - assert model.dict().items() == qc_extracted_metrics.items() + assert model.dict()["metrics"] == qc_extracted_metrics -def test_qc_validation_model_condition_fail(qc_extracted_metrics): - """test QCValidationModel for multiple metrics with failing conditions""" +def test_metric_validation_model_fail(qc_extracted_metrics): + """test MetricValidationModel for an overly restrictive metric condition""" - # GIVEN input attributes that does not meet the specified conditions + # GIVEN input attributes with a value that does not meet the filtering condition metrics = copy.deepcopy(qc_extracted_metrics) - metrics["metrics"]["sample_1"][0]["value"] = 10.0 - metrics["metrics"]["sample_2"][0]["value"] = 10.0 + metrics[3]["value"] = 2.0 - # THEN check that the model filters the metrics according to its norm + # THEN check that the model filters the metric according to its norm with pytest.raises(ValueError) as val_exc: - QCValidationModel(**metrics) - assert "2 validation errors for QCValidationModel" in str(val_exc.value) - - -def test_qc_validation_model_get_json(qc_extracted_metrics): - """test metric-value json extraction and metric filtering for passing conditions""" - - # GIVEN expected output - output_metrics = { - "sample_1": {"MEAN_INSERT_SIZE_1": 0.5, "MEAN_INSERT_SIZE_2": 0.5}, - "sample_2": {"MEAN_INSERT_SIZE_1": 0.5}, - } - - # WHEN building the QC validation model - validation_model = QCValidationModel(**qc_extracted_metrics) - - # THEN check if the extracted metrics and its structure meets the expected one - assert validation_model.get_json.items() == output_metrics.items() - + MetricValidationModel(metrics=metrics) + assert ( + f"QC metric {metrics[3]['name']}: {metrics[3]['value']} validation has failed. " + f"(Condition: {metrics[3]['condition']['norm']} {metrics[3]['condition']['threshold']}, ID: {metrics[3]['id']})" + in str(val_exc.value) + ) -def test_delivery_metric_model_pass_validation(): - """test DeliveryMetricModel attributes parsing""" - # GIVEN input attributes - metrics = { - "header": None, - "id": "005", - "input": "S1_005.sorted.mrkdup.txt", - "name": "MEAN_INSERT_SIZE", - "step": "multiqc_rule", - "value": 0.5, - } +def test_multiple_metric_validation_model_fail(qc_extracted_metrics): + """test MetricValidationModel for multiple metrics with failing conditions""" - # WHEN building the delivery metric model - metrics_model = DeliveryMetricModel(**metrics) + # GIVEN input attributes that does not meet the specified conditions + metrics = copy.deepcopy(qc_extracted_metrics) + metrics[2]["value"] = 999.0 + metrics[3]["value"] = 2 - # THEN assert retrieved values from the created model - assert metrics_model.dict().items() == metrics.items() + # THEN check that the model filters the metrics according to its norm + with pytest.raises(ValueError) as val_exc: + MetricValidationModel(metrics=metrics) + assert "2 validation errors for MetricValidationModel" in str(val_exc.value) + assert metrics[2]["name"] in str(val_exc.value) + assert metrics[3]["name"] in str(val_exc.value) -def test_delivery_metric_model_fail_validation(): - """test DeliveryMetricModel behaviour for an incorrect input""" +def test_metric_validation_model_norm_fail(qc_extracted_metrics): + """test MetricValidationModel ValueError raising for an operator that it is not accepted""" - # GIVEN a non accepted input - invalid_input = {"name": "MEAN_INSERT_SIZE"} + # GIVEN a metric with an incorrect norm attribute + metrics = copy.deepcopy(qc_extracted_metrics) + metrics[3]["condition"]["norm"] = "lower" - # THEN the model raises an error due to an incomplete input - with pytest.raises(ValueError) as input_exc: - DeliveryMetricModel(**invalid_input) - assert f"field required" in str(input_exc.value) + # THEN model raises an error due to a non accepted norm + try: + MetricValidationModel(metrics=metrics) + except KeyError as key_exc: + assert metrics[3]["condition"]["norm"] in str(key_exc) diff --git a/tests/utils/test_qc_metrics.py b/tests/utils/test_qc_metrics.py index bdfdc23a3..46657301e 100644 --- a/tests/utils/test_qc_metrics.py +++ b/tests/utils/test_qc_metrics.py @@ -1,254 +1,11 @@ -import json -import os +from BALSAMIC.utils.qc_metrics import validate_qc_metrics -from pydantic import ValidationError -from BALSAMIC.utils.qc_metrics import ( - get_qc_metrics_json, - read_metrics, - update_metrics_dict, - get_qc_metrics_dict, - get_qc_available_panel_beds, - merge_dicts, - get_multiqc_data_source, - extract_metrics_for_delivery, -) - - -def test_get_qc_available_panel_beds(qc_raw_targeted_metrics): - """test extraction of the panel beds available for QC validation""" - - # GIVEN an expected output - expected_output = ["panel_1.bed", "panel_2.bed"] - - # WHEN calling the function - available_panel_beds = get_qc_available_panel_beds(qc_raw_targeted_metrics) - - # THEN check if the extracted bed file names correspond to the expected ones - assert available_panel_beds == expected_output - - -def test_merge_dicts(qc_raw_targeted_metrics): - """test dictionary merging and requirements overwriting by panel BED specific conditions""" - - # GIVEN an expected output - expected_output = { - "metrics_1.json": {"METRIC_1": 0.5, "METRIC_2": 0.2, "METRIC_4": 0.4}, - "metrics_2.json": {"METRIC_3": 0.3}, - } - - # WHEN calling the function - merged_dict = merge_dicts( - qc_raw_targeted_metrics["default"], - qc_raw_targeted_metrics["panel_2.bed"], - ) - - # THEN check if the extracted output meets the merged dictionary - assert merged_dict.items() == expected_output.items() - - -def test_read_metrics(analysis_path): - """test metric extraction from a specific QC file""" - - # GIVEN a QC file name - file_name = "multiqc_picard_dups.json" - - # GIVEN an expected output - expected_output = { - "concatenated_tumor_XXXXXX_R": { - "LIBRARY": "Unknown Library", - "UNPAIRED_READS_EXAMINED": 11860.0, - "READ_PAIRS_EXAMINED": 20440841.0, - "SECONDARY_OR_SUPPLEMENTARY_RDS": 4333388.0, - "UNMAPPED_READS": 19824.0, - "UNPAIRED_READ_DUPLICATES": 10178.0, - "READ_PAIR_DUPLICATES": 14680829.0, - "READ_PAIR_OPTICAL_DUPLICATES": 0.0, - "PERCENT_DUPLICATION": 0.718251, - "ESTIMATED_LIBRARY_SIZE": 5951948.0, - } - } - - # WHEN calling the function - raw_metrics = read_metrics(analysis_path, file_name) - - # THEN check if the extracted metrics correspond to the expected ones - assert raw_metrics.items() == expected_output.items() - - -def test_update_metrics_dict(qc_extracted_metrics): - """test adding metrics to a nested dictionary""" - - # GIVEN input parameters - sample_id = "sample_" - metric = ["MEAN_INSERT_SIZE", {"condition": {"norm": "lt", "threshold": 1.0}}] - value = 0.5 - - # WHEN adding a metric to an empty dictionary - metric[0] = "MEAN_INSERT_SIZE_1" - m_dict = update_metrics_dict(sample_id + "1", metric, value, {}) - - # WHEN appending a metric to an already created dictionary - metric[0] = "MEAN_INSERT_SIZE_2" - m_dict = update_metrics_dict(sample_id + "1", metric, value, m_dict) - - # WHEN appending a metric from another sample to a dictionary - metric[0] = "MEAN_INSERT_SIZE_1" - m_dict = update_metrics_dict(sample_id + "2", metric, value, m_dict) - - # THEN check if the dictionary is updated correctly - assert m_dict.items() == qc_extracted_metrics["metrics"].items() - - -def test_get_qc_metrics_dict(analysis_path, qc_metrics): - """test QC metric extraction and its structure""" - - # GIVEN a sequencing type - seq_type = "targeted" - - # GIVEN an expected output - expected_output = { - "concatenated_tumor": [ - { - "name": "MEAN_INSERT_SIZE", - "norm": None, - "threshold": None, - "value": 74.182602, - }, - { - "name": "MEDIAN_TARGET_COVERAGE", - "norm": "gt", - "threshold": 500.0, - "value": 461.0, - }, - ] - } - - # WHEN calling the function - metrics_dict = get_qc_metrics_dict(analysis_path, qc_metrics["qc"][seq_type]) - - # THEN check if the extracted metrics and its structure meets the expected one - assert metrics_dict.items() == expected_output.items() - - -def test_get_qc_metrics_json_wgs(analysis_path): - """test JSON object generation for a WGS run""" - - # GIVEN a sequencing type - seq_type = "wgs" - capture_kit = None - - # GIVEN retrieved WGS metrics - output_metrics = {"concatenated_tumor": {"FOLD_80_BASE_PENALTY": 1.238604}} - - # WHEN calling the function - qc_metrics = get_qc_metrics_json(analysis_path, seq_type, capture_kit) - - # THEN check if the obtained metrics are WGS specific - assert qc_metrics.items() == output_metrics.items() - - -def test_get_qc_metrics_json_targeted(analysis_path): - """test JSON object generation for a custom bed file""" - - # GIVEN a sequencing type - seq_type = "targeted" - capture_kit = "lymphoma_6.1_hg19_design.bed" - - # THEN check if the obtained metrics are following the panel bed specific requirements - try: - get_qc_metrics_json(analysis_path, seq_type, capture_kit) - except ValidationError as val_err: - assert ( - "2 validation errors for QCValidationModel" in str(val_err) - and "MEDIAN_TARGET_COVERAGE" in str(val_err) - and "FOLD_80_BASE_PENALTY" in str(val_err) - ) - - -def test_get_multiqc_data_source(analysis_path): - """test multiQC source extraction from multiqc_data.json analysis file""" - - # GIVEN input parameters - sample = "concatenated_tumor_XXXXXX_R" - source_name_hs_metrics = "multiqc_picard_HsMetrics" - source_name_dup = "multiqc_picard_dups" - - with open( - os.path.join(analysis_path, "qc", "multiqc_data", "multiqc_data.json"), "r" - ) as f: - raw_data = json.load(f) - - # GIVEN an expected output - source_hs_metrics = "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric" - source_dup = "concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt" - - # WHEN extracting the source of a specific sample and collection of metrics - out_source_hs_metrics = get_multiqc_data_source( - raw_data, sample, source_name_hs_metrics - ) - out_source_dup = get_multiqc_data_source(raw_data, sample, source_name_dup) - - # THEN check if the extracted source names correspond to the expected ones - assert source_hs_metrics == out_source_hs_metrics - assert source_dup == out_source_dup - - -def test_extract_metrics_for_delivery(analysis_path): - """test output metrics retrieving""" - - # GIVEN a sequencing type - seq_type = "targeted" - - # GIVEN an expected output - n_metrics = 6 # Number of expected metric - - hs_metric = { - "header": None, - "id": "tumor", - "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric", - "name": "PCT_OFF_BAIT", - "step": "multiqc_picard_HsMetrics", - "value": 0.364546, - } - - ins_size_metric = { - "header": None, - "id": "tumor", - "input": "concatenated_tumor_XXXXXX_R.sorted.insertsizemetric", - "name": "MEAN_INSERT_SIZE", - "step": "multiqc_picard_insertSize", - "value": 201.813054, - } - - dups_metric = { - "header": None, - "id": "tumor", - "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt", - "name": "PERCENT_DUPLICATION", - "step": "multiqc_picard_dups", - "value": 0.391429, - } - - # WHEN calling the function - metrics = extract_metrics_for_delivery(analysis_path, seq_type) - - # THEN check if the metrics are correctly retrieved - assert len(metrics) == n_metrics - assert ( - hs_metric in metrics and ins_size_metric in metrics and dups_metric in metrics - ) - - -def test_extract_metrics_for_delivery_filtering_umi(analysis_path): - """test umi discarding when extracting metrics""" - - # GIVEN a sequencing type - seq_type = "targeted" +def test_validate_qc_metrics(qc_extracted_metrics): + """test QC metric validation""" # WHEN calling the function - metrics = extract_metrics_for_delivery(analysis_path, seq_type) + validated_metrics_pass = validate_qc_metrics(qc_extracted_metrics) - # THEN check if the umi samples are filtered out - for metric in metrics: - assert "umi" not in metric["input"] + # THEN check if the obtained metrics are correctly parsed and validated + assert validated_metrics_pass == qc_extracted_metrics diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 5bb12da65..92afad384 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -1,4 +1,5 @@ import json +import os import subprocess import pytest import sys @@ -43,6 +44,9 @@ check_executable, job_id_dump_to_yaml, generate_h5, + get_md5, + create_md5, + read_yaml, ) from BALSAMIC.utils.rule import ( @@ -56,7 +60,9 @@ get_threads, get_delivery_id, get_reference_output_files, + get_rule_output, ) +from tests.helpers import Map def test_get_variant_callers_wrong_analysis_type(tumor_normal_config): @@ -184,7 +190,7 @@ def test_get_bioinfo_tools_version(): # THEN assert it is a dictionary and versions are correct assert isinstance(bioinfo_tools_dict, dict) - assert set(observed_versions).issubset(set(["1.12", "1.11", "1.9"])) + assert set(observed_versions).issubset(set(["1.15.1", "1.12", "1.11", "1.9"])) def test_get_delivery_id(): @@ -349,28 +355,36 @@ def test_get_snakefile(): ("paired", "targeted"), ("single", "wgs"), ("single", "targeted"), - ("qc", ""), + ("qc_panel", "targeted"), ("generate_ref", ""), ("pon", ""), ] # WHEN asking to see snakefile for paired - for analysis_type, sequencing_type in workflow: - snakefile = get_snakefile(analysis_type, sequencing_type) - pipeline = "" - - if sequencing_type in ["targeted", "wgs", "qc"]: - pipeline = "BALSAMIC/workflows/balsamic.smk" - elif analysis_type == "generate_ref": - pipeline = "BALSAMIC/workflows/reference.smk" - elif analysis_type == "pon": - pipeline = "BALSAMIC/workflows/PON.smk" - - # THEN it should return the snakefile path - # THEN assert file exists - assert snakefile.startswith("/") - assert pipeline in snakefile - assert Path(snakefile).is_file() + for reference_genome in ["hg19", "hg38", "canfam3"]: + for analysis_type, sequencing_type in workflow: + snakefile = get_snakefile(analysis_type, reference_genome) + + pipeline = "" + if sequencing_type in ["targeted", "wgs"] and analysis_type in [ + "single", + "paired", + ]: + pipeline = "BALSAMIC/workflows/balsamic.smk" + elif analysis_type == "generate_ref" and reference_genome != "canfam3": + pipeline = "BALSAMIC/workflows/reference.smk" + elif analysis_type == "generate_ref" and reference_genome == "canfam3": + pipeline = "BALSAMIC/workflows/reference-canfam3.smk" + elif analysis_type == "pon": + pipeline = "BALSAMIC/workflows/PON.smk" + elif "qc" in analysis_type: + pipeline = "BALSAMIC/workflows/QC.smk" + + # THEN it should return the snakefile path + # THEN assert file exists + assert snakefile.startswith("/") + assert pipeline in snakefile + assert Path(snakefile).is_file() def test_get_chrom(config_files): @@ -543,6 +557,65 @@ def test_write_json_error(tmp_path): assert write_json(ref_json, output_json) +def test_read_yaml(metrics_yaml_path): + """test data extraction from a saved YAML file""" + + # GIVEN an expected output + n_metrics = 11 # Number of expected metric + + hs_metric = { + "header": None, + "id": "tumor", + "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.hsmetric", + "name": "MEDIAN_TARGET_COVERAGE", + "step": "multiqc_picard_HsMetrics", + "value": 2393.0, + "condition": {"norm": "gt", "threshold": 1000.0}, + } + + ins_size_metric = { + "header": None, + "id": "tumor", + "input": "concatenated_tumor_XXXXXX_R.sorted.insertsizemetric", + "name": "MEAN_INSERT_SIZE", + "step": "multiqc_picard_insertSize", + "value": 201.813054, + "condition": None, + } + + dups_metric = { + "header": None, + "id": "tumor", + "input": "concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt", + "name": "PERCENT_DUPLICATION", + "step": "multiqc_picard_dups", + "value": 0.391429, + "condition": None, + } + + # WHEN calling the function + requested_metrics = read_yaml(metrics_yaml_path) + + # THEN check if the data are correctly retrieved from the YAML + assert len(requested_metrics) == n_metrics + assert hs_metric in requested_metrics + assert ins_size_metric in requested_metrics + assert dups_metric in requested_metrics + + +def test_read_yaml_error(): + """test data extraction from an incorrect YAML path""" + + # GIVEN an invalid path + yaml_path = "NOT_A_PATH" + + # THEN assert that the FileNotFoundError is raised + try: + read_yaml(yaml_path) + except FileNotFoundError as file_exc: + assert f"The YAML file {yaml_path} was not found." in str(file_exc) + + def test_get_threads(config_files): # GIVEN cluster config file and rule name cluster_config = json.load(open(config_files["cluster_json"], "r")) @@ -940,3 +1013,67 @@ def test_generate_h5_capture_no_output(tmp_path): actual_output = generate_h5(dummy_job_name, dummy_job_id, dummy_path) assert actual_output == None + + +def test_get_md5(tmp_path): + + # GIVEN a dummy file + dummy_dir = tmp_path / "md5" + dummy_dir.mkdir() + dummy_file = dummy_dir / "dummy_file.dump" + dummy_file.write_text("Awesome Text") + + # THEN md5 returned should be + assert get_md5(dummy_file) == "3945B39E" + + +def test_create_md5(tmp_path): + + # GIVEN a path to a md5 file and reference dummy files + ref_dir = tmp_path / "references" + ref_dir.mkdir() + dummy_ref_file1 = ref_dir / "reference_file1.dump" + dummy_ref_file1.write_text("Test reference1") + dummy_ref_file2 = ref_dir / "reference_file2.dump" + dummy_ref_file2.write_text("Test reference2") + dummy_reference_dict = { + "reference_dummy1": str(dummy_ref_file1), + "reference_dummy2": str(dummy_ref_file2), + } + dummy_dir = tmp_path / "md5" + dummy_dir.mkdir() + dummy_file = dummy_dir / "dummy_file.dump" + + create_md5(dummy_reference_dict, dummy_file) + + # THEN md5 file exists + assert dummy_file.exists() + + +def test_get_rule_output(snakemake_fastqc_rule): + """Tests retrieval of existing output files from a specific workflow""" + + # GIVEN a snakemake fastqc rule object, a rule name and a list of associated wildcards + rules = snakemake_fastqc_rule + rule_name = "fastqc" + output_file_wildcards = { + "sample": ["concatenated_tumor_XXXXXX_R", "tumor", "normal"], + "case_name": "sample_tumor_only", + } + + # THEN retrieve the output files + output_files = get_rule_output(rules, rule_name, output_file_wildcards) + + # THEN check that the fastq files has been picked up by the function and that the tags has been correctly created + assert len(output_files) == 2 + for file in output_files: + # Expected file names + assert ( + os.path.basename(file[0]) == "concatenated_tumor_XXXXXX_R_1.fastq.gz" + or os.path.basename(file[0]) == "concatenated_tumor_XXXXXX_R_2.fastq.gz" + ) + # Expected tags + assert ( + file[3] == "1,fastqc,quality-trimmed-seq-fastqc" + or file[3] == "2,fastqc,quality-trimmed-seq-fastqc" + )