diff --git a/BALSAMIC/assets/scripts/igh_dux4_detection.sh b/BALSAMIC/assets/scripts/igh_dux4_detection.sh new file mode 100644 index 000000000..1bb39cf21 --- /dev/null +++ b/BALSAMIC/assets/scripts/igh_dux4_detection.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +# Check if at least 3 arguments are provided +if [ "$#" -lt 3 ]; then + echo "Usage: $0 [normal_bam]" + exit 1 +fi + +# Assign variables +genome_version="$1" +output_vcf="$2" +tumor_bam="$3" +normal_bam="$4" +output_vcf_tmp=$(echo $output_vcf | sed 's/.gz$//;') + +# Print given arguments +echo "genome_version: $genome_version" +echo "output vcf: $output_vcf" +echo "tumor bam: $tumor_bam" +echo "normal bam: $normal_bam" + +# Set chr positions depending on the genome version +if [ "$genome_version" = "hg19" ]; then + igh_chr="14" + igh_pos="106032614" + dux4_chr="4" + dux4_pos="190988100" +elif [ "$genome_version" = "hg38" ]; then + igh_chr="14" + igh_pos="105586437" + dux4_chr="4" + dux4_pos="190173000" +else + echo "Invalid genome version. Accepted values: hg19, hg38. Given: $genome_version" + exit 1 +fi + + +# Define functions +get_supporting_reads() { + # Get number of supporting reads for IGH::DUX4 rearrangement in a given BAM file + local bam="$1" + if [ "$genome_version" = "hg19" ]; then + local supporting_reads=$(samtools view -F 1024 -c \ + -e '(rnext == "4" && pnext > 190988100 && pnext < 191007000) || (rnext == "10" && pnext > 135477000 && pnext < 135500000) || (rnext == "GL000228.1" && pnext > 70000 && pnext < 115000) || ([SA] =~ "10,1354[789][0-9]{4}") || ([SA] =~ "4,19(09[8-9][0-9]|100[0-7])[0-9]{3}" || [SA] =~ "GL000228.1,([7-9][0-9]{4}|1[0-1][0-5][0-9]{3})")' \ + $bam 14:106032614-107288051 ) + elif [ "$genome_version" = "hg38" ]; then + local supporting_reads=$(samtools view -F 1024 -c \ + -e '(rnext == "4" && pnext > 190173000 && pnext < 190176000) || ([SA] =~ "4,19017[345][0-9]{3}")' \ + $bam chr14:105586437-106879844 ) + fi + echo $supporting_reads +} + +# Set information for tumor and normal +supporting_reads_tumor=$(get_supporting_reads $tumor_bam) +samples_header="TUMOR" +samples_field="${supporting_reads_tumor}" +if [ -n "$normal_bam" ]; then + supporting_reads_normal=$(get_supporting_reads $normal_bam) + samples_header="NORMAL\tTUMOR" + samples_field="${supporting_reads_normal}\t${supporting_reads_tumor}" +fi + + +# If supporting reads are found in the tumor, set filter to PASS. Otherwise add: no_supporting_reads +if [ "$supporting_reads_tumor" -gt 0 ]; then + vcf_filter="PASS" +else + vcf_filter="no_supporting_reads" +fi + +echo "supporting reads tumor: $supporting_reads_tumor" +echo "supporting reads normal: $supporting_reads_normal" +echo "vcf filter: $vcf_filter" + +# Write vcf entry +{ + echo '##fileformat=VCFv4.2' + echo '##ALT=' + echo '##INFO=' + echo '##INFO=' + echo '##FORMAT=' + echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t${samples_header}" + echo -e "${igh_chr}\t${igh_pos}\tsamtools_igh_dux4\tN\tN[${dux4_chr}:${dux4_pos}[\t.\t${vcf_filter}\tSVTYPE=BND;IMPRECISE;\tDV\t${samples_field}" +} >> $output_vcf_tmp + +bgzip $output_vcf_tmp +tabix -p vcf $output_vcf diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index 03e934b6d..0c4444bbc 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -21,6 +21,7 @@ OPTION_CASE_ID, OPTION_CLINICAL_SNV_OBSERVATIONS, OPTION_CLINICAL_SV_OBSERVATIONS, + OPTION_EXOME, OPTION_FASTQ_PATH, OPTION_GENDER, OPTION_GENOME_INTERVAL, @@ -70,6 +71,7 @@ @OPTION_CASE_ID @OPTION_CLINICAL_SNV_OBSERVATIONS @OPTION_CLINICAL_SV_OBSERVATIONS +@OPTION_EXOME @OPTION_FASTQ_PATH @OPTION_GENDER @OPTION_GENOME_VERSION @@ -101,6 +103,7 @@ def case_config( case_id: str, clinical_snv_observations: Path, clinical_sv_observations: Path, + exome: bool, fastq_path: Path, gender: Gender, genome_version: GenomeVersion, @@ -219,6 +222,7 @@ def case_config( container_conda_env_path=CONTAINERS_DIR, ), panel={ + "exome": exome, "capture_kit": panel_bed, "chrom": get_panel_chrom(panel_bed), "pon_cnn": pon_cnn, diff --git a/BALSAMIC/commands/options.py b/BALSAMIC/commands/options.py index 575cdae9b..41a4bdec1 100644 --- a/BALSAMIC/commands/options.py +++ b/BALSAMIC/commands/options.py @@ -22,7 +22,7 @@ from BALSAMIC.constants.constants import LOG_LEVELS, LogLevel from BALSAMIC.constants.rules import DELIVERY_RULES from BALSAMIC.constants.workflow_params import VCF_DICT -from BALSAMIC.utils.cli import validate_cache_version +from BALSAMIC.utils.cli import validate_cache_version, validate_exome_option OPTION_ADAPTER_TRIM = click.option( "--adapter-trim/--no-adapter-trim", @@ -189,6 +189,14 @@ help="Enable dragen variant caller", ) +OPTION_EXOME = click.option( + "--exome", + is_flag=True, + default=False, + help="Assign exome parameters to TGA workflow", + callback=validate_exome_option, +) + OPTION_FASTQ_PATH = click.option( "--fastq-path", type=click.Path(exists=True, resolve_path=True), diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json index df3b90de4..3f6d1a5a3 100644 --- a/BALSAMIC/constants/cluster_analysis.json +++ b/BALSAMIC/constants/cluster_analysis.json @@ -411,5 +411,9 @@ "samtools_qc": { "time": "04:00:00", "n": 16 + }, + "igh_dux4_detection": { + "time": "02:00:00", + "n": 1 } } diff --git a/BALSAMIC/constants/variant_filters.py b/BALSAMIC/constants/variant_filters.py index df8fe2d40..8c3a22535 100644 --- a/BALSAMIC/constants/variant_filters.py +++ b/BALSAMIC/constants/variant_filters.py @@ -10,16 +10,8 @@ "description": "General purpose filters used for filtering any variant caller", } -# Configuration of VARDICT settings -VARDICT_SETTINGS = { - "AD": {"tag_value": 5, "filter_name": "balsamic_low_tumor_ad", "field": "INFO"}, - "DP": { - "tag_value": 100, - "filter_name": "balsamic_low_tumor_dp", - "field": "INFO", - }, - "MQ": {"tag_value": 40, "filter_name": "balsamic_low_mq", "field": "INFO"}, - "AF_min": {"tag_value": 0.007, "filter_name": "balsamic_low_af", "field": "INFO"}, +# Configuration of common VARDICT settings +VARDICT_SETTINGS_COMMON = { "pop_freq": { "tag_value": 0.005, "filter_name": "balsamic_high_pop_freq", @@ -35,12 +27,35 @@ "filter_name": "Frq", "field": "INFO", }, + "MQ": {"tag_value": 30, "filter_name": "balsamic_low_mq", "field": "INFO"}, + "AF_min": {"tag_value": 0.007, "filter_name": "balsamic_low_af", "field": "INFO"}, + "AD": {"tag_value": 5, "filter_name": "balsamic_low_tumor_ad", "field": "INFO"}, "varcaller_name": "VarDict", "filter_type": "general", - "analysis_type": "tumor_only", + "analysis_type": "tumor_only,tumor_normal", "description": "General purpose filters used for filtering VarDict", } +# Configuration of VARDICT settings for smaller panels +VARDICT_SETTINGS_PANEL = { + **VARDICT_SETTINGS_COMMON, + "DP": { + "tag_value": 100, + "filter_name": "balsamic_low_tumor_dp", + "field": "INFO", + }, +} + +# Configuration of VARDICT settings for exomes +VARDICT_SETTINGS_EXOME = { + **VARDICT_SETTINGS_COMMON, + "DP": { + "tag_value": 20, + "filter_name": "balsamic_low_tumor_dp", + "field": "INFO", + }, +} + # Configuration for SENTIEON settings: SENTIEON_VARCALL_SETTINGS = { "AD": {"tag_value": 3, "filter_name": "balsamic_low_tumor_ad", "field": "FORMAT"}, @@ -50,6 +65,11 @@ "field": "FORMAT", }, "AF_min": {"tag_value": 0.05, "filter_name": "balsamic_low_af", "field": "FORMAT"}, + "high_normal_tumor_af_frac": { + "tag_value": 0.3, + "filter_name": "high_normal_tumor_af_frac", + "field": "FORMAT", + }, "pop_freq": { "tag_value": 0.001, "filter_name": "balsamic_high_pop_freq", diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 396c92e73..0bf5156c3 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -99,6 +99,13 @@ "sequencing_type": ["wgs"], "workflow_solution": ["BALSAMIC"], }, + "igh_dux4": { + "mutation": "somatic", + "mutation_type": "SV", + "analysis_type": ["single", "paired"], + "sequencing_type": ["wgs"], + "workflow_solution": ["BALSAMIC"], + }, "svdb": { "mutation": "somatic", "mutation_type": "SV", diff --git a/BALSAMIC/models/config.py b/BALSAMIC/models/config.py index 7df595883..1c17c8ec3 100644 --- a/BALSAMIC/models/config.py +++ b/BALSAMIC/models/config.py @@ -49,6 +49,7 @@ class SampleInstanceModel(BaseModel): class PanelModel(BaseModel): """Holds attributes of PANEL BED file if provided Attributes: + exome: (bool); optional parameter for targeted analyses to use exome parameters capture_kit : Field(str(Path)); string representation of path to PANEL BED file chrom : Field(list(str)); list of chromosomes in PANEL BED pon_cnn: Field(optional); Path where PON reference .cnn file is stored @@ -59,6 +60,7 @@ class PanelModel(BaseModel): """ + exome: Optional[bool] = False capture_kit: Annotated[Optional[str], AfterValidator(is_file)] = None chrom: Optional[List[str]] = None pon_cnn: Annotated[Optional[str], AfterValidator(is_file)] = None @@ -102,6 +104,7 @@ class VCFModel(BaseModel): dellycnv: VarcallerAttribute tiddit: VarcallerAttribute cnvpytor: VarcallerAttribute + igh_dux4: VarcallerAttribute svdb: VarcallerAttribute diff --git a/BALSAMIC/models/params.py b/BALSAMIC/models/params.py index f7bbbe99b..4b4849c01 100644 --- a/BALSAMIC/models/params.py +++ b/BALSAMIC/models/params.py @@ -218,6 +218,7 @@ class VarCallerFilter(BaseModel): Attributes: AD: VCFAttributes (required); minimum allelic depth AF_min: VCFAttributes (optional); minimum allelic fraction + high_normal_tumor_af_frac: VCFAttributes (optional); maximum normal allele frequency / tumor allele frequency MQ: VCFAttributes (optional); minimum mapping quality DP: VCFAttributes (optional); minimum read depth pop_freq: VCFAttributes (optional); maximum gnomad allele frequency @@ -238,6 +239,7 @@ class VarCallerFilter(BaseModel): AD: Optional[VCFAttributes] = None AF_min: Optional[VCFAttributes] = None + high_normal_tumor_af_frac: Optional[VCFAttributes] = None MQ: Optional[VCFAttributes] = None DP: Optional[VCFAttributes] = None pop_freq: Optional[VCFAttributes] = None diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_quality_filter.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_quality_filter.rule index 96459c0cd..04d5d6948 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_quality_filter.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_quality_filter.rule @@ -58,6 +58,8 @@ elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["anal AD = [SENTIEON_CALLER.AD.tag_value, SENTIEON_CALLER.AD.filter_name], DP = [SENTIEON_CALLER.DP.tag_value, SENTIEON_CALLER.DP.filter_name], AF_min = [SENTIEON_CALLER.AF_min.tag_value, SENTIEON_CALLER.AF_min.filter_name], + high_normal_tumor_af_frac_filter_name=SENTIEON_CALLER.high_normal_tumor_af_frac.filter_name, + high_normal_tumor_af_frac_value=SENTIEON_CALLER.high_normal_tumor_af_frac.tag_value, case_name = config["analysis"]["case_id"], threads: get_threads(cluster_config, 'bcftools_quality_filter_tnscope_tumor_normal') @@ -65,10 +67,12 @@ elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["anal "Quality filtering WGS tumor-normal tnscope variants using bcftools for {params.case_name}" shell: """ -bcftools view -f PASS,triallelic_site {input.vcf_snv} \ +bcftools view {input.vcf_snv} \ | bcftools filter --threads {threads} --include 'SUM(FORMAT/AD[0:0]+FORMAT/AD[0:1]) >= {params.DP[0]} || SUM(FORMAT/AD[1:0]+FORMAT/AD[1:1]) >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' \ | bcftools filter --threads {threads} --include 'FORMAT/AD[0:1] >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' \ | bcftools filter --threads {threads} --include 'FORMAT/AF[0] >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' \ +| bcftools annotate -x FILTER/alt_allele_in_normal \ +| bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1])/sum(FORMAT/AF[0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' \ | bcftools view -f PASS,triallelic_site -O z -o {output.vcf_snv_research}; tabix -p vcf -f {output.vcf_snv_research}; @@ -179,14 +183,18 @@ tabix -p vcf -f {output.vcf_filtered}; Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: case_name = config["analysis"]["case_id"], + high_normal_tumor_af_frac_filter_name=SENTIEON_CALLER.high_normal_tumor_af_frac.filter_name, + high_normal_tumor_af_frac_value=SENTIEON_CALLER.high_normal_tumor_af_frac.tag_value, threads: get_threads(cluster_config,'bcftools_quality_filter_TNscope_umi_tumor_normal') message: "Quality filtering TNscope_umi tumor-normal annotated variants using bcftools for {params.case_name} " shell: """ -bcftools view {input.vcf} | \ -bcftools view -f PASS,triallelic_site -o {output.vcf_filtered} -O z; +bcftools view {input.vcf} \ +| bcftools annotate -x FILTER/alt_allele_in_normal \ +| bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1])/sum(FORMAT/AF[0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' \ +| bcftools view -f PASS,triallelic_site -o {output.vcf_filtered} -O z; tabix -p vcf -f {output.vcf_filtered}; """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule index 6e18d05fb..50bd6cc9a 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule @@ -402,6 +402,33 @@ else: rm {input.ascat_cnv}; """ + +rule igh_dux4_detection_tumor_normal: + input: + fa = config["reference"]["reference_genome"], + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name=normal_sample), + output: + vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".igh_dux4.vcf.gz", + benchmark: + benchmark_dir + 'igh_dux4_detection_tumor_normal_' + config["analysis"]["case_id"] + ".tsv" + singularity: + Path(singularity_image, config["bioinfo_tools"].get("samtools") + ".sif").as_posix() + params: + genome_version = config["reference"]["genome_version"], + custom_sv_detection_script = get_script_path("igh_dux4_detection.sh"), + case_name = config["analysis"]["case_id"], + threads: + get_threads(cluster_config, "igh_dux4_detection") + message: + "Detecting IGH::DUX4 rearrangement for {params.case_name} using samtools." + shell: + """ + bash {params.custom_sv_detection_script} {params.genome_version} {output.vcf} {input.bamT} {input.bamN} + """ + + + rule svdb_merge_tumor_normal: input: vcf = expand( diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule index fa2bc7c9b..a0e032b95 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule @@ -288,6 +288,31 @@ tabix -p vcf -f {output.delly_sv}; tabix -p vcf -f {output.delly_cnv}; """ + +rule igh_dux4_detection_tumor_only: + input: + fa = config["reference"]["reference_genome"], + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample) + output: + vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".igh_dux4.vcf.gz", + benchmark: + benchmark_dir + 'igh_dux4_detection_tumor_only_' + config["analysis"]["case_id"] + ".tsv" + singularity: + Path(singularity_image, config["bioinfo_tools"].get("samtools") + ".sif").as_posix() + params: + genome_version = config["reference"]["genome_version"], + custom_sv_detection_script = get_script_path("igh_dux4_detection.sh"), + case_name = config["analysis"]["case_id"], + threads: + get_threads(cluster_config, "igh_dux4_detection") + message: + "Detecting IGH::DUX4 rearrangement for {params.case_name} using samtools." + shell: + """ + bash {params.custom_sv_detection_script} {params.genome_version} {output.vcf} {input.bamT} + """ + + rule svdb_merge_tumor_only: input: vcf = expand( diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule index f8ba2a2e1..1f6734c76 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule @@ -32,7 +32,7 @@ mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx90G\"'; -vardict-java -U -u -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \ +vardict-java -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \ -b \"{input.bamT}|{input.bamN}\" \ -th {threads} \ {params.col_info} {input.bed} \ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule index cf8ea7148..b701dc276 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule @@ -31,7 +31,7 @@ mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx45G\"'; -vardict-java -u -I 600 \ +vardict-java -I 600 \ -G {input.fa} \ -f {params.af} \ -N {params.case_name} \ diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 11616e197..6b66c70e8 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -488,6 +488,15 @@ def get_analysis_fastq_files_directory(case_dir: str, fastq_path: str) -> str: return Path(fastq_path).as_posix() +def validate_exome_option(ctx: click.Context, _param: click.Parameter, exome: bool): + """Validate that a panel-bed has been supplied together with the exome option.""" + if exome and not ctx.params.get("panel_bed"): + raise click.BadParameter( + "If --exome is provided, --panel-bed must also be provided." + ) + return exome + + def validate_cache_version( _ctx: click.Context, _param: click.Parameter, version: str ) -> str: diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 57c414ca1..49a5f34e0 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -16,7 +16,9 @@ from BALSAMIC.constants.variant_filters import ( COMMON_SETTINGS, SENTIEON_VARCALL_SETTINGS, SVDB_FILTER_SETTINGS, - VARDICT_SETTINGS, + VARDICT_SETTINGS_PANEL, + VARDICT_SETTINGS_EXOME, + VARDICT_SETTINGS_COMMON, MANTA_FILTER_SETTINGS, ) from BALSAMIC.constants.workflow_params import VARCALL_PARAMS, WORKFLOW_PARAMS, SLEEP_BEFORE_START @@ -115,7 +117,13 @@ else: # Varcaller filter settings COMMON_FILTERS = VarCallerFilter.model_validate(COMMON_SETTINGS) -VARDICT = VarCallerFilter.model_validate(VARDICT_SETTINGS) + +# Set VarDict settings depending on if panel is exome or not +VARDICT = VarCallerFilter.model_validate(VARDICT_SETTINGS_PANEL) +if config_model.panel and config_model.panel.exome: + VARDICT = VarCallerFilter.model_validate(VARDICT_SETTINGS_EXOME) + + SENTIEON_CALLER = VarCallerFilter.model_validate(SENTIEON_VARCALL_SETTINGS) SVDB_FILTERS = VarCallerFilter.model_validate(SVDB_FILTER_SETTINGS) MANTA_FILTERS = VarCallerFilter.model_validate(MANTA_FILTER_SETTINGS) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 275707231..4d840e100 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,27 @@ +[15.0.0] +------- + +Added: +^^^^^^ +* high_normal_tumor_af_frac filter in bcftools for TNscope T+N filtering out more than 30% TINC https://github.com/Clinical-Genomics/BALSAMIC/pull/1289 +* New option for exome samples `--exome` with modified bcftools filters compared to standard targeted workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1414 +* Custom samtools script for the detection of IGH::DUX4 rearrangements https://github.com/Clinical-Genomics/BALSAMIC/pull/1397 + +Changed: +^^^^^^^^ +* Reduced stringency of minimum MQ for all TGA to 30 from 40 https://github.com/Clinical-Genomics/BALSAMIC/pull/1414 +* Removed -u flag from VarDict T+N and T only rules to remove calling only in reverse reads of overlapping mates https://github.com/Clinical-Genomics/BALSAMIC/pull/1414 +* Removed -U flag to VarDict T+N rule to start calling SVs https://github.com/Clinical-Genomics/BALSAMIC/pull/1414 + +Removed: +^^^^^^^^ +* alt_allele_in_normal filter from TNscope T+N workflows https://github.com/Clinical-Genomics/BALSAMIC/pull/1289 + +Fixed: +^^^^^^ +* initial filter keeping only PASS or triallelic-site from T+N bcftools quality filter rule has been removed https://github.com/Clinical-Genomics/BALSAMIC/pull/1424 + + [14.0.1] ------- @@ -36,6 +60,7 @@ Fixed: ^^^^^^ * Missing `__init__.py` in `snakemake_rules` folders https://github.com/Clinical-Genomics/BALSAMIC/pull/1383 + [13.0.0] ------- diff --git a/docs/balsamic_filters.rst b/docs/balsamic_filters.rst index 0e2b89bae..676b1a671 100644 --- a/docs/balsamic_filters.rst +++ b/docs/balsamic_filters.rst @@ -2,7 +2,7 @@ Calling and filtering variants *********************************** -In BALSAMIC, various bioinfo tools are integrated for reporting somatic and germline variants summarized in the table below. The choice of these tools differs between the type of analysis, `Target Genome Analysis (TGA)` or analysis of `Whole Genome Sequencing (WGS)`. +In BALSAMIC, various bioinfo tools are integrated for reporting somatic and germline variants summarized in the table below. The choice of these tools differs between the type of analysis; `Whole Genome Sequencing (WGS)`, or `Target Genome Analysis (TGA)` and `Target Genome Analysis (TGA) with UMI-analysis activated`. .. list-table:: SNV and small-Indel callers @@ -20,12 +20,7 @@ In BALSAMIC, various bioinfo tools are integrated for reporting somatic and germ - germline - SNV, InDel * - TNscope - - WGS - - tumor-normal, tumor-only - - somatic - - SNV, InDel - * - TNScope_umi - - TGA + - WGS, TGA (with UMIs activated) - tumor-normal, tumor-only - somatic - SNV, InDel @@ -109,15 +104,21 @@ Somatic Callers for reporting SNVs/INDELS The results of `Vardict` variant calling are further post-filtered based on several criteria (`Vardict_filtering`_) to retrieve high-confidence variant calls. These high-confidence variant calls are the final list of variants uploaded to Scout or available in the delivered VCF file in Caesar. + +There are two slightly different post-processing filters activated depending on if the sample is an exome or a smaller panel as these tend to have very different sequencing depths. + **Vardict_filtering** ^^^^^^^^^^^^^^^^^^^^^^ + Following is the set of criteria applied for filtering vardict results. It is used for both tumor-normal and tumor-only samples. +**Post-call Quality Filters for panels** + *Mean Mapping Quality (MQ)*: Refers to the root mean square (RMS) mapping quality of all the reads spanning the given variant site. :: - MQ >= 40 + MQ >= 30 *Total Depth (DP)*: Refers to the overall read depth supporting the called variant. @@ -136,12 +137,44 @@ Following is the set of criteria applied for filtering vardict results. It is us :: Minimum AF >= 0.007 - Maximum AF < 1 + +**Post-call Quality Filters for exomes** + + +*Mean Mapping Quality (MQ)*: Refers to the root mean square (RMS) mapping quality of all the reads spanning the given variant site. + +:: + + MQ >= 30 + +*Total Depth (DP)*: Refers to the overall read depth supporting the called variant. + +:: + + DP >= 20 + +*Variant depth (VD)*: Total reads supporting the ALT allele + +:: + + VD >= 5 + +*Allelic Frequency (AF)*: Fraction of the reads supporting the alternate allele + +:: + + Minimum AF >= 0.007 + +**Note:** +**Additionally, the variant is excluded for tumor-normal cases if marked as 'germline' in the `STATUS` column of the VCF file.** **Attention:** **BALSAMIC <= v8.2.7 uses minimum AF 1% (0.01). From Balsamic v8.2.8, minimum VAF is changed to 0.7% (0.007)** +**Post-call Observation database Filters** + + *GNOMADAF_POPMAX*: Maximum Allele Frequency across populations :: @@ -160,46 +193,73 @@ Following is the set of criteria applied for filtering vardict results. It is us Frq <= 0.01 (or) Frq == "." -**Note:** -**Additionally, the variant is excluded for tumor-normal cases if marked as 'germline' in the `STATUS` column of the VCF file.** -**Whole Genome Sequencing (WGS)** -********************************** +**Target Genome Analysis with UMI's into account** +************************************************** **Sentieon's TNscope** ======================= +`UMI workflow `_ performs the variant calling of SNVs/INDELS using the `TNscope` algorithm from UMI consensus-called reads. +The following filter applies for both tumor-normal and tumor-only samples. -BALSAMIC utilizes the `TNscope` algorithm for calling somatic SNVs and INDELS in WGS samples. -The `TNscope `_ algorithm performs the somatic variant calling on the tumor-normal or the tumor-only samples, using a Haplotyper algorithm. +**Pre-call Filters** -**TNscope filtering (Tumor_normal)** -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following filters are applied to the variants in TNscope raw VCF file (`SNV.somatic..tnscope.all.vcf.gz`). The variants scored as `PASS` or `triallelic_sites` are included in the final vcf file (`SNV.somatic..tnscope..filtered.pass.vcf.gz`). +*minreads*: Filtering of consensus called reads based on the minimum reads supporting each UMI tag group -*Total Depth (DP)*: Refers to the overall read depth from all target samples supporting the variant call +:: + + minreads = 3,1,1 + +It means that at least `3` read-pairs need to support the UMI-group (based on the UMI-tag and the aligned genomic positions), and with at least 1 read-pair from each strand (F1R2 and F2R1). + +*min_init_tumor_lod*: Initial Log odds for the that the variant exists in the tumor. :: - DP(tumor) >= 10 (or) DP(normal) >= 10 + min_init_tumor_lod = 0.5 -*Allelic Depth (AD)*: Total reads supporting the ALT allele in the tumor sample +*min_tumor_lod*: Minimum log odds in the final call of variant in the tumor. :: - AD(tumor) >= 3 + min_tumor_lod = 4.0 -*Allelic Frequency (AF)*: Fraction of the reads supporting the alternate allele +*min_tumor_allele_frac*: Set the minimum tumor AF to be considered as potential variant site. :: - Minimum AF(tumor) >= 0.05 - Maximum AF(tumor) < 1 + min_tumor_allele_frac = 0.0005 + +*interval_padding*: Adding an extra 100bp to each end of the target region in the bed file before variant calling. + +:: + + interval_padding = 100 + +**Post-call Quality Filters** + +*alt_allele_in_normal*: Default filter set by TNscope was considered too stringent in filtering tumor in normal and is removed. + +:: + + bcftools annotate -x FILTER/alt_allele_in_normal + + +*Relative tumor AF in normal*: Allows for maximum Tumor-In-Normal-Contamination of 30%. + +:: + + excludes variant if: AF(normal) / AF(tumor) > 0.3 + +**Post-call Observation database Filters** *GNOMADAF_POPMAX*: Maximum Allele Frequency across populations :: - GNOMADAF_popmax <= 0.001 (or) GNOMADAF_popmax == "." + GNOMADAF_popmax <= 0.02 (or) GNOMADAF_popmax == "." + +*SWEGENAF*: SweGen Allele Frequency :: @@ -211,38 +271,94 @@ The following filters are applied to the variants in TNscope raw VCF file (`SNV. Frq <= 0.01 (or) Frq == "." -**TNscope filtering (tumor_only)** -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The somatic variants in TNscope raw VCF file (`SNV.somatic..tnscope.all.vcf.gz`) are filtered out for the genomic regions that are not reliable (eg: centromeric regions, non-chromosome contigs) to enhance the computation time. This WGS interval region file is collected from gatk_bundles ``_ -and following filters are applied. The variants scored as `PASS` or `triallelic_sites` are included in the final vcf file (`SNV.somatic..tnscope..filtered.pass.vcf.gz`). +The variants scored as `PASS` or `triallelic_sites` are included in the final vcf file (`SNV.somatic..tnscope..filtered.pass.vcf.gz`). -*Total Depth (DP)*: Refers to the overall read depth supporting the variant call +**Whole Genome Sequencing (WGS)** +********************************** + +**Sentieon's TNscope** +======================= + +BALSAMIC utilizes the `TNscope` algorithm for calling somatic SNVs and INDELS in WGS samples. +The `TNscope `_ algorithm performs the somatic variant calling on the tumor-normal or the tumor-only samples. + +**TNscope filtering (Tumor_normal)** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Pre-call Filters** + +*Apply TNscope trained MachineLearning Model*: Sets MLrejected on variants with ML_PROB below 0.32. :: + ML model: SentieonTNscopeModel_GiAB_HighAF_LowFP-201711.05.model is applied - DP(tumor) >= 10 +*min_init_tumor_lod*: Initial Log odds for the that the variant exists in the tumor. + +:: + + min_init_tumor_lod = 1 + + +*min_tumor_lod*: Minimum log odds in the final call of variant in the tumor. + +:: + + min_tumor_lod = 8 + + +*min_init_normal_lod*: Initial Log odds for the that the variant exists in the normal. + +:: + + min_init_normal_lod = 0.5 + + +*min_normal_lod*: Minimum log odds in the final call of variant in the normal. + +:: + + min_normal_lod = 1 + +**Post-call Quality Filters** + +*Total Depth (DP)*: Refers to the overall read depth from all target samples supporting the variant call + +:: + + DP(tumor) >= 10 (or) DP(normal) >= 10 *Allelic Depth (AD)*: Total reads supporting the ALT allele in the tumor sample :: - AD(tumor) > 3 + AD(tumor) >= 3 *Allelic Frequency (AF)*: Fraction of the reads supporting the alternate allele :: - Minimum AF(tumor) > 0.05 - Maximum AF(tumor) < 1 + Minimum AF(tumor) >= 0.05 -*GNOMADAF_POPMAX*: Maximum Allele Frequency across populations +*alt_allele_in_normal*: Default filter set by TNscope was considered too stringent in filtering tumor in normal and is removed. :: - GNOMADAF_popmax <= 0.001 (or) GNOMADAF_popmax == "." + bcftools annotate -x FILTER/alt_allele_in_normal -*Normalized base quality scores*: The sum of base quality scores for each allele (QSS) is divided by the allelic depth of alt and ref alleles (AD) +*Relative tumor AF in normal*: Allows for maximum Tumor-In-Normal-Contamination of 30%. + +:: + + excludes variant if: AF(normal) / AF(tumor) > 0.3 + +**Post-call Observation database Filters** + +*GNOMADAF_POPMAX*: Maximum Allele Frequency across populations + +:: + + GNOMADAF_popmax <= 0.001 (or) GNOMADAF_popmax == "." :: @@ -254,75 +370,76 @@ and following filters are applied. The variants scored as `PASS` or `triallelic_ Frq <= 0.01 (or) Frq == "." -:: +The variants scored as `PASS` or `triallelic_sites` are included in the final vcf file (`SNV.somatic..tnscope..filtered.pass.vcf.gz`). - SUM(QSS)/SUM(AD) >= 20 +**TNscope filtering (tumor_only)** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -*Read Counts*: Count of reads in a given (F1R2, F2R1) pair orientation supporting the alternate allele and reference alleles +**Pre-call Filters** + +*min_init_tumor_lod*: Initial Log odds for the that the variant exists in the tumor. :: - ALT_F1R2 > 0, ALT_F2R1 > 0 - REF_F1R2 > 0, REF_F2R1 > 0 + min_init_tumor_lod = 1 -*SOR*: Symmetric Odds Ratio of 2x2 contingency table to detect strand bias + +*min_tumor_lod*: Minimum log odds in the final call of variant in the tumor. :: - SOR < 3 + min_tumor_lod = 8 -**Target Genome Analysis with UMI's into account** -************************************************** +The somatic variants in TNscope raw VCF file (`SNV.somatic..tnscope.all.vcf.gz`) are filtered out for the genomic regions that are not reliable (eg: centromeric regions, non-chromosome contigs) to enhance the computation time. This WGS interval region file is collected from gatk_bundles ``_. -**Sentieon's TNscope** -======================= -`UMI workflow `_ performs the variant calling of SNVs/INDELS using the `TNscope` algorithm from UMI consensus-called reads. -The following filter applies for both tumor-normal and tumor-only samples. +**Post-call Quality Filters** -**Pre-call Filters** -*minreads*: Filtering of consensus called reads based on the minimum reads supporting each UMI tag group +*Total Depth (DP)*: Refers to the overall read depth supporting the variant call :: - minreads = 3,1,1 - -It means that at least `3` UMI tag groups should be ideally considered from both DNA strands, where a minimum of at least `1` UMI tag group should exist in each of the single-stranded consensus reads. + DP(tumor) >= 10 -*min_init_tumor_lod*: Log odds is the likelihood that the candidate mutation is real over the likelihood that the candidate mutation is a sequencing error before any read-based filters are applied. -Minimum log-odds for the candidate selection. TNscope default: `4`. In our UMI-workflow we reduced this setting to `0.5` +*Allelic Depth (AD)*: Total reads supporting the ALT allele in the tumor sample :: - min_init_tumor_lod = 0.5 + AD(tumor) > 3 -*min_tumor_lod*: minimum log odds in the final call of variants. TNscope default: `6.3`. In our UMI-workflow we reduced this setting to `4.0` +*Allelic Frequency (AF)*: Fraction of the reads supporting the alternate allele :: - min_tumor_lod = 4.0 + Minimum AF(tumor) > 0.05 -*min_tumor_allele_frac*: Set the minimum tumor AF to be considered as potential variant site. +:: + + SUM(QSS)/SUM(AD) >= 20 + +*Read Counts*: Count of reads in a given (F1R2, F2R1) pair orientation supporting the alternate allele and reference alleles :: - min_tumor_allele_frac = 0.0005 + ALT_F1R2 > 0, ALT_F2R1 > 0 + REF_F1R2 > 0, REF_F2R1 > 0 -*interval_padding*: Adding an extra 100bp to each end of the target region in the bed file before variant calling. +*SOR*: Symmetric Odds Ratio of 2x2 contingency table to detect strand bias :: - interval_padding = 100 + SOR < 3 -**Post-call Filters** +**Post-call Observation database Filters** *GNOMADAF_POPMAX*: Maximum Allele Frequency across populations :: - GNOMADAF_popmax <= 0.02 (or) GNOMADAF_popmax == "." + GNOMADAF_popmax <= 0.001 (or) GNOMADAF_popmax == "." -*SWEGENAF*: SweGen Allele Frequency + +*Normalized base quality scores*: The sum of base quality scores for each allele (QSS) is divided by the allelic depth of alt and ref alleles (AD) :: @@ -334,9 +451,11 @@ Minimum log-odds for the candidate selection. TNscope default: `4`. In our UMI-w Frq <= 0.01 (or) Frq == "." +The variants scored as `PASS` or `triallelic_sites` are included in the final vcf file (`SNV.somatic..tnscope..filtered.pass.vcf.gz`). **Attention:** **BALSAMIC <= v8.2.10 uses GNOMAD_popmax <= 0.005. From Balsamic v9.0.0, this settings is changed to 0.02, to reduce the stringency.** **BALSAMIC >= v11.0.0 removes unmapped reads from the bam and cram files for all the workflows.** +**BALSAMIC >= v13.0.0 keeps unmapped reads in bam and cram files for all the workflows.** diff --git a/docs/balsamic_sv_cnv.rst b/docs/balsamic_sv_cnv.rst index c32a8b2e8..f3598792b 100644 --- a/docs/balsamic_sv_cnv.rst +++ b/docs/balsamic_sv_cnv.rst @@ -44,11 +44,21 @@ Depending on the sequencing type, BALSAMIC is currently running the following st - tumor-only - somatic - CNV + * - igh_dux4 (see note below) + - WGS + - tumor-normal, tumor-only + - somatic + - SV Further details about a specific caller can be found in the links for the repositories containing the documentation for SV and CNV callers along with the links for the articles are listed in `bioinfo softwares `_. +Note that igh_dux4 is not a variant caller itself. This is a custom script that uses samtools to detect read pairs supporting IGH::DUX4 rearrangements. In short, the command identifies discordant reads mapping to the IGH region and to either DUX4 or its homologous DUX4-like regions (see references for details). The inclusion of this feature aims to alleviate the failure of callers to detect this rearrangement. It is important to note, however, that the reported breakpoints are fixed to the IGH and DUX4 coordinates and are, therefore, imprecise and uncertain. Therefore, we advise caution when interpreting this information. + + It is mandatory to provide the gender of the sample from BALSAMIC version >= 10.0.0 For CNV analysis. + + **Pre-merge Filtrations** ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -82,6 +92,9 @@ Manta calls are filtered using bcftools to only keep variants that have evidence * - Manta - low_pr_sr_count - SUM(FORMAT/PR[0:1]+FORMAT/SR[0:1]) < 4.0 + * - igh_dux4 + - samtools_igh_dux4 + - DV < 1 Further information regarding the TIDDIT tumor normal filtration: As translocation variants are represented by 2 BNDs in the VCF which allows for mixed assignment of soft-filters, a requirement for assigning soft-filters to translocations is that neither BND is PASS. @@ -117,12 +130,13 @@ Further information regarding the TIDDIT tumor normal filtration: As translocati | 3. ascat | 4. dellycnv | 5. tiddit + | 6. igh_dux4 - | 1. manta | 2. dellysv | 3. dellycnv | 4. tiddit | 5. cnvpytor - + | 6. igh_dux4 The merged `SNV.somatic..svdb.vcf.gz` file retains all the information for the variants from the caller in which the variants are identified, which are then annotated using `ensembl-vep`. diff --git a/docs/resources.rst b/docs/resources.rst index 90ec6eb04..5fe527ad0 100644 --- a/docs/resources.rst +++ b/docs/resources.rst @@ -58,6 +58,8 @@ Relevant publications #. **Two case studies and a pipeline (unpublished)**: Noll, A. C., Miller, N. A., Smith, L. D., Yoo, B., Fiedler, S., Cooley, L. D., … Kingsmore, S. F. (2016). Clinical detection of deletion structural variants in whole-genome sequences. Npj Genomic Medicine, 1(1), 16026. https://doi.org/10.1038/npjgenmed.2016.26 #. **Review on driver gene methods**: Tokheim, C. J., Papadopoulos, N., Kinzler, K. W., Vogelstein, B., & Karchin, R. (2016). Evaluating the evaluation of cancer driver genes. Proceedings of the National Academy of Sciences, 113(50), 14330–14335. https://doi.org/10.1073/pnas.1616440113 +#. **Detection of IGH::DUX4 rearrangement**: Rezayee, F., Eisfeldt, J., Skaftason, A., Öfverholm, I., Sayyab, S., Syvänen, A. C., … & Barbany, G. (2023). Feasibility to use whole-genome sequencing as a sole diagnostic method to detect genomic aberrations in pediatric B-cell acute lymphoblastic leukemia. Frontiers in Oncology, 13, 1217712. https://doi.org/10.3389/fonc.2023.1217712 + *Resource, or general notable papers including resource and KB papers related to cancer genomics* #. **GIAB**: Zook, J. M., Catoe, D., McDaniel, J., Vang, L., Spies, N., Sidow, A., … Salit, M. (2016). Extensive sequencing of seven human genomes to characterize benchmark reference materials. Scientific Data, 3, 160025. https://doi.org/10.1038/sdata.2016.25 diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py index 728a5156e..9185e7afa 100644 --- a/tests/commands/config/test_config_sample.py +++ b/tests/commands/config/test_config_sample.py @@ -458,8 +458,83 @@ def test_config_with_gens_arguments_for_tga( panel_bed_file, ], ) - # THEN a config should be created and exist + # THEN config should fail with error message assert result.exit_code == 2 assert ( "GENS is currently not compatible with TGA analysis, only WGS." in result.output ) + + +def test_config_wgs_with_exome( + invoke_cli, + tumor_sample_name: str, + analysis_dir: str, + balsamic_cache: str, + fastq_dir_tumor_only: str, + case_id_tumor_only: str, +): + """Test balsamic config case with --exome argument for WGS.""" + + # GIVEN CLI arguments including optional GENS input-files + + # WHEN invoking the config case command + result = invoke_cli( + [ + "config", + "case", + "--case-id", + case_id_tumor_only, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, + "--exome", + ], + ) + # THEN config should fail with error message + assert result.exit_code == 2 + assert "If --exome is provided, --panel-bed must also be provided." in result.output + + +def test_config_tga_with_exome( + invoke_cli, + tumor_sample_name: str, + analysis_dir: str, + balsamic_cache: str, + fastq_dir_tumor_only: str, + case_id_tumor_only: str, + panel_bed_file: str, +): + """Test balsamic config case with GENS arguments for TGA.""" + + # GIVEN CLI arguments including optional GENS input-files + + # WHEN invoking the config case command + result = invoke_cli( + [ + "config", + "case", + "--case-id", + case_id_tumor_only, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, + "-p", + panel_bed_file, + "--exome", + ], + ) + # THEN a config should be created and exist + assert result.exit_code == 0 + assert Path( + analysis_dir, case_id_tumor_only, f"{case_id_tumor_only}.{FileType.JSON}" + ).exists() diff --git a/tests/test_data/config.json b/tests/test_data/config.json index 507aa8aef..d2b815d73 100644 --- a/tests/test_data/config.json +++ b/tests/test_data/config.json @@ -181,6 +181,20 @@ "BALSAMIC" ] }, + "igh_dux4": { + "mutation": "somatic", + "mutation_type": "SV", + "analysis_type": [ + "single", + "paired" + ], + "sequencing_type": [ + "wgs" + ], + "workflow_solution": [ + "BALSAMIC" + ] + }, "svdb": { "mutation": "somatic", "mutation_type": "SV", diff --git a/tests/test_workflow.py b/tests/test_workflow.py index c9b84f7c4..ebf5fef36 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -1,4 +1,5 @@ from unittest import mock +import logging import snakemake from BALSAMIC.constants.analysis import AnalysisWorkflow @@ -8,13 +9,17 @@ def test_workflow_tumor_only_tga_hg19( - tumor_only_config, sentieon_install_dir, sentieon_license + tumor_only_config, + sentieon_install_dir, + sentieon_license, + caplog, ): # GIVEN a sample config dict and a snakefile analysis_type = "single" analysis_workflow = "balsamic" snakefile = get_snakefile(analysis_type, analysis_workflow) config_json = tumor_only_config + caplog.set_level(logging.INFO) # WHEN invoking snakemake module with dry run option # THEN the snakemake workflow for TGA, hg19-tumor-only should run successfully. @@ -27,15 +32,22 @@ def test_workflow_tumor_only_tga_hg19( ): assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) + # THEN the following rules should not be included + assert "igh_dux4_detection_tumor_only" not in caplog.text + def test_workflow_tumor_normal_tga_hg19( - tumor_normal_config, sentieon_install_dir, sentieon_license + tumor_normal_config, + sentieon_install_dir, + sentieon_license, + caplog, ): # GIVEN a sample config dict and a snakefile analysis_type = "paired" analysis_workflow = "balsamic" snakefile = get_snakefile(analysis_type, analysis_workflow) config_json = tumor_normal_config + caplog.set_level(logging.INFO) # WHEN invoking snakemake module with dry run option # THEN the snakemake workflow for TGA, hg19-tumor-normal should run successfully. @@ -48,15 +60,22 @@ def test_workflow_tumor_normal_tga_hg19( ): assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) + # THEN the following rules should not be included + assert "igh_dux4_detection_tumor_normal" not in caplog.text + def test_workflow_tumor_only_wgs_hg19( - tumor_only_wgs_config, sentieon_install_dir, sentieon_license + tumor_only_wgs_config, + sentieon_install_dir, + sentieon_license, + caplog, ): # GIVEN a sample config dict and a snakefile analysis_type = "single" analysis_workflow = "balsamic" snakefile = get_snakefile(analysis_type, analysis_workflow) config_json = tumor_only_wgs_config + caplog.set_level(logging.INFO) # WHEN invoking snakemake module with dry run option # THEN the snakemake workflow for WGS, hg19-tumor-only should run successfully. @@ -69,15 +88,22 @@ def test_workflow_tumor_only_wgs_hg19( ): assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) + # THEN the following rules should be included + assert "igh_dux4_detection_tumor_only" in caplog.text + def test_workflow_tumor_normal_wgs_hg19( - tumor_normal_wgs_config, sentieon_install_dir, sentieon_license + tumor_normal_wgs_config, + sentieon_install_dir, + sentieon_license, + caplog, ): # GIVEN a sample config dict and a snakefile analysis_type = "paired" analysis_workflow = "balsamic" snakefile = get_snakefile(analysis_type, analysis_workflow) config_json = tumor_normal_wgs_config + caplog.set_level(logging.INFO) # WHEN invoking snakemake module with dry run option # THEN the snakemake workflow for WGS, hg19-tumor-normal should run successfully. @@ -90,6 +116,9 @@ def test_workflow_tumor_normal_wgs_hg19( ): assert snakemake.snakemake(snakefile, configfiles=[config_json], dryrun=True) + # THEN the following rules should be included + assert "igh_dux4_detection_tumor_normal" in caplog.text + def test_workflow_qc_tumor_only_hg19( tumor_only_config_qc, sentieon_install_dir, sentieon_license diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 916c5fe20..785b428bb 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -12,6 +12,7 @@ from _pytest.logging import LogCaptureFixture from _pytest.tmpdir import TempPathFactory +from BALSAMIC.commands.config.case import case_config from BALSAMIC.constants.analysis import BIOINFO_TOOL_ENV, SampleType, SequencingType from BALSAMIC.constants.cache import CacheVersion from BALSAMIC.constants.cluster import ClusterConfigType @@ -38,6 +39,7 @@ get_snakefile, job_id_dump_to_yaml, validate_cache_version, + validate_exome_option, ) from BALSAMIC.utils.exc import BalsamicError, WorkflowRunError from BALSAMIC.utils.io import ( @@ -1001,6 +1003,23 @@ def test_get_fastp_parameters(balsamic_model: ConfigModel): assert "--disable_adapter_trimming" in fastp_params_tga["fastp_trim_adapter"] +def test_validate_exome_option(panel_bed_file: str): + # GIVEN that a panel bedfile has been supplied and exome parameter set to true + ctx = click.Context(case_config) + ctx.params["panel_bed"] = panel_bed_file + # WHEN validating exome option + # THEN exome argument should be correctly set + assert validate_exome_option(ctx, click.Parameter, True) == True + + # GIVEN that a panel bedfile has NOT been supplied and exome parameter set to true + ctx.params["panel_bed"] = None + + # WHEN validating exome option + # THEN a bad parameter error should be raised + with pytest.raises(click.BadParameter): + validate_exome_option(ctx, click.Parameter, True) + + def test_validate_cache_version_develop(): """Test develop cache version validation."""