Skip to content

Commit

Permalink
Merge pull request #1422 from Clinical-Genomics/release_v15.0.0
Browse files Browse the repository at this point in the history
This is a relatively small update to Balsamic with some significant changes to VarDict filters and settings, which influences all TGA workflows and in particular exome-analyses, as well as a new feature with a custom solution for detecting IGH::DUX4 rearrangements in WGS. 

#### Added

* new option for exome samples --exome with modified bcftools filters compared to standard targeted workflow #1414
* custom samtools rule for detection of reads supporting IGH::DUX4 rearrangements #1397
* high_normal_tumor_af_frac filter in bcftools for TNscope WGS T+N analysis, and UMI T+N analysis which allows for 30% of tumor in the normal. #1289

#### Changed

* reduced stringency of targeted none-exome bcftools filters for min MQ #1414
* removed -u flag from VarDict T+N and T only rules #1414

#### Removed

* removed -U flag to VarDict T+N rule to start calling SVs in VarDict (required for FLT3) #1414
* alt_allele_in_normal set by TNscope for WGS T+N analysis, and UMI T+N analysis #1289
  • Loading branch information
mathiasbio authored Apr 10, 2024
2 parents b94831b + 28d7e1e commit 2d6965b
Show file tree
Hide file tree
Showing 23 changed files with 604 additions and 93 deletions.
89 changes: 89 additions & 0 deletions BALSAMIC/assets/scripts/igh_dux4_detection.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/bin/bash

# Check if at least 3 arguments are provided
if [ "$#" -lt 3 ]; then
echo "Usage: $0 <genome_version> <output_vcf> <tumor_bam> [normal_bam]"
exit 1
fi

# Assign variables
genome_version="$1"
output_vcf="$2"
tumor_bam="$3"
normal_bam="$4"
output_vcf_tmp=$(echo $output_vcf | sed 's/.gz$//;')

# Print given arguments
echo "genome_version: $genome_version"
echo "output vcf: $output_vcf"
echo "tumor bam: $tumor_bam"
echo "normal bam: $normal_bam"

# Set chr positions depending on the genome version
if [ "$genome_version" = "hg19" ]; then
igh_chr="14"
igh_pos="106032614"
dux4_chr="4"
dux4_pos="190988100"
elif [ "$genome_version" = "hg38" ]; then
igh_chr="14"
igh_pos="105586437"
dux4_chr="4"
dux4_pos="190173000"
else
echo "Invalid genome version. Accepted values: hg19, hg38. Given: $genome_version"
exit 1
fi


# Define functions
get_supporting_reads() {
# Get number of supporting reads for IGH::DUX4 rearrangement in a given BAM file
local bam="$1"
if [ "$genome_version" = "hg19" ]; then
local supporting_reads=$(samtools view -F 1024 -c \
-e '(rnext == "4" && pnext > 190988100 && pnext < 191007000) || (rnext == "10" && pnext > 135477000 && pnext < 135500000) || (rnext == "GL000228.1" && pnext > 70000 && pnext < 115000) || ([SA] =~ "10,1354[789][0-9]{4}") || ([SA] =~ "4,19(09[8-9][0-9]|100[0-7])[0-9]{3}" || [SA] =~ "GL000228.1,([7-9][0-9]{4}|1[0-1][0-5][0-9]{3})")' \
$bam 14:106032614-107288051 )
elif [ "$genome_version" = "hg38" ]; then
local supporting_reads=$(samtools view -F 1024 -c \
-e '(rnext == "4" && pnext > 190173000 && pnext < 190176000) || ([SA] =~ "4,19017[345][0-9]{3}")' \
$bam chr14:105586437-106879844 )
fi
echo $supporting_reads
}

# Set information for tumor and normal
supporting_reads_tumor=$(get_supporting_reads $tumor_bam)
samples_header="TUMOR"
samples_field="${supporting_reads_tumor}"
if [ -n "$normal_bam" ]; then
supporting_reads_normal=$(get_supporting_reads $normal_bam)
samples_header="NORMAL\tTUMOR"
samples_field="${supporting_reads_normal}\t${supporting_reads_tumor}"
fi


# If supporting reads are found in the tumor, set filter to PASS. Otherwise add: no_supporting_reads
if [ "$supporting_reads_tumor" -gt 0 ]; then
vcf_filter="PASS"
else
vcf_filter="no_supporting_reads"
fi

echo "supporting reads tumor: $supporting_reads_tumor"
echo "supporting reads normal: $supporting_reads_normal"
echo "vcf filter: $vcf_filter"

# Write vcf entry
{
echo '##fileformat=VCFv4.2'
echo '##ALT=<ID=BND,Description="Break end">'
echo '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">'
echo '##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">'
echo '##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of paired-ends that support the event">'
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t${samples_header}"
echo -e "${igh_chr}\t${igh_pos}\tsamtools_igh_dux4\tN\tN[${dux4_chr}:${dux4_pos}[\t.\t${vcf_filter}\tSVTYPE=BND;IMPRECISE;\tDV\t${samples_field}"
} >> $output_vcf_tmp

bgzip $output_vcf_tmp
tabix -p vcf $output_vcf
4 changes: 4 additions & 0 deletions BALSAMIC/commands/config/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
OPTION_CASE_ID,
OPTION_CLINICAL_SNV_OBSERVATIONS,
OPTION_CLINICAL_SV_OBSERVATIONS,
OPTION_EXOME,
OPTION_FASTQ_PATH,
OPTION_GENDER,
OPTION_GENOME_INTERVAL,
Expand Down Expand Up @@ -70,6 +71,7 @@
@OPTION_CASE_ID
@OPTION_CLINICAL_SNV_OBSERVATIONS
@OPTION_CLINICAL_SV_OBSERVATIONS
@OPTION_EXOME
@OPTION_FASTQ_PATH
@OPTION_GENDER
@OPTION_GENOME_VERSION
Expand Down Expand Up @@ -101,6 +103,7 @@ def case_config(
case_id: str,
clinical_snv_observations: Path,
clinical_sv_observations: Path,
exome: bool,
fastq_path: Path,
gender: Gender,
genome_version: GenomeVersion,
Expand Down Expand Up @@ -219,6 +222,7 @@ def case_config(
container_conda_env_path=CONTAINERS_DIR,
),
panel={
"exome": exome,
"capture_kit": panel_bed,
"chrom": get_panel_chrom(panel_bed),
"pon_cnn": pon_cnn,
Expand Down
10 changes: 9 additions & 1 deletion BALSAMIC/commands/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from BALSAMIC.constants.constants import LOG_LEVELS, LogLevel
from BALSAMIC.constants.rules import DELIVERY_RULES
from BALSAMIC.constants.workflow_params import VCF_DICT
from BALSAMIC.utils.cli import validate_cache_version
from BALSAMIC.utils.cli import validate_cache_version, validate_exome_option

OPTION_ADAPTER_TRIM = click.option(
"--adapter-trim/--no-adapter-trim",
Expand Down Expand Up @@ -189,6 +189,14 @@
help="Enable dragen variant caller",
)

OPTION_EXOME = click.option(
"--exome",
is_flag=True,
default=False,
help="Assign exome parameters to TGA workflow",
callback=validate_exome_option,
)

OPTION_FASTQ_PATH = click.option(
"--fastq-path",
type=click.Path(exists=True, resolve_path=True),
Expand Down
4 changes: 4 additions & 0 deletions BALSAMIC/constants/cluster_analysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -411,5 +411,9 @@
"samtools_qc": {
"time": "04:00:00",
"n": 16
},
"igh_dux4_detection": {
"time": "02:00:00",
"n": 1
}
}
42 changes: 31 additions & 11 deletions BALSAMIC/constants/variant_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,8 @@
"description": "General purpose filters used for filtering any variant caller",
}

# Configuration of VARDICT settings
VARDICT_SETTINGS = {
"AD": {"tag_value": 5, "filter_name": "balsamic_low_tumor_ad", "field": "INFO"},
"DP": {
"tag_value": 100,
"filter_name": "balsamic_low_tumor_dp",
"field": "INFO",
},
"MQ": {"tag_value": 40, "filter_name": "balsamic_low_mq", "field": "INFO"},
"AF_min": {"tag_value": 0.007, "filter_name": "balsamic_low_af", "field": "INFO"},
# Configuration of common VARDICT settings
VARDICT_SETTINGS_COMMON = {
"pop_freq": {
"tag_value": 0.005,
"filter_name": "balsamic_high_pop_freq",
Expand All @@ -35,12 +27,35 @@
"filter_name": "Frq",
"field": "INFO",
},
"MQ": {"tag_value": 30, "filter_name": "balsamic_low_mq", "field": "INFO"},
"AF_min": {"tag_value": 0.007, "filter_name": "balsamic_low_af", "field": "INFO"},
"AD": {"tag_value": 5, "filter_name": "balsamic_low_tumor_ad", "field": "INFO"},
"varcaller_name": "VarDict",
"filter_type": "general",
"analysis_type": "tumor_only",
"analysis_type": "tumor_only,tumor_normal",
"description": "General purpose filters used for filtering VarDict",
}

# Configuration of VARDICT settings for smaller panels
VARDICT_SETTINGS_PANEL = {
**VARDICT_SETTINGS_COMMON,
"DP": {
"tag_value": 100,
"filter_name": "balsamic_low_tumor_dp",
"field": "INFO",
},
}

# Configuration of VARDICT settings for exomes
VARDICT_SETTINGS_EXOME = {
**VARDICT_SETTINGS_COMMON,
"DP": {
"tag_value": 20,
"filter_name": "balsamic_low_tumor_dp",
"field": "INFO",
},
}

# Configuration for SENTIEON settings:
SENTIEON_VARCALL_SETTINGS = {
"AD": {"tag_value": 3, "filter_name": "balsamic_low_tumor_ad", "field": "FORMAT"},
Expand All @@ -50,6 +65,11 @@
"field": "FORMAT",
},
"AF_min": {"tag_value": 0.05, "filter_name": "balsamic_low_af", "field": "FORMAT"},
"high_normal_tumor_af_frac": {
"tag_value": 0.3,
"filter_name": "high_normal_tumor_af_frac",
"field": "FORMAT",
},
"pop_freq": {
"tag_value": 0.001,
"filter_name": "balsamic_high_pop_freq",
Expand Down
7 changes: 7 additions & 0 deletions BALSAMIC/constants/workflow_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@
"sequencing_type": ["wgs"],
"workflow_solution": ["BALSAMIC"],
},
"igh_dux4": {
"mutation": "somatic",
"mutation_type": "SV",
"analysis_type": ["single", "paired"],
"sequencing_type": ["wgs"],
"workflow_solution": ["BALSAMIC"],
},
"svdb": {
"mutation": "somatic",
"mutation_type": "SV",
Expand Down
3 changes: 3 additions & 0 deletions BALSAMIC/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class SampleInstanceModel(BaseModel):
class PanelModel(BaseModel):
"""Holds attributes of PANEL BED file if provided
Attributes:
exome: (bool); optional parameter for targeted analyses to use exome parameters
capture_kit : Field(str(Path)); string representation of path to PANEL BED file
chrom : Field(list(str)); list of chromosomes in PANEL BED
pon_cnn: Field(optional); Path where PON reference .cnn file is stored
Expand All @@ -59,6 +60,7 @@ class PanelModel(BaseModel):
"""

exome: Optional[bool] = False
capture_kit: Annotated[Optional[str], AfterValidator(is_file)] = None
chrom: Optional[List[str]] = None
pon_cnn: Annotated[Optional[str], AfterValidator(is_file)] = None
Expand Down Expand Up @@ -102,6 +104,7 @@ class VCFModel(BaseModel):
dellycnv: VarcallerAttribute
tiddit: VarcallerAttribute
cnvpytor: VarcallerAttribute
igh_dux4: VarcallerAttribute
svdb: VarcallerAttribute


Expand Down
2 changes: 2 additions & 0 deletions BALSAMIC/models/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ class VarCallerFilter(BaseModel):
Attributes:
AD: VCFAttributes (required); minimum allelic depth
AF_min: VCFAttributes (optional); minimum allelic fraction
high_normal_tumor_af_frac: VCFAttributes (optional); maximum normal allele frequency / tumor allele frequency
MQ: VCFAttributes (optional); minimum mapping quality
DP: VCFAttributes (optional); minimum read depth
pop_freq: VCFAttributes (optional); maximum gnomad allele frequency
Expand All @@ -238,6 +239,7 @@ class VarCallerFilter(BaseModel):

AD: Optional[VCFAttributes] = None
AF_min: Optional[VCFAttributes] = None
high_normal_tumor_af_frac: Optional[VCFAttributes] = None
MQ: Optional[VCFAttributes] = None
DP: Optional[VCFAttributes] = None
pop_freq: Optional[VCFAttributes] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,21 @@ elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["anal
AD = [SENTIEON_CALLER.AD.tag_value, SENTIEON_CALLER.AD.filter_name],
DP = [SENTIEON_CALLER.DP.tag_value, SENTIEON_CALLER.DP.filter_name],
AF_min = [SENTIEON_CALLER.AF_min.tag_value, SENTIEON_CALLER.AF_min.filter_name],
high_normal_tumor_af_frac_filter_name=SENTIEON_CALLER.high_normal_tumor_af_frac.filter_name,
high_normal_tumor_af_frac_value=SENTIEON_CALLER.high_normal_tumor_af_frac.tag_value,
case_name = config["analysis"]["case_id"],
threads:
get_threads(cluster_config, 'bcftools_quality_filter_tnscope_tumor_normal')
message:
"Quality filtering WGS tumor-normal tnscope variants using bcftools for {params.case_name}"
shell:
"""
bcftools view -f PASS,triallelic_site {input.vcf_snv} \
bcftools view {input.vcf_snv} \
| bcftools filter --threads {threads} --include 'SUM(FORMAT/AD[0:0]+FORMAT/AD[0:1]) >= {params.DP[0]} || SUM(FORMAT/AD[1:0]+FORMAT/AD[1:1]) >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' \
| bcftools filter --threads {threads} --include 'FORMAT/AD[0:1] >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' \
| bcftools filter --threads {threads} --include 'FORMAT/AF[0] >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' \
| bcftools annotate -x FILTER/alt_allele_in_normal \
| bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1])/sum(FORMAT/AF[0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' \
| bcftools view -f PASS,triallelic_site -O z -o {output.vcf_snv_research};

tabix -p vcf -f {output.vcf_snv_research};
Expand Down Expand Up @@ -179,14 +183,18 @@ tabix -p vcf -f {output.vcf_filtered};
Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix()
params:
case_name = config["analysis"]["case_id"],
high_normal_tumor_af_frac_filter_name=SENTIEON_CALLER.high_normal_tumor_af_frac.filter_name,
high_normal_tumor_af_frac_value=SENTIEON_CALLER.high_normal_tumor_af_frac.tag_value,
threads:
get_threads(cluster_config,'bcftools_quality_filter_TNscope_umi_tumor_normal')
message:
"Quality filtering TNscope_umi tumor-normal annotated variants using bcftools for {params.case_name} "
shell:
"""
bcftools view {input.vcf} | \
bcftools view -f PASS,triallelic_site -o {output.vcf_filtered} -O z;
bcftools view {input.vcf} \
| bcftools annotate -x FILTER/alt_allele_in_normal \
| bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1])/sum(FORMAT/AF[0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' \
| bcftools view -f PASS,triallelic_site -o {output.vcf_filtered} -O z;

tabix -p vcf -f {output.vcf_filtered};
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,33 @@ else:
rm {input.ascat_cnv};
"""


rule igh_dux4_detection_tumor_normal:
input:
fa = config["reference"]["reference_genome"],
bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample),
bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name=normal_sample),
output:
vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".igh_dux4.vcf.gz",
benchmark:
benchmark_dir + 'igh_dux4_detection_tumor_normal_' + config["analysis"]["case_id"] + ".tsv"
singularity:
Path(singularity_image, config["bioinfo_tools"].get("samtools") + ".sif").as_posix()
params:
genome_version = config["reference"]["genome_version"],
custom_sv_detection_script = get_script_path("igh_dux4_detection.sh"),
case_name = config["analysis"]["case_id"],
threads:
get_threads(cluster_config, "igh_dux4_detection")
message:
"Detecting IGH::DUX4 rearrangement for {params.case_name} using samtools."
shell:
"""
bash {params.custom_sv_detection_script} {params.genome_version} {output.vcf} {input.bamT} {input.bamN}
"""



rule svdb_merge_tumor_normal:
input:
vcf = expand(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,31 @@ tabix -p vcf -f {output.delly_sv};
tabix -p vcf -f {output.delly_cnv};
"""


rule igh_dux4_detection_tumor_only:
input:
fa = config["reference"]["reference_genome"],
bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample)
output:
vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".igh_dux4.vcf.gz",
benchmark:
benchmark_dir + 'igh_dux4_detection_tumor_only_' + config["analysis"]["case_id"] + ".tsv"
singularity:
Path(singularity_image, config["bioinfo_tools"].get("samtools") + ".sif").as_posix()
params:
genome_version = config["reference"]["genome_version"],
custom_sv_detection_script = get_script_path("igh_dux4_detection.sh"),
case_name = config["analysis"]["case_id"],
threads:
get_threads(cluster_config, "igh_dux4_detection")
message:
"Detecting IGH::DUX4 rearrangement for {params.case_name} using samtools."
shell:
"""
bash {params.custom_sv_detection_script} {params.genome_version} {output.vcf} {input.bamT}
"""


rule svdb_merge_tumor_only:
input:
vcf = expand(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};
export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx90G\"';
vardict-java -U -u -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \
vardict-java -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \
-b \"{input.bamT}|{input.bamN}\" \
-th {threads} \
{params.col_info} {input.bed} \
Expand Down
Loading

0 comments on commit 2d6965b

Please sign in to comment.