Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Release v15.0.0 #1422

Merged
merged 6 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions BALSAMIC/assets/scripts/igh_dux4_detection.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/bin/bash

# Check if at least 3 arguments are provided
if [ "$#" -lt 3 ]; then
echo "Usage: $0 <genome_version> <output_vcf> <tumor_bam> [normal_bam]"
exit 1
fi

# Assign variables
genome_version="$1"
output_vcf="$2"
tumor_bam="$3"
normal_bam="$4"
output_vcf_tmp=$(echo $output_vcf | sed 's/.gz$//;')

# Print given arguments
echo "genome_version: $genome_version"
echo "output vcf: $output_vcf"
echo "tumor bam: $tumor_bam"
echo "normal bam: $normal_bam"

# Set chr positions depending on the genome version
if [ "$genome_version" = "hg19" ]; then
igh_chr="14"
igh_pos="106032614"
dux4_chr="4"
dux4_pos="190988100"
elif [ "$genome_version" = "hg38" ]; then
igh_chr="14"
igh_pos="105586437"
dux4_chr="4"
dux4_pos="190173000"
else
echo "Invalid genome version. Accepted values: hg19, hg38. Given: $genome_version"
exit 1
fi


# Define functions
get_supporting_reads() {
# Get number of supporting reads for IGH::DUX4 rearrangement in a given BAM file
local bam="$1"
if [ "$genome_version" = "hg19" ]; then
local supporting_reads=$(samtools view -F 1024 -c \
-e '(rnext == "4" && pnext > 190988100 && pnext < 191007000) || (rnext == "10" && pnext > 135477000 && pnext < 135500000) || (rnext == "GL000228.1" && pnext > 70000 && pnext < 115000) || ([SA] =~ "10,1354[789][0-9]{4}") || ([SA] =~ "4,19(09[8-9][0-9]|100[0-7])[0-9]{3}" || [SA] =~ "GL000228.1,([7-9][0-9]{4}|1[0-1][0-5][0-9]{3})")' \
$bam 14:106032614-107288051 )
elif [ "$genome_version" = "hg38" ]; then
local supporting_reads=$(samtools view -F 1024 -c \
-e '(rnext == "4" && pnext > 190173000 && pnext < 190176000) || ([SA] =~ "4,19017[345][0-9]{3}")' \
$bam chr14:105586437-106879844 )
fi
echo $supporting_reads
}

# Set information for tumor and normal
supporting_reads_tumor=$(get_supporting_reads $tumor_bam)
samples_header="TUMOR"
samples_field="${supporting_reads_tumor}"
if [ -n "$normal_bam" ]; then
supporting_reads_normal=$(get_supporting_reads $normal_bam)
samples_header="NORMAL\tTUMOR"
samples_field="${supporting_reads_normal}\t${supporting_reads_tumor}"
fi


# If supporting reads are found in the tumor, set filter to PASS. Otherwise add: no_supporting_reads
if [ "$supporting_reads_tumor" -gt 0 ]; then
vcf_filter="PASS"
else
vcf_filter="no_supporting_reads"
fi

echo "supporting reads tumor: $supporting_reads_tumor"
echo "supporting reads normal: $supporting_reads_normal"
echo "vcf filter: $vcf_filter"

# Write vcf entry
{
echo '##fileformat=VCFv4.2'
echo '##ALT=<ID=BND,Description="Break end">'
echo '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">'
echo '##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">'
echo '##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of paired-ends that support the event">'
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t${samples_header}"
echo -e "${igh_chr}\t${igh_pos}\tsamtools_igh_dux4\tN\tN[${dux4_chr}:${dux4_pos}[\t.\t${vcf_filter}\tSVTYPE=BND;IMPRECISE;\tDV\t${samples_field}"
} >> $output_vcf_tmp

bgzip $output_vcf_tmp
tabix -p vcf $output_vcf
4 changes: 4 additions & 0 deletions BALSAMIC/commands/config/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
OPTION_CASE_ID,
OPTION_CLINICAL_SNV_OBSERVATIONS,
OPTION_CLINICAL_SV_OBSERVATIONS,
OPTION_EXOME,
OPTION_FASTQ_PATH,
OPTION_GENDER,
OPTION_GENOME_INTERVAL,
Expand Down Expand Up @@ -70,6 +71,7 @@
@OPTION_CASE_ID
@OPTION_CLINICAL_SNV_OBSERVATIONS
@OPTION_CLINICAL_SV_OBSERVATIONS
@OPTION_EXOME
@OPTION_FASTQ_PATH
@OPTION_GENDER
@OPTION_GENOME_VERSION
Expand Down Expand Up @@ -101,6 +103,7 @@ def case_config(
case_id: str,
clinical_snv_observations: Path,
clinical_sv_observations: Path,
exome: bool,
fastq_path: Path,
gender: Gender,
genome_version: GenomeVersion,
Expand Down Expand Up @@ -219,6 +222,7 @@ def case_config(
container_conda_env_path=CONTAINERS_DIR,
),
panel={
"exome": exome,
"capture_kit": panel_bed,
"chrom": get_panel_chrom(panel_bed),
"pon_cnn": pon_cnn,
Expand Down
10 changes: 9 additions & 1 deletion BALSAMIC/commands/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from BALSAMIC.constants.constants import LOG_LEVELS, LogLevel
from BALSAMIC.constants.rules import DELIVERY_RULES
from BALSAMIC.constants.workflow_params import VCF_DICT
from BALSAMIC.utils.cli import validate_cache_version
from BALSAMIC.utils.cli import validate_cache_version, validate_exome_option

OPTION_ADAPTER_TRIM = click.option(
"--adapter-trim/--no-adapter-trim",
Expand Down Expand Up @@ -189,6 +189,14 @@
help="Enable dragen variant caller",
)

OPTION_EXOME = click.option(
"--exome",
is_flag=True,
default=False,
help="Assign exome parameters to TGA workflow",
callback=validate_exome_option,
)

OPTION_FASTQ_PATH = click.option(
"--fastq-path",
type=click.Path(exists=True, resolve_path=True),
Expand Down
4 changes: 4 additions & 0 deletions BALSAMIC/constants/cluster_analysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -411,5 +411,9 @@
"samtools_qc": {
"time": "04:00:00",
"n": 16
},
"igh_dux4_detection": {
"time": "02:00:00",
"n": 1
}
}
42 changes: 31 additions & 11 deletions BALSAMIC/constants/variant_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,8 @@
"description": "General purpose filters used for filtering any variant caller",
}

# Configuration of VARDICT settings
VARDICT_SETTINGS = {
"AD": {"tag_value": 5, "filter_name": "balsamic_low_tumor_ad", "field": "INFO"},
"DP": {
"tag_value": 100,
"filter_name": "balsamic_low_tumor_dp",
"field": "INFO",
},
"MQ": {"tag_value": 40, "filter_name": "balsamic_low_mq", "field": "INFO"},
"AF_min": {"tag_value": 0.007, "filter_name": "balsamic_low_af", "field": "INFO"},
# Configuration of common VARDICT settings
VARDICT_SETTINGS_COMMON = {
"pop_freq": {
"tag_value": 0.005,
"filter_name": "balsamic_high_pop_freq",
Expand All @@ -35,12 +27,35 @@
"filter_name": "Frq",
"field": "INFO",
},
"MQ": {"tag_value": 30, "filter_name": "balsamic_low_mq", "field": "INFO"},
"AF_min": {"tag_value": 0.007, "filter_name": "balsamic_low_af", "field": "INFO"},
"AD": {"tag_value": 5, "filter_name": "balsamic_low_tumor_ad", "field": "INFO"},
"varcaller_name": "VarDict",
"filter_type": "general",
"analysis_type": "tumor_only",
"analysis_type": "tumor_only,tumor_normal",
"description": "General purpose filters used for filtering VarDict",
}

# Configuration of VARDICT settings for smaller panels
VARDICT_SETTINGS_PANEL = {
**VARDICT_SETTINGS_COMMON,
"DP": {
"tag_value": 100,
"filter_name": "balsamic_low_tumor_dp",
"field": "INFO",
},
}

# Configuration of VARDICT settings for exomes
VARDICT_SETTINGS_EXOME = {
**VARDICT_SETTINGS_COMMON,
"DP": {
"tag_value": 20,
"filter_name": "balsamic_low_tumor_dp",
"field": "INFO",
},
}

# Configuration for SENTIEON settings:
SENTIEON_VARCALL_SETTINGS = {
"AD": {"tag_value": 3, "filter_name": "balsamic_low_tumor_ad", "field": "FORMAT"},
Expand All @@ -50,6 +65,11 @@
"field": "FORMAT",
},
"AF_min": {"tag_value": 0.05, "filter_name": "balsamic_low_af", "field": "FORMAT"},
"high_normal_tumor_af_frac": {
"tag_value": 0.3,
"filter_name": "high_normal_tumor_af_frac",
"field": "FORMAT",
},
"pop_freq": {
"tag_value": 0.001,
"filter_name": "balsamic_high_pop_freq",
Expand Down
7 changes: 7 additions & 0 deletions BALSAMIC/constants/workflow_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@
"sequencing_type": ["wgs"],
"workflow_solution": ["BALSAMIC"],
},
"igh_dux4": {
"mutation": "somatic",
"mutation_type": "SV",
"analysis_type": ["single", "paired"],
"sequencing_type": ["wgs"],
"workflow_solution": ["BALSAMIC"],
},
"svdb": {
"mutation": "somatic",
"mutation_type": "SV",
Expand Down
3 changes: 3 additions & 0 deletions BALSAMIC/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class SampleInstanceModel(BaseModel):
class PanelModel(BaseModel):
"""Holds attributes of PANEL BED file if provided
Attributes:
exome: (bool); optional parameter for targeted analyses to use exome parameters
capture_kit : Field(str(Path)); string representation of path to PANEL BED file
chrom : Field(list(str)); list of chromosomes in PANEL BED
pon_cnn: Field(optional); Path where PON reference .cnn file is stored
Expand All @@ -59,6 +60,7 @@ class PanelModel(BaseModel):

"""

exome: Optional[bool] = False
capture_kit: Annotated[Optional[str], AfterValidator(is_file)] = None
chrom: Optional[List[str]] = None
pon_cnn: Annotated[Optional[str], AfterValidator(is_file)] = None
Expand Down Expand Up @@ -102,6 +104,7 @@ class VCFModel(BaseModel):
dellycnv: VarcallerAttribute
tiddit: VarcallerAttribute
cnvpytor: VarcallerAttribute
igh_dux4: VarcallerAttribute
svdb: VarcallerAttribute


Expand Down
2 changes: 2 additions & 0 deletions BALSAMIC/models/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ class VarCallerFilter(BaseModel):
Attributes:
AD: VCFAttributes (required); minimum allelic depth
AF_min: VCFAttributes (optional); minimum allelic fraction
high_normal_tumor_af_frac: VCFAttributes (optional); maximum normal allele frequency / tumor allele frequency
MQ: VCFAttributes (optional); minimum mapping quality
DP: VCFAttributes (optional); minimum read depth
pop_freq: VCFAttributes (optional); maximum gnomad allele frequency
Expand All @@ -238,6 +239,7 @@ class VarCallerFilter(BaseModel):

AD: Optional[VCFAttributes] = None
AF_min: Optional[VCFAttributes] = None
high_normal_tumor_af_frac: Optional[VCFAttributes] = None
MQ: Optional[VCFAttributes] = None
DP: Optional[VCFAttributes] = None
pop_freq: Optional[VCFAttributes] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,21 @@ elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["anal
AD = [SENTIEON_CALLER.AD.tag_value, SENTIEON_CALLER.AD.filter_name],
DP = [SENTIEON_CALLER.DP.tag_value, SENTIEON_CALLER.DP.filter_name],
AF_min = [SENTIEON_CALLER.AF_min.tag_value, SENTIEON_CALLER.AF_min.filter_name],
high_normal_tumor_af_frac_filter_name=SENTIEON_CALLER.high_normal_tumor_af_frac.filter_name,
high_normal_tumor_af_frac_value=SENTIEON_CALLER.high_normal_tumor_af_frac.tag_value,
case_name = config["analysis"]["case_id"],
threads:
get_threads(cluster_config, 'bcftools_quality_filter_tnscope_tumor_normal')
message:
"Quality filtering WGS tumor-normal tnscope variants using bcftools for {params.case_name}"
shell:
"""
bcftools view -f PASS,triallelic_site {input.vcf_snv} \
bcftools view {input.vcf_snv} \
| bcftools filter --threads {threads} --include 'SUM(FORMAT/AD[0:0]+FORMAT/AD[0:1]) >= {params.DP[0]} || SUM(FORMAT/AD[1:0]+FORMAT/AD[1:1]) >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' \
| bcftools filter --threads {threads} --include 'FORMAT/AD[0:1] >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' \
| bcftools filter --threads {threads} --include 'FORMAT/AF[0] >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' \
| bcftools annotate -x FILTER/alt_allele_in_normal \
| bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1])/sum(FORMAT/AF[0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' \
| bcftools view -f PASS,triallelic_site -O z -o {output.vcf_snv_research};

tabix -p vcf -f {output.vcf_snv_research};
Expand Down Expand Up @@ -179,14 +183,18 @@ tabix -p vcf -f {output.vcf_filtered};
Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix()
params:
case_name = config["analysis"]["case_id"],
high_normal_tumor_af_frac_filter_name=SENTIEON_CALLER.high_normal_tumor_af_frac.filter_name,
high_normal_tumor_af_frac_value=SENTIEON_CALLER.high_normal_tumor_af_frac.tag_value,
threads:
get_threads(cluster_config,'bcftools_quality_filter_TNscope_umi_tumor_normal')
message:
"Quality filtering TNscope_umi tumor-normal annotated variants using bcftools for {params.case_name} "
shell:
"""
bcftools view {input.vcf} | \
bcftools view -f PASS,triallelic_site -o {output.vcf_filtered} -O z;
bcftools view {input.vcf} \
| bcftools annotate -x FILTER/alt_allele_in_normal \
| bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1])/sum(FORMAT/AF[0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' \
| bcftools view -f PASS,triallelic_site -o {output.vcf_filtered} -O z;

tabix -p vcf -f {output.vcf_filtered};
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,33 @@ else:
rm {input.ascat_cnv};
"""


rule igh_dux4_detection_tumor_normal:
input:
fa = config["reference"]["reference_genome"],
bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample),
bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name=normal_sample),
output:
vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".igh_dux4.vcf.gz",
benchmark:
benchmark_dir + 'igh_dux4_detection_tumor_normal_' + config["analysis"]["case_id"] + ".tsv"
singularity:
Path(singularity_image, config["bioinfo_tools"].get("samtools") + ".sif").as_posix()
params:
genome_version = config["reference"]["genome_version"],
custom_sv_detection_script = get_script_path("igh_dux4_detection.sh"),
case_name = config["analysis"]["case_id"],
threads:
get_threads(cluster_config, "igh_dux4_detection")
message:
"Detecting IGH::DUX4 rearrangement for {params.case_name} using samtools."
shell:
"""
bash {params.custom_sv_detection_script} {params.genome_version} {output.vcf} {input.bamT} {input.bamN}
"""



rule svdb_merge_tumor_normal:
input:
vcf = expand(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,31 @@ tabix -p vcf -f {output.delly_sv};
tabix -p vcf -f {output.delly_cnv};
"""


rule igh_dux4_detection_tumor_only:
input:
fa = config["reference"]["reference_genome"],
bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample)
output:
vcf = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".igh_dux4.vcf.gz",
benchmark:
benchmark_dir + 'igh_dux4_detection_tumor_only_' + config["analysis"]["case_id"] + ".tsv"
singularity:
Path(singularity_image, config["bioinfo_tools"].get("samtools") + ".sif").as_posix()
params:
genome_version = config["reference"]["genome_version"],
custom_sv_detection_script = get_script_path("igh_dux4_detection.sh"),
case_name = config["analysis"]["case_id"],
threads:
get_threads(cluster_config, "igh_dux4_detection")
message:
"Detecting IGH::DUX4 rearrangement for {params.case_name} using samtools."
shell:
"""
bash {params.custom_sv_detection_script} {params.genome_version} {output.vcf} {input.bamT}
"""


rule svdb_merge_tumor_only:
input:
vcf = expand(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};
export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx90G\"';

vardict-java -U -u -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \
vardict-java -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \
-b \"{input.bamT}|{input.bamN}\" \
-th {threads} \
{params.col_info} {input.bed} \
Expand Down
Loading
Loading