Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add postprocess bam and memory to vardict #1332

Merged
merged 19 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions BALSAMIC/constants/cluster_analysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
"time": "01:00:00",
"n": 4
},
"postprocess_bam": {
"time": "03:00:00",
"n": 12
},
"finalize_gens_outputfiles": {
"time": "01:00:00",
"n": 2
Expand Down Expand Up @@ -129,7 +133,7 @@
"n": 8
},
"samtools_sort_index": {
"time": "01:30:00",
"time": "02:30:00",
"n": 16
},
"sentieon_DNAscope": {
Expand Down Expand Up @@ -186,11 +190,11 @@
},
"vardict_tumor_normal": {
"time": "12:00:00",
"n": 10
"n": 18
},
"vardict_tumor_only": {
"time": "10:00:00",
"n": 10
"n": 9
},
"sentieon_bwa_umiextract": {
"time": "8:00:00",
Expand Down
2 changes: 2 additions & 0 deletions BALSAMIC/constants/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
"align": [
"snakemake_rules/umi/sentieon_umiextract.rule",
"snakemake_rules/umi/sentieon_consensuscall.rule",
"snakemake_rules/align/postprocess_bam.rule",
],
"varcall": [
"snakemake_rules/variant_calling/germline.rule",
Expand Down Expand Up @@ -100,6 +101,7 @@
"align": [
"snakemake_rules/umi/sentieon_umiextract.rule",
"snakemake_rules/umi/sentieon_consensuscall.rule",
"snakemake_rules/align/postprocess_bam.rule",
],
"varcall": [
"snakemake_rules/variant_calling/germline.rule",
Expand Down
2 changes: 1 addition & 1 deletion BALSAMIC/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def get_final_bam_name(
final_bam_suffix = "dedup"
elif self.analysis.sequencing_type == SequencingType.TARGETED:
# Only dedup is necessary for TGA
final_bam_suffix = "dedup"
final_bam_suffix = "dedup_sorted"
else:
# For WGS the bamfiles are realigned
final_bam_suffix = "dedup.realign"
Expand Down
47 changes: 47 additions & 0 deletions BALSAMIC/snakemake_rules/align/postprocess_bam.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# vim: syntax=python tabstop=4 expandtab
# coding: utf-8

# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format



rule postprocess_bam:
input:
bam = Path(bam_dir,"{sample_type}.{sample}.dedup_sorted.bam").as_posix()
output:
postprocessed_bam = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted_postprocessed.bam").as_posix(),
benchmark:
Path(benchmark_dir,"postprocess_bam_{sample_type}.{sample}.tsv").as_posix()
singularity:
Path(singularity_image,config["bioinfo_tools"].get("picard") + ".sif").as_posix()
params:
tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
sample_id = "{sample}"
threads:
get_threads(cluster_config, "postprocess_bam")
message:
"Samtools remove unmapped reads and collapse readgroups for sample: {params.sample_id}"
shell:
"""
mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};
mathiasbio marked this conversation as resolved.
Show resolved Hide resolved

samtools view --threads {threads} -O BAM -f 4 {input.bam} \
-o {params.tmpdir}/{wildcards.sample_type}.um.bam ;

samtools index {params.tmpdir}/{wildcards.sample_type}.um.bam ;

samtools view --threads {threads} -O BAM -F 4 {params.tmpdir}/{wildcards.sample_type}.um.bam \
-o {params.tmpdir}/{wildcards.sample_type}.m.bam ;

samtools index {params.tmpdir}/{wildcards.sample_type}.m.bam ;
mathiasbio marked this conversation as resolved.
Show resolved Hide resolved

picard -Xmx75g AddOrReplaceReadGroups -RGPU ILLUMINAi -RGID {wildcards.sample_type} -RGSM {wildcards.sample_type} -RGPL ILLUMINAi -RGLB ILLUMINAi -MAX_RECORDS_IN_RAM 1000000 -CREATE_INDEX true -CREATE_MD5_FILE true \
mathiasbio marked this conversation as resolved.
Show resolved Hide resolved
-TMP_DIR {params.tmpdir} \
-INPUT {params.tmpdir}/{wildcards.sample_type}.m.bam \
-OUTPUT {output.postprocessed_bam};

samtools index {output.postprocessed_bam};

rm -rf {params.tmpdir};
mathiasbio marked this conversation as resolved.
Show resolved Hide resolved
"""
52 changes: 37 additions & 15 deletions BALSAMIC/snakemake_rules/align/sentieon_alignment.rule
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,37 @@ shell_bam_files=$(echo {input.bam_files} | sed 's/ / -i /g') ;
--metrics {output.metrics} \
{output.bam};


sed 's/^LIBRARY/\\n## METRICS CLASS\tpicard\.sam\.DuplicationMetrics\\nLIBRARY/' -i {output.metrics}
"""

rule samtools_sort_index:
input:
bam = Path(bam_dir,"{sample_type}.{sample}.dedup.bam").as_posix(),
output:
bam = Path(bam_dir,"{sample_type}.{sample}.dedup_sorted.bam").as_posix(),
benchmark:
Path(benchmark_dir,"samtools_sort_index_{sample_type}_{sample}.tsv").as_posix()
singularity:
Path(singularity_image,config["bioinfo_tools"].get("samtools") + ".sif").as_posix()
params:
sample_id="{sample}"
threads:
get_threads(cluster_config,"samtools_qc")
message:
"Calculating alignment stats for sample: {params.sample_id}"
shell:
"""
samtools sort --threads {threads} -o {output.bam} {input.bam};
samtools index -@ {threads} {output.bam};
"""


rule sentieon_realign:
input:
ref = config["reference"]["reference_genome"],
mills = config["reference"]["mills_1kg"],
bam = Path(bam_dir, "{sample_type}.{sample}.dedup.bam").as_posix(),
bam = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted.bam").as_posix(),
indel_1kg = config["reference"]["known_indel_1kg"]
output:
bam = Path(bam_dir, "{sample_type}.{sample}.dedup.realign.bam").as_posix()
Expand All @@ -110,18 +132,18 @@ rule sentieon_realign:
"INDEL realignment using sentieon realigner for sample: {params.sample_id}"
shell:
"""
mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};
export SENTIEON_TMPDIR={params.tmpdir};
export SENTIEON_LICENSE={params.sentieon_lic};

{params.sentieon_exec} driver \
-r {input.ref} \
-t {threads} \
-i {input.bam} \
--algo Realigner \
-k {input.mills} \
-k {input.indel_1kg} \
{output};
"""
mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};
export SENTIEON_TMPDIR={params.tmpdir};
export SENTIEON_LICENSE={params.sentieon_lic};
{params.sentieon_exec} driver \
-r {input.ref} \
-t {threads} \
-i {input.bam} \
--algo Realigner \
-k {input.mills} \
-k {input.indel_1kg} \
{output};
"""

17 changes: 9 additions & 8 deletions BALSAMIC/snakemake_rules/concatenation/concatenation.rule
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
rule concatenate:
"""Merge fastq files per lane into a single forward and reverse fastq."""
input:
fwd_fastqs = lambda wildcards: config_model.get_all_fastqs_for_sample(
sample_name = wildcards.sample, fastq_types = [FastqName.FWD]),
rev_fastqs = lambda wildcards: config_model.get_all_fastqs_for_sample(
sample_name = wildcards.sample, fastq_types = [FastqName.REV])
fastqs_fwd=lambda wildcards: config_model.get_all_fastqs_for_sample(
sample_name=wildcards.sample, fastq_types=[FastqName.FWD]
),
fastqs_rev=lambda wildcards: config_model.get_all_fastqs_for_sample(
sample_name=wildcards.sample, fastq_types=[FastqName.REV]
),
output:
concat_fwd = fastq_dir + "{sample}_concat_R_1.fp.fastq.gz",
concat_rev = fastq_dir + "{sample}_concat_R_2.fp.fastq.gz"
Expand All @@ -16,13 +18,12 @@ rule concatenate:
params:
fastq_dir = fastq_dir,
sample = "{sample}",
read = "{read}"
threads:
get_threads(cluster_config, "concatenate")
message:
"Sample {params.sample} and read {params.read} FASTQ concatenation"
"Sample {params.sample} FASTQ concatenation"
shell:
"""
cat {input.fwd_fastqs} > {output.concat_fwd}
cat {input.rev_fastqs} > {output.concat_rev}
cat {input.fastqs_fwd} > {output.concat_fwd}
cat {input.fastqs_rev} > {output.concat_rev}
"""
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
rule vardict_tumor_normal:
input:
fa = config["reference"]["reference_genome"],
bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample),
bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample),
bamN = expand(bam_dir + "normal.{sample}.dedup_sorted_postprocessed.bam", sample=normal_sample),
bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_postprocessed.bam", sample=tumor_sample),
bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit,
output:
temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz")
Expand All @@ -30,7 +30,7 @@ rule vardict_tumor_normal:
"""
mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};
export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx32G\"';
export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx90G\"';

vardict-java -U -u -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \
-b \"{input.bamT}|{input.bamN}\" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
rule vardict_tumor_only:
input:
fa = config["reference"]["reference_genome"],
bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample),
bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_postprocessed.bam", sample=tumor_sample),
bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit,
output:
temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz")
Expand All @@ -29,7 +29,7 @@ export PERL5LIB=;

mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};
export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx48G\"';
export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx45G\"';

vardict-java -u -I 600 \
-G {input.fa} \
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Added:
* CNVs from PureCN to TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1278
* Command-line arguments and rules for creation of GENS files https://github.com/Clinical-Genomics/BALSAMIC/pull/1279
* Somatic and germline Loqusdb annotation to ReadtheDocs https://github.com/Clinical-Genomics/BALSAMIC/pull/1317
* Postprocess step before VarDict in TGA https://github.com/Clinical-Genomics/BALSAMIC/pull/1332

Changed:
^^^^^^^^
Expand Down Expand Up @@ -66,6 +67,7 @@ Changed:
* Split analysis model into config and params models https://github.com/Clinical-Genomics/BALSAMIC/pull/1306
* Renamed name in sample column of final clincial vcfs https://github.com/Clinical-Genomics/BALSAMIC/pull/1310
* Update Gens HK tags https://github.com/Clinical-Genomics/BALSAMIC/pull/1319
* Increased memory and threads for VarDict https://github.com/Clinical-Genomics/BALSAMIC/pull/1332
* Updated ReadtheDocs with GENS and structural pipeline changes https://github.com/Clinical-Genomics/BALSAMIC/pull/1327

Fixed:
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_config_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def test_get_final_bam_name(balsamic_model: ConfigModel):
)

# Then retrieved final bam names should match the expected format and be identical regardless of request parameter
expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.bam"
expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup_sorted.bam"
assert expected_final_bam_name == bam_name_sample_name
assert bam_name_sample_name == bam_name_sample_type

Expand Down