Clinical-Genomics · mathiasbio · Nov 28, 2023 · Nov 1, 2023 · Nov 15, 2023 · Nov 15, 2023
diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json
@@ -15,6 +15,10 @@
 		"time": "01:00:00",
 		"n": 4
 	},
+	"postprocess_bam": {
+		"time": "03:00:00",
+		"n": 12
+	},
 	"finalize_gens_outputfiles": {
 		"time": "01:00:00",
 		"n": 2
@@ -129,7 +133,7 @@
 		"n": 8
 	},
 	"samtools_sort_index": {
-		"time": "01:30:00",
+		"time": "02:30:00",
 		"n": 16
 	},
 	"sentieon_DNAscope": {
@@ -186,11 +190,11 @@
 	},
 	"vardict_tumor_normal": {
 		"time": "12:00:00",
-		"n": 10
+		"n": 18
 	},
 	"vardict_tumor_only": {
 		"time": "10:00:00",
-		"n": 10
+		"n": 9
 	},
 	"sentieon_bwa_umiextract": {
 		"time": "8:00:00",

diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py
@@ -69,6 +69,7 @@
         "align": [
             "snakemake_rules/umi/sentieon_umiextract.rule",
             "snakemake_rules/umi/sentieon_consensuscall.rule",
+            "snakemake_rules/align/postprocess_bam.rule",
         ],
         "varcall": [
             "snakemake_rules/variant_calling/germline.rule",
@@ -100,6 +101,7 @@
         "align": [
             "snakemake_rules/umi/sentieon_umiextract.rule",
             "snakemake_rules/umi/sentieon_consensuscall.rule",
+            "snakemake_rules/align/postprocess_bam.rule",
         ],
         "varcall": [
             "snakemake_rules/variant_calling/germline.rule",

diff --git a/BALSAMIC/models/config.py b/BALSAMIC/models/config.py
@@ -403,7 +403,7 @@ def get_final_bam_name(
             final_bam_suffix = "dedup"
         elif self.analysis.sequencing_type == SequencingType.TARGETED:
             # Only dedup is necessary for TGA
-            final_bam_suffix = "dedup"
+            final_bam_suffix = "dedup_sorted"
         else:
             # For WGS the bamfiles are realigned
             final_bam_suffix = "dedup.realign"

diff --git a/BALSAMIC/snakemake_rules/align/postprocess_bam.rule b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule
@@ -0,0 +1,47 @@
+# vim: syntax=python tabstop=4 expandtab
+# coding: utf-8
+
+# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format
+
+
+
+rule postprocess_bam:
+    input:
+        bam = Path(bam_dir,"{sample_type}.{sample}.dedup_sorted.bam").as_posix()
+    output:
+        postprocessed_bam = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted_postprocessed.bam").as_posix(),
+    benchmark:
+        Path(benchmark_dir,"postprocess_bam_{sample_type}.{sample}.tsv").as_posix()
+    singularity:
+        Path(singularity_image,config["bioinfo_tools"].get("picard") + ".sif").as_posix()
+    params:
+        tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
+        sample_id = "{sample}"
+    threads:
+        get_threads(cluster_config, "postprocess_bam")
+    message:
+        "Samtools remove unmapped reads and collapse readgroups for sample: {params.sample_id}"
+    shell:
+        """
+mkdir -p {params.tmpdir};
+export TMPDIR={params.tmpdir};
+
+samtools view --threads {threads} -O BAM -f 4 {input.bam} \
+-o {params.tmpdir}/{wildcards.sample_type}.um.bam ;
+
+samtools index {params.tmpdir}/{wildcards.sample_type}.um.bam ;
+
+samtools view --threads {threads} -O BAM -F 4 {params.tmpdir}/{wildcards.sample_type}.um.bam \
+-o {params.tmpdir}/{wildcards.sample_type}.m.bam ;
+
+samtools index {params.tmpdir}/{wildcards.sample_type}.m.bam ;
+
+picard -Xmx75g AddOrReplaceReadGroups -RGPU ILLUMINAi -RGID {wildcards.sample_type} -RGSM {wildcards.sample_type} -RGPL ILLUMINAi -RGLB ILLUMINAi -MAX_RECORDS_IN_RAM 1000000 -CREATE_INDEX true -CREATE_MD5_FILE true \
+-TMP_DIR {params.tmpdir} \
+-INPUT {params.tmpdir}/{wildcards.sample_type}.m.bam  \
+-OUTPUT {output.postprocessed_bam}; 
+
+samtools index {output.postprocessed_bam}; 
+
+rm -rf {params.tmpdir};
+        """
diff --git a/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule b/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule
@@ -85,15 +85,37 @@ shell_bam_files=$(echo {input.bam_files} | sed 's/ / -i /g') ;
 --metrics {output.metrics} \
 {output.bam};
 
+
 sed 's/^LIBRARY/\\n## METRICS CLASS\tpicard\.sam\.DuplicationMetrics\\nLIBRARY/' -i {output.metrics}
         """
 
+rule samtools_sort_index:
+    input:
+        bam = Path(bam_dir,"{sample_type}.{sample}.dedup.bam").as_posix(),
+    output:
+        bam = Path(bam_dir,"{sample_type}.{sample}.dedup_sorted.bam").as_posix(),
+    benchmark:
+        Path(benchmark_dir,"samtools_sort_index_{sample_type}_{sample}.tsv").as_posix()
+    singularity:
+        Path(singularity_image,config["bioinfo_tools"].get("samtools") + ".sif").as_posix()
+    params:
+        sample_id="{sample}"
+    threads:
+        get_threads(cluster_config,"samtools_qc")
+    message:
+        "Calculating alignment stats for sample: {params.sample_id}"
+    shell:
+        """
+samtools sort --threads {threads} -o {output.bam} {input.bam};
+samtools index -@ {threads} {output.bam};
+        """
+
 
 rule sentieon_realign:
     input:
         ref = config["reference"]["reference_genome"],
         mills = config["reference"]["mills_1kg"],
-        bam = Path(bam_dir, "{sample_type}.{sample}.dedup.bam").as_posix(),
+        bam = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted.bam").as_posix(),
         indel_1kg = config["reference"]["known_indel_1kg"]
     output:
         bam = Path(bam_dir, "{sample_type}.{sample}.dedup.realign.bam").as_posix()
@@ -110,18 +132,18 @@ rule sentieon_realign:
         "INDEL realignment using sentieon realigner for sample: {params.sample_id}"
     shell:
         """
-mkdir -p {params.tmpdir};
-export TMPDIR={params.tmpdir};
-export SENTIEON_TMPDIR={params.tmpdir};
-export SENTIEON_LICENSE={params.sentieon_lic};
-
-{params.sentieon_exec} driver \
--r {input.ref} \
--t {threads} \
--i {input.bam} \
---algo Realigner \
--k {input.mills} \
--k {input.indel_1kg} \
-{output}; 
-        """
+    mkdir -p {params.tmpdir};
+    export TMPDIR={params.tmpdir};
+    export SENTIEON_TMPDIR={params.tmpdir};
+    export SENTIEON_LICENSE={params.sentieon_lic};
+    
+    {params.sentieon_exec} driver \
+    -r {input.ref} \
+    -t {threads} \
+    -i {input.bam} \
+    --algo Realigner \
+    -k {input.mills} \
+    -k {input.indel_1kg} \
+    {output}; 
+            """
 
diff --git a/BALSAMIC/snakemake_rules/concatenation/concatenation.rule b/BALSAMIC/snakemake_rules/concatenation/concatenation.rule
@@ -4,10 +4,12 @@
 rule concatenate:
     """Merge fastq files per lane into a single forward and reverse fastq."""
     input:
-        fwd_fastqs = lambda wildcards: config_model.get_all_fastqs_for_sample(
-            sample_name = wildcards.sample, fastq_types = [FastqName.FWD]),
-        rev_fastqs = lambda wildcards: config_model.get_all_fastqs_for_sample(
-            sample_name = wildcards.sample, fastq_types = [FastqName.REV])
+        fastqs_fwd=lambda wildcards: config_model.get_all_fastqs_for_sample(
+            sample_name=wildcards.sample, fastq_types=[FastqName.FWD]
+        ),
+        fastqs_rev=lambda wildcards: config_model.get_all_fastqs_for_sample(
+            sample_name=wildcards.sample, fastq_types=[FastqName.REV]
+        ),
     output:
         concat_fwd = fastq_dir + "{sample}_concat_R_1.fp.fastq.gz",
         concat_rev = fastq_dir + "{sample}_concat_R_2.fp.fastq.gz"
@@ -16,13 +18,12 @@ rule concatenate:
     params:
         fastq_dir = fastq_dir,
         sample = "{sample}",
-        read = "{read}"
     threads:
         get_threads(cluster_config, "concatenate")
     message:
-        "Sample {params.sample} and read {params.read} FASTQ concatenation"
+        "Sample {params.sample} FASTQ concatenation"
     shell:
         """
-        cat {input.fwd_fastqs} > {output.concat_fwd}
-        cat {input.rev_fastqs} > {output.concat_rev}
+        cat {input.fastqs_fwd} > {output.concat_fwd}
+        cat {input.fastqs_rev} > {output.concat_rev}
         """
diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule
@@ -6,8 +6,8 @@
 rule vardict_tumor_normal:
     input:
         fa = config["reference"]["reference_genome"],
-        bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample),
-        bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample),
+        bamN = expand(bam_dir + "normal.{sample}.dedup_sorted_postprocessed.bam", sample=normal_sample),
+        bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_postprocessed.bam", sample=tumor_sample),
         bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit,
     output:
         temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz")
@@ -30,7 +30,7 @@ rule vardict_tumor_normal:
         """
 mkdir -p {params.tmpdir};
 export TMPDIR={params.tmpdir};
-export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx32G\"';
+export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx90G\"';
 
 vardict-java -U -u -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \
 -b \"{input.bamT}|{input.bamN}\" \

diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule
@@ -4,7 +4,7 @@
 rule vardict_tumor_only:
     input:
         fa = config["reference"]["reference_genome"],
-        bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample),
+        bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_postprocessed.bam", sample=tumor_sample),
         bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit,
     output:
         temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz")
@@ -29,7 +29,7 @@ export PERL5LIB=;
 
 mkdir -p {params.tmpdir};
 export TMPDIR={params.tmpdir};
-export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx48G\"'; 
+export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx45G\"'; 
 
 vardict-java -u -I 600 \
 -G {input.fa} \

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -30,6 +30,7 @@ Added:
 * CNVs from PureCN to TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1278
 * Command-line arguments and rules for creation of GENS files https://github.com/Clinical-Genomics/BALSAMIC/pull/1279
 * Somatic and germline Loqusdb annotation to ReadtheDocs https://github.com/Clinical-Genomics/BALSAMIC/pull/1317
+* Postprocess step before VarDict in TGA https://github.com/Clinical-Genomics/BALSAMIC/pull/1332
 
 Changed:
 ^^^^^^^^
@@ -66,6 +67,7 @@ Changed:
 * Split analysis model into config and params models https://github.com/Clinical-Genomics/BALSAMIC/pull/1306
 * Renamed name in sample column of final clincial vcfs https://github.com/Clinical-Genomics/BALSAMIC/pull/1310
 * Update Gens HK tags https://github.com/Clinical-Genomics/BALSAMIC/pull/1319
+* Increased memory and threads for VarDict https://github.com/Clinical-Genomics/BALSAMIC/pull/1332
 * Updated ReadtheDocs with GENS and structural pipeline changes https://github.com/Clinical-Genomics/BALSAMIC/pull/1327
 
 Fixed:

diff --git a/tests/models/test_config_models.py b/tests/models/test_config_models.py
@@ -364,7 +364,7 @@ def test_get_final_bam_name(balsamic_model: ConfigModel):
     )
 
     # Then retrieved final bam names should match the expected format and be identical regardless of request parameter
-    expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.bam"
+    expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup_sorted.bam"
     assert expected_final_bam_name == bam_name_sample_name
     assert bam_name_sample_name == bam_name_sample_type