From 8d348cf7f3a4f597e34736d047e6dd73250220ad Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Wed, 1 Nov 2023 15:11:15 +0100 Subject: [PATCH 01/16] add rule sleep before starting analysis --- BALSAMIC/constants/rules.py | 1 + BALSAMIC/constants/workflow_params.py | 2 ++ BALSAMIC/snakemake_rules/misc/sleep.rule | 15 +++++++++++++++ .../quality_control/fastp_tga.rule | 1 + .../quality_control/fastp_wgs.rule | 1 + .../snakemake_rules/quality_control/fastqc.rule | 1 + BALSAMIC/workflows/PON.smk | 6 +++++- BALSAMIC/workflows/QC.smk | 6 ++++-- BALSAMIC/workflows/balsamic.smk | 4 +++- CHANGELOG.rst | 2 ++ 10 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 BALSAMIC/snakemake_rules/misc/sleep.rule diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py index a9d440b9c..2b96cf2d6 100644 --- a/BALSAMIC/constants/rules.py +++ b/BALSAMIC/constants/rules.py @@ -30,6 +30,7 @@ SNAKEMAKE_RULES: Dict[str, Dict[str, list]] = { "common": { + "misc": ["snakemake_rules/misc/sleep.rule"], "qc": [ "snakemake_rules/quality_control/fastqc.rule", "snakemake_rules/quality_control/multiqc.rule", diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 3c6d46614..fcf822b54 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -108,6 +108,8 @@ }, } +SLEEP_BEFORE_START = 120 + WORKFLOW_PARAMS = { "common": { "pcr_model": "NONE", diff --git a/BALSAMIC/snakemake_rules/misc/sleep.rule b/BALSAMIC/snakemake_rules/misc/sleep.rule new file mode 100644 index 000000000..c97deb58d --- /dev/null +++ b/BALSAMIC/snakemake_rules/misc/sleep.rule @@ -0,0 +1,15 @@ + +rule sleep_before_start: + """Wait 120s before starting any processing to avoid key_error issue.""" + output: + wake_up = result_dir + "start_analysis" + params: + sleep_seconds = seconds_before_start + threads: get_threads(cluster_config, "sleep_before_start") + message: + "Sleeping for {params.sleep_seconds} seconds before starting analysis." + shell: + """ +sleep {params.sleep_seconds} +echo "Waited: {params.sleep_seconds} seconds. Now starting analysis." >> {output.wake_up} + """ diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule index a9abae72f..841eca850 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule @@ -3,6 +3,7 @@ rule fastp_umi_trim: """Fastq TGA data pre-processing to remove UMIs.""" input: + wake_up = result_dir + "start_analysis", fastq_r1 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.FWD), fastq_r2 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.REV) output: diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp_wgs.rule b/BALSAMIC/snakemake_rules/quality_control/fastp_wgs.rule index 743d50db3..0b4bba3dc 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastp_wgs.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastp_wgs.rule @@ -4,6 +4,7 @@ rule fastp_quality_trim_wgs: """Fastq data pre-processing for WGS.""" input: + wake_up = result_dir + "start_analysis", fastq_r1 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.FWD), fastq_r2 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.REV) output: diff --git a/BALSAMIC/snakemake_rules/quality_control/fastqc.rule b/BALSAMIC/snakemake_rules/quality_control/fastqc.rule index 493a892fd..4d1d895f5 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastqc.rule @@ -4,6 +4,7 @@ rule fastqc: """Perform quality control checks on raw sequence data.""" input: + wake_up = result_dir + "start_analysis", fastq = input_fastq_dir + "{fastq_file_names}.fastq.gz" output: fastqc_zip = fastqc_dir + "{fastq_file_names}_fastqc.zip" diff --git a/BALSAMIC/workflows/PON.smk b/BALSAMIC/workflows/PON.smk index 134e5ff06..092029cb8 100644 --- a/BALSAMIC/workflows/PON.smk +++ b/BALSAMIC/workflows/PON.smk @@ -15,7 +15,7 @@ from BALSAMIC.constants.paths import BALSAMIC_DIR from BALSAMIC.constants.analysis import FastqName, SampleType, SequencingType, PONWorkflow, Gender from BALSAMIC.utils.io import write_finish_file from BALSAMIC.utils.rule import get_fastp_parameters, get_threads, get_result_dir -from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS +from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS, SLEEP_BEFORE_START from BALSAMIC.models.analysis import BalsamicWorkflowConfig, ConfigModel @@ -50,6 +50,9 @@ bam_dir: str = Path(result_dir, "bam", "").as_posix() + "/" cnv_dir: str = Path(result_dir, "cnv", "").as_posix() + "/" qc_dir: str = Path(result_dir, "qc", "").as_posix() + "/" +# Pre run parameters +seconds_before_start: int = SLEEP_BEFORE_START + # PON setting pon_workflow: PONWorkflow = config_model.analysis.pon_workflow @@ -83,6 +86,7 @@ if not Path(config["SENTIEON_EXEC"]).exists(): sequence_type = config['analysis']["sequencing_type"] rules_to_include = [] +rules_to_include.append("snakemake_rules/misc/sleep.rule") if sequence_type == SequencingType.TARGETED: rules_to_include.append("snakemake_rules/quality_control/fastp_tga.rule") else: diff --git a/BALSAMIC/workflows/QC.smk b/BALSAMIC/workflows/QC.smk index c8bbb140f..49bba14fb 100644 --- a/BALSAMIC/workflows/QC.smk +++ b/BALSAMIC/workflows/QC.smk @@ -24,7 +24,7 @@ from BALSAMIC.utils.rule import (get_fastp_parameters, get_rule_output, get_resu get_script_path, get_threads, get_sequencing_type, get_capture_kit) -from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS +from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS, SLEEP_BEFORE_START # Initialize ConfigModel config_model = ConfigModel.parse_obj(config) @@ -56,6 +56,8 @@ vcf_dir: str = Path(result_dir, "vcf").as_posix() + "/" qc_dir: str = Path(result_dir, "qc").as_posix() + "/" delivery_dir: str = Path(result_dir, "delivery").as_posix() + "/" +# Pre run parameters +seconds_before_start: int = SLEEP_BEFORE_START # Run information singularity_image: str = config_model.singularity['image'] @@ -112,7 +114,7 @@ sequence_type = config['analysis']["sequencing_type"] rules_to_include = [] for workflow_type, value in SNAKEMAKE_RULES.items(): if workflow_type in ["common", analysis_type + "_" + sequence_type]: - rules_to_include.extend(value.get("qc", []) + value.get("align", [])) + rules_to_include.extend(value.get("misc", []) + value.get("qc", []) + value.get("align", [])) rules_to_include = [rule for rule in rules_to_include if "umi" not in rule] if "snakemake_rules/quality_control/report.rule" in rules_to_include: rules_to_include = [rule for rule in rules_to_include if "quality_control/report.rule" not in rule] diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index eb7f4f4fc..8555b19c6 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -34,7 +34,7 @@ from BALSAMIC.utils.rule import (get_fastp_parameters, get_variant_callers, get_ from BALSAMIC.constants.analysis import MutationType, FastqName, SampleType from BALSAMIC.constants.variant_filters import (COMMON_SETTINGS, VARDICT_SETTINGS, SENTIEON_VARCALL_SETTINGS, SVDB_FILTER_SETTINGS) -from BALSAMIC.constants.workflow_params import (WORKFLOW_PARAMS, VARCALL_PARAMS) +from BALSAMIC.constants.workflow_params import (WORKFLOW_PARAMS, VARCALL_PARAMS, SLEEP_BEFORE_START) from BALSAMIC.constants.rules import SNAKEMAKE_RULES # Initialize ConfigModel @@ -71,6 +71,8 @@ delivery_dir: str = Path(result_dir, "delivery").as_posix() + "/" umi_dir: str = Path(result_dir, "umi").as_posix() + "/" umi_qc_dir: str = Path(qc_dir, "umi_qc").as_posix() + "/" +# Pre run parameters +seconds_before_start: int = SLEEP_BEFORE_START # Annotations research_annotations = [] diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4a249a2c7..31c764fe0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -29,6 +29,8 @@ Added: * CNVs from PureCN to TGA workflow https://Clinical-Genomics/BALSAMIC/pull/1278 * CNVkit Panel of Normal for gmsmyeloid_5.3 to TGA workflow https://Clinical-Genomics/BALSAMIC/pull/1278 * Command-line arguments and rules for creation of GENS files https://github.com/Clinical-Genomics/BALSAMIC/pull/1279 +* Sleep rule before start to fix key_error https://github.com/Clinical-Genomics/BALSAMIC/pull/1310 + Changed: From b28453780b8594ac780763e54a88b435e8e6e9f0 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Wed, 15 Nov 2023 12:28:11 +0100 Subject: [PATCH 02/16] add new rule for resorting and indexing bamfile --- BALSAMIC/constants/cluster_analysis.json | 2 +- BALSAMIC/models/config.py | 2 +- .../align/sentieon_alignment.rule | 52 +++++++++++++------ 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json index 078657dd7..6f5ddf1f1 100644 --- a/BALSAMIC/constants/cluster_analysis.json +++ b/BALSAMIC/constants/cluster_analysis.json @@ -129,7 +129,7 @@ "n": 8 }, "samtools_sort_index": { - "time": "01:30:00", + "time": "02:30:00", "n": 16 }, "sentieon_DNAscope": { diff --git a/BALSAMIC/models/config.py b/BALSAMIC/models/config.py index 070d93dde..6240256da 100644 --- a/BALSAMIC/models/config.py +++ b/BALSAMIC/models/config.py @@ -403,7 +403,7 @@ def get_final_bam_name( final_bam_suffix = "dedup" elif self.analysis.sequencing_type == SequencingType.TARGETED: # Only dedup is necessary for TGA - final_bam_suffix = "dedup" + final_bam_suffix = "dedup_sorted" else: # For WGS the bamfiles are realigned final_bam_suffix = "dedup.realign" diff --git a/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule b/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule index 22817e726..21ec02ae5 100644 --- a/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule +++ b/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule @@ -85,15 +85,37 @@ shell_bam_files=$(echo {input.bam_files} | sed 's/ / -i /g') ; --metrics {output.metrics} \ {output.bam}; + sed 's/^LIBRARY/\\n## METRICS CLASS\tpicard\.sam\.DuplicationMetrics\\nLIBRARY/' -i {output.metrics} """ +rule samtools_sort_index: + input: + bam = Path(bam_dir,"{sample_type}.{sample}.dedup.bam").as_posix(), + output: + bam = Path(bam_dir,"{sample_type}.{sample}.dedup_sorted.bam").as_posix(), + benchmark: + Path(benchmark_dir,"samtools_sort_index_{sample_type}_{sample}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("samtools") + ".sif").as_posix() + params: + sample_id="{sample}" + threads: + get_threads(cluster_config,"samtools_qc") + message: + "Calculating alignment stats for sample: {params.sample_id}" + shell: + """ +samtools sort --threads {threads} -o {output.bam} {input.bam}; +samtools index --threads {threads} {output.bam}; + """ + rule sentieon_realign: input: ref = config["reference"]["reference_genome"], mills = config["reference"]["mills_1kg"], - bam = Path(bam_dir, "{sample_type}.{sample}.dedup.bam").as_posix(), + bam = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted.bam").as_posix(), indel_1kg = config["reference"]["known_indel_1kg"] output: bam = Path(bam_dir, "{sample_type}.{sample}.dedup.realign.bam").as_posix() @@ -110,18 +132,18 @@ rule sentieon_realign: "INDEL realignment using sentieon realigner for sample: {params.sample_id}" shell: """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; -export SENTIEON_TMPDIR={params.tmpdir}; -export SENTIEON_LICENSE={params.sentieon_lic}; - -{params.sentieon_exec} driver \ --r {input.ref} \ --t {threads} \ --i {input.bam} \ ---algo Realigner \ --k {input.mills} \ --k {input.indel_1kg} \ -{output}; - """ + mkdir -p {params.tmpdir}; + export TMPDIR={params.tmpdir}; + export SENTIEON_TMPDIR={params.tmpdir}; + export SENTIEON_LICENSE={params.sentieon_lic}; + + {params.sentieon_exec} driver \ + -r {input.ref} \ + -t {threads} \ + -i {input.bam} \ + --algo Realigner \ + -k {input.mills} \ + -k {input.indel_1kg} \ + {output}; + """ From 55c7ee563b66f5dd63d94254485da7d7d7e1a9fe Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Wed, 15 Nov 2023 17:44:12 +0100 Subject: [PATCH 03/16] fix threads to index --- BALSAMIC/snakemake_rules/align/sentieon_alignment.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule b/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule index 21ec02ae5..13f0eb815 100644 --- a/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule +++ b/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule @@ -107,7 +107,7 @@ rule samtools_sort_index: shell: """ samtools sort --threads {threads} -o {output.bam} {input.bam}; -samtools index --threads {threads} {output.bam}; +samtools index -@ {threads} {output.bam}; """ From 682ffcf561c9c2f46be9c29a333d8b700d5eb451 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 20 Nov 2023 16:11:05 +0100 Subject: [PATCH 04/16] Revert "fix threads to index" This reverts commit 55c7ee563b66f5dd63d94254485da7d7d7e1a9fe. --- BALSAMIC/snakemake_rules/align/sentieon_alignment.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule b/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule index 13f0eb815..21ec02ae5 100644 --- a/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule +++ b/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule @@ -107,7 +107,7 @@ rule samtools_sort_index: shell: """ samtools sort --threads {threads} -o {output.bam} {input.bam}; -samtools index -@ {threads} {output.bam}; +samtools index --threads {threads} {output.bam}; """ From 2e91d146f2bdf40be6a5e8b93243452e9664000c Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Tue, 21 Nov 2023 11:52:04 +0100 Subject: [PATCH 05/16] Revert "Revert "fix threads to index"" This reverts commit 682ffcf561c9c2f46be9c29a333d8b700d5eb451. --- BALSAMIC/snakemake_rules/align/sentieon_alignment.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule b/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule index 21ec02ae5..13f0eb815 100644 --- a/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule +++ b/BALSAMIC/snakemake_rules/align/sentieon_alignment.rule @@ -107,7 +107,7 @@ rule samtools_sort_index: shell: """ samtools sort --threads {threads} -o {output.bam} {input.bam}; -samtools index --threads {threads} {output.bam}; +samtools index -@ {threads} {output.bam}; """ From f6d1618f2026dcd133eb7a2e7df12aabf4d984a2 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Tue, 21 Nov 2023 11:54:38 +0100 Subject: [PATCH 06/16] Revert "merge sleep rule" This reverts commit 1e5ac64f4cfacbd4142f824a36792faf23bd18d1, reversing changes made to 55c7ee563b66f5dd63d94254485da7d7d7e1a9fe. --- BALSAMIC/constants/rules.py | 1 - BALSAMIC/constants/workflow_params.py | 2 -- BALSAMIC/snakemake_rules/misc/sleep.rule | 15 --------------- .../quality_control/fastp_tga.rule | 1 - .../quality_control/fastp_wgs.rule | 1 - .../snakemake_rules/quality_control/fastqc.rule | 1 - BALSAMIC/workflows/PON.smk | 7 +------ BALSAMIC/workflows/QC.smk | 10 ++-------- BALSAMIC/workflows/balsamic.smk | 4 +--- CHANGELOG.rst | 4 ---- 10 files changed, 4 insertions(+), 42 deletions(-) delete mode 100644 BALSAMIC/snakemake_rules/misc/sleep.rule diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py index 55071af45..ec5d07bde 100644 --- a/BALSAMIC/constants/rules.py +++ b/BALSAMIC/constants/rules.py @@ -30,7 +30,6 @@ SNAKEMAKE_RULES: Dict[str, Dict[str, list]] = { "common": { - "misc": ["snakemake_rules/misc/sleep.rule"], "qc": [ "snakemake_rules/quality_control/fastqc.rule", "snakemake_rules/quality_control/multiqc.rule", diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 1d3988708..d3a2486b6 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -108,8 +108,6 @@ }, } -SLEEP_BEFORE_START = 120 - WORKFLOW_PARAMS = { "common": { "pcr_model": "NONE", diff --git a/BALSAMIC/snakemake_rules/misc/sleep.rule b/BALSAMIC/snakemake_rules/misc/sleep.rule deleted file mode 100644 index c97deb58d..000000000 --- a/BALSAMIC/snakemake_rules/misc/sleep.rule +++ /dev/null @@ -1,15 +0,0 @@ - -rule sleep_before_start: - """Wait 120s before starting any processing to avoid key_error issue.""" - output: - wake_up = result_dir + "start_analysis" - params: - sleep_seconds = seconds_before_start - threads: get_threads(cluster_config, "sleep_before_start") - message: - "Sleeping for {params.sleep_seconds} seconds before starting analysis." - shell: - """ -sleep {params.sleep_seconds} -echo "Waited: {params.sleep_seconds} seconds. Now starting analysis." >> {output.wake_up} - """ diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule index 841eca850..a9abae72f 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule @@ -3,7 +3,6 @@ rule fastp_umi_trim: """Fastq TGA data pre-processing to remove UMIs.""" input: - wake_up = result_dir + "start_analysis", fastq_r1 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.FWD), fastq_r2 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.REV) output: diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp_wgs.rule b/BALSAMIC/snakemake_rules/quality_control/fastp_wgs.rule index 0b4bba3dc..743d50db3 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastp_wgs.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastp_wgs.rule @@ -4,7 +4,6 @@ rule fastp_quality_trim_wgs: """Fastq data pre-processing for WGS.""" input: - wake_up = result_dir + "start_analysis", fastq_r1 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.FWD), fastq_r2 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.REV) output: diff --git a/BALSAMIC/snakemake_rules/quality_control/fastqc.rule b/BALSAMIC/snakemake_rules/quality_control/fastqc.rule index 4d1d895f5..493a892fd 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastqc.rule @@ -4,7 +4,6 @@ rule fastqc: """Perform quality control checks on raw sequence data.""" input: - wake_up = result_dir + "start_analysis", fastq = input_fastq_dir + "{fastq_file_names}.fastq.gz" output: fastqc_zip = fastqc_dir + "{fastq_file_names}_fastqc.zip" diff --git a/BALSAMIC/workflows/PON.smk b/BALSAMIC/workflows/PON.smk index d7f7827e1..0ed589067 100644 --- a/BALSAMIC/workflows/PON.smk +++ b/BALSAMIC/workflows/PON.smk @@ -10,14 +10,13 @@ from typing import Dict, List from BALSAMIC.constants.analysis import FastqName, Gender, PONWorkflow, SampleType, SequencingType from BALSAMIC.constants.paths import BALSAMIC_DIR -from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS, SLEEP_BEFORE_START +from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS from BALSAMIC.models.config import ConfigModel from BALSAMIC.models.params import BalsamicWorkflowConfig from BALSAMIC.utils.exc import BalsamicError from BALSAMIC.utils.io import write_finish_file from BALSAMIC.utils.rule import get_fastp_parameters, get_result_dir, get_threads - # Initialize ConfigModel config_model = ConfigModel.model_validate(config) @@ -49,9 +48,6 @@ bam_dir: str = Path(result_dir, "bam", "").as_posix() + "/" cnv_dir: str = Path(result_dir, "cnv", "").as_posix() + "/" qc_dir: str = Path(result_dir, "qc", "").as_posix() + "/" -# Pre run parameters -seconds_before_start: int = SLEEP_BEFORE_START - # PON setting pon_workflow: PONWorkflow = config_model.analysis.pon_workflow @@ -85,7 +81,6 @@ if not Path(config["SENTIEON_EXEC"]).exists(): sequence_type = config['analysis']["sequencing_type"] rules_to_include = [] -rules_to_include.append("snakemake_rules/misc/sleep.rule") if sequence_type == SequencingType.TARGETED: rules_to_include.append("snakemake_rules/quality_control/fastp_tga.rule") else: diff --git a/BALSAMIC/workflows/QC.smk b/BALSAMIC/workflows/QC.smk index 4d0548c0a..d94eb78c4 100644 --- a/BALSAMIC/workflows/QC.smk +++ b/BALSAMIC/workflows/QC.smk @@ -9,7 +9,7 @@ from typing import Dict, List from BALSAMIC.constants.analysis import AnalysisType, FastqName, SampleType from BALSAMIC.constants.paths import BALSAMIC_DIR from BALSAMIC.constants.rules import SNAKEMAKE_RULES -from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS, SLEEP_BEFORE_START +from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS from BALSAMIC.models.config import ConfigModel from BALSAMIC.models.params import BalsamicWorkflowConfig from BALSAMIC.utils.cli import check_executable, generate_h5 @@ -57,12 +57,6 @@ vcf_dir: str = Path(result_dir, "vcf").as_posix() + "/" qc_dir: str = Path(result_dir, "qc").as_posix() + "/" delivery_dir: str = Path(result_dir, "delivery").as_posix() + "/" -<<<<<<< HEAD -======= -# Pre run parameters -seconds_before_start: int = SLEEP_BEFORE_START - ->>>>>>> add_sleep_rule_before_start # Run information singularity_image: str = config_model.singularity['image'] sample_names: List[str] = config_model.get_all_sample_names() @@ -118,7 +112,7 @@ sequence_type = config['analysis']["sequencing_type"] rules_to_include = [] for workflow_type, value in SNAKEMAKE_RULES.items(): if workflow_type in ["common", analysis_type + "_" + sequence_type]: - rules_to_include.extend(value.get("misc", []) + value.get("qc", []) + value.get("align", [])) + rules_to_include.extend(value.get("qc", []) + value.get("align", [])) rules_to_include = [rule for rule in rules_to_include if "umi" not in rule] if "snakemake_rules/quality_control/report.rule" in rules_to_include: rules_to_include = [rule for rule in rules_to_include if "quality_control/report.rule" not in rule] diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 89be5458e..9c9b04e2f 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -17,7 +17,7 @@ from BALSAMIC.constants.variant_filters import ( SVDB_FILTER_SETTINGS, VARDICT_SETTINGS, ) -from BALSAMIC.constants.workflow_params import (WORKFLOW_PARAMS, VARCALL_PARAMS, SLEEP_BEFORE_START) +from BALSAMIC.constants.workflow_params import VARCALL_PARAMS, WORKFLOW_PARAMS from BALSAMIC.models.config import ConfigModel from BALSAMIC.models.params import BalsamicWorkflowConfig, VarCallerFilter from BALSAMIC.utils.cli import check_executable, generate_h5 @@ -82,8 +82,6 @@ delivery_dir: str = Path(result_dir, "delivery").as_posix() + "/" umi_dir: str = Path(result_dir, "umi").as_posix() + "/" umi_qc_dir: str = Path(qc_dir, "umi_qc").as_posix() + "/" -# Pre run parameters -seconds_before_start: int = SLEEP_BEFORE_START # Annotations research_annotations = [] diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 483d16caa..95db38041 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -30,10 +30,6 @@ Added: * CNVs from PureCN to TGA workflow https://Clinical-Genomics/BALSAMIC/pull/1278 * Command-line arguments and rules for creation of GENS files https://github.com/Clinical-Genomics/BALSAMIC/pull/1279 * Somatic and germline Loqusdb annotation to ReadtheDocs https://github.com/Clinical-Genomics/BALSAMIC/pull/1317 -* Sleep rule before start to fix key_error https://github.com/Clinical-Genomics/BALSAMIC/pull/1310 - - ->>>>>>> add_sleep_rule_before_start Changed: ^^^^^^^^ From d16eebd20b819df90ab0643e1a5c97b5a84a81fb Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Tue, 21 Nov 2023 11:56:54 +0100 Subject: [PATCH 07/16] fix test --- tests/models/test_config_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_config_models.py b/tests/models/test_config_models.py index 34935be1e..499eacc1c 100644 --- a/tests/models/test_config_models.py +++ b/tests/models/test_config_models.py @@ -364,7 +364,7 @@ def test_get_final_bam_name(balsamic_model: ConfigModel): ) # Then retrieved final bam names should match the expected format and be identical regardless of request parameter - expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.bam" + expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup_sorted.bam" assert expected_final_bam_name == bam_name_sample_name assert bam_name_sample_name == bam_name_sample_type From 9331e84c1598f56f953373cb0051e26ffc4c2760 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Sat, 25 Nov 2023 11:32:19 +0100 Subject: [PATCH 08/16] revert tga workflow, WIP, test --- BALSAMIC/constants/rules.py | 15 +++- .../snakemake_rules/align/bwa_mem_picard.rule | 84 +++++++++++++++++++ .../concatenation/concatenation.rule | 17 ++-- .../quality_control/fastp_tga.rule | 40 ++++----- .../quality_control/multiqc.rule | 7 +- 5 files changed, 130 insertions(+), 33 deletions(-) create mode 100644 BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py index ec5d07bde..451b0c810 100644 --- a/BALSAMIC/constants/rules.py +++ b/BALSAMIC/constants/rules.py @@ -37,7 +37,6 @@ "snakemake_rules/quality_control/samtools_qc.rule", ], "align": [ - "snakemake_rules/align/sentieon_alignment.rule", "snakemake_rules/align/bam_compress.rule", ], "varcall": [ @@ -66,7 +65,11 @@ "snakemake_rules/umi/mergetype_tumor_umi.rule", "snakemake_rules/umi/generate_AF_tables.rule", ], + "concatenation": [ + "snakemake_rules/concatenation/concatenation.rule", + ], "align": [ + "snakemake_rules/align/bwa_mem_picard.rule", "snakemake_rules/umi/sentieon_umiextract.rule", "snakemake_rules/umi/sentieon_consensuscall.rule", ], @@ -97,7 +100,11 @@ "snakemake_rules/quality_control/contest.rule", "snakemake_rules/umi/generate_AF_tables.rule", ], + "concatenation": [ + "snakemake_rules/concatenation/concatenation.rule", + ], "align": [ + "snakemake_rules/align/bwa_mem_picard.rule", "snakemake_rules/umi/sentieon_umiextract.rule", "snakemake_rules/umi/sentieon_consensuscall.rule", ], @@ -116,6 +123,9 @@ ], }, "single_wgs": { + "align": [ + "snakemake_rules/align/sentieon_alignment.rule" + ], "qc": [ "snakemake_rules/quality_control/fastp_wgs.rule", "snakemake_rules/quality_control/sentieon_qc_metrics.rule", @@ -134,6 +144,9 @@ ], }, "paired_wgs": { + "align": [ + "snakemake_rules/align/sentieon_alignment.rule" + ], "qc": [ "snakemake_rules/quality_control/fastp_wgs.rule", "snakemake_rules/quality_control/sentieon_qc_metrics.rule", diff --git a/BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule b/BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule new file mode 100644 index 000000000..bbf98e3cf --- /dev/null +++ b/BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule @@ -0,0 +1,84 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format + + +rule bwa_mem: + input: + fa = config["reference"]["reference_genome"], + fastq_r1=fastq_dir + "{sample}_1.fp.fastq.gz", + fastq_r2=fastq_dir + "{sample}_2.fp.fastq.gz", + refidx = expand(config["reference"]["reference_genome"] + ".{prefix}", prefix=["amb","ann","bwt","pac","sa"]) + output: + bam_out=Path(bam_dir,"{sample}_align_sort.bam").as_posix() + benchmark: + Path(benchmark_dir, "bwa_mem_{sample}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bwa") + ".sif").as_posix() + params: + bam_header = params.common.align_header, + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + sample_id = "{sample}" + threads: + get_threads(cluster_config, "bwa_mem") + message: + ("Align fastq files with bwa-mem to reference genome and " + "sort using samtools for sample: {params.sample_id}") + shell: + """ +mkdir -p {params.tmpdir}; +export TMPDIR={params.tmpdir}; + +bwa mem \ +-t {threads} \ +-R {params.bam_header} \ +-M \ +-v 1 \ +{input.fa} {input.fastq_r1} {input.fastq_r2} \ +| samtools sort -T {params.tmpdir} \ +--threads {threads} \ +--output-fmt BAM \ +-o {output.bam_out} - ; + +samtools index -@ {threads} {output.bam_out}; +rm -rf {params.tmpdir}; + """ + + +rule picard_markduplicates: + input: + Path(bam_dir, "{sample}_align_sort.bam").as_posix() + output: + mrkdup = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted.bam").as_posix(), + picard_stats = Path(qc_dir, "{sample_type}.{sample}.dedup.metrics").as_posix(), + benchmark: + Path(benchmark_dir,"picard_{sample_type}_markduplicates_{sample}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("picard") + ".sif").as_posix() + params: + mem = "16g", + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + sample_id = "{sample}" + threads: + get_threads(cluster_config, "picard_markduplicates") + message: + "Picard marking duplicates and samtool indexing for sample: {params.sample_id}" + shell: + """ +mkdir -p {params.tmpdir}; +export TMPDIR={params.tmpdir}; + +picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ +MarkDuplicates \ +INPUT={input} \ +OUTPUT={output.mrkdup} \ +VALIDATION_STRINGENCY=SILENT \ +MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \ +REMOVE_DUPLICATES=FALSE \ +METRICS_FILE='{output.picard_stats}'; + +samtools index {output.mrkdup}; + +rm -rf {params.tmpdir}; + """ diff --git a/BALSAMIC/snakemake_rules/concatenation/concatenation.rule b/BALSAMIC/snakemake_rules/concatenation/concatenation.rule index d7466f6e2..53a32dc2d 100644 --- a/BALSAMIC/snakemake_rules/concatenation/concatenation.rule +++ b/BALSAMIC/snakemake_rules/concatenation/concatenation.rule @@ -4,10 +4,12 @@ rule concatenate: """Merge fastq files per lane into a single forward and reverse fastq.""" input: - fwd_fastqs = lambda wildcards: config_model.get_all_fastqs_for_sample( - sample_name = wildcards.sample, fastq_types = [FastqName.FWD]), - rev_fastqs = lambda wildcards: config_model.get_all_fastqs_for_sample( - sample_name = wildcards.sample, fastq_types = [FastqName.REV]) + fastqs_fwd=lambda wildcards: config_model.get_all_fastqs_for_sample( + sample_name=wildcards.sample, fastq_types=[FastqName.FWD] + ), + fastqs_rev=lambda wildcards: config_model.get_all_fastqs_for_sample( + sample_name=wildcards.sample, fastq_types=[FastqName.REV] + ), output: concat_fwd = fastq_dir + "{sample}_concat_R_1.fp.fastq.gz", concat_rev = fastq_dir + "{sample}_concat_R_2.fp.fastq.gz" @@ -16,13 +18,12 @@ rule concatenate: params: fastq_dir = fastq_dir, sample = "{sample}", - read = "{read}" threads: get_threads(cluster_config, "concatenate") message: - "Sample {params.sample} and read {params.read} FASTQ concatenation" + "Sample {params.sample} FASTQ concatenation" shell: """ - cat {input.fwd_fastqs} > {output.concat_fwd} - cat {input.rev_fastqs} > {output.concat_rev} + cat {input.fastqs_fwd} > {output.concat_fwd} + cat {input.fastqs_rev} > {output.concat_rev} """ diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule index a9abae72f..dd6fcac3e 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule @@ -3,33 +3,33 @@ rule fastp_umi_trim: """Fastq TGA data pre-processing to remove UMIs.""" input: - fastq_r1 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.FWD), - fastq_r2 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.REV) + concat_fwd=fastq_dir + "{sample}_concat_R_1.fp.fastq.gz", + concat_rev=fastq_dir + "{sample}_concat_R_2.fp.fastq.gz" output: - fastq_r1 = temp(fastq_dir + "{fastq_pattern}_1.umi_removed.fastq.gz"), - fastq_r2 = temp(fastq_dir + "{fastq_pattern}_2.umi_removed.fastq.gz"), - json = qc_dir + "fastp/{fastq_pattern}_umi_removed_fastp.json", - html = qc_dir + "fastp/{fastq_pattern}_umi_removed_fastp.html", + fastq_r1 = temp(fastq_dir + "{sample}_1.umi_removed.fastq.gz"), + fastq_r2 = temp(fastq_dir + "{sample}_2.umi_removed.fastq.gz"), + json = qc_dir + "fastp/{sample}_umi_removed_fastp.json", + html = qc_dir + "fastp/{sample}_umi_removed_fastp.html", benchmark: - Path(benchmark_dir, "fastp_remove_umi" + "{fastq_pattern}.tsv").as_posix() + Path(benchmark_dir, "fastp_remove_umi" + "{sample}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix() params: tmpdir = tmp_dir, fastp_trim_umi = " ".join(fastp_parameters["fastp_trim_umi"]), - fastq_pattern = "{fastq_pattern}", + sample = "{sample}", threads: get_threads(cluster_config, 'fastp_remove_umi') message: - "Trimming away UMI sequences for fastqs in fastq pattern: {params.fastq_pattern}" + "Trimming away UMI sequences for fastqs in sample: {params.sample}" shell: """ export TMPDIR={params.tmpdir}; fastp \ --thread {threads} \ ---in1 {input.fastq_r1} \ ---in2 {input.fastq_r2} \ +--in1 {input.concat_fwd} \ +--in2 {input.concat_rev} \ --out1 {output.fastq_r1} \ --out2 {output.fastq_r2} \ --json {output.json} \ @@ -45,26 +45,26 @@ fastp \ rule fastp_quality_trim_tga: """Fastq data pre-processing after removal of UMIs.""" input: - fastq_r1 = fastq_dir + "{fastq_pattern}_1.umi_removed.fastq.gz", - fastq_r2 = fastq_dir + "{fastq_pattern}_2.umi_removed.fastq.gz" + fastq_r1 = fastq_dir + "{sample}_1.umi_removed.fastq.gz", + fastq_r2 = fastq_dir + "{sample}_2.umi_removed.fastq.gz" output: - fastq_r1 = temp(fastq_dir + "{fastq_pattern}_1.fp.fastq.gz"), - fastq_r2 = temp(fastq_dir + "{fastq_pattern}_2.fp.fastq.gz"), - json = qc_dir + "fastp/{fastq_pattern}_fastp.json", - html = qc_dir + "fastp/{fastq_pattern}_fastp.html" + fastq_r1 = temp(fastq_dir + "{sample}_1.fp.fastq.gz"), + fastq_r2 = temp(fastq_dir + "{sample}_2.fp.fastq.gz"), + json = qc_dir + "fastp/{sample}_fastp.json", + html = qc_dir + "fastp/{sample}_fastp.html" benchmark: - Path(benchmark_dir, "fastp_quality_trim" + "{fastq_pattern}.tsv").as_posix() + Path(benchmark_dir, "fastp_quality_trim" + "{sample}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix() params: tmpdir = tmp_dir, quality_trim = " ".join(fastp_parameters["fastp_trim_qual"]), adapter_trim = " ".join(fastp_parameters["fastp_trim_adapter"]), - fastq_pattern = "{fastq_pattern}" + sample = "{sample}" threads: get_threads(cluster_config, 'fastp_quality_trim') message: - "Quality and adapter trimming for fastqs for fastq pattern: {params.fastq_pattern}" + "Quality and adapter trimming for fastqs for fastq pattern: {params.sample}" shell: """ export TMPDIR={params.tmpdir}; diff --git a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule index 176da7b15..bccb090b0 100644 --- a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule @@ -55,10 +55,9 @@ if config["analysis"]["sequencing_type"] == 'wgs': else: # fastp metrics - multiqc_input.extend(expand(qc_dir + "fastp/{fastq_pattern}_fastp.json", - fastq_pattern=config_model.get_fastq_patterns_by_sample(sample_names = sample_names))) - multiqc_input.extend(expand(qc_dir + "fastp/{fastq_pattern}_fastp.html", - fastq_pattern=config_model.get_fastq_patterns_by_sample(sample_names = sample_names))) + multiqc_input.extend(expand(qc_dir + "fastp/{sample}_fastp.json", sample=sample_names)) + multiqc_input.extend(expand(qc_dir + "fastp/{sample}_fastp.html", sample=sample_names)) + # picard metrics multiqc_input.extend(expand(bam_dir + "{sample}.dedup.insertsizemetric.txt", sample=tumor_sample)) From 7cfed8116f8a86c3f402c273019af19ca08ab67d Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Sun, 26 Nov 2023 21:12:37 +0100 Subject: [PATCH 09/16] add all postprocessing steps from production version --- .../snakemake_rules/align/bwa_mem_picard.rule | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule b/BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule index bbf98e3cf..abf04a933 100644 --- a/BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule +++ b/BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule @@ -72,13 +72,35 @@ export TMPDIR={params.tmpdir}; picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ MarkDuplicates \ INPUT={input} \ -OUTPUT={output.mrkdup} \ +OUTPUT={output.mrkdup}_markdup.bam \ VALIDATION_STRINGENCY=SILENT \ MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \ REMOVE_DUPLICATES=FALSE \ METRICS_FILE='{output.picard_stats}'; -samtools index {output.mrkdup}; +samtools index {output.mrkdup}_markdup.bam ; + +picard -Xmx150g FixMateInformation -ADD_MATE_CIGAR true -MAX_RECORDS_IN_RAM 10000000 -CREATE_INDEX true -CREATE_MD5_FILE true" \ +-TMP_DIR {params.tmpdir} \ +-INPUT {output.mrkdup}_markdup.bam \ +-OUTPUT {params.tmpdir}/{wildcards.sample_type}.fixed.bam; + +samtools view --threads {threads} -O BAM -f 4 {params.tmpdir}/{wildcards.sample_type}.fixed.bam \ +-o {params.tmpdir}/{wildcards.sample_type}.fixed.um.bam; + +samtools index {params.tmpdir}/{wildcards.sample_type}.fixed.um.bam; + +samtools view --threads {threads} -O BAM -F 4 {params.tmpdir}/{wildcards.sample_type}.fixed.bam \ +-o {params.tmpdir}/{wildcards.sample_type}.fixed.m.bam; + +samtools index {params.tmpdir}/{wildcards.sample_type}.fixed.m.bam; + +picard -Xmx150g AddOrReplaceReadGroups -RGPU ILLUMINAi -RGID {wildcards.sample_type} -RGSM {wildcards.sample_type} -RGPL ILLUMINAi -RGLB ILLUMINAi -MAX_RECORDS_IN_RAM 1000000 -CREATE_INDEX true -CREATE_MD5_FILE true \ +-TMP_DIR {params.tmpdir} \ +-INPUT {params.tmpdir}/{wildcards.sample_type}.fixed.m.bam \ +-OUTPUT {output.mrkdup}; + +samtools index {output.mrkdup}; rm -rf {params.tmpdir}; """ From c04910d46681589a9b5fd3b9b4701658f02bb629 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 27 Nov 2023 15:13:21 +0100 Subject: [PATCH 10/16] add more mem for vardict, add picardRG and removal of unmapped reads for vardict --- BALSAMIC/constants/cluster_analysis.json | 8 +- BALSAMIC/constants/rules.py | 19 +--- .../snakemake_rules/align/bwa_mem_picard.rule | 106 ------------------ .../align/postprocess_bam.rule | 47 ++++++++ .../quality_control/fastp_tga.rule | 42 +++---- .../quality_control/multiqc.rule | 9 +- .../variant_calling/somatic_tumor_normal.rule | 6 +- .../variant_calling/somatic_tumor_only.rule | 4 +- 8 files changed, 88 insertions(+), 153 deletions(-) delete mode 100644 BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule create mode 100644 BALSAMIC/snakemake_rules/align/postprocess_bam.rule diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json index 6f5ddf1f1..cbbda931c 100644 --- a/BALSAMIC/constants/cluster_analysis.json +++ b/BALSAMIC/constants/cluster_analysis.json @@ -15,6 +15,10 @@ "time": "01:00:00", "n": 4 }, + "postprocess_bam": { + "time": "03:00:00", + "n": 12 + }, "finalize_gens_outputfiles": { "time": "01:00:00", "n": 2 @@ -186,11 +190,11 @@ }, "vardict_tumor_normal": { "time": "12:00:00", - "n": 10 + "n": 18 }, "vardict_tumor_only": { "time": "10:00:00", - "n": 10 + "n": 9 }, "sentieon_bwa_umiextract": { "time": "8:00:00", diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py index 451b0c810..6d62d19e1 100644 --- a/BALSAMIC/constants/rules.py +++ b/BALSAMIC/constants/rules.py @@ -37,6 +37,7 @@ "snakemake_rules/quality_control/samtools_qc.rule", ], "align": [ + "snakemake_rules/align/sentieon_alignment.rule", "snakemake_rules/align/bam_compress.rule", ], "varcall": [ @@ -65,13 +66,10 @@ "snakemake_rules/umi/mergetype_tumor_umi.rule", "snakemake_rules/umi/generate_AF_tables.rule", ], - "concatenation": [ - "snakemake_rules/concatenation/concatenation.rule", - ], "align": [ - "snakemake_rules/align/bwa_mem_picard.rule", "snakemake_rules/umi/sentieon_umiextract.rule", "snakemake_rules/umi/sentieon_consensuscall.rule", + "snakemake_rules/align/postprocess_bam.rule", ], "varcall": [ "snakemake_rules/variant_calling/germline.rule", @@ -100,13 +98,10 @@ "snakemake_rules/quality_control/contest.rule", "snakemake_rules/umi/generate_AF_tables.rule", ], - "concatenation": [ - "snakemake_rules/concatenation/concatenation.rule", - ], "align": [ - "snakemake_rules/align/bwa_mem_picard.rule", "snakemake_rules/umi/sentieon_umiextract.rule", "snakemake_rules/umi/sentieon_consensuscall.rule", + "snakemake_rules/align/postprocess_bam.rule", ], "varcall": [ "snakemake_rules/variant_calling/germline.rule", @@ -123,9 +118,6 @@ ], }, "single_wgs": { - "align": [ - "snakemake_rules/align/sentieon_alignment.rule" - ], "qc": [ "snakemake_rules/quality_control/fastp_wgs.rule", "snakemake_rules/quality_control/sentieon_qc_metrics.rule", @@ -144,9 +136,6 @@ ], }, "paired_wgs": { - "align": [ - "snakemake_rules/align/sentieon_alignment.rule" - ], "qc": [ "snakemake_rules/quality_control/fastp_wgs.rule", "snakemake_rules/quality_control/sentieon_qc_metrics.rule", @@ -222,4 +211,4 @@ "finalize_gens_outputfiles", # TMB "tmb_calculation", -] +] \ No newline at end of file diff --git a/BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule b/BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule deleted file mode 100644 index abf04a933..000000000 --- a/BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule +++ /dev/null @@ -1,106 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format - - -rule bwa_mem: - input: - fa = config["reference"]["reference_genome"], - fastq_r1=fastq_dir + "{sample}_1.fp.fastq.gz", - fastq_r2=fastq_dir + "{sample}_2.fp.fastq.gz", - refidx = expand(config["reference"]["reference_genome"] + ".{prefix}", prefix=["amb","ann","bwt","pac","sa"]) - output: - bam_out=Path(bam_dir,"{sample}_align_sort.bam").as_posix() - benchmark: - Path(benchmark_dir, "bwa_mem_{sample}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bwa") + ".sif").as_posix() - params: - bam_header = params.common.align_header, - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - sample_id = "{sample}" - threads: - get_threads(cluster_config, "bwa_mem") - message: - ("Align fastq files with bwa-mem to reference genome and " - "sort using samtools for sample: {params.sample_id}") - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; - -bwa mem \ --t {threads} \ --R {params.bam_header} \ --M \ --v 1 \ -{input.fa} {input.fastq_r1} {input.fastq_r2} \ -| samtools sort -T {params.tmpdir} \ ---threads {threads} \ ---output-fmt BAM \ --o {output.bam_out} - ; - -samtools index -@ {threads} {output.bam_out}; -rm -rf {params.tmpdir}; - """ - - -rule picard_markduplicates: - input: - Path(bam_dir, "{sample}_align_sort.bam").as_posix() - output: - mrkdup = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted.bam").as_posix(), - picard_stats = Path(qc_dir, "{sample_type}.{sample}.dedup.metrics").as_posix(), - benchmark: - Path(benchmark_dir,"picard_{sample_type}_markduplicates_{sample}.tsv").as_posix() - singularity: - Path(singularity_image,config["bioinfo_tools"].get("picard") + ".sif").as_posix() - params: - mem = "16g", - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - sample_id = "{sample}" - threads: - get_threads(cluster_config, "picard_markduplicates") - message: - "Picard marking duplicates and samtool indexing for sample: {params.sample_id}" - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; - -picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ -MarkDuplicates \ -INPUT={input} \ -OUTPUT={output.mrkdup}_markdup.bam \ -VALIDATION_STRINGENCY=SILENT \ -MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \ -REMOVE_DUPLICATES=FALSE \ -METRICS_FILE='{output.picard_stats}'; - -samtools index {output.mrkdup}_markdup.bam ; - -picard -Xmx150g FixMateInformation -ADD_MATE_CIGAR true -MAX_RECORDS_IN_RAM 10000000 -CREATE_INDEX true -CREATE_MD5_FILE true" \ --TMP_DIR {params.tmpdir} \ --INPUT {output.mrkdup}_markdup.bam \ --OUTPUT {params.tmpdir}/{wildcards.sample_type}.fixed.bam; - -samtools view --threads {threads} -O BAM -f 4 {params.tmpdir}/{wildcards.sample_type}.fixed.bam \ --o {params.tmpdir}/{wildcards.sample_type}.fixed.um.bam; - -samtools index {params.tmpdir}/{wildcards.sample_type}.fixed.um.bam; - -samtools view --threads {threads} -O BAM -F 4 {params.tmpdir}/{wildcards.sample_type}.fixed.bam \ --o {params.tmpdir}/{wildcards.sample_type}.fixed.m.bam; - -samtools index {params.tmpdir}/{wildcards.sample_type}.fixed.m.bam; - -picard -Xmx150g AddOrReplaceReadGroups -RGPU ILLUMINAi -RGID {wildcards.sample_type} -RGSM {wildcards.sample_type} -RGPL ILLUMINAi -RGLB ILLUMINAi -MAX_RECORDS_IN_RAM 1000000 -CREATE_INDEX true -CREATE_MD5_FILE true \ --TMP_DIR {params.tmpdir} \ --INPUT {params.tmpdir}/{wildcards.sample_type}.fixed.m.bam \ --OUTPUT {output.mrkdup}; - -samtools index {output.mrkdup}; - -rm -rf {params.tmpdir}; - """ diff --git a/BALSAMIC/snakemake_rules/align/postprocess_bam.rule b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule new file mode 100644 index 000000000..6e4d08d93 --- /dev/null +++ b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule @@ -0,0 +1,47 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format + + + +rule postprocess_bam: + input: + bam = Path(bam_dir,"{sample_type}.{sample}.dedup_sorted.bam").as_posix() + output: + postprocessed_bam = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted_postprocessed.bam").as_posix(), + benchmark: + Path(benchmark_dir,"postprocess_bam_{sample_type}.{sample}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("picard") + ".sif").as_posix() + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + sample_id = "{sample}" + threads: + get_threads(cluster_config, "postprocess_bam") + message: + "Samtools remove unmapped reads and collapse readgroups for sample: {params.sample_id}" + shell: + """ +mkdir -p {params.tmpdir}; +export TMPDIR={params.tmpdir}; + +samtools view --threads {threads} -O BAM -f 4 {input.bam} \ +-o {params.tmpdir}/{wildcards.sample_type}.um.bam ; + +samtools index {params.tmpdir}/{wildcards.sample_type}.um.bam ; + +samtools view --threads {threads} -O BAM -F 4 {params.tmpdir}/{wildcards.sample_type}.um.bam \ +-o {params.tmpdir}/{wildcards.sample_type}.m.bam ; + +samtools index {params.tmpdir}/{wildcards.sample_type}.m.bam ; + +picard -Xmx75g AddOrReplaceReadGroups -RGPU ILLUMINAi -RGID {wildcards.sample_type} -RGSM {wildcards.sample_type} -RGPL ILLUMINAi -RGLB ILLUMINAi -MAX_RECORDS_IN_RAM 1000000 -CREATE_INDEX true -CREATE_MD5_FILE true \ +-TMP_DIR {params.tmpdir} \ +-INPUT {params.tmpdir}/{wildcards.sample_type}.m.bam \ +-OUTPUT {output.mrkdup}; + +samtools index {output.postprocessed_bam}; + +rm -rf {params.tmpdir}; + """ diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule index dd6fcac3e..52a0bac0d 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule @@ -3,33 +3,33 @@ rule fastp_umi_trim: """Fastq TGA data pre-processing to remove UMIs.""" input: - concat_fwd=fastq_dir + "{sample}_concat_R_1.fp.fastq.gz", - concat_rev=fastq_dir + "{sample}_concat_R_2.fp.fastq.gz" + fastq_r1 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.FWD), + fastq_r2 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.REV) output: - fastq_r1 = temp(fastq_dir + "{sample}_1.umi_removed.fastq.gz"), - fastq_r2 = temp(fastq_dir + "{sample}_2.umi_removed.fastq.gz"), - json = qc_dir + "fastp/{sample}_umi_removed_fastp.json", - html = qc_dir + "fastp/{sample}_umi_removed_fastp.html", + fastq_r1 = temp(fastq_dir + "{fastq_pattern}_1.umi_removed.fastq.gz"), + fastq_r2 = temp(fastq_dir + "{fastq_pattern}_2.umi_removed.fastq.gz"), + json = qc_dir + "fastp/{fastq_pattern}_umi_removed_fastp.json", + html = qc_dir + "fastp/{fastq_pattern}_umi_removed_fastp.html", benchmark: - Path(benchmark_dir, "fastp_remove_umi" + "{sample}.tsv").as_posix() + Path(benchmark_dir, "fastp_remove_umi" + "{fastq_pattern}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix() params: tmpdir = tmp_dir, fastp_trim_umi = " ".join(fastp_parameters["fastp_trim_umi"]), - sample = "{sample}", + fastq_pattern = "{fastq_pattern}", threads: get_threads(cluster_config, 'fastp_remove_umi') message: - "Trimming away UMI sequences for fastqs in sample: {params.sample}" + "Trimming away UMI sequences for fastqs in fastq pattern: {params.fastq_pattern}" shell: """ export TMPDIR={params.tmpdir}; fastp \ --thread {threads} \ ---in1 {input.concat_fwd} \ ---in2 {input.concat_rev} \ +--in1 {input.fastq_r1} \ +--in2 {input.fastq_r2} \ --out1 {output.fastq_r1} \ --out2 {output.fastq_r2} \ --json {output.json} \ @@ -45,26 +45,26 @@ fastp \ rule fastp_quality_trim_tga: """Fastq data pre-processing after removal of UMIs.""" input: - fastq_r1 = fastq_dir + "{sample}_1.umi_removed.fastq.gz", - fastq_r2 = fastq_dir + "{sample}_2.umi_removed.fastq.gz" + fastq_r1 = fastq_dir + "{fastq_pattern}_1.umi_removed.fastq.gz", + fastq_r2 = fastq_dir + "{fastq_pattern}_2.umi_removed.fastq.gz" output: - fastq_r1 = temp(fastq_dir + "{sample}_1.fp.fastq.gz"), - fastq_r2 = temp(fastq_dir + "{sample}_2.fp.fastq.gz"), - json = qc_dir + "fastp/{sample}_fastp.json", - html = qc_dir + "fastp/{sample}_fastp.html" + fastq_r1 = temp(fastq_dir + "{fastq_pattern}_1.fp.fastq.gz"), + fastq_r2 = temp(fastq_dir + "{fastq_pattern}_2.fp.fastq.gz"), + json = qc_dir + "fastp/{fastq_pattern}_fastp.json", + html = qc_dir + "fastp/{fastq_pattern}_fastp.html" benchmark: - Path(benchmark_dir, "fastp_quality_trim" + "{sample}.tsv").as_posix() + Path(benchmark_dir, "fastp_quality_trim" + "{fastq_pattern}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix() params: tmpdir = tmp_dir, quality_trim = " ".join(fastp_parameters["fastp_trim_qual"]), adapter_trim = " ".join(fastp_parameters["fastp_trim_adapter"]), - sample = "{sample}" + fastq_pattern = "{fastq_pattern}" threads: get_threads(cluster_config, 'fastp_quality_trim') message: - "Quality and adapter trimming for fastqs for fastq pattern: {params.sample}" + "Quality and adapter trimming for fastqs for fastq pattern: {params.fastq_pattern}" shell: """ export TMPDIR={params.tmpdir}; @@ -80,4 +80,4 @@ fastp \ --overrepresentation_analysis \ {params.quality_trim} \ {params.adapter_trim}; - """ + """ \ No newline at end of file diff --git a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule index bccb090b0..2dd6cbddd 100644 --- a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule @@ -55,9 +55,10 @@ if config["analysis"]["sequencing_type"] == 'wgs': else: # fastp metrics - multiqc_input.extend(expand(qc_dir + "fastp/{sample}_fastp.json", sample=sample_names)) - multiqc_input.extend(expand(qc_dir + "fastp/{sample}_fastp.html", sample=sample_names)) - + multiqc_input.extend(expand(qc_dir + "fastp/{fastq_pattern}_fastp.json", + fastq_pattern=config_model.get_fastq_patterns_by_sample(sample_names = sample_names))) + multiqc_input.extend(expand(qc_dir + "fastp/{fastq_pattern}_fastp.html", + fastq_pattern=config_model.get_fastq_patterns_by_sample(sample_names = sample_names))) # picard metrics multiqc_input.extend(expand(bam_dir + "{sample}.dedup.insertsizemetric.txt", sample=tumor_sample)) @@ -115,4 +116,4 @@ multiqc --force --outdir {params.qc_dir} \ -l {params.qc_dir}/dir_list; chmod -R 777 {params.qc_dir}; - """ + """ \ No newline at end of file diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule index afab32275..f228fb7dc 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule @@ -6,8 +6,8 @@ rule vardict_tumor_normal: input: fa = config["reference"]["reference_genome"], - bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), - bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + bamN = expand(bam_dir + "normal.{sample}.dedup_sorted_postprocessed.bam", sample=normal_sample), + bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_postprocessed.bam", sample=tumor_sample), bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, output: temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz") @@ -30,7 +30,7 @@ rule vardict_tumor_normal: """ mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; -export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx32G\"'; +export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx90G\"'; vardict-java -U -u -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \ -b \"{input.bamT}|{input.bamN}\" \ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule index 3c6339fa4..dbbdcf31e 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule @@ -4,7 +4,7 @@ rule vardict_tumor_only: input: fa = config["reference"]["reference_genome"], - bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_postprocessed.bam", sample=tumor_sample), bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, output: temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz") @@ -29,7 +29,7 @@ export PERL5LIB=; mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; -export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx48G\"'; +export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx45G\"'; vardict-java -u -I 600 \ -G {input.fa} \ From b057515034e6f1a7f50d6deb1650e51d5b66aef4 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 27 Nov 2023 15:23:28 +0100 Subject: [PATCH 11/16] empty lines --- BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule | 2 +- BALSAMIC/snakemake_rules/quality_control/multiqc.rule | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule index 52a0bac0d..a9abae72f 100644 --- a/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule +++ b/BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule @@ -80,4 +80,4 @@ fastp \ --overrepresentation_analysis \ {params.quality_trim} \ {params.adapter_trim}; - """ \ No newline at end of file + """ diff --git a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule index 2dd6cbddd..176da7b15 100644 --- a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule @@ -116,4 +116,4 @@ multiqc --force --outdir {params.qc_dir} \ -l {params.qc_dir}/dir_list; chmod -R 777 {params.qc_dir}; - """ \ No newline at end of file + """ From aafdc4231b9e7d33449e08ac716f052aebb115a7 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 27 Nov 2023 15:27:28 +0100 Subject: [PATCH 12/16] fix bug --- BALSAMIC/snakemake_rules/align/postprocess_bam.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/snakemake_rules/align/postprocess_bam.rule b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule index 6e4d08d93..0f8d23241 100644 --- a/BALSAMIC/snakemake_rules/align/postprocess_bam.rule +++ b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule @@ -39,7 +39,7 @@ samtools index {params.tmpdir}/{wildcards.sample_type}.m.bam ; picard -Xmx75g AddOrReplaceReadGroups -RGPU ILLUMINAi -RGID {wildcards.sample_type} -RGSM {wildcards.sample_type} -RGPL ILLUMINAi -RGLB ILLUMINAi -MAX_RECORDS_IN_RAM 1000000 -CREATE_INDEX true -CREATE_MD5_FILE true \ -TMP_DIR {params.tmpdir} \ -INPUT {params.tmpdir}/{wildcards.sample_type}.m.bam \ --OUTPUT {output.mrkdup}; +-OUTPUT {output.postprocessed_bam}; samtools index {output.postprocessed_bam}; From 2bfd5e1445a9752c7219fc6ed142e193330f0775 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Tue, 28 Nov 2023 10:52:34 +0100 Subject: [PATCH 13/16] black and changelog --- BALSAMIC/constants/rules.py | 2 +- CHANGELOG.rst | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py index 6d62d19e1..165e1b68d 100644 --- a/BALSAMIC/constants/rules.py +++ b/BALSAMIC/constants/rules.py @@ -211,4 +211,4 @@ "finalize_gens_outputfiles", # TMB "tmb_calculation", -] \ No newline at end of file +] diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 95db38041..44aec5cf9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -30,6 +30,7 @@ Added: * CNVs from PureCN to TGA workflow https://Clinical-Genomics/BALSAMIC/pull/1278 * Command-line arguments and rules for creation of GENS files https://github.com/Clinical-Genomics/BALSAMIC/pull/1279 * Somatic and germline Loqusdb annotation to ReadtheDocs https://github.com/Clinical-Genomics/BALSAMIC/pull/1317 +* Postprocess step before VarDict in TGA https://github.com/Clinical-Genomics/BALSAMIC/pull/1332 Changed: ^^^^^^^^ @@ -66,6 +67,7 @@ Changed: * Split analysis model into config and params models https://github.com/Clinical-Genomics/BALSAMIC/pull/1306 * Renamed name in sample column of final clincial vcfs https://github.com/Clinical-Genomics/BALSAMIC/pull/1310 * Update Gens HK tags https://github.com/Clinical-Genomics/BALSAMIC/pull/1319 +* Increased memory and threads for VarDict https://github.com/Clinical-Genomics/BALSAMIC/pull/1332 Fixed: ^^^^^^ From c3074ab69c2f0203e68e96f32d0a0084987f7c49 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Tue, 28 Nov 2023 11:46:52 +0100 Subject: [PATCH 14/16] remove samtools unmapped reads command --- .../align/postprocess_bam.rule | 21 +++++++------------ .../variant_calling/somatic_tumor_normal.rule | 4 ++-- .../variant_calling/somatic_tumor_only.rule | 2 +- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/BALSAMIC/snakemake_rules/align/postprocess_bam.rule b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule index 0f8d23241..f70fc85f3 100644 --- a/BALSAMIC/snakemake_rules/align/postprocess_bam.rule +++ b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule @@ -9,7 +9,7 @@ rule postprocess_bam: input: bam = Path(bam_dir,"{sample_type}.{sample}.dedup_sorted.bam").as_posix() output: - postprocessed_bam = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted_postprocessed.bam").as_posix(), + postprocessed_bam = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted_addRG.bam").as_posix(), benchmark: Path(benchmark_dir,"postprocess_bam_{sample_type}.{sample}.tsv").as_posix() singularity: @@ -20,25 +20,18 @@ rule postprocess_bam: threads: get_threads(cluster_config, "postprocess_bam") message: - "Samtools remove unmapped reads and collapse readgroups for sample: {params.sample_id}" + "Collapse readgroups for sample: {params.sample_id}" shell: """ mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; -samtools view --threads {threads} -O BAM -f 4 {input.bam} \ --o {params.tmpdir}/{wildcards.sample_type}.um.bam ; - -samtools index {params.tmpdir}/{wildcards.sample_type}.um.bam ; - -samtools view --threads {threads} -O BAM -F 4 {params.tmpdir}/{wildcards.sample_type}.um.bam \ --o {params.tmpdir}/{wildcards.sample_type}.m.bam ; - -samtools index {params.tmpdir}/{wildcards.sample_type}.m.bam ; - -picard -Xmx75g AddOrReplaceReadGroups -RGPU ILLUMINAi -RGID {wildcards.sample_type} -RGSM {wildcards.sample_type} -RGPL ILLUMINAi -RGLB ILLUMINAi -MAX_RECORDS_IN_RAM 1000000 -CREATE_INDEX true -CREATE_MD5_FILE true \ +picard -Xmx75g AddOrReplaceReadGroups \ +-RGPU ILLUMINAi -RGID {wildcards.sample_type} -RGSM {wildcards.sample_type} \ +-RGPL ILLUMINAi -RGLB ILLUMINAi -MAX_RECORDS_IN_RAM 1000000 \ +-CREATE_INDEX true -CREATE_MD5_FILE true \ -TMP_DIR {params.tmpdir} \ --INPUT {params.tmpdir}/{wildcards.sample_type}.m.bam \ +-INPUT {input.bam} \ -OUTPUT {output.postprocessed_bam}; samtools index {output.postprocessed_bam}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule index f228fb7dc..f8ba2a2e1 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule @@ -6,8 +6,8 @@ rule vardict_tumor_normal: input: fa = config["reference"]["reference_genome"], - bamN = expand(bam_dir + "normal.{sample}.dedup_sorted_postprocessed.bam", sample=normal_sample), - bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_postprocessed.bam", sample=tumor_sample), + bamN = expand(bam_dir + "normal.{sample}.dedup_sorted_addRG.bam", sample=normal_sample), + bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_addRG.bam", sample=tumor_sample), bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, output: temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz") diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule index dbbdcf31e..cf8ea7148 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule @@ -4,7 +4,7 @@ rule vardict_tumor_only: input: fa = config["reference"]["reference_genome"], - bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_postprocessed.bam", sample=tumor_sample), + bamT = expand(bam_dir + "tumor.{sample}.dedup_sorted_addRG.bam", sample=tumor_sample), bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, output: temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz") From 54a7073bcc079ed97072a8294aed76a0505f360c Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Tue, 28 Nov 2023 11:47:27 +0100 Subject: [PATCH 15/16] clean up unnecessary mkdir tmp --- BALSAMIC/snakemake_rules/align/postprocess_bam.rule | 1 - 1 file changed, 1 deletion(-) diff --git a/BALSAMIC/snakemake_rules/align/postprocess_bam.rule b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule index f70fc85f3..97ef9aa6e 100644 --- a/BALSAMIC/snakemake_rules/align/postprocess_bam.rule +++ b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule @@ -23,7 +23,6 @@ rule postprocess_bam: "Collapse readgroups for sample: {params.sample_id}" shell: """ -mkdir -p {params.tmpdir}; export TMPDIR={params.tmpdir}; picard -Xmx75g AddOrReplaceReadGroups \ From 7b912c77bc7d27fe305ab35cabcd3dd65ed907e7 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Tue, 28 Nov 2023 15:46:39 +0100 Subject: [PATCH 16/16] add note about new rule --- BALSAMIC/snakemake_rules/align/postprocess_bam.rule | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/snakemake_rules/align/postprocess_bam.rule b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule index 97ef9aa6e..76f16b559 100644 --- a/BALSAMIC/snakemake_rules/align/postprocess_bam.rule +++ b/BALSAMIC/snakemake_rules/align/postprocess_bam.rule @@ -1,9 +1,11 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format - +# NOTE: This rule is only applied to prevent VarDict from failing with error like this: +# Critical exception occurs on region: 20:39794721-39795011, program will be stopped. +# java.util.concurrent.CompletionException: java.lang.InternalError: a fault occurred in a recent unsafe memory access +# It's however unclear how this resolves the issue. rule postprocess_bam: input: