From 9718ef7726f306a345780ca3d75b7ab1b4bbe7d4 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Mon, 26 Jun 2023 13:24:31 +0000 Subject: [PATCH 1/2] Avoid redundant index generation in sentieon flow --- subworkflows/local/bam_sentieon_dedup/main.nf | 6 ++++-- .../local/fastq_align_bwamem_mem2_dragmap_sentieon/main.nf | 4 ++++ workflows/sarek.nf | 7 ++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/bam_sentieon_dedup/main.nf b/subworkflows/local/bam_sentieon_dedup/main.nf index bb0d8b5d5..8a0bced37 100644 --- a/subworkflows/local/bam_sentieon_dedup/main.nf +++ b/subworkflows/local/bam_sentieon_dedup/main.nf @@ -13,6 +13,7 @@ include { SAMTOOLS_INDEX as INDEX_MARKDUPLICATES } from '../../../modules/nf-cor workflow BAM_SENTIEON_DEDUP { take: bam // channel: [mandatory] [ meta, bam ] // Although the channel is named "bam", it may contain cram-files. + bai fasta // channel: [mandatory] [ fasta ] fasta_fai // channel: [mandatory] [ fasta_fai ] intervals_bed_combined // channel: [optional] [ intervals_bed ] @@ -21,8 +22,9 @@ workflow BAM_SENTIEON_DEDUP { versions = Channel.empty() reports = Channel.empty() - INDEX_INPUT(bam) - bam_bai = bam.join(INDEX_INPUT.out.bai.concat(INDEX_INPUT.out.crai), failOnMismatch:true, failOnDuplicate:true) + bam = bam.map{ meta, bam -> [ meta - meta.subMap('data_type'), bam ] } + bai = bai.map{ meta, bai -> [ meta - meta.subMap('data_type'), bai ] } + bam_bai = bam.join(bai, failOnMismatch:true, failOnDuplicate:true) // The concat operation is part of the above command since if the "bam" channel contains cram-files, then the index files will be in the channel INDEX_INPUT.out.crai and not in INDEX_INPUT.out.bai SENTIEON_DEDUP(bam_bai, fasta, fasta_fai) diff --git a/subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/main.nf b/subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/main.nf index 99c422ef6..0699eb5c1 100644 --- a/subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/main.nf +++ b/subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/main.nf @@ -36,6 +36,9 @@ workflow FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON { bam = bam.mix(DRAGMAP_ALIGN.out.bam) bam = bam.mix(SENTIEON_BWAMEM.out.bam_and_bai.map{ meta, bam, bai -> [ meta, bam ] }) + bai = Channel.empty() + bai = bai.mix(SENTIEON_BWAMEM.out.bam_and_bai.map{ meta, bam, bai -> [ meta, bai ] }) + // Gather reports of all tools used reports = reports.mix(DRAGMAP_ALIGN.out.log) @@ -47,6 +50,7 @@ workflow FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON { emit: bam // channel: [ [meta], bam ] + bai // channel: [ [meta], bai ] reports versions // channel: [ versions.yml ] } diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 1ead6c752..7db25bc96 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -540,6 +540,10 @@ workflow SAREK { [ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bam', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), bam ] }.groupTuple() + bai_mapped = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.bai.map{ meta, bai -> + [ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bai', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), bai ] + }.groupTuple() + // gatk4 markduplicates can handle multiple bams as input, so no need to merge/index here // Except if and only if save_mapped or (skipping markduplicates and sentieon-dedup) if ( @@ -578,7 +582,6 @@ workflow SAREK { // ch_bam_for_markduplicates will contain bam mapped with FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON when step is mapping // Or bams that are specified in the samplesheet.csv when step is prepare_recalibration cram_for_markduplicates = params.step == 'mapping' ? bam_mapped : input_sample.map{ meta, input, index -> [ meta, input ] } - // if no MD is done, then run QC on mapped & converted CRAM files // or the input BAM (+converted) or CRAM files cram_skip_markduplicates = Channel.empty() @@ -628,8 +631,10 @@ workflow SAREK { // Gather used softwares versions versions = versions.mix(BAM_MARKDUPLICATES_SPARK.out.versions) } else if (params.tools && params.tools.split(',').contains('sentieon_dedup')) { + crai_for_markduplicates = params.step == 'mapping' ? bai_mapped : input_sample.map{ meta, input, index -> [ meta, index ] } BAM_SENTIEON_DEDUP( cram_for_markduplicates, + crai_for_markduplicates, fasta, fasta_fai, intervals_for_preprocessing) From 78979b0e2eaf0271c9a7fcfaf8aa6f891a0ec39a Mon Sep 17 00:00:00 2001 From: asp8200 Date: Mon, 26 Jun 2023 20:23:53 +0000 Subject: [PATCH 2/2] cleanup --- subworkflows/local/bam_sentieon_dedup/main.nf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/subworkflows/local/bam_sentieon_dedup/main.nf b/subworkflows/local/bam_sentieon_dedup/main.nf index 8a0bced37..b2d9a4d3a 100644 --- a/subworkflows/local/bam_sentieon_dedup/main.nf +++ b/subworkflows/local/bam_sentieon_dedup/main.nf @@ -5,10 +5,7 @@ // A when clause condition is defined in the conf/modules.config to determine if the module should be run include { CRAM_QC_MOSDEPTH_SAMTOOLS } from '../cram_qc_mosdepth_samtools/main' -include { GATK4_MARKDUPLICATES } from '../../../modules/nf-core/gatk4/markduplicates/main' include { SENTIEON_DEDUP } from '../../../modules/nf-core/sentieon/dedup/main' -include { SAMTOOLS_INDEX as INDEX_INPUT } from '../../../modules/nf-core/samtools/index/main' -include { SAMTOOLS_INDEX as INDEX_MARKDUPLICATES } from '../../../modules/nf-core/samtools/index/main' workflow BAM_SENTIEON_DEDUP { take: @@ -25,7 +22,6 @@ workflow BAM_SENTIEON_DEDUP { bam = bam.map{ meta, bam -> [ meta - meta.subMap('data_type'), bam ] } bai = bai.map{ meta, bai -> [ meta - meta.subMap('data_type'), bai ] } bam_bai = bam.join(bai, failOnMismatch:true, failOnDuplicate:true) - // The concat operation is part of the above command since if the "bam" channel contains cram-files, then the index files will be in the channel INDEX_INPUT.out.crai and not in INDEX_INPUT.out.bai SENTIEON_DEDUP(bam_bai, fasta, fasta_fai) // Join with the crai file