Merge pull request #957 from maxulysse/mutect2

Add failsafe options for Nextflow join operator
nf-core · Mar 2, 2023 · b2a8891 · b2a8891
2 parents 4e60e8c + 59b3d0d
commit b2a8891
Show file tree

Hide file tree

Showing 19 changed files with 61 additions and 54 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -41,6 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#929](https://github.com/nf-core/sarek/pull/929) - Fix somatic variant calling issues with msisensor following [#896](https://github.com/nf-core/sarek/pull/896)
 - [#941](https://github.com/nf-core/sarek/pull/941) - Fix json validation for `tools`, `skip_tools` and `use_gatk_spark` [#892](https://github.com/nf-core/sarek/issues/892)
 - [#954](https://github.com/nf-core/sarek/pull/954) - Fix missing annotation keys with snpeff and ensemblvep for `hg19`
+- [#957](https://github.com/nf-core/sarek/pull/957) - Add `failOnDuplicate` and `failOnMismatch` options to all `join()` operator where it was possible
 
 ### Deprecated
 

diff --git a/modules.json b/modules.json
@@ -434,7 +434,7 @@
                 "nf-core": {
                     "vcf_annotate_ensemblvep": {
                         "branch": "master",
-                        "git_sha": "bea3ca998816a7f812b1bbbcb27c3a9ffbac0706",
+                        "git_sha": "82b8c4ef7d34c5c1009649581af46b684183da12",
                         "installed_by": ["subworkflows"]
                     },
                     "vcf_annotate_snpeff": {

diff --git a/subworkflows/local/bam_convert_samtools/main.nf b/subworkflows/local/bam_convert_samtools/main.nf
@@ -37,8 +37,8 @@ workflow BAM_CONVERT_SAMTOOLS {
 
     // Merge UNMAP
     all_unmapped_bam = SAMTOOLS_VIEW_UNMAP_UNMAP.out.bam
-        .join(SAMTOOLS_VIEW_UNMAP_MAP.out.bam, remainder: true)
-        .join(SAMTOOLS_VIEW_MAP_UNMAP.out.bam, remainder: true)
+        .join(SAMTOOLS_VIEW_UNMAP_MAP.out.bam, failOnDuplicate: true, remainder: true)
+        .join(SAMTOOLS_VIEW_MAP_UNMAP.out.bam, failOnDuplicate: true, remainder: true)
         .map{ meta, unmap_unmap, unmap_map, map_unmap -> [ meta, [ unmap_unmap, unmap_map, map_unmap ] ] }
 
     SAMTOOLS_MERGE_UNMAP(all_unmapped_bam, fasta[1], fasta_fai)
@@ -52,7 +52,7 @@ workflow BAM_CONVERT_SAMTOOLS {
     // join Mapped & unmapped fastq
 
     reads_to_concat = COLLATE_FASTQ_MAP.out.fastq
-        .join(COLLATE_FASTQ_UNMAP.out.fastq)
+        .join(COLLATE_FASTQ_UNMAP.out.fastq, failOnDuplicate: true, failOnMismatch: true)
         .map{ meta, mapped_reads, unmapped_reads -> [ meta, [ mapped_reads[0], mapped_reads[1], unmapped_reads[0], unmapped_reads[1] ] ] }
 
     // Concatenate Mapped_R1 with Unmapped_R1 and Mapped_R2 with Unmapped_R2

diff --git a/subworkflows/local/bam_joint_calling_germline_gatk/main.nf b/subworkflows/local/bam_joint_calling_germline_gatk/main.nf
@@ -55,7 +55,7 @@ workflow BAM_JOINT_CALLING_GERMLINE_GATK {
     // Rework meta for variantscalled.csv and annotation tools
     MERGE_GENOTYPEGVCFS(gvcf_to_merge, dict.map{ it -> [ [ id:'dict' ], it ] } )
 
-    vqsr_input = MERGE_GENOTYPEGVCFS.out.vcf.join(MERGE_GENOTYPEGVCFS.out.tbi)
+    vqsr_input = MERGE_GENOTYPEGVCFS.out.vcf.join(MERGE_GENOTYPEGVCFS.out.tbi, failOnDuplicate: true)
     indels_resource_label = known_indels_vqsr.mix(dbsnp_vqsr).collect()
     snps_resource_label = known_snps_vqsr.mix(dbsnp_vqsr).collect()
 
@@ -82,16 +82,16 @@ workflow BAM_JOINT_CALLING_GERMLINE_GATK {
 
     // Join results of variant recalibration into a single channel tuple
     // Rework meta for variantscalled.csv and annotation tools
-    vqsr_input_snp = vqsr_input.join(VARIANTRECALIBRATOR_SNP.out.recal)
-        .join(VARIANTRECALIBRATOR_SNP.out.idx)
-        .join(VARIANTRECALIBRATOR_SNP.out.tranches)
+    vqsr_input_snp = vqsr_input.join(VARIANTRECALIBRATOR_SNP.out.recal, failOnDuplicate: true)
+        .join(VARIANTRECALIBRATOR_SNP.out.idx, failOnDuplicate: true)
+        .join(VARIANTRECALIBRATOR_SNP.out.tranches, failOnDuplicate: true)
         .map{ meta, vcf, tbi, recal, index, tranche -> [ meta - meta.subMap('id') + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }
 
     // Join results of variant recalibration into a single channel tuple
     // Rework meta for variantscalled.csv and annotation tools
-    vqsr_input_indel = vqsr_input.join(VARIANTRECALIBRATOR_INDEL.out.recal)
-        .join(VARIANTRECALIBRATOR_INDEL.out.idx)
-        .join(VARIANTRECALIBRATOR_INDEL.out.tranches)
+    vqsr_input_indel = vqsr_input.join(VARIANTRECALIBRATOR_INDEL.out.recal, failOnDuplicate: true)
+        .join(VARIANTRECALIBRATOR_INDEL.out.idx, failOnDuplicate: true)
+        .join(VARIANTRECALIBRATOR_INDEL.out.tranches, failOnDuplicate: true)
         .map{ meta, vcf, tbi, recal, index, tranche -> [ meta - meta.subMap('id') + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }
 
     GATK4_APPLYVQSR_SNP(

diff --git a/subworkflows/local/bam_markduplicates/main.nf b/subworkflows/local/bam_markduplicates/main.nf
@@ -26,7 +26,7 @@ workflow BAM_MARKDUPLICATES {
     INDEX_MARKDUPLICATES(GATK4_MARKDUPLICATES.out.cram)
 
     // Join with the crai file
-    cram = GATK4_MARKDUPLICATES.out.cram.join(INDEX_MARKDUPLICATES.out.crai)
+    cram = GATK4_MARKDUPLICATES.out.cram.join(INDEX_MARKDUPLICATES.out.crai, failOnDuplicate: true, failOnMismatch: true)
 
     // QC on CRAM
     CRAM_QC_MOSDEPTH_SAMTOOLS(cram, fasta, intervals_bed_combined)

diff --git a/subworkflows/local/bam_markduplicates_spark/main.nf b/subworkflows/local/bam_markduplicates_spark/main.nf
@@ -28,7 +28,7 @@ workflow BAM_MARKDUPLICATES_SPARK {
     INDEX_MARKDUPLICATES(GATK4_MARKDUPLICATES_SPARK.out.output)
 
     // Join with the crai file
-    cram = GATK4_MARKDUPLICATES_SPARK.out.output.join(INDEX_MARKDUPLICATES.out.crai)
+    cram = GATK4_MARKDUPLICATES_SPARK.out.output.join(INDEX_MARKDUPLICATES.out.crai, failOnDuplicate: true, failOnMismatch: true)
 
     // QC on CRAM
     CRAM_QC_MOSDEPTH_SAMTOOLS(cram, fasta, intervals_bed_combined)

diff --git a/subworkflows/local/bam_merge_index_samtools/main.nf b/subworkflows/local/bam_merge_index_samtools/main.nf
@@ -32,7 +32,7 @@ workflow BAM_MERGE_INDEX_SAMTOOLS {
     INDEX_MERGE_BAM(bam_all)
 
     // Join with the bai file
-    bam_bai = bam_all.join(INDEX_MERGE_BAM.out.bai)
+    bam_bai = bam_all.join(INDEX_MERGE_BAM.out.bai, failOnDuplicate: true, failOnMismatch: true)
 
     // Gather versions of all tools used
     versions = versions.mix(INDEX_MERGE_BAM.out.versions)

diff --git a/subworkflows/local/bam_variant_calling_haplotypecaller/main.nf b/subworkflows/local/bam_variant_calling_haplotypecaller/main.nf
@@ -36,8 +36,9 @@ workflow BAM_VARIANT_CALLING_HAPLOTYPECALLER {
     GATK4_HAPLOTYPECALLER(cram_intervals, fasta, fasta_fai, dict, dbsnp, dbsnp_tbi)
 
     // For joint genotyping
-    genotype_intervals = GATK4_HAPLOTYPECALLER.out.vcf.join(GATK4_HAPLOTYPECALLER.out.tbi)
-        .join(cram_intervals)
+    genotype_intervals = GATK4_HAPLOTYPECALLER.out.vcf
+        .join(GATK4_HAPLOTYPECALLER.out.tbi, failOnMismatch: true)
+        .join(cram_intervals, failOnMismatch: true)
         .map{ meta, gvcf, tbi, cram, crai, intervals, dragstr_model -> [ meta, gvcf, tbi, intervals ] }
 
     // Figuring out if there is one or more vcf(s) from the same sample
@@ -86,7 +87,7 @@ workflow BAM_VARIANT_CALLING_HAPLOTYPECALLER {
     if (!skip_haplotypecaller_filter) {
 
         VCF_VARIANT_FILTERING_GATK(
-            haplotypecaller_vcf.join(haplotypecaller_tbi),
+            haplotypecaller_vcf.join(haplotypecaller_tbi, failOnDuplicate: true, failOnMismatch: true),
             fasta,
             fasta_fai,
             dict,

diff --git a/subworkflows/local/bam_variant_calling_somatic_all/main.nf b/subworkflows/local/bam_variant_calling_somatic_all/main.nf
@@ -158,7 +158,7 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ALL {
     if (tools.split(',').contains('strelka')) {
         // Remap channel to match module/subworkflow
         cram_strelka = (tools.split(',').contains('manta')) ?
-            cram.join(BAM_VARIANT_CALLING_SOMATIC_MANTA.out.candidate_small_indels_vcf).join(BAM_VARIANT_CALLING_SOMATIC_MANTA.out.candidate_small_indels_vcf_tbi) :
+            cram.join(BAM_VARIANT_CALLING_SOMATIC_MANTA.out.candidate_small_indels_vcf, failOnDuplicate: true, failOnMismatch: true).join(BAM_VARIANT_CALLING_SOMATIC_MANTA.out.candidate_small_indels_vcf_tbi, failOnDuplicate: true, failOnMismatch: true) :
             cram.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> [ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, [], [] ] }
 
         BAM_VARIANT_CALLING_SOMATIC_STRELKA(

diff --git a/subworkflows/local/bam_variant_calling_somatic_controlfreec/main.nf b/subworkflows/local/bam_variant_calling_somatic_controlfreec/main.nf
@@ -21,10 +21,10 @@ workflow BAM_VARIANT_CALLING_SOMATIC_CONTROLFREEC {
 
     FREEC_SOMATIC(controlfreec_input, fasta, fasta_fai, [], dbsnp, dbsnp_tbi, chr_files, mappability, intervals_bed, [])
 
-    ASSESS_SIGNIFICANCE(FREEC_SOMATIC.out.CNV.join(FREEC_SOMATIC.out.ratio))
+    ASSESS_SIGNIFICANCE(FREEC_SOMATIC.out.CNV.join(FREEC_SOMATIC.out.ratio, failOnDuplicate: true, failOnMismatch: true))
     FREEC2BED(FREEC_SOMATIC.out.ratio)
     FREEC2CIRCOS(FREEC_SOMATIC.out.ratio)
-    MAKEGRAPH(FREEC_SOMATIC.out.ratio.join(FREEC_SOMATIC.out.BAF))
+    MAKEGRAPH(FREEC_SOMATIC.out.ratio.join(FREEC_SOMATIC.out.BAF, failOnDuplicate: true, failOnMismatch: true))
 
     ch_versions = ch_versions.mix(FREEC_SOMATIC.out.versions)
     ch_versions = ch_versions.mix(ASSESS_SIGNIFICANCE.out.versions)

diff --git a/subworkflows/local/bam_variant_calling_somatic_mutect2/main.nf b/subworkflows/local/bam_variant_calling_somatic_mutect2/main.nf
@@ -121,10 +121,14 @@ workflow BAM_VARIANT_CALLING_SOMATIC_MUTECT2 {
         .map{ meta, table -> [ meta - meta.subMap('id') + [ id:meta.tumor_id + "_vs_" + meta.normal_id ], table ] }
 
     // Contamination and segmentation tables created using calculatecontamination on the pileup summary table
-    CALCULATECONTAMINATION(pileup_table_tumor.join(pileup_table_normal))
+    CALCULATECONTAMINATION(pileup_table_tumor.join(pileup_table_normal, failOnDuplicate: true, failOnMismatch: true))
 
     // Mutect2 calls filtered by filtermutectcalls using the artifactpriors, contamination and segmentation tables
-    vcf_to_filter = vcf.join(tbi).join(stats).join(LEARNREADORIENTATIONMODEL.out.artifactprior).join(CALCULATECONTAMINATION.out.segmentation).join(CALCULATECONTAMINATION.out.contamination)
+    vcf_to_filter = vcf.join(tbi, failOnDuplicate: true, failOnMismatch: true)
+        .join(stats, failOnDuplicate: true, failOnMismatch: true)
+        .join(LEARNREADORIENTATIONMODEL.out.artifactprior, failOnDuplicate: true, failOnMismatch: true)
+        .join(CALCULATECONTAMINATION.out.segmentation, failOnDuplicate: true, failOnMismatch: true)
+        .join(CALCULATECONTAMINATION.out.contamination, failOnDuplicate: true, failOnMismatch: true)
         .map{ meta, vcf, tbi, stats, orientation, seg, cont -> [ meta, vcf, tbi, stats, orientation, seg, cont, [] ] }
 
     FILTERMUTECTCALLS(vcf_to_filter, fasta, fai, dict)

diff --git a/subworkflows/local/bam_variant_calling_somatic_tiddit/main.nf b/subworkflows/local/bam_variant_calling_somatic_tiddit/main.nf
@@ -16,7 +16,7 @@ workflow BAM_VARIANT_CALLING_SOMATIC_TIDDIT {
     TIDDIT_NORMAL(cram_normal, fasta, bwa)
     TIDDIT_TUMOR(cram_tumor, fasta, bwa)
 
-    SVDB_MERGE(TIDDIT_NORMAL.out.vcf.join(TIDDIT_TUMOR.out.vcf).map{ meta, vcf_normal, vcf_tumor -> [ meta, [vcf_normal, vcf_tumor] ] }, false)
+    SVDB_MERGE(TIDDIT_NORMAL.out.vcf.join(TIDDIT_TUMOR.out.vcf, failOnDuplicate: true, failOnMismatch: true).map{ meta, vcf_normal, vcf_tumor -> [ meta, [vcf_normal, vcf_tumor] ] }, false)
 
     vcf = SVDB_MERGE.out.vcf
 

diff --git a/subworkflows/local/bam_variant_calling_tumor_only_controlfreec/main.nf b/subworkflows/local/bam_variant_calling_tumor_only_controlfreec/main.nf
@@ -21,10 +21,10 @@ workflow BAM_VARIANT_CALLING_TUMOR_ONLY_CONTROLFREEC {
 
     FREEC_TUMORONLY(controlfreec_input, fasta, fasta_fai, [], dbsnp, dbsnp_tbi, chr_files, mappability, intervals_bed, [])
 
-    ASSESS_SIGNIFICANCE(FREEC_TUMORONLY.out.CNV.join(FREEC_TUMORONLY.out.ratio))
+    ASSESS_SIGNIFICANCE(FREEC_TUMORONLY.out.CNV.join(FREEC_TUMORONLY.out.ratio, failOnDuplicate: true, failOnMismatch: true))
     FREEC2BED(FREEC_TUMORONLY.out.ratio)
     FREEC2CIRCOS(FREEC_TUMORONLY.out.ratio)
-    MAKEGRAPH(FREEC_TUMORONLY.out.ratio.join(FREEC_TUMORONLY.out.BAF))
+    MAKEGRAPH(FREEC_TUMORONLY.out.ratio.join(FREEC_TUMORONLY.out.BAF, failOnDuplicate: true, failOnMismatch: true))
 
     ch_versions = ch_versions.mix(FREEC_TUMORONLY.out.versions)
     ch_versions = ch_versions.mix(ASSESS_SIGNIFICANCE.out.versions)

diff --git a/subworkflows/local/bam_variant_calling_tumor_only_mutect2/main.nf b/subworkflows/local/bam_variant_calling_tumor_only_mutect2/main.nf
@@ -104,7 +104,11 @@ workflow BAM_VARIANT_CALLING_TUMOR_ONLY_MUTECT2 {
     CALCULATECONTAMINATION(pileup_table.map{ meta, table -> [ meta, table, [] ] })
 
     // Mutect2 calls filtered by filtermutectcalls using the contamination and segmentation tables
-    vcf_to_filter = vcf.join(tbi).join(stats).join(LEARNREADORIENTATIONMODEL.out.artifactprior).join(CALCULATECONTAMINATION.out.segmentation).join(CALCULATECONTAMINATION.out.contamination)
+    vcf_to_filter = vcf.join(tbi, failOnDuplicate: true, failOnMismatch: true)
+        .join(stats, failOnDuplicate: true, failOnMismatch: true)
+        .join(LEARNREADORIENTATIONMODEL.out.artifactprior, failOnDuplicate: true, failOnMismatch: true)
+        .join(CALCULATECONTAMINATION.out.segmentation, failOnDuplicate: true, failOnMismatch: true)
+        .join(CALCULATECONTAMINATION.out.contamination, failOnDuplicate: true, failOnMismatch: true)
         .map{ meta, vcf, tbi, stats, artifactprior, seg, cont -> [ meta, vcf, tbi, stats, artifactprior, seg, cont, [] ] }
 
     FILTERMUTECTCALLS(vcf_to_filter, fasta, fai, dict)

diff --git a/subworkflows/local/cram_merge_index_samtools/main.nf b/subworkflows/local/cram_merge_index_samtools/main.nf
@@ -34,7 +34,7 @@ workflow CRAM_MERGE_INDEX_SAMTOOLS {
     INDEX_CRAM(cram_all)
 
     // Join with the crai file
-    cram_crai = cram_all.join(INDEX_CRAM.out.crai)
+    cram_crai = cram_all.join(INDEX_CRAM.out.crai, failOnDuplicate: true, failOnMismatch: true)
 
     // Gather versions of all tools used
     versions = versions.mix(INDEX_CRAM.out.versions.first())

diff --git a/subworkflows/local/vcf_variant_filtering_gatk/main.nf b/subworkflows/local/vcf_variant_filtering_gatk/main.nf
@@ -21,7 +21,7 @@ workflow VCF_VARIANT_FILTERING_GATK {
 
     CNNSCOREVARIANTS(cnn_in, fasta, fasta_fai, dict, [], [])
 
-    FILTERVARIANTTRANCHES(CNNSCOREVARIANTS.out.vcf.join(CNNSCOREVARIANTS.out.tbi).combine(intervals_bed_combined), known_sites, known_sites_tbi, fasta, fasta_fai, dict)
+    FILTERVARIANTTRANCHES(CNNSCOREVARIANTS.out.vcf.join(CNNSCOREVARIANTS.out.tbi, failOnDuplicate: true, failOnMismatch: true).combine(intervals_bed_combined), known_sites, known_sites_tbi, fasta, fasta_fai, dict)
 
     filtered_vcf = FILTERVARIANTTRANCHES.out.vcf
         // add variantcaller to meta map and remove no longer necessary field: num_intervals

diff --git a/subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf b/subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf
diff --git a/tests/test_gatk4_spark.yml b/tests/test_gatk4_spark.yml
@@ -12,15 +12,15 @@
       md5sum: 2d29d9e53894dcce96a1b5beb6ef3312
     - path: results/multiqc
     - path: results/preprocessing/markduplicates/test/test.md.cram
-      md5sum: 5a0ba5e67e828ed0bcf29ade62a1b2db
+      md5sum: b1338daf0d2f85c42fdc1bf12764bcae
     - path: results/preprocessing/markduplicates/test/test.md.cram.crai
-      md5sum: 2de7ae39dd18ae332f14d23d32549524
+      md5sum: 4f249f7b2492c37fd2186b02f7e5b03c
     - path: results/preprocessing/recal_table/test/test.recal.table
       md5sum: 5b6e5078b4a90f6cb982fa0f0df616c2
     - path: results/preprocessing/recalibrated/test/test.recal.cram
-      md5sum: fa292026b4076705f7cc2d9fd7f4b450
+      md5sum: b841ee6b350ff4e5e25ebe877794e837
     - path: results/preprocessing/recalibrated/test/test.recal.cram.crai
-      md5sum: 952ab9c5dedcf2105247bc20129b40ee
+      md5sum: 91b334df15cef7afc73e68bacef8029b
     - path: results/reports/fastqc/test-test_L1
     - path: results/reports/markduplicates/test/test.md.cram.metrics
     # text-based file changes md5sums on reruns
@@ -62,15 +62,15 @@
     - path: results/csv/recalibrated.csv
       md5sum: 2d29d9e53894dcce96a1b5beb6ef3312
     - path: results/preprocessing/markduplicates/test/test.md.cram
-      md5sum: 5a0ba5e67e828ed0bcf29ade62a1b2db
+      md5sum: b1338daf0d2f85c42fdc1bf12764bcae
     - path: results/preprocessing/markduplicates/test/test.md.cram.crai
-      md5sum: 2de7ae39dd18ae332f14d23d32549524
+      md5sum: 4f249f7b2492c37fd2186b02f7e5b03c
     - path: results/preprocessing/recal_table/test/test.recal.table
       md5sum: 5b6e5078b4a90f6cb982fa0f0df616c2
     - path: results/preprocessing/recalibrated/test/test.recal.cram
-      md5sum: fa292026b4076705f7cc2d9fd7f4b450
+      md5sum: b841ee6b350ff4e5e25ebe877794e837
     - path: results/preprocessing/recalibrated/test/test.recal.cram.crai
-      md5sum: 952ab9c5dedcf2105247bc20129b40ee
+      md5sum: 91b334df15cef7afc73e68bacef8029b
     - path: results/multiqc
       should_exist: false
     - path: results/reports/fastqc