nf-core · FriederikeHanssen · Jun 28, 2022 · Jun 28, 2022 · Jun 28, 2022 · Jun 28, 2022
@@ -104,7 +104,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#599](https://github.com/nf-core/sarek/pull/599) - Add checks for correct data type for `params.step`
 - [#599](https://github.com/nf-core/sarek/pull/599) - Add checks for no empty `--tools` with `--step variant_calling` or `--step annotation`
 - [#600](https://github.com/nf-core/sarek/pull/600) - Remove `nf-core lint` warnings
-- [#602](https://github.com/nf-core/sarek/pull/602/) - Fixed bug in `alignment_to_fastq` and added tests
+- [#602](https://github.com/nf-core/sarek/pull/602) - Fixed bug in `alignment_to_fastq` and added tests
+- [#609](https://github.com/nf-core/sarek/pull/609) - Remove unused intervals code, reorganize combined intervals file
 
 ### Deprecated
 

@@ -81,16 +81,16 @@ workflow PAIR_VARIANT_CALLING {
                             [meta, tumor_cram, intervals]
                         }
         RUN_MPILEUP_NORMAL(cram_normal_intervals_no_index, fasta)
-        mpileup_normal = RUN_MPILEUP_NORMAL.out.mpileup
         RUN_MPILEUP_TUMOR(cram_tumor_intervals_no_index, fasta)
+
+        mpileup_normal = RUN_MPILEUP_NORMAL.out.mpileup
         mpileup_tumor = RUN_MPILEUP_TUMOR.out.mpileup
-        ch_versions = ch_versions.mix(RUN_MPILEUP_NORMAL.out.versions)
-        ch_versions = ch_versions.mix(RUN_MPILEUP_TUMOR.out.versions)
 
         controlfreec_input = mpileup_normal.cross(mpileup_tumor)
         .map{ normal, tumor ->
             [normal[0], normal[1], tumor[1], [], [], [], []]
         }
+
         RUN_CONTROLFREEC_SOMATIC(controlfreec_input,
                         fasta,
                         fasta_fai,
@@ -99,6 +99,9 @@ workflow PAIR_VARIANT_CALLING {
                         chr_files,
                         mappability,
                         intervals_bed_combined)
+
+        ch_versions = ch_versions.mix(RUN_MPILEUP_NORMAL.out.versions)
+        ch_versions = ch_versions.mix(RUN_MPILEUP_TUMOR.out.versions)
         ch_versions = ch_versions.mix(RUN_CONTROLFREEC_SOMATIC.out.versions)
     }
 

@@ -6,11 +6,10 @@
 // For all modules here:
 // A when clause condition is defined in the conf/modules.config to determine if the module should be run
 
-include { BUILD_INTERVALS                                     } from '../../modules/local/build_intervals/main'
-include { CREATE_INTERVALS_BED                                } from '../../modules/local/create_intervals_bed/main'
-include { GATK4_INTERVALLISTTOBED                             } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main'
-include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main'
-include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_ALL   } from '../../modules/nf-core/modules/tabix/bgziptabix/main'
+include { BUILD_INTERVALS                                       } from '../../modules/local/build_intervals/main'
+include { CREATE_INTERVALS_BED                                  } from '../../modules/local/create_intervals_bed/main'
+include { GATK4_INTERVALLISTTOBED                               } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main'
+include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT   } from '../../modules/nf-core/modules/tabix/bgziptabix/main'
 
 workflow PREPARE_INTERVALS {
     take:
@@ -20,9 +19,9 @@ workflow PREPARE_INTERVALS {
 
     ch_versions = Channel.empty()
 
-    ch_intervals                     = Channel.empty()
-    ch_intervals_bed_gz_tbi          = Channel.empty()
-    ch_intervals_combined_bed_gz_tbi = Channel.empty() // Create bed.gz and bed.gz.tbi for input/or created interval file. Contains ALL regions.
+    ch_intervals                     = Channel.empty() // List of bed files, one for each region
+    ch_intervals_bed_gz_tbi          = Channel.empty() // List of bed.gz, bed,gz.tbi, one for each region
+    ch_intervals_combined            = Channel.empty() // Bed file containing all intervals
 
     if (params.no_intervals) {
         file("${params.outdir}/no_intervals.bed").text = "no_intervals\n"
@@ -35,46 +34,37 @@ workflow PREPARE_INTERVALS {
         ch_intervals_bed_gz_tbi = Channel.fromPath(file("${params.outdir}/no_intervals.bed.{gz,gz.tbi}"))
                                             .collect().map{ it -> [it, 0]}
 
-        ch_intervals_combined_bed_gz_tbi = Channel.fromPath(file("${params.outdir}/no_intervals.bed.{gz,gz.tbi}"))
-                                            .collect()
+        ch_intervals_combined = Channel.fromPath(file("${params.outdir}/no_intervals.bed")).collect().map{ it -> [[id:it.simpleName], it]}
 
     } else if (params.step != 'annotate' && params.step != 'controlfreec') {
 
-        tabix_in_combined                = Channel.empty()
-
         //If no interval/target file is provided, then intervals are generated from FASTA file
         if (!params.intervals) {
 
             BUILD_INTERVALS(fasta_fai)
-            tabix_in_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.simpleName], it] }
+            ch_intervals_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.simpleName], it] }
 
-            ch_intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS.out.bed)
+            ch_intervals = CREATE_INTERVALS_BED(ch_intervals_combined)
 
         } else {
 
-            tabix_in_combined = Channel.fromPath(file(params.intervals)).map{it -> [[id:it.baseName], it] }
+            ch_intervals_combined = Channel.fromPath(file(params.intervals)).map{it -> [[id:it.baseName], it] }
+            ch_intervals = CREATE_INTERVALS_BED(file(params.intervals))
 
             //If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format
             if(!params.intervals.endsWith(".bed")) {
-                GATK4_INTERVALLISTTOBED(tabix_in_combined)
-                tabix_in_combined = GATK4_INTERVALLISTTOBED.out.bed
+                GATK4_INTERVALLISTTOBED(ch_intervals_combined)
+                ch_intervals_combined = GATK4_INTERVALLISTTOBED.out.bed
                 ch_versions = ch_versions.mix(GATK4_INTERVALLISTTOBED.out.versions)
             }
 
-            ch_intervals = CREATE_INTERVALS_BED(file(params.intervals))
         }
 
         // Now for the interval.bed the following operations are done:
-        // 1. Complete intervals file (with all intervals) is indexed
-        // 2. Interval file is split up into multiple bed files for scatter/gather
-        // 3. Each bed file from 2. is indexed
-
-        // 1. Index complete interval file
-        TABIX_BGZIPTABIX_INTERVAL_ALL(tabix_in_combined)
-        ch_intervals_combined_bed_gz_tbi = TABIX_BGZIPTABIX_INTERVAL_ALL.out.gz_tbi.map{ meta, bed, tbi -> [bed, tbi] }
-        ch_versions = ch_versions.mix(TABIX_BGZIPTABIX_INTERVAL_ALL.out.versions)
+        // 1. Interval file is split up into multiple bed files for scatter/gather
+        // 2. Each bed file from 2. is indexed
 
-        // 2. Interval file is split up into multiple bed files for scatter/gather & grouping together small intervals
+        // 1. Interval file is split up into multiple bed files for scatter/gather & grouping together small intervals
         ch_intervals = ch_intervals.flatten()
             .map{ intervalFile ->
                 def duration = 0.0
@@ -95,7 +85,7 @@ workflow PREPARE_INTERVALS {
                    [it, it.size() ] // Adding number of intervals as elements
                 }.transpose()
 
-        // 3. Create bed.gz and bed.gz.tbi for each interval file. They are split by region (see above)
+        // 2. Create bed.gz and bed.gz.tbi for each interval file. They are split by region (see above)
         tabix_in = ch_intervals.map{ file, num_intervals -> [[id:file.baseName], file] }
         TABIX_BGZIPTABIX_INTERVAL_SPLIT(tabix_in)
         ch_intervals_bed_gz_tbi = TABIX_BGZIPTABIX_INTERVAL_SPLIT.out.gz_tbi.map{ meta, bed, tbi -> [bed, tbi ]}.toList().map{
@@ -107,8 +97,8 @@ workflow PREPARE_INTERVALS {
     }
 
     emit:
-        intervals_bed                    = ch_intervals                     // path: intervals.bed, num_intervals                        [intervals split for parallel execution]
-        intervals_bed_gz_tbi             = ch_intervals_bed_gz_tbi          // path: target.bed.gz, target.bed.gz.tbi, num_intervals     [intervals split for parallel execution]
-        intervals_combined_bed_gz_tbi    = ch_intervals_combined_bed_gz_tbi // path: interval.bed.gz, interval.bed.gz.tbi [all intervals in one file]
-        versions                         = ch_versions                      // channel: [ versions.yml ]
+        intervals_bed                    = ch_intervals                                 // path: intervals.bed, num_intervals                        [intervals split for parallel execution]
+        intervals_bed_gz_tbi             = ch_intervals_bed_gz_tbi                      // path: target.bed.gz, target.bed.gz.tbi, num_intervals     [intervals split for parallel execution]
+        intervals_bed_combined           = ch_intervals_combined.map{meta, bed -> bed } // path: intervals.bed                        [all intervals in one file]
+        versions                         = ch_versions                                  // channel: [ versions.yml ]
 }
@@ -52,7 +52,7 @@ for (param in checkPathParamList) if (param) file(param, checkIfExists: true)
 // Set input, can either be from --input or from automatic retrieval in WorkflowSarek.groovy
 ch_input_sample = extract_csv(file(params.input, checkIfExists: true))
 
-if (params.wes) {
+if (params.wes && !params.step == 'annotate') {
     if (params.intervals && !params.intervals.endsWith("bed")) exit 1, "Target file specified with `--intervals` must be in BED format"
 } else {
     if (params.intervals && !params.intervals.endsWith("bed") && !params.intervals.endsWith("interval_list")) exit 1, "Interval file must end with .bed or .interval_list"
@@ -291,17 +291,11 @@ workflow SAREK {
     PREPARE_INTERVALS(fasta_fai)
 
     // Intervals for speed up preprocessing/variant calling by spread/gather
-    // this is not good, we need the combined bed for some tools that don't support scatter/gather. Why would we not use the same intervals for WGS?
-    // intervals_bed_combined        = (params.intervals && params.wes) ? Channel.fromPath(params.intervals).collect() : []
-    // check if this actually still works if interval_list format
-    intervals_bed_combined        = params.intervals ? Channel.fromPath(params.intervals).collect() : []
-    //TODO: intervals also with WGS data? Probably need a parameter if WGS for deepvariant tool, that would allow to check here too
-    intervals_for_preprocessing              = (params.wes && params.intervals) ? intervals_bed_combined : []
-
-    intervals                     = PREPARE_INTERVALS.out.intervals_bed        // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather
-    intervals_bed_gz_tbi          = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather
-
+    intervals_bed_combined      = params.no_intervals ? [] : PREPARE_INTERVALS.out.intervals_bed_combined  // [interval.bed] all intervals in one file
+    intervals_for_preprocessing = params.wes ? intervals_bed_combined : []      // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS)
 
+    intervals                   = PREPARE_INTERVALS.out.intervals_bed        // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather
+    intervals_bed_gz_tbi        = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather
 
     // Gather used softwares versions
     ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions)