From ed00139794af770ee160101ddf464642df047bcb Mon Sep 17 00:00:00 2001
From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
Date: Wed, 20 Dec 2023 17:41:31 +0000
Subject: [PATCH 1/6] Split FASTQs are grouped earlier for more reliable
 grouping strategy

Changes:
 - The grouping strategy for sharded data has been improved
 - The number of BAM files per sample is calculated by grouping the sample by ID after splitting the FASTQ files, then counting the total number of FASTQ files created.
 - This has to wait for all FASTQ files to be produced by FASTP, but is more reliable.
 - After alignment, the number of FASTQ files is used to wait to determine the expected number of BAM files used by groupBy.

Fixes #1357
---
 workflows/sarek.nf | 77 +++++++++++++++++++++++++++++++---------------
 1 file changed, 53 insertions(+), 24 deletions(-)

diff --git a/workflows/sarek.nf b/workflows/sarek.nf
index a18d3864b..23d57557c 100644
--- a/workflows/sarek.nf
+++ b/workflows/sarek.nf
@@ -471,7 +471,7 @@ workflow SAREK {
             if (params.split_fastq) {
                 reads_for_alignment = FASTP.out.reads.map{ meta, reads ->
                     read_files = reads.sort(false) { a,b -> a.getName().tokenize('.')[0] <=> b.getName().tokenize('.')[0] }.collate(2)
-                    [ meta + [ size:read_files.size() ], read_files ]
+                    [ meta + [ n_fastq: read_files.size() ], read_files ]
                 }.transpose()
             } else reads_for_alignment = FASTP.out.reads
 
@@ -482,34 +482,63 @@ workflow SAREK {
         }
 
         // STEP 1: MAPPING READS TO REFERENCE GENOME
-        // reads will be sorted
-        reads_for_alignment = reads_for_alignment.map{ meta, reads ->
-            // Update meta.id to meta.sample no multiple lanes or splitted fastqs
-            if (meta.size * meta.num_lanes == 1) [ meta + [ id:meta.sample ], reads ]
-            else [ meta, reads ]
-        }
+        // First, we must calculate number of lanes for each sample (meta.n_fastq)
+        // This is needed to group reads from the same sample together using groupKey to avoid stalling the workflow 
+        // when reads from different samples are mixed together
+        reads_for_alignment.map { meta, reads -> 
+                [ meta.subMap('patient', 'sample', 'sex', 'status'), reads ]   
+            }
+            .groupTuple()
+            .map { meta, reads -> 
+                meta + [ n_fastq: reads.size() ] // We can drop the FASTQ files now that we know how many there are
+            }
+            .set { reads_grouping_key }
 
+        reads_grouping_key.view()
+
+        // reads will be sorted
         sort_bam = true
         FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON(reads_for_alignment, index_alignement, sort_bam, fasta, fasta_fai)
 
         // Grouping the bams from the same samples not to stall the workflow
-        bam_mapped = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.bam.map{ meta, bam ->
-
-            // Update meta.id to be meta.sample, ditching sample-lane that is not needed anymore
-            // Update meta.data_type
-            // Remove no longer necessary fields:
-            //   read_group: Now in the BAM header
-            //    num_lanes: only needed for mapping
-            //         size: only needed for mapping
-
-            // Use groupKey to make sure that the correct group can advance as soon as it is complete
-            // and not stall the workflow until all reads from all channels are mapped
-            [ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bam', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), bam ]
-        }.groupTuple()
-
-        bai_mapped = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.bai.map{ meta, bai ->
-            [ groupKey( meta - meta.subMap('num_lanes', 'read_group', 'size') + [ data_type:'bai', id:meta.sample ], (meta.num_lanes ?: 1) * (meta.size ?: 1)), bai ]
-        }.groupTuple()
+        // Use groupKey to make sure that the correct group can advance as soon as it is complete
+        // and not stall the workflow until all reads from all channels are mapped
+        bam_mapped = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.bam
+            .combine(reads_grouping_key) // Creates a tuple of [ meta, bam, reads_grouping_key ]
+            .filter { meta1, bam, meta2 -> meta1.sample == meta2.sample }
+            // Add n_fastq and other variables to meta
+            .map { meta1, bam, meta2 ->
+                [ meta1 + meta2, bam ]
+            }
+            // Manipulate meta map to remove old fields and add new ones
+            .map { meta, bam ->
+                [ meta - meta.subMap('id', 'read_group', 'data_type', 'num_lanes', 'read_group', 'size') + [ data_type: 'bam', id: meta.sample ], bam ]
+            }
+            // Create groupKey from meta map
+            .map { meta, bam ->
+                [ groupKey( meta, meta.n_fastq), bam ]
+            }
+            // Group
+            .groupTuple()
+
+        bai_mapped = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.bai
+            .combine(reads_grouping_key) // Creates a tuple of [ meta, bai, reads_grouping_key ]
+            .filter { meta1, bai, meta2 -> meta1.sample == meta2.sample }
+            // Add n_fastq and other variables to meta
+            .map { meta1, bai, meta2 ->
+                [ meta1 + meta2, bai ]
+            }
+            // Manipulate meta map to remove old fields and add new ones
+            .map { meta, bai ->
+                [ meta - meta.subMap('id', 'read_group', 'data_type', 'num_lanes', 'read_group', 'size') + [ data_type: 'bai', id: meta.sample ], bai ]
+            }
+            // Create groupKey from meta map
+            .map { meta, bai ->
+                [ groupKey( meta, meta.n_fastq), bai ]
+            }
+            // Group
+            .groupTuple()
+
 
         // gatk4 markduplicates can handle multiple bams as input, so no need to merge/index here
         // Except if and only if save_mapped or (skipping markduplicates and sentieon-dedup)

From 991501d583c4668ffef0b2d98d288571e2296b0f Mon Sep 17 00:00:00 2001
From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
Date: Wed, 20 Dec 2023 17:46:21 +0000
Subject: [PATCH 2/6] Minimum number of fastqs for split_fastq is now 250

Changes:
 - FASTP uses blocks of 250 reads when splitting a FASTQ file.
 - This update makes 250 the minimum sized block to split a FASTQ file into.
 - Updates help text accordingly

Fixes #1363
---
 nextflow_schema.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 3f3f21313..f7c7a4753 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -69,7 +69,8 @@
                     "default": 50000000,
                     "fa_icon": "fas fa-clock",
                     "description": "Specify how many reads each split of a FastQ file contains. Set 0 to turn off splitting at all.",
-                    "help_text": "Use the the tool FastP to split FASTQ file by number of reads. This parallelizes across fastq file shards speeding up mapping. "
+                    "help_text": "Use the the tool FastP to split FASTQ file by number of reads. This parallelizes across fastq file shards speeding up mapping. Note although the minimum value is 250 reads, if you have fewer than 250 reads a single FASTQ shard will still be created.",
+                    "minimum": 250
                 },
                 "wes": {
                     "type": "boolean",

From d0b8cdc735aaac3a38156f8330e587112a37ba0e Mon Sep 17 00:00:00 2001
From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
Date: Thu, 21 Dec 2023 09:47:43 +0000
Subject: [PATCH 3/6] Support for split_fastq to be zero to disable splitting

---
 nextflow_schema.json | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index f7c7a4753..b00cdafa7 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -65,12 +65,21 @@
             "default": "",
             "properties": {
                 "split_fastq": {
-                    "type": "integer",
+                    "anyOf": [
+                        {
+                            "type": "integer",
+                            "minimum": 250
+                        },
+                        {
+                            "type": "integer",
+                            "minimum": 0,
+                            "maximum": 0
+                        }
+                    ],
                     "default": 50000000,
                     "fa_icon": "fas fa-clock",
                     "description": "Specify how many reads each split of a FastQ file contains. Set 0 to turn off splitting at all.",
-                    "help_text": "Use the the tool FastP to split FASTQ file by number of reads. This parallelizes across fastq file shards speeding up mapping. Note although the minimum value is 250 reads, if you have fewer than 250 reads a single FASTQ shard will still be created.",
-                    "minimum": 250
+                    "help_text": "Use the the tool FastP to split FASTQ file by number of reads. This parallelizes across fastq file shards speeding up mapping. Note although the minimum value is 250 reads, if you have fewer than 250 reads a single FASTQ shard will still be created."
                 },
                 "wes": {
                     "type": "boolean",

From 162397d0b45da7a11ce73b7b85368dbcf5c5c360 Mon Sep 17 00:00:00 2001
From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
Date: Thu, 21 Dec 2023 17:08:49 +0000
Subject: [PATCH 4/6] CHANGELOG

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 96069b791..d1e27f769 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#1335](https://github.com/nf-core/sarek/pull/1335) - Add docs and validation for bcftools annotation parameters
 - [#1345](https://github.com/nf-core/sarek/pull/1345) - Preserve STDERR for easier debugging
 - [#1351](https://github.com/nf-core/sarek/pull/1351) - Fix params name for test profiles (`bcftools_annotations`)
+- [#1357](https://github.com/nf-core/sarek/pull/1364) - Fixed bug where samples were dropped while reconstituting BAM files
 
 ### Removed
 

From 6b42de635e071d826e443e6ee8bdb7ab6a663e79 Mon Sep 17 00:00:00 2001
From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
Date: Thu, 21 Dec 2023 17:11:10 +0000
Subject: [PATCH 5/6] remove .view() statement

---
 workflows/sarek.nf | 2 --
 1 file changed, 2 deletions(-)

diff --git a/workflows/sarek.nf b/workflows/sarek.nf
index 23d57557c..7a90bd675 100644
--- a/workflows/sarek.nf
+++ b/workflows/sarek.nf
@@ -494,8 +494,6 @@ workflow SAREK {
             }
             .set { reads_grouping_key }
 
-        reads_grouping_key.view()
-
         // reads will be sorted
         sort_bam = true
         FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON(reads_for_alignment, index_alignement, sort_bam, fasta, fasta_fai)

From b815378f558bfe38b8060960f9cbea4ccb9be0af Mon Sep 17 00:00:00 2001
From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
Date: Thu, 21 Dec 2023 17:38:31 +0000
Subject: [PATCH 6/6] Add 'type' parameter to cheat nf-core linter

---
 nextflow_schema.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index b00cdafa7..a7f7ed706 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -65,7 +65,7 @@
             "default": "",
             "properties": {
                 "split_fastq": {
-                    "anyOf": [
+                    "oneOf": [
                         {
                             "type": "integer",
                             "minimum": 250
@@ -76,6 +76,7 @@
                             "maximum": 0
                         }
                     ],
+                    "type": "integer",
                     "default": 50000000,
                     "fa_icon": "fas fa-clock",
                     "description": "Specify how many reads each split of a FastQ file contains. Set 0 to turn off splitting at all.",