From 2736aacaa0c2c7af4581154e577993664fc59fa7 Mon Sep 17 00:00:00 2001
From: maxulysse <max.u.garcia@gmail.com>
Date: Thu, 3 Oct 2024 10:43:56 +0200
Subject: [PATCH 1/5] print warnings instead of erroring

---
 workflows/sarek/main.nf | 66 +++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 23 deletions(-)
diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf
index 90307f19c2..634e6a716f 100644
--- a/workflows/sarek/main.nf
+++ b/workflows/sarek/main.nf
@@ -4,6 +4,8 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
+import java.util.zip.GZIPInputStream
+
 include { paramsSummaryMap                                  } from 'plugin/nf-validation'
 include { paramsSummaryMultiqc                              } from '../../subworkflows/nf-core/utils_nfcore_pipeline'
 include { softwareVersionsToYAML                            } from '../../subworkflows/nf-core/utils_nfcore_pipeline'
@@ -937,45 +939,63 @@ workflow SAREK {
     FUNCTIONS
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
+
 // Add readgroup to meta and remove lane
 def addReadgroupToMeta(meta, files) {
     def CN = params.seq_center ? "CN:${params.seq_center}\\t" : ''
 
     // Here we're assuming that fastq_1 and fastq_2 are from the same flowcell:
-    def flowcell = flowcellLaneFromFastq(files[0])
+    // If we cannot read the flowcell ID from the fastq file, then we don't use it
+    def sample_lane_id = flowcellLaneFromFastq(files[0]) ? "${flowcell}.${meta.sample}.${meta.lane}" : "${meta.sample}.${meta.lane}"
     // TO-DO: Would it perhaps be better to also call flowcellLaneFromFastq(files[1]) and check that we get the same flowcell-id?
 
     // Don't use a random element for ID, it breaks resuming
-    def read_group = "\"@RG\\tID:${flowcell}.${meta.sample}.${meta.lane}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\""
+    def read_group = "\"@RG\\tID:${sample_lane_id}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\""
     meta  = meta - meta.subMap('lane') + [read_group: read_group.toString()]
     return [ meta, files ]
 }
+
 // Parse first line of a FASTQ file, return the flowcell id and lane number.
 def flowcellLaneFromFastq(path) {
-    // expected format:
-    // xx:yy:FLOWCELLID:LANE:... (seven fields)
-    // or
-    // FLOWCELLID:LANE:xx:... (five fields)
-    def line
-    path.withInputStream {
-        InputStream gzipStream = new java.util.zip.GZIPInputStream(it)
-        Reader decoder = new InputStreamReader(gzipStream, 'ASCII')
-        BufferedReader buffered = new BufferedReader(decoder)
-        line = buffered.readLine()
-    }
-    assert line.startsWith('@')
-    line = line.substring(1)
-    def fields = line.split(':')
-    String fcid
-
+    // First line of FASTQ file contains sequence identifier plus optional description
+    def firstLine = readFirstLineOfFastq(path)
+    def flowcell_id = null
+
+    // Expected format from ILLUMINA
+    // cf https://en.wikipedia.org/wiki/FASTQ_format#Illumina_sequence_identifiers
+    // Five fields:
+    // <flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>
+    // Seven fields or more (from CASAVA 1.8+):
+    // "@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>..."
+
+    fields = firstLine ? firstLine.split(':') : []
     if (fields.size() >= 7) {
-        // CASAVA 1.8+ format, from  https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
-        // "@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>"
-        fcid = fields[2]
+        flowcell_id = fields[2]
     } else if (fields.size() == 5) {
-        fcid = fields[0]
+        flowcell_id = fields[0]
+    } else {
+        log.warn "Cannot extract flowcell id from file: ${path}"
+        log.warn "First line of FASTQ is: ${firstLine}"
+    }
+    return flowcell_id
+}
+
+// Get first line of a FASTQ file
+def readFirstLineOfFastq(path) {
+    def line = null
+    try {
+        path.withInputStream {
+            InputStream gzipStream = new java.util.zip.GZIPInputStream(it)
+            Reader decoder = new InputStreamReader(gzipStream, 'ASCII')
+            BufferedReader buffered = new BufferedReader(decoder)
+            line = buffered.readLine()
+            assert line.startsWith('@')
+        }
+    } catch (Exception e) {
+        log.warn "Error streaming gzipped FASTQ file: ${e.message}"
+        log.warn "File path: ${path}"
     }
-    return fcid
+    return line
 }
 
 /*

From a0dc4fb060b0764b2c1ca35d37ca1c3df014ba49 Mon Sep 17 00:00:00 2001
From: maxulysse <max.u.garcia@gmail.com>
Date: Thu, 3 Oct 2024 10:44:48 +0200
Subject: [PATCH 2/5] no need to import

---
 workflows/sarek/main.nf | 2 --
 1 file changed, 2 deletions(-)

diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf
index 634e6a716f..55dcfeea08 100644
--- a/workflows/sarek/main.nf
+++ b/workflows/sarek/main.nf
@@ -4,8 +4,6 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-import java.util.zip.GZIPInputStream
-
 include { paramsSummaryMap                                  } from 'plugin/nf-validation'
 include { paramsSummaryMultiqc                              } from '../../subworkflows/nf-core/utils_nfcore_pipeline'
 include { softwareVersionsToYAML                            } from '../../subworkflows/nf-core/utils_nfcore_pipeline'

From 0aafe1bc4ca341ff48d33daf3601bdd98aea5f06 Mon Sep 17 00:00:00 2001
From: maxulysse <max.u.garcia@gmail.com>
Date: Thu, 3 Oct 2024 11:00:18 +0200
Subject: [PATCH 3/5] no need to import

---
 workflows/sarek/main.nf | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf
index 55dcfeea08..5a6bebefa0 100644
--- a/workflows/sarek/main.nf
+++ b/workflows/sarek/main.nf
@@ -972,8 +972,7 @@ def flowcellLaneFromFastq(path) {
     } else if (fields.size() == 5) {
         flowcell_id = fields[0]
     } else {
-        log.warn "Cannot extract flowcell id from file: ${path}"
-        log.warn "First line of FASTQ is: ${firstLine}"
+        log.warn "Cannot extract flowcell id from first line in file(${path}): ${firstLine}"
     }
     return flowcell_id
 }
@@ -990,8 +989,8 @@ def readFirstLineOfFastq(path) {
             assert line.startsWith('@')
         }
     } catch (Exception e) {
-        log.warn "Error streaming gzipped FASTQ file: ${e.message}"
-        log.warn "File path: ${path}"
+        log.warn "Error streaming gzipped FASTQ file(${path})"
+        log.warn "${e.message}"
     }
     return line
 }

From 900a6c7669b3ebed4c5c903b305827405e7329a2 Mon Sep 17 00:00:00 2001
From: maxulysse <max.u.garcia@gmail.com>
Date: Thu, 3 Oct 2024 11:17:51 +0200
Subject: [PATCH 4/5] fix comments

---
 workflows/sarek/main.nf | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf
index 5a6bebefa0..685a76a164 100644
--- a/workflows/sarek/main.nf
+++ b/workflows/sarek/main.nf
@@ -962,17 +962,19 @@ def flowcellLaneFromFastq(path) {
     // Expected format from ILLUMINA
     // cf https://en.wikipedia.org/wiki/FASTQ_format#Illumina_sequence_identifiers
     // Five fields:
-    // <flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>
+    // @<instrument>:<lane>:<tile>:<x-pos>:<y-pos>...
     // Seven fields or more (from CASAVA 1.8+):
     // "@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>..."
 
     fields = firstLine ? firstLine.split(':') : []
-    if (fields.size() >= 7) {
+    if (fields.size() == 5) {
+        // Get the instrument name as flowcell ID
+        flowcell_id = fields[0].substring(1)
+    } else if (fields.size() >= 7) {
+        // Get the actual flowcell ID
         flowcell_id = fields[2]
-    } else if (fields.size() == 5) {
-        flowcell_id = fields[0]
-    } else {
-        log.warn "Cannot extract flowcell id from first line in file(${path}): ${firstLine}"
+    } else if (fields.size() != 0) {
+        log.warn "FASTQ file(${path}): Cannot extract flowcell ID from ${firstLine}"
     }
     return flowcell_id
 }
@@ -989,7 +991,7 @@ def readFirstLineOfFastq(path) {
             assert line.startsWith('@')
         }
     } catch (Exception e) {
-        log.warn "Error streaming gzipped FASTQ file(${path})"
+        log.warn "FASTQ file(${path}): Error streaming"
         log.warn "${e.message}"
     }
     return line

From 181af2b0e444d7ea7fee3d9600c5ffc288ae7b65 Mon Sep 17 00:00:00 2001
From: maxulysse <max.u.garcia@gmail.com>
Date: Thu, 3 Oct 2024 11:28:09 +0200
Subject: [PATCH 5/5] update CHANGELOG

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 79b1c35461..ccad46a6ba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 
 - [1657](https://github.com/nf-core/sarek/pull/1657) - Update all actions used in the GHA CI
+- [1673](https://github.com/nf-core/sarek/pull/1673) - Print warning message instead of silent error with Nextflow versions prior to 24.08.0edge
 
 ### Removed