Merge pull request nf-core#1673 from maxulysse/fix_previously_silent_…

…error print warnings instead of erroring
maxulysse · Oct 3, 2024 · 2be5d14 · 2be5d14
2 parents 9e0fda5 + 70b10c3
commit 2be5d14
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 25 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,12 +16,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- [1669](https://github.com/nf-core/sarek/pull/1669) - Better nf-test pipeline level tests
+
 ### Fixed
 
 - [1656](https://github.com/nf-core/sarek/pull/1656) - Retiring parameter `snpeff_genome`
 - [1657](https://github.com/nf-core/sarek/pull/1657) - Update all actions used in the GHA CI
 - [1661](https://github.com/nf-core/sarek/pull/1661) - nf-test pipeline level tests
-- [1669](https://github.com/nf-core/sarek/pull/1669) - Better nf-test pipeline level tests
+- [1673](https://github.com/nf-core/sarek/pull/1673) - Print warning message instead of silent error with Nextflow versions prior to 24.08.0edge
 
 ### Removed
 

diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf
@@ -937,45 +937,64 @@ workflow SAREK {
     FUNCTIONS
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
+
 // Add readgroup to meta and remove lane
 def addReadgroupToMeta(meta, files) {
     def CN = params.seq_center ? "CN:${params.seq_center}\\t" : ''
 
     // Here we're assuming that fastq_1 and fastq_2 are from the same flowcell:
-    def flowcell = flowcellLaneFromFastq(files[0])
+    // If we cannot read the flowcell ID from the fastq file, then we don't use it
+    def sample_lane_id = flowcellLaneFromFastq(files[0]) ? "${flowcell}.${meta.sample}.${meta.lane}" : "${meta.sample}.${meta.lane}"
     // TO-DO: Would it perhaps be better to also call flowcellLaneFromFastq(files[1]) and check that we get the same flowcell-id?
 
     // Don't use a random element for ID, it breaks resuming
-    def read_group = "\"@RG\\tID:${flowcell}.${meta.sample}.${meta.lane}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\""
+    def read_group = "\"@RG\\tID:${sample_lane_id}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\""
     meta  = meta - meta.subMap('lane') + [read_group: read_group.toString()]
     return [ meta, files ]
 }
+
 // Parse first line of a FASTQ file, return the flowcell id and lane number.
 def flowcellLaneFromFastq(path) {
-    // expected format:
-    // xx:yy:FLOWCELLID:LANE:... (seven fields)
-    // or
-    // FLOWCELLID:LANE:xx:... (five fields)
-    def line
-    path.withInputStream {
-        InputStream gzipStream = new java.util.zip.GZIPInputStream(it)
-        Reader decoder = new InputStreamReader(gzipStream, 'ASCII')
-        BufferedReader buffered = new BufferedReader(decoder)
-        line = buffered.readLine()
+    // First line of FASTQ file contains sequence identifier plus optional description
+    def firstLine = readFirstLineOfFastq(path)
+    def flowcell_id = null
+
+    // Expected format from ILLUMINA
+    // cf https://en.wikipedia.org/wiki/FASTQ_format#Illumina_sequence_identifiers
+    // Five fields:
+    // @<instrument>:<lane>:<tile>:<x-pos>:<y-pos>...
+    // Seven fields or more (from CASAVA 1.8+):
+    // "@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>..."
+
+    fields = firstLine ? firstLine.split(':') : []
+    if (fields.size() == 5) {
+        // Get the instrument name as flowcell ID
+        flowcell_id = fields[0].substring(1)
+    } else if (fields.size() >= 7) {
+        // Get the actual flowcell ID
+        flowcell_id = fields[2]
+    } else if (fields.size() != 0) {
+        log.warn "FASTQ file(${path}): Cannot extract flowcell ID from ${firstLine}"
     }
-    assert line.startsWith('@')
-    line = line.substring(1)
-    def fields = line.split(':')
-    String fcid
-
-    if (fields.size() >= 7) {
-        // CASAVA 1.8+ format, from  https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
-        // "@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>"
-        fcid = fields[2]
-    } else if (fields.size() == 5) {
-        fcid = fields[0]
+    return flowcell_id
+}
+
+// Get first line of a FASTQ file
+def readFirstLineOfFastq(path) {
+    def line = null
+    try {
+        path.withInputStream {
+            InputStream gzipStream = new java.util.zip.GZIPInputStream(it)
+            Reader decoder = new InputStreamReader(gzipStream, 'ASCII')
+            BufferedReader buffered = new BufferedReader(decoder)
+            line = buffered.readLine()
+            assert line.startsWith('@')
+        }
+    } catch (Exception e) {
+        log.warn "FASTQ file(${path}): Error streaming"
+        log.warn "${e.message}"
     }
-    return fcid
+    return line
 }
 
 /*