From 2736aacaa0c2c7af4581154e577993664fc59fa7 Mon Sep 17 00:00:00 2001 From: maxulysse Date: Thu, 3 Oct 2024 10:43:56 +0200 Subject: [PATCH 1/5] print warnings instead of erroring --- workflows/sarek/main.nf | 66 +++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index 90307f19c2..634e6a716f 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -4,6 +4,8 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +import java.util.zip.GZIPInputStream + include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' @@ -937,45 +939,63 @@ workflow SAREK { FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ + // Add readgroup to meta and remove lane def addReadgroupToMeta(meta, files) { def CN = params.seq_center ? "CN:${params.seq_center}\\t" : '' // Here we're assuming that fastq_1 and fastq_2 are from the same flowcell: - def flowcell = flowcellLaneFromFastq(files[0]) + // If we cannot read the flowcell ID from the fastq file, then we don't use it + def sample_lane_id = flowcellLaneFromFastq(files[0]) ? "${flowcell}.${meta.sample}.${meta.lane}" : "${meta.sample}.${meta.lane}" // TO-DO: Would it perhaps be better to also call flowcellLaneFromFastq(files[1]) and check that we get the same flowcell-id? // Don't use a random element for ID, it breaks resuming - def read_group = "\"@RG\\tID:${flowcell}.${meta.sample}.${meta.lane}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\"" + def read_group = "\"@RG\\tID:${sample_lane_id}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\"" meta = meta - meta.subMap('lane') + [read_group: read_group.toString()] return [ meta, files ] } + // Parse first line of a FASTQ file, return the flowcell id and lane number. def flowcellLaneFromFastq(path) { - // expected format: - // xx:yy:FLOWCELLID:LANE:... (seven fields) - // or - // FLOWCELLID:LANE:xx:... (five fields) - def line - path.withInputStream { - InputStream gzipStream = new java.util.zip.GZIPInputStream(it) - Reader decoder = new InputStreamReader(gzipStream, 'ASCII') - BufferedReader buffered = new BufferedReader(decoder) - line = buffered.readLine() - } - assert line.startsWith('@') - line = line.substring(1) - def fields = line.split(':') - String fcid - + // First line of FASTQ file contains sequence identifier plus optional description + def firstLine = readFirstLineOfFastq(path) + def flowcell_id = null + + // Expected format from ILLUMINA + // cf https://en.wikipedia.org/wiki/FASTQ_format#Illumina_sequence_identifiers + // Five fields: + // :::: + // Seven fields or more (from CASAVA 1.8+): + // "@::::::..." + + fields = firstLine ? firstLine.split(':') : [] if (fields.size() >= 7) { - // CASAVA 1.8+ format, from https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm - // "@::::::: :::" - fcid = fields[2] + flowcell_id = fields[2] } else if (fields.size() == 5) { - fcid = fields[0] + flowcell_id = fields[0] + } else { + log.warn "Cannot extract flowcell id from file: ${path}" + log.warn "First line of FASTQ is: ${firstLine}" + } + return flowcell_id +} + +// Get first line of a FASTQ file +def readFirstLineOfFastq(path) { + def line = null + try { + path.withInputStream { + InputStream gzipStream = new java.util.zip.GZIPInputStream(it) + Reader decoder = new InputStreamReader(gzipStream, 'ASCII') + BufferedReader buffered = new BufferedReader(decoder) + line = buffered.readLine() + assert line.startsWith('@') + } + } catch (Exception e) { + log.warn "Error streaming gzipped FASTQ file: ${e.message}" + log.warn "File path: ${path}" } - return fcid + return line } /* From a0dc4fb060b0764b2c1ca35d37ca1c3df014ba49 Mon Sep 17 00:00:00 2001 From: maxulysse Date: Thu, 3 Oct 2024 10:44:48 +0200 Subject: [PATCH 2/5] no need to import --- workflows/sarek/main.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index 634e6a716f..55dcfeea08 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -4,8 +4,6 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -import java.util.zip.GZIPInputStream - include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' From 0aafe1bc4ca341ff48d33daf3601bdd98aea5f06 Mon Sep 17 00:00:00 2001 From: maxulysse Date: Thu, 3 Oct 2024 11:00:18 +0200 Subject: [PATCH 3/5] no need to import --- workflows/sarek/main.nf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index 55dcfeea08..5a6bebefa0 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -972,8 +972,7 @@ def flowcellLaneFromFastq(path) { } else if (fields.size() == 5) { flowcell_id = fields[0] } else { - log.warn "Cannot extract flowcell id from file: ${path}" - log.warn "First line of FASTQ is: ${firstLine}" + log.warn "Cannot extract flowcell id from first line in file(${path}): ${firstLine}" } return flowcell_id } @@ -990,8 +989,8 @@ def readFirstLineOfFastq(path) { assert line.startsWith('@') } } catch (Exception e) { - log.warn "Error streaming gzipped FASTQ file: ${e.message}" - log.warn "File path: ${path}" + log.warn "Error streaming gzipped FASTQ file(${path})" + log.warn "${e.message}" } return line } From 900a6c7669b3ebed4c5c903b305827405e7329a2 Mon Sep 17 00:00:00 2001 From: maxulysse Date: Thu, 3 Oct 2024 11:17:51 +0200 Subject: [PATCH 4/5] fix comments --- workflows/sarek/main.nf | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index 5a6bebefa0..685a76a164 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -962,17 +962,19 @@ def flowcellLaneFromFastq(path) { // Expected format from ILLUMINA // cf https://en.wikipedia.org/wiki/FASTQ_format#Illumina_sequence_identifiers // Five fields: - // :::: + // @::::... // Seven fields or more (from CASAVA 1.8+): // "@::::::..." fields = firstLine ? firstLine.split(':') : [] - if (fields.size() >= 7) { + if (fields.size() == 5) { + // Get the instrument name as flowcell ID + flowcell_id = fields[0].substring(1) + } else if (fields.size() >= 7) { + // Get the actual flowcell ID flowcell_id = fields[2] - } else if (fields.size() == 5) { - flowcell_id = fields[0] - } else { - log.warn "Cannot extract flowcell id from first line in file(${path}): ${firstLine}" + } else if (fields.size() != 0) { + log.warn "FASTQ file(${path}): Cannot extract flowcell ID from ${firstLine}" } return flowcell_id } @@ -989,7 +991,7 @@ def readFirstLineOfFastq(path) { assert line.startsWith('@') } } catch (Exception e) { - log.warn "Error streaming gzipped FASTQ file(${path})" + log.warn "FASTQ file(${path}): Error streaming" log.warn "${e.message}" } return line From 181af2b0e444d7ea7fee3d9600c5ffc288ae7b65 Mon Sep 17 00:00:00 2001 From: maxulysse Date: Thu, 3 Oct 2024 11:28:09 +0200 Subject: [PATCH 5/5] update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 79b1c35461..ccad46a6ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - [1657](https://github.com/nf-core/sarek/pull/1657) - Update all actions used in the GHA CI +- [1673](https://github.com/nf-core/sarek/pull/1673) - Print warning message instead of silent error with Nextflow versions prior to 24.08.0edge ### Removed