From 6ff95ff7cb09563bfefdf7bf05065f1d5ff04ac9 Mon Sep 17 00:00:00 2001 From: Felix Lenner <52530259+fellen31@users.noreply.github.com> Date: Thu, 28 Mar 2024 09:05:21 +0100 Subject: [PATCH] Add support for uBAM and multisample test (#51) Add ubam support and multisample test --- .github/workflows/ci.yml | 5 +- CHANGELOG.md | 1 + assets/schema_input.json | 4 +- conf/modules/bam_to_fastq.config | 35 +++++ modules.json | 5 + .../nf-core/samtools/fastq/environment.yml | 8 + modules/nf-core/samtools/fastq/main.nf | 48 ++++++ modules/nf-core/samtools/fastq/meta.yml | 62 ++++++++ .../samtools/fastq/samtools-fastq.diff | 30 ++++ .../nf-core/samtools/fastq/tests/main.nf.test | 67 +++++++++ .../samtools/fastq/tests/main.nf.test.snap | 139 ++++++++++++++++++ modules/nf-core/samtools/fastq/tests/tags.yml | 2 + nextflow.config | 1 + subworkflows/local/bam_to_fastq.nf | 32 ++++ workflows/skierfe.nf | 11 +- 15 files changed, 445 insertions(+), 5 deletions(-) create mode 100644 conf/modules/bam_to_fastq.config create mode 100644 modules/nf-core/samtools/fastq/environment.yml create mode 100644 modules/nf-core/samtools/fastq/main.nf create mode 100644 modules/nf-core/samtools/fastq/meta.yml create mode 100644 modules/nf-core/samtools/fastq/samtools-fastq.diff create mode 100644 modules/nf-core/samtools/fastq/tests/main.nf.test create mode 100644 modules/nf-core/samtools/fastq/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/fastq/tests/tags.yml create mode 100644 subworkflows/local/bam_to_fastq.nf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fae9a11d..9e7dfce2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,9 @@ jobs: runs-on: ubuntu-latest strategy: matrix: + parameters: + - "" + - "--input assets/samplesheet_multisample_bam.csv --split_fastq 250 --parallel_snv 1" NXF_VER: - "23.04.0" - "latest-everything" @@ -43,4 +46,4 @@ jobs: # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results ${{ matrix.parameters }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e3a75fb..8501543c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Initial release of genomic-medicine-sweden/skierfe, created with the [nf-core](h ### `Added` +- Added uBAM support and multisample test [#51](https://github.com/genomic-medicine-sweden/skierfe/pull/51) - Added Revio BAM test data [#50](https://github.com/genomic-medicine-sweden/skierfe/pull/50) - Update template to 2.13.1 [#38](https://github.com/genomic-medicine-sweden/skierfe/pull/38) - Update pipeline to run with a small test dataset [#35](https://github.com/genomic-medicine-sweden/skierfe/pull/35) diff --git a/assets/schema_input.json b/assets/schema_input.json index 8953674f..f523bdb4 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -16,8 +16,8 @@ "file": { "format": "file-path", "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+\\.(f(ast)?q\\.gz|bam)$", + "errorMessage": "FastQ or BAM file must be provided, cannot contain spaces and must have extension '.fq.gz', '.fastq.gz' or '.bam'" }, "family_id": { "type": "string", diff --git a/conf/modules/bam_to_fastq.config b/conf/modules/bam_to_fastq.config new file mode 100644 index 00000000..071968bd --- /dev/null +++ b/conf/modules/bam_to_fastq.config @@ -0,0 +1,35 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BAM TO FASTQ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: '.*:BAM_TO_FASTQ:.*' { + publishDir = [ + enabled: false, + ] + } + + withName: '.*:BAM_TO_FASTQ:SAMTOOLS_FASTQ' { + + // Maybe should only allow unmapped data + // Unsure why SA tag is still there after reset + ext.args = '-x SA' // samtools reset + ext.args2 = '-T \\*' // samtools fastq + + } +} diff --git a/modules.json b/modules.json index 86d60b2f..532da412 100644 --- a/modules.json +++ b/modules.json @@ -102,6 +102,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "samtools/fastq": { + "branch": "master", + "git_sha": "897c33d5da084b61109500ee44c01da2d3e4e773", + "installed_by": ["modules"] + }, "samtools/index": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/nf-core/samtools/fastq/environment.yml b/modules/nf-core/samtools/fastq/environment.yml new file mode 100644 index 00000000..8e8857a7 --- /dev/null +++ b/modules/nf-core/samtools/fastq/environment.yml @@ -0,0 +1,8 @@ +name: samtools_fastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.19.2 + - bioconda::htslib=1.19.1 diff --git a/modules/nf-core/samtools/fastq/main.nf b/modules/nf-core/samtools/fastq/main.nf new file mode 100644 index 00000000..26a42070 --- /dev/null +++ b/modules/nf-core/samtools/fastq/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_FASTQ { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" + + input: + tuple val(meta), path(input) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fastq.gz") , optional:true, emit: fastq + tuple val(meta), path("*_interleaved.fastq") , optional:true, emit: interleaved + tuple val(meta), path("*_singleton.fastq.gz") , optional:true, emit: singleton + tuple val(meta), path("*_other.fastq.gz") , optional:true, emit: other + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fastq" : + meta.single_end ? "-1 ${prefix}_1.fastq.gz -s ${prefix}_singleton.fastq.gz" : + "-1 ${prefix}_1.fastq.gz -2 ${prefix}_2.fastq.gz -s ${prefix}_singleton.fastq.gz" + """ + samtools reset \\ + --threads ${task.cpus-1} \\ + $args \\ + $input \\ + | \\ + samtools fastq \\ + $args2 \\ + --threads ${task.cpus-1} \\ + -0 ${prefix}_other.fastq.gz \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/fastq/meta.yml b/modules/nf-core/samtools/fastq/meta.yml new file mode 100644 index 00000000..c4002a45 --- /dev/null +++ b/modules/nf-core/samtools/fastq/meta.yml @@ -0,0 +1,62 @@ +name: samtools_fastq +description: Converts a SAM/BAM/CRAM file to FASTQ +keywords: + - bam + - sam + - cram + - fastq +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - interleave: + type: boolean + description: Set true for interleaved fastq file +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Compressed FASTQ file(s) with reads with either the READ1 or READ2 flag set in separate files. + pattern: "*_{1,2}.fastq.gz" + - interleaved: + type: file + description: Compressed FASTQ file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. + pattern: "*_interleaved.fastq.gz" + - singleton: + type: file + description: Compressed FASTQ file with singleton reads + pattern: "*_singleton.fastq.gz" + - other: + type: file + description: Compressed FASTQ file with reads with either both READ1 and READ2 flags set or unset + pattern: "*_other.fastq.gz" +authors: + - "@priyanka-surana" + - "@suzannejin" +maintainers: + - "@priyanka-surana" + - "@suzannejin" diff --git a/modules/nf-core/samtools/fastq/samtools-fastq.diff b/modules/nf-core/samtools/fastq/samtools-fastq.diff new file mode 100644 index 00000000..00fa1b3d --- /dev/null +++ b/modules/nf-core/samtools/fastq/samtools-fastq.diff @@ -0,0 +1,30 @@ +Changes in module 'nf-core/samtools/fastq' +--- modules/nf-core/samtools/fastq/main.nf ++++ modules/nf-core/samtools/fastq/main.nf +@@ -23,17 +23,21 @@ + + script: + def args = task.ext.args ?: '' ++ def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fastq" : + meta.single_end ? "-1 ${prefix}_1.fastq.gz -s ${prefix}_singleton.fastq.gz" : + "-1 ${prefix}_1.fastq.gz -2 ${prefix}_2.fastq.gz -s ${prefix}_singleton.fastq.gz" + """ +- samtools \\ +- fastq \\ ++ samtools reset \\ ++ --threads ${task.cpus-1} \\ + $args \\ ++ $input \\ ++ | \\ ++ samtools fastq \\ ++ $args2 \\ + --threads ${task.cpus-1} \\ + -0 ${prefix}_other.fastq.gz \\ +- $input \\ + $output + + cat <<-END_VERSIONS > versions.yml + +************************************************************ diff --git a/modules/nf-core/samtools/fastq/tests/main.nf.test b/modules/nf-core/samtools/fastq/tests/main.nf.test new file mode 100644 index 00000000..f6ac1123 --- /dev/null +++ b/modules/nf-core/samtools/fastq/tests/main.nf.test @@ -0,0 +1,67 @@ +nextflow_process { + + name "Test Process SAMTOOLS_FASTQ" + script "../main.nf" + process "SAMTOOLS_FASTQ" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/fastq" + + test("bam") { + + when { + process { + """ + interleave = false + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = interleave + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.fastq[0][1].collect { path(it).linesGzip[0..6] }).match("bam_fastq") }, + { assert snapshot(process.out.interleaved).match("bam_interleaved") }, + { assert snapshot(file(process.out.singleton[0][1]).name).match("bam_singleton") }, + { assert snapshot(file(process.out.other[0][1]).name).match("bam_other") }, + { assert snapshot(process.out.versions).match("bam_versions") } + ) + } + } + + test("bam_interleave") { + + when { + process { + """ + interleave = true + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = interleave + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.fastq).match("bam_interleave_fastq") }, + { assert snapshot(path(process.out.interleaved[0][1]).readLines()[0..6]).match("bam_interlinterleave_eaved") }, + { assert snapshot(process.out.singleton).match("bam_singinterleave_leton") }, + { assert snapshot(file(process.out.other[0][1]).name).match("bam_interleave_other") }, + { assert snapshot(process.out.versions).match("bam_verinterleave_sions") } + ) + } + } +} diff --git a/modules/nf-core/samtools/fastq/tests/main.nf.test.snap b/modules/nf-core/samtools/fastq/tests/main.nf.test.snap new file mode 100644 index 00000000..0bceafc1 --- /dev/null +++ b/modules/nf-core/samtools/fastq/tests/main.nf.test.snap @@ -0,0 +1,139 @@ +{ + "bam_interlinterleave_eaved": { + "content": [ + [ + "@ERR5069949.2151832/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "+", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE [ meta + [ 'single_end': true ], fastq ] } + .branch { meta, reads -> + fastq: reads.extension == 'gz' + bam: reads.extension == 'bam' + } + .set { ch_filetypes } + + ch_filetypes.fastq.set { ch_sample } + + SAMTOOLS_FASTQ ( ch_filetypes.bam, false ) + ch_versions = ch_versions.mix(SAMTOOLS_FASTQ.out.versions) + + // Mix converted BAM back in + ch_sample = ch_sample.mix(SAMTOOLS_FASTQ.out.other) + + emit: + fastq = ch_sample // channel: [ val(meta), fastq ] + versions = ch_versions // channel: [ versions.yml ] +} + diff --git a/workflows/skierfe.nf b/workflows/skierfe.nf index cf8cfbdf..038574fd 100644 --- a/workflows/skierfe.nf +++ b/workflows/skierfe.nf @@ -89,6 +89,7 @@ if( (params.preset == "pacbio" & !params.skip_methylation_wf) | */ include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' +include { BAM_TO_FASTQ } from '../subworkflows/local/bam_to_fastq' include { ASSEMBLY } from '../subworkflows/local/genome_assembly' include { ASSEMBLY_VARIANT_CALLING } from '../subworkflows/local/assembly_variant_calling' include { ALIGN_READS } from '../subworkflows/local/align_reads' @@ -114,7 +115,6 @@ include { BUILD_INTERVALS } from '../modules/local/build_intervals/m include { SPLIT_BED_CHUNKS } from '../modules/local/split_bed_chunks/main' // nf-core -include { MOSDEPTH } from '../modules/nf-core/mosdepth/main' include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { paramsSummaryMap } from 'plugin/nf-validation' @@ -131,13 +131,20 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_skie workflow SKIERFE { take: - ch_sample + ch_input main: ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() + BAM_TO_FASTQ ( ch_input ) + ch_versions = ch_versions.mix(BAM_TO_FASTQ.out.versions) + + BAM_TO_FASTQ.out.fastq + .set { ch_sample } + + if(!params.skip_qc) { // Fastq QC