diff --git a/CHANGELOG.md b/CHANGELOG.md index 47cb7b72..fb799c9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [dev](https://github.com/nf-core/smrnaseq/branch/dev) -- _nothing yet done_ +### Parameters + +| Old parameter | New parameter | +| ------------- | --------------------------- | +| | `--with_umi` | +| | `--umitools_extract_method` | +| | `--umitools_bc_pattern` | +| | `--umi_discard_read` | +| | `--save_umi_intermeds` | +| | `--umi_merge_unmapped` | + ## [v2.2.4](https://github.com/nf-core/smrnaseq/releases/tag/2.2.4) - 2023-11-03 @@ -64,22 +74,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [[#188](https://github.com/nf-core/smrnaseq/pull/188)] - Dropped TrimGalore in favor of fastp QC and adapter trimming, improved handling of adapters and trimming parameters - [[#194](https://github.com/nf-core/smrnaseq/issues/194)] - Added default adapters file for FastP improved miRNA adapter trimming -### Parameters - -| Old parameter | New parameter | -| ------------- | ------------------------ | -| | `--mirgenedb` | -| | `--mirgenedb_species` | -| | `--mirgenedb_gff` | -| | `--mirgenedb_mature` | -| | `--mirgenedb_hairpin` | -| | `--contamination_filter` | -| | `--rrna` | -| | `--trna` | -| | `--cdna` | -| | `--ncrna` | -| | `--pirna` | -| | `--other_contamination` | ## [v2.0.0](https://github.com/nf-core/smrnaseq/releases/tag/2.0.0) - 2022-05-31 Aqua Zinc Chihuahua @@ -98,20 +92,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Other enhancements & fixes - [#134](https://github.com/nf-core/smrnaseq/issues/134) - Fixed colSum of zero issues for edgeR_miRBase.R script +- [#49](https://github.com/nf-core/smrnaseq/issues/49) - Integrated the existing umitools modules into the pipeline and extend the deduplication step. - [#55](https://github.com/lpantano/seqcluster/pull/55) - update seqcluster to fix UMI-detecting bug -### Parameters - -| Old parameter | New parameter | -| -------------------- | ---------------- | -| `--conda` | `--enable_conda` | -| `--clusterOptions` | | -| `--publish_dir_mode` | | - -> **NB:** Parameter has been **updated** if both old and new parameter information is present. -> **NB:** Parameter has been **added** if just the new parameter information is present. -> **NB:** Parameter has been **removed** if parameter information isn't present. - ### Software dependencies Note, since the pipeline is now using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. @@ -133,6 +116,7 @@ Note, since the pipeline is now using Nextflow DSL2, each process will be run wi | `seqkit` | 0.16.0 | 2.0.0 | | `trim-galore` | 0.6.6 | 0.6.7 | | `bioconvert` | - | 0.4.3 | +| `umi_tools` | - | 1.1.2 | | `htseq` | - | - | | `markdown` | - | - | | `pymdown-extensions` | - | - | diff --git a/README.md b/README.md index 7c010c00..5c5bf3a2 100644 --- a/README.md +++ b/README.md @@ -28,28 +28,30 @@ You can find numerous talks on the nf-core events page from various topics inclu ## Pipeline summary 1. Raw read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Adapter trimming ([`Trim Galore!`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)) +2. UMI barcode extraction ([`UMI-tools`](https://github.com/CGATOxford/UMI-tools)) +3. Adapter trimming ([`Trim Galore!`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)) 1. Insert Size calculation 2. Collapse reads ([`seqcluster`](https://seqcluster.readthedocs.io/mirna_annotation.html#processing-of-reads)) -3. Contamination filtering ([`Bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)) -4. Alignment against miRBase mature miRNA ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) -5. Alignment against miRBase hairpin +4. UMI barcode deduplication ([`UMI-tools`](https://github.com/CGATOxford/UMI-tools)) +5. Contamination filtering ([`Bowtie2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)) +6. Alignment against miRBase mature miRNA ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) +7. Alignment against miRBase hairpin 1. Unaligned reads from step 3 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) 2. Collapsed reads from step 2.2 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) -6. Post-alignment processing of miRBase hairpin +8. Post-alignment processing of miRBase hairpin 1. Basic statistics from step 3 and step 4.1 ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) 2. Analysis on miRBase, or MirGeneDB hairpin counts ([`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html)) - TMM normalization and a table of top expression hairpin - MDS plot clustering samples - Heatmap of sample similarities 3. miRNA and isomiR annotation from step 4.1 ([`mirtop`](https://github.com/miRTop/mirtop)) -7. Alignment against host reference genome ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) +9. Alignment against host reference genome ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) 1. Post-alignment processing of alignment against host reference genome ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) -8. Novel miRNAs and known miRNAs discovery ([`MiRDeep2`](https://www.mdc-berlin.de/content/mirdeep2-documentation)) - 1. Mapping against reference genome with the mapper module - 2. Known and novel miRNA discovery with the mirdeep2 module -9. miRNA quality control ([`mirtrace`](https://github.com/friedlanderlab/mirtrace)) -10. Present QC for raw read, alignment, and expression results ([`MultiQC`](http://multiqc.info/)) +10. Novel miRNAs and known miRNAs discovery ([`MiRDeep2`](https://www.mdc-berlin.de/content/mirdeep2-documentation)) +11. Mapping against reference genome with the mapper module +12. Known and novel miRNA discovery with the mirdeep2 module +13. miRNA quality control ([`mirtrace`](https://github.com/friedlanderlab/mirtrace)) +14. Present QC for raw read, alignment, and expression results ([`MultiQC`](http://multiqc.info/)) ## Usage diff --git a/conf/modules.config b/conf/modules.config index b06d0055..4c8ee5aa 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -77,6 +77,7 @@ process { // // Read QC and trimming options // + process { withName: 'MIRTRACE_RUN' { publishDir = [ @@ -89,7 +90,7 @@ process { if (!(params.skip_fastqc)) { process { - withName: '.*:FASTQC_FASTP:FASTQC_.*' { + withName: '.*:FASTQC_UMITOOLS_FASTP:FASTQC_.*' { ext.args = '--quiet' } } @@ -97,7 +98,7 @@ if (!(params.skip_fastqc)) { if (!params.skip_fastp) { process { - withName: 'FASTP' { + withName: '.*:FASTQC_UMITOOLS_FASTP:FASTP' { ext.args = [ "", params.trim_fastq ? "" : "--disable_adapter_trimming", params.clip_r1 > 0 ? "--trim_front1 ${params.clip_r1}" : "", // Remove bp from the 5' end of read 1. @@ -142,6 +143,90 @@ if (!params.skip_fastp) { } } +if (params.with_umi && !params.skip_umi_extract) { + process { + withName: '.*:FASTQC_UMITOOLS_TRIMGALORE:UMITOOLS_EXTRACT' { + ext.args = [ + params.umitools_extract_method ? "--extract-method=${params.umitools_extract_method}" : '', + params.umitools_bc_pattern ? "--bc-pattern='${params.umitools_bc_pattern}'" : '', + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/umitools" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path: { "${params.outdir}/umitools" }, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz", + enabled: params.save_umi_intermeds + ] + ] + } + } +} + +// +// UMI tools deduplication +// + +if (params.with_umi) { + process { + withName: '.*:DEDUPLICATE_UMIS:UMITOOLS_DEDUP' { + ext.args = { meta.single_end ? '' : '--unpaired-reads=discard --chimeric-pairs=discard' } + ext.prefix = { "${meta.id}.umi_dedup.sorted" } + publishDir = [ + [ + path: { "${params.outdir}/umi_dedup/umitools" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ], + [ + path: { "${params.outdir}/umi_dedup" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + enabled: ( + params.save_umi_intermeds + ) + ] + ] + } + + withName: '.*:DEDUPLICATE_UMIS:BAM_SORT_SAMTOOLS:SAMTOOLS_SORT' { + ext.prefix = { "${meta.id}.sorted" } + publishDir = [ + path: { "${params.outdir}/umi_dedup" }, + mode: params.publish_dir_mode, + pattern: '*.{bam}', + enabled: ( + params.save_umi_intermeds + ) + ] + } + + withName: '.*:DEDUPLICATE_UMIS:BAM_SORT_SAMTOOLS:SAMTOOLS_INDEX' { + ext.prefix = { "${meta.id}.sorted" } + publishDir = [ + path: { "${params.outdir}/umi_dedup" }, + mode: params.publish_dir_mode, + pattern: '*.{bai,csi}', + enabled: ( + params.save_umi_intermeds + ) + ] + } + + withName: '.*:DEDUPLICATE_UMIS:BAM_SORT_SAMTOOLS:BAM_STATS_SAMTOOLS:.*' { + publishDir = [ + path: { "${params.outdir}/umi_dedup/samtools_stats" }, + mode: params.publish_dir_mode, + pattern: '*.{stats,flagstat,idxstats}' + ] + } + } +} + // // Quantification // diff --git a/docs/output.md b/docs/output.md index 706930a1..fc9d14ef 100644 --- a/docs/output.md +++ b/docs/output.md @@ -13,6 +13,8 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - [FastQC](#fastqc) - read quality control +- [UMI-tools extract](#umi-tools-extract) - UMI barcode extraction +- [UMI-tools deduplicate](#umi-tools-deduplicate) - read deduplication - [FastP](#fastp) - adapter trimming - [Bowtie2](#bowtie2) - contamination filtering - [Bowtie](#bowtie) - alignment against mature miRNAs and miRNA precursors (hairpins) @@ -40,6 +42,21 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d ![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +## UMI-tools extract + +
+Output files + +- `umitools/` + - `*.fastq.gz`: If `--save_umi_intermeds` is specified, FastQ files **after** UMI extraction will be placed in this directory. + - `*.log`: Log file generated by the UMI-tools `extract` command. + +
+ +[UMI-tools](https://github.com/CGATOxford/UMI-tools) deduplicates reads based on unique molecular identifiers (UMIs) to address PCR-bias. Firstly, the UMI-tools `extract` command removes the UMI barcode information from the read sequence and adds it to the read name. Secondly, reads are deduplicated based on UMI identifier after mapping as highlighted in the [UMI-tools deduplicate](#umi-tools-deduplicate) section. + +To facilitate processing of input data which has the UMI barcode already embedded in the read name from the start, `--skip_umi_extract` can be specified in conjunction with `--with_umi`. + ## FastP [FastP](https://github.com/OpenGene/fastp) is used for removal of adapter contamination and trimming of low quality regions. @@ -55,6 +72,19 @@ Contains FastQ files with quality and adapter trimmed reads for each sample, alo FastP can automatically detect adapter sequences when not specified directly by the user - the pipeline also comes with a feature and a supplied miRNA adapters file to ensure adapters auto-detected are more accurate. If there are needs to add more known miRNA adapters to this list, please open a pull request. +## UMI-tools deduplicate + +
+Output files + +- `umi_dedup/` + - `*.tsv`: Results statistics files detailing the UMI deduplication results. + - `*.bam`: If `--save_umi_intermeds` is specified, the deduplicated bam files **after** UMI deduplication will be placed in this directory. In addition the sorted and indexed files will be placed there as well. + - `samtools_stats/` - `*.{stats,flagstat,idxstats}:` Statistics on the mappings underlying the UMI deduplication. +
+ +[UMI-tools](https://github.com/CGATOxford/UMI-tools) deduplicates reads based on unique molecular identifiers (UMIs) to address PCR-bias. Firstly, the UMI-tools `extract` command removes the UMI barcode information from the read sequence and adds it to the read name as highlighted in the [UMI-tools extract](#umi-tools-extract) section. The reads are deduplicated based on an alignment against the full genome of the species. The deduplicated reads are then converted into fastq format and merged with the reads that remained unmapped in order to reduce potential reference bias. This behavior can be stopped by setting `--umi_merge_unmapped false`. The resulting fastq files are used in the remaining steps of the pipeline. + ## Bowtie2 [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) is used to align the reads to user-defined databases of contaminants. diff --git a/modules/local/mirdeep2_run.nf b/modules/local/mirdeep2_run.nf index 26635a4f..98fb16ad 100644 --- a/modules/local/mirdeep2_run.nf +++ b/modules/local/mirdeep2_run.nf @@ -40,4 +40,3 @@ process MIRDEEP2_RUN { END_VERSIONS """ } - diff --git a/modules/nf-core/modules/cat/cat/main.nf b/modules/nf-core/modules/cat/cat/main.nf new file mode 100644 index 00000000..40e53f3e --- /dev/null +++ b/modules/nf-core/modules/cat/cat/main.nf @@ -0,0 +1,62 @@ +process CAT_CAT { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::pigz=2.3.4" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'quay.io/biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(files_in) + + output: + tuple val(meta), path("${prefix}"), emit: file_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def file_list = files_in.collect { it.toString() } + + // | input | output | command1 | command2 | + // |-----------|------------|----------|----------| + // | gzipped | gzipped | cat | | + // | ungzipped | ungzipped | cat | | + // | gzipped | ungzipped | zcat | | + // | ungzipped | gzipped | cat | pigz | + + // Use input file ending as default + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + out_zip = prefix.endsWith('.gz') + in_zip = file_list[0].endsWith('.gz') + command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' + command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + """ + $command1 \\ + $args \\ + ${file_list.join(' ')} \\ + $command2 \\ + > ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + """ + touch $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/cat/cat/meta.yml b/modules/nf-core/modules/cat/cat/meta.yml new file mode 100644 index 00000000..5eeff5a6 --- /dev/null +++ b/modules/nf-core/modules/cat/cat/meta.yml @@ -0,0 +1,37 @@ +name: cat_cat +description: A module for concatenation of gzipped or uncompressed files +keywords: + - concatenate + - gzip + - cat +tools: + - cat: + description: Just concatenation + homepage: None + documentation: https://man7.org/linux/man-pages/man1/cat.1.html + tool_dev_url: None + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - files_in: + type: file + description: List of compressed / uncompressed files + pattern: "*" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - file_out: + type: file + description: Concatenated file. Will be gzipped if file_out ends with ".gz" + pattern: "${file_out}" + +authors: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/modules/samtools/bam2fq/main.nf b/modules/nf-core/modules/samtools/bam2fq/main.nf new file mode 100644 index 00000000..9301d1d3 --- /dev/null +++ b/modules/nf-core/modules/samtools/bam2fq/main.nf @@ -0,0 +1,56 @@ +process SAMTOOLS_BAM2FQ { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + + input: + tuple val(meta), path(inputbam) + val split + + output: + tuple val(meta), path("*.fq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (split){ + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + -1 ${prefix}_1.fq.gz \\ + -2 ${prefix}_2.fq.gz \\ + -0 ${prefix}_other.fq.gz \\ + -s ${prefix}_singleton.fq.gz \\ + $inputbam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } else { + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + $inputbam | gzip --no-name > ${prefix}_interleaved.fq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/modules/samtools/bam2fq/meta.yml b/modules/nf-core/modules/samtools/bam2fq/meta.yml new file mode 100644 index 00000000..319a60cf --- /dev/null +++ b/modules/nf-core/modules/samtools/bam2fq/meta.yml @@ -0,0 +1,55 @@ +name: samtools_bam2fq +description: | + The module uses bam2fq method from samtools to + convert a SAM, BAM or CRAM file to FASTQ format +keywords: + - bam2fq + - samtools + - fastq +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + homepage: None + documentation: http://www.htslib.org/doc/1.1/samtools.html + tool_dev_url: None + doi: "" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - inputbam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - split: + type: boolean + description: | + TRUE/FALSE value to indicate if reads should be separated into + /1, /2 and if present other, or singleton. + Note: choosing TRUE will generate 4 different files. + Choosing FALSE will produce a single file, which will be interleaved in case + the input contains paired reads. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: | + FASTQ files, which will be either a group of 4 files (read_1, read_2, other and singleton) + or a single interleaved .fq.gz file if the user chooses not to split the reads. + pattern: "*.fq.gz" + +authors: + - "@lescai" diff --git a/modules/nf-core/modules/umitools/dedup/main.nf b/modules/nf-core/modules/umitools/dedup/main.nf new file mode 100644 index 00000000..dfcbcf2f --- /dev/null +++ b/modules/nf-core/modules/umitools/dedup/main.nf @@ -0,0 +1,41 @@ +process UMITOOLS_DEDUP { + tag "$meta.id" + label "process_medium" + + conda (params.enable_conda ? "bioconda::umi_tools=1.1.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.2--py38h4a8c8d9_0' : + 'quay.io/biocontainers/umi_tools:1.1.2--py38h4a8c8d9_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("*edit_distance.tsv"), emit: tsv_edit_distance + tuple val(meta), path("*per_umi.tsv") , emit: tsv_per_umi + tuple val(meta), path("*per_position.tsv") , emit: tsv_umi_per_position + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + """ + umi_tools \\ + dedup \\ + -I $bam \\ + -S ${prefix}.bam \\ + --output-stats $prefix \\ + $paired \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/umitools/dedup/meta.yml b/modules/nf-core/modules/umitools/dedup/meta.yml new file mode 100644 index 00000000..eee8952f --- /dev/null +++ b/modules/nf-core/modules/umitools/dedup/meta.yml @@ -0,0 +1,59 @@ +name: umitools_dedup +description: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. +keywords: + - umitools + - deduplication +tools: + - umi_tools: + description: > + UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) + and single cell RNA-Seq cell barcodes + documentation: https://umi-tools.readthedocs.io/en/latest/ + license: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: | + BAM file containing reads to be deduplicated via UMIs. + pattern: "*.{bam}" + - bai: + type: file + description: | + BAM index files corresponding to the input BAM file. + pattern: "*.{bai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file with deduplicated UMIs. + pattern: "*.{bam}" + - tsv_edit_distance: + type: file + description: Reports the (binned) average edit distance between the UMIs at each position. + pattern: "*edit_distance.tsv" + - tsv_per_umi: + type: file + description: UMI-level summary statistics. + pattern: "*per_umi.tsv" + - tsv_umi_per_position: + type: file + description: Tabulates the counts for unique combinations of UMI and position. + pattern: "*per_position.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@drpatelh" + - "@grst" + - "@klkeys" diff --git a/modules/nf-core/modules/umitools/extract/main.nf b/modules/nf-core/modules/umitools/extract/main.nf new file mode 100644 index 00000000..22a405b9 --- /dev/null +++ b/modules/nf-core/modules/umitools/extract/main.nf @@ -0,0 +1,55 @@ +process UMITOOLS_EXTRACT { + tag "$meta.id" + label "process_low" + + conda (params.enable_conda ? "bioconda::umi_tools=1.1.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.2--py38h4a8c8d9_0' : + 'quay.io/biocontainers/umi_tools:1.1.2--py38h4a8c8d9_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + """ + umi_tools \\ + extract \\ + -I $reads \\ + -S ${prefix}.umi_extract.fastq.gz \\ + $args \\ + > ${prefix}.umi_extract.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') + END_VERSIONS + """ + } else { + """ + umi_tools \\ + extract \\ + -I ${reads[0]} \\ + --read2-in=${reads[1]} \\ + -S ${prefix}.umi_extract_1.fastq.gz \\ + --read2-out=${prefix}.umi_extract_2.fastq.gz \\ + $args \\ + > ${prefix}.umi_extract.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/modules/umitools/extract/meta.yml b/modules/nf-core/modules/umitools/extract/meta.yml new file mode 100644 index 00000000..7fc23f72 --- /dev/null +++ b/modules/nf-core/modules/umitools/extract/meta.yml @@ -0,0 +1,47 @@ +name: umitools_extract +description: Extracts UMI barcode from a read and add it to the read name, leaving any sample barcode in place +keywords: + - umitools + - extract +tools: + - umi_tools: + description: > + UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) + and single cell RNA-Seq cell barcodes + documentation: https://umi-tools.readthedocs.io/en/latest/ + license: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: list + description: | + List of input FASTQ files whose UMIs will be extracted. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: > + Extracted FASTQ files. | + For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | + For paired-end reads, pattern is \${prefix}.umi_extract_{1,2}.fastq.gz. + pattern: "*.{fastq.gz}" + - log: + type: file + description: Logfile for umi_tools + pattern: "*.{log}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@drpatelh" + - "@grst" diff --git a/nextflow.config b/nextflow.config index 2038c633..a069471c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -29,6 +29,15 @@ params { mirgenedb_gff = null mirgenedb_species = null + // UMI handling + with_umi = false + skip_umi_extract = false + umitools_extract_method = 'string' + umitools_bc_pattern = null + umi_discard_read = null + save_umi_intermeds = false + umi_merge_unmapped = true + // Trimming options clip_r1 = null three_prime_clip_r1 = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 40360e49..33b150b4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -50,6 +50,53 @@ } } }, + "umi_options": { + "title": "UMI options", + "type": "object", + "description": "Options for processing reads with unique molecular identifiers", + "default": "", + "properties": { + "with_umi": { + "type": "boolean", + "fa_icon": "fas fa-barcode", + "description": "Enable UMI-based read deduplication." + }, + "umitools_extract_method": { + "type": "string", + "default": "string", + "fa_icon": "fas fa-barcode", + "description": "UMI pattern to use. Can be either 'string' (default) or 'regex'.", + "help_text": "More details can be found in the [UMI-tools documentation](https://umi-tools.readthedocs.io/en/latest/reference/extract.html#extract-method).\n" + }, + "skip_umi_extract": { + "type": "boolean", + "fa_icon": "fas fa-compress-alt", + "description": "Skip the UMI extraction from the read in case the UMIs have been moved to the headers in advance of the pipeline run." + }, + "umitools_bc_pattern": { + "type": "string", + "fa_icon": "fas fa-barcode", + "help_text": "More details can be found in the [UMI-tools documentation](https://umi-tools.readthedocs.io/en/latest/reference/extract.html#extract-method).", + "description": "The UMI barcode pattern to use e.g. 'NNNNNN' indicates that the first 6 nucleotides of the read are from the UMI." + }, + "umi_discard_read": { + "type": "integer", + "fa_icon": "fas fa-barcode", + "description": "After UMI barcode extraction discard either R1 or R2 by setting this parameter to 1 or 2, respectively." + }, + "save_umi_intermeds": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "If this option is specified, intermediate FastQ and BAM files produced by UMI-tools are also saved in the results directory." + }, + "umi_merge_unmapped": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Unless deactivated the deduplicated reads are merged with the reads that could not be placed to reduce the potential reference bias" + } + }, + "fa_icon": "fas fa-barcode" + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -473,6 +520,9 @@ { "$ref": "#/definitions/input_output_options" }, + { + "$ref": "#/definitions/umi_options" + }, { "$ref": "#/definitions/reference_genome_options" }, diff --git a/subworkflows/local/umi_dedup.nf b/subworkflows/local/umi_dedup.nf new file mode 100644 index 00000000..12033274 --- /dev/null +++ b/subworkflows/local/umi_dedup.nf @@ -0,0 +1,68 @@ +// +// Deduplicate the UMI reads by mapping them to the complete genome. +// + +include { INDEX_GENOME } from '../../modules/local/bowtie_genome' +include { BOWTIE_MAP_SEQ as UMI_MAP_GENOME } from '../../modules/local/bowtie_map_mirna' +include { BAM_SORT_SAMTOOLS } from '../../subworkflows/nf-core/bam_sort_samtools' +include { UMITOOLS_DEDUP } from '../../modules/nf-core/modules/umitools/dedup/main' +include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/modules/samtools/bam2fq/main' +include { CAT_CAT } from '../../modules/nf-core/modules/cat/cat/main' + +workflow DEDUPLICATE_UMIS { + take: + fasta + bt_index + reads // channel: [ val(meta), [ reads ] ] + + main: + + ch_versions = Channel.empty() + ch_dedup_stats = Channel.empty() + + if (!bt_index){ + INDEX_GENOME ( fasta ) + bt_index = INDEX_GENOME.out.bowtie_indices + fasta_formatted = INDEX_GENOME.out.fasta + ch_versions = ch_versions.mix(INDEX_GENOME.out.versions) + } else { + bt_index = Channel.fromPath("${bt_index}**ebwt", checkIfExists: true).ifEmpty { exit 1, "Bowtie1 index directory not found: ${bt_index}" } + fasta_formatted = fasta + } + + if (bt_index){ + + UMI_MAP_GENOME ( reads, bt_index.collect() ) + ch_versions = ch_versions.mix(UMI_MAP_GENOME.out.versions) + + BAM_SORT_SAMTOOLS ( UMI_MAP_GENOME.out.bam, Channel.empty() ) + ch_versions = ch_versions.mix(BAM_SORT_SAMTOOLS.out.versions) + + ch_umi_dedup = BAM_SORT_SAMTOOLS.out.bam.join(BAM_SORT_SAMTOOLS.out.bai) + UMITOOLS_DEDUP ( ch_umi_dedup ) + ch_versions = ch_versions.mix(UMITOOLS_DEDUP.out.versions) + ch_dedup_stats = ch_dedup_stats.mix(UMITOOLS_DEDUP.out.tsv_edit_distance).join(UMITOOLS_DEDUP.out.tsv_per_umi).join(UMITOOLS_DEDUP.out.tsv_umi_per_position) + + SAMTOOLS_BAM2FQ ( UMITOOLS_DEDUP.out.bam, false ) + ch_versions = ch_versions.mix(SAMTOOLS_BAM2FQ.out.versions) + + ch_dedup_reads = SAMTOOLS_BAM2FQ.out.reads + + if ( params.umi_merge_unmapped ) { + + SAMTOOLS_BAM2FQ.out.reads + .join(UMI_MAP_GENOME.out.unmapped) + .map { meta, file1, file2 -> [meta, [file1, file2]]} + .set { ch_cat } + + CAT_CAT ( ch_cat ) + ch_dedup_reads = CAT_CAT.out.file_out + } + } + + emit: + reads = ch_dedup_reads + indices = bt_index + stats = ch_dedup_stats + versions = ch_versions +} diff --git a/subworkflows/nf-core/fastqc_umitools_trimgalore.nf b/subworkflows/nf-core/fastqc_umitools_trimgalore.nf new file mode 100644 index 00000000..ca158e7a --- /dev/null +++ b/subworkflows/nf-core/fastqc_umitools_trimgalore.nf @@ -0,0 +1,78 @@ +// +// Read QC, UMI extraction and trimming +// + +nextflow.enable.dsl=2 + +include { FASTQC } from '../../modules/nf-core/modules/fastqc/main' +include { UMITOOLS_EXTRACT } from '../../modules/nf-core/modules/umitools/extract/main' +include { TRIMGALORE } from '../../modules/nf-core/modules/trimgalore/main' + +workflow FASTQC_UMITOOLS_TRIMGALORE { + take: + reads // channel: [ val(meta), [ reads ] ] + skip_fastqc // boolean: true/false + with_umi // boolean: true/false + skip_trimming // boolean: true/false + umi_discard_read // integer: 0, 1 or 2 + + main: + + ch_versions = Channel.empty() + fastqc_html = Channel.empty() + fastqc_zip = Channel.empty() + if (!skip_fastqc) { + FASTQC ( reads ).html.set { fastqc_html } + fastqc_zip = FASTQC.out.zip + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + } + + umi_reads = reads + umi_log = Channel.empty() + if (with_umi) { + UMITOOLS_EXTRACT ( reads ).reads.set { umi_reads } + umi_log = UMITOOLS_EXTRACT.out.log + ch_versions = ch_versions.mix(UMITOOLS_EXTRACT.out.versions.first()) + + // Discard R1 / R2 if required + if (umi_discard_read in [1,2]) { + UMITOOLS_EXTRACT + .out + .reads + .map { meta, reads -> + if (!meta.single_end) { + meta['single_end'] = true + reads = reads[umi_discard_read % 2] + } + return [ meta, reads ] + } + .set { umi_reads } + } + } + + trim_reads = umi_reads + trim_html = Channel.empty() + trim_zip = Channel.empty() + trim_log = Channel.empty() + if (!skip_trimming) { + TRIMGALORE ( umi_reads ).reads.set { trim_reads } + trim_html = TRIMGALORE.out.html + trim_zip = TRIMGALORE.out.zip + trim_log = TRIMGALORE.out.log + ch_versions = ch_versions.mix(TRIMGALORE.out.versions.first()) + } + + emit: + reads = trim_reads // channel: [ val(meta), [ reads ] ] + + fastqc_html // channel: [ val(meta), [ html ] ] + fastqc_zip // channel: [ val(meta), [ zip ] ] + + umi_log // channel: [ val(meta), [ log ] ] + + trim_html // channel: [ val(meta), [ html ] ] + trim_zip // channel: [ val(meta), [ zip ] ] + trim_log // channel: [ val(meta), [ txt ] ] + + versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/workflows/smrnaseq.nf b/workflows/smrnaseq.nf index b6c2d3a2..55f7614b 100644 --- a/workflows/smrnaseq.nf +++ b/workflows/smrnaseq.nf @@ -65,7 +65,8 @@ if (!params.mirgenedb) { } include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { FASTQC_FASTP } from '../subworkflows/local/fastqc_fastp' +include { FASTQC_UMITOOLS_FASTP } from '../subworkflows/nf-core/fastqc_umitools_trimgalore' +include { DEDUPLICATE_UMIS } from '../subworkflows/local/umi_dedup' include { CONTAMINANT_FILTER } from '../subworkflows/local/contaminant_filter' include { MIRNA_QUANT } from '../subworkflows/local/mirna_quant' include { GENOME_QUANT } from '../subworkflows/local/genome_quant' @@ -134,7 +135,37 @@ workflow SMRNASEQ { // SUBWORKFLOW: Read QC and trim adapters // - FASTQC_FASTP ( + // + // SUBWORKFLOW: Read QC, extract UMI and trim adapters + // + FASTQC_UMITOOLS_FASTP ( + ch_cat_fastq, + params.skip_fastqc || params.skip_qc, + params.with_umi, + params.skip_trimming, + params.umi_discard_read + ) + ch_versions = ch_versions.mix(FASTQC_UMITOOLS_FASTP.out.versions) + + reads_for_mirna = FASTQC_UMITOOLS_FASTP.out.reads + + // + // SUBWORKFLOW: Deduplicate UMIs by mapping them to the genome + // + if (params.with_umi){ + if (fasta){ + fasta_ch = file(fasta) + DEDUPLICATE_UMIS ( + fasta_ch, + bt_index, + FASTQC_UMITOOLS_FASTP.out.reads + ) + reads_for_mirna = DEDUPLICATE_UMIS.out.reads + ch_versions = ch_versions.mix(DEDUPLICATE_UMIS.out.versions) + } + } + + FASTQC_UMITOOLS_FASTP ( ch_cat_fastq, ch_fastp_adapters, false, @@ -197,7 +228,7 @@ workflow SMRNASEQ { if (!params.skip_mirdeep) { MIRDEEP2 ( - FASTQC_FASTP.out.reads, + FASTQC_UMITOOLS_FASTP.out.reads, GENOME_QUANT.out.fasta, GENOME_QUANT.out.index.collect(), MIRNA_QUANT.out.fasta_hairpin, @@ -226,7 +257,7 @@ workflow SMRNASEQ { ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC_FASTP.out.fastqc_raw_zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_UMITOOLS_FASTP.out.fastqc_raw_zip.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(FASTQC_FASTP.out.trim_json.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(contamination_stats.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(genome_stats.collect({it[1]}).ifEmpty([]))