diff --git a/CHANGELOG.md b/CHANGELOG.md index 72b58d17..b12442dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements & fixes +| Old parameter | New parameter | +| ------------- | --------------------- | +| | `--mirGeneDB` | +| | `--mirGeneDB_species` | +| | `--mirGeneDB_gff` | +| | `--mirGeneDB_mature` | +| | `--mirGeneDB_hairpin` | + +### Other enhancements + +- [#55](https://github.com/nf-core/smrnaseq/issues/12) - Enabled the use of `MirGeneDB` as an alternative database insted of `miRBase` + ### Parameters ## [v2.0.0](https://github.com/nf-core/smrnaseq/releases/tag/2.0.0) - 2022-05-31 Aqua Zinc Chihuahua diff --git a/README.md b/README.md index 02689202..cee53f76 100644 --- a/README.md +++ b/README.md @@ -31,13 +31,13 @@ On release, automated continuous integration tests run the pipeline on a full-si 2. Adapter trimming ([`Trim Galore!`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)) 1. Insert Size calculation 2. Collapse reads ([`seqcluster`](https://seqcluster.readthedocs.io/mirna_annotation.html#processing-of-reads)) -3. Alignment against miRBase mature miRNA ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) -4. Alignment against miRBase hairpin +3. Alignment against miRBase or MirGeneDB mature miRNA ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) +4. Alignment against miRBase or MirGeneDB hairpin 1. Unaligned reads from step 3 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) 2. Collapsed reads from step 2.2 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml)) -5. Post-alignment processing of miRBase hairpin +5. Post-alignment processing of miRBase, or MirGeneDB hairpin 1. Basic statistics from step 3 and step 4.1 ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) - 2. Analysis on miRBase hairpin counts ([`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html)) + 2. Analysis on miRBase, or MirGeneDB hairpin counts ([`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html)) - TMM normalization and a table of top expression hairpin - MDS plot clustering samples - Heatmap of sample similarities diff --git a/docs/output.md b/docs/output.md index ce1f8347..f3a23da8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -60,7 +60,7 @@ This is an example of the output we can get: ## Bowtie -[Bowtie](http://bowtie-bio.sourceforge.net/index.shtml) is used for mapping adapter trimmed reads against the mature miRNAs and miRNA precursors (hairpins) in [miRBase](http://www.mirbase.org/). +[Bowtie](http://bowtie-bio.sourceforge.net/index.shtml) is used for mapping adapter trimmed reads against the mature miRNAs and miRNA precursors (hairpins) of the chosen database [miRBase](http://www.mirbase.org/) or [MirGeneDB](https://mirgenedb.org/). **Output directory: `results/samtools`** diff --git a/docs/usage.md b/docs/usage.md index f1304605..c55ad4a0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -16,16 +16,24 @@ This option indicates the experimental protocol used for the sample preparation. - 'cats': adapter (`GATCGGAAGAGCACACGTCTG), clip_r1(`3) - 'custom' (where the ser can indicate the `three_prime_adapter`, `clip_r1` and three_prime_clip_r1`) -### `mirtrace_species` +### `mirtrace_species or mirGeneDB_species` -It should point to the 3-letter species name used by `miRBase`. +It should point to the 3-letter species name used by `miRBase`, or `MirGeneDB`. Note the difference in case for the two databases. ### miRNA related files +Different parameters can be set for the two supported datbases. By default `miRBase` will be used with the parameters below. + - `mirna_gtf`: If not supplied by the user, then `mirna_gtf` will point to the latest GFF3 file in miRbase: `https://mirbase.org/ftp/CURRENT/genomes/${params.mirtrace_species}.gff3` - `mature`: points to the FASTA file of mature miRNA sequences. `https://mirbase.org/ftp/CURRENT/mature.fa.gz` - `hairpin`: points to the FASTA file of precursor miRNA sequences. `https://mirbase.org/ftp/CURRENT/hairpin.fa.gz` +If `MirGeneDB` should be used instead it needs to be specified using `--mirGeneDB` and use the parameters below . + +- `mirGeneDB_gff`: The data can not be downloaded automatically, thus the user needs to supply the gff file for either his species, or all species downloaded from `https://mirgenedb.org/download`. The total set will automatically be subsetted to the species specified with `mirGeneDB_species`. +- `mirGeneDB_mature`: points to the FASTA file of mature miRNA sequences. Download from `https://mirgenedb.org/download`. +- `mirGeneDB_hairpin`: points to the FASTA file of precursor miRNA sequences. Download from `https://mirgenedb.org/download`. Note that `MirGeneDB` does not have a dedicated `hairpin` file, but the `Precursor sequences` are to be used. + ### Genome - `fasta`: the reference genome FASTA file diff --git a/modules/local/mirtop_quant.nf b/modules/local/mirtop_quant.nf index d6b276ff..ffbfc0a0 100644 --- a/modules/local/mirtop_quant.nf +++ b/modules/local/mirtop_quant.nf @@ -11,6 +11,8 @@ process MIRTOP_QUANT { path hairpin path gtf + //if (!params.mirGeneDB) {params.filterSpecies = params.mirtrace_species} else {params.filterSpecies = params.mirGeneDB_species} + output: path "mirtop/mirtop.gff" path "mirtop/mirtop.tsv" , emit: mirtop_table @@ -20,9 +22,9 @@ process MIRTOP_QUANT { script: """ - mirtop gff --hairpin $hairpin --gtf $gtf -o mirtop --sps $params.mirtrace_species ./bams/* - mirtop counts --hairpin $hairpin --gtf $gtf -o mirtop --sps $params.mirtrace_species --add-extra --gff mirtop/mirtop.gff - mirtop export --format isomir --hairpin $hairpin --gtf $gtf --sps $params.mirtrace_species -o mirtop mirtop/mirtop.gff + mirtop gff --hairpin $hairpin --gtf $gtf -o mirtop --sps $params.filterSpecies ./bams/* + mirtop counts --hairpin $hairpin --gtf $gtf -o mirtop --sps $params.filterSpecies --add-extra --gff mirtop/mirtop.gff + mirtop export --format isomir --hairpin $hairpin --gtf $gtf --sps $params.filterSpecies -o mirtop mirtop/mirtop.gff mirtop stats mirtop/mirtop.gff --out mirtop/stats mv mirtop/stats/mirtop_stats.log mirtop/stats/full_mirtop_stats.log diff --git a/modules/local/parse_fasta_mirna.nf b/modules/local/parse_fasta_mirna.nf index 8b4c21f8..18b51066 100644 --- a/modules/local/parse_fasta_mirna.nf +++ b/modules/local/parse_fasta_mirna.nf @@ -9,6 +9,8 @@ process PARSE_FASTA_MIRNA { input: path fasta + //if (!params.mirGeneDB) {params.filterSpecies = params.mirtrace_species} else {params.filterSpecies = params.mirGeneDB_species} + output: path '*_igenome.fa', emit: parsed_fasta path "versions.yml", emit: versions @@ -27,7 +29,7 @@ process PARSE_FASTA_MIRNA { # TODO perl -ane 's/[ybkmrsw]/N/ig;print;' \${FASTA}_parsed_tmp.fa > \${FASTA}_parsed.fa sed -i 's/\s.*//' \${FASTA}_parsed.fa - seqkit grep -r --pattern \".*${params.mirtrace_species}-.*\" \${FASTA}_parsed.fa > \${FASTA}_sps.fa + seqkit grep -r --pattern \".*${params.filterSpecies}-.*\" \${FASTA}_parsed.fa > \${FASTA}_sps.fa seqkit seq --rna2dna \${FASTA}_sps.fa > \${FASTA}_igenome.fa cat <<-END_VERSIONS > versions.yml diff --git a/nextflow.config b/nextflow.config index 23d74c15..335c2fc1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,6 +26,11 @@ params { mirtrace_protocol = 'illumina' mature = "https://mirbase.org/ftp/CURRENT/mature.fa.gz" hairpin = "https://mirbase.org/ftp/CURRENT/hairpin.fa.gz" + mirGeneDB = false + mirGeneDB_mature = "/Users/chriskub/Downloads/ALL-mat.fas" + mirGeneDB_hairpin = "/Users/chriskub/Downloads/ALL-pre.fas" + mirGeneDB_gff = "/Users/chriskub/Downloads/ALL.gff" + mirGeneDB_species = null // Trimming options clip_r1 = 0 diff --git a/nextflow_schema.json b/nextflow_schema.json index 53c8dca9..20d6ae5e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -62,12 +62,23 @@ "fa_icon": "fas fa-book", "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, + "mirGeneDB": { + "type": "boolean", + "description": "Boolean wether mirGeneDB should be used instead of miRBase", + "help_text": "This allows you to use mirGeneDB instead of miRBase as the database. \n Note that you will need to set the additional flags `--mirGeneDB_species`, `--mirGeneDB_gff`, `--mirGeneDB_mature` and `--mirGeneDB_hairpin`", + "default": "false" + }, "mirtrace_species": { "type": "string", "description": "Species for miRTrace.", "help_text": "This is automatically set when using `--genome`. Example values: `hsa`, `mmu`...\n Note that mirTrace relies on miRBase for its species reference. See available references [here](https://mirbase.org/ftp/CURRENT/genomes/).", "fa_icon": "fas fa-journal-whills" }, + "mirGeneDB_species": { + "type": "string", + "description": "Species of mirGeneDB.", + "help_text": "This replaces the value of `--mirtrace_species` if `--mirGeneDB` is used. \n Note the difference in case for species names used in MirGeneDB and miRBase." + }, "fasta": { "type": "string", "fa_icon": "fas fa-font", @@ -80,6 +91,11 @@ "help_text": "miRBase `.gff3` file, typically downloaded from [`https://mirbase.org/ftp/CURRENT/genomes/`](https://mirbase.org/ftp/CURRENT/genomes/)\n\nIf using iGenomes with `--genome` this file will be downloaded from miRBase automatically during the pipeline run.\n\n", "fa_icon": "fas fa-address-book" }, + "mirGeneDB_gff": { + "type": "string", + "description": "GFF/GTF file with coordinates positions of precursor and miRNAs.", + "help_text": "mirGeneDB `.gff3` file, typically downloaded from [`https://mirgenedb.org/download`]. This replaces the value of --mirna_gff if --mirGeneDB is used." + }, "mature": { "type": "string", "description": "Path to FASTA file with mature miRNAs.", @@ -87,6 +103,11 @@ "help_text": "Typically this will be the `mature.fa` file from miRBase. Can be given either as a plain text `.fa` file or a compressed `.gz` file.\n\nDefaults to the current miRBase release URL, from which the file will be downloaded.", "default": "https://mirbase.org/ftp/CURRENT/mature.fa.gz" }, + "mirGeneDB_mature": { + "type": "string", + "description": "Path to FASTA file with mirGeneDB mature miRNAs.", + "help_text": "This file needs to be downloaded from [`https://mirgenedb.org/download`]. Can be given either as a plain text `.fa` file or a compressed `.gz` file." + }, "hairpin": { "type": "string", "description": "Path to FASTA file with miRNAs precursors.", @@ -94,6 +115,11 @@ "help_text": "Typically this will be the `mature.fa` file from miRBase. Can be given either as a plain text `.fa` file or a compressed `.gz` file.\n\nDefaults to the current miRBase release URL, from which the file will be downloaded.", "default": "https://mirbase.org/ftp/CURRENT/hairpin.fa.gz" }, + "mirGeneDB_hairpin": { + "type": "string", + "description": "Path to FASTA file with miRNAs precursors.", + "help_text": "This file needs to be downloaded from [`https://mirgenedb.org/download`]. Can be given either as a plain text `.fa` file or a compressed `.gz` file.\nNote that mirGeneDB does not have a dedicated hairpin file. The equivalent is the `Precursor sequences`." + }, "bowtie_indices": { "type": "string", "description": "Path to a Bowtie 1 index directory", diff --git a/workflows/smrnaseq.nf b/workflows/smrnaseq.nf index 694b6e89..cf6ed597 100644 --- a/workflows/smrnaseq.nf +++ b/workflows/smrnaseq.nf @@ -26,6 +26,7 @@ if (!params.mirtrace_species){ // Genome options bt_index_from_species = params.genome ? params.genomes[ params.genome ].bowtie ?: false : false bt_index = params.bowtie_indices ?: bt_index_from_species + mirtrace_species_from_species = params.genome ? params.genomes[ params.genome ].mirtrace_species ?: false : false mirtrace_species = params.mirtrace_species ?: mirtrace_species_from_species fasta_from_species = params.genome ? params.genomes[ params.genome ].fasta ?: false : false @@ -51,8 +52,16 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -if (params.mature) { reference_mature = file(params.mature, checkIfExists: true) } else { exit 1, "Mature miRNA fasta file not found: ${params.mature}" } -if (params.hairpin) { reference_hairpin = file(params.hairpin, checkIfExists: true) } else { exit 1, "Hairpin miRNA fasta file not found: ${params.hairpin}" } +if (!params.mirGeneDB) { + if (params.mature) { reference_mature = file(params.mature, checkIfExists: true) } else { exit 1, "Mature miRNA fasta file not found: ${params.mature}" } + if (params.hairpin) { reference_hairpin = file(params.hairpin, checkIfExists: true) } else { exit 1, "Hairpin miRNA fasta file not found: ${params.hairpin}" } + params.filterSpecies = params.mirtrace_species +} else { + if (params.mirGeneDB_mature) { reference_mature = file(params.mirGeneDB_mature, checkIfExists: true) } else { exit 1, "Mature miRNA fasta file not found: ${params.mirGeneDB_mature}" } + if (params.mirGeneDB_hairpin) { reference_hairpin = file(params.mirGeneDB_hairpin, checkIfExists: true) } else { exit 1, "Hairpin miRNA fasta file not found: ${params.mirGeneDB_hairpin}" } + if (params.mirGeneDB_gff) { mirna_gtf = file(params.mirGeneDB_gff, checkIfExists: true) } else { exit 1, "MirGeneDB gff file not found: ${params.mirGeneDB_gff}"} + params.filterSpecies = params.mirGeneDB_species +} include { INPUT_CHECK } from '../subworkflows/local/input_check' include { FASTQC_TRIMGALORE } from '../subworkflows/nf-core/fastqc_trimgalore'