Merge pull request #146 from CKComputomics/MirGeneDB

Enable the use of MirGeneDB
nf-core · Jun 10, 2022 · 4278785 · 4278785
2 parents 38249d5 + 85fb55a
commit 4278785
Show file tree

Hide file tree

Showing 9 changed files with 77 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Enhancements & fixes
 
+| Old parameter | New parameter         |
+| ------------- | --------------------- |
+|               | `--mirGeneDB`         |
+|               | `--mirGeneDB_species` |
+|               | `--mirGeneDB_gff`     |
+|               | `--mirGeneDB_mature`  |
+|               | `--mirGeneDB_hairpin` |
+
+### Other enhancements
+
+- [#55](https://github.com/nf-core/smrnaseq/issues/12) - Enabled the use of `MirGeneDB` as an alternative database insted of `miRBase`
+
 ### Parameters
 
 ## [v2.0.0](https://github.com/nf-core/smrnaseq/releases/tag/2.0.0) - 2022-05-31 Aqua Zinc Chihuahua

diff --git a/README.md b/README.md
@@ -31,13 +31,13 @@ On release, automated continuous integration tests run the pipeline on a full-si
 2. Adapter trimming ([`Trim Galore!`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/))
    1. Insert Size calculation
    2. Collapse reads ([`seqcluster`](https://seqcluster.readthedocs.io/mirna_annotation.html#processing-of-reads))
-3. Alignment against miRBase mature miRNA ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml))
-4. Alignment against miRBase hairpin
+3. Alignment against miRBase or MirGeneDB mature miRNA ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml))
+4. Alignment against miRBase or MirGeneDB hairpin
    1. Unaligned reads from step 3 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml))
    2. Collapsed reads from step 2.2 ([`Bowtie1`](http://bowtie-bio.sourceforge.net/index.shtml))
-5. Post-alignment processing of miRBase hairpin
+5. Post-alignment processing of miRBase, or MirGeneDB hairpin
    1. Basic statistics from step 3 and step 4.1 ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/))
-   2. Analysis on miRBase hairpin counts ([`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html))
+   2. Analysis on miRBase, or MirGeneDB hairpin counts ([`edgeR`](https://bioconductor.org/packages/release/bioc/html/edgeR.html))
       - TMM normalization and a table of top expression hairpin
       - MDS plot clustering samples
       - Heatmap of sample similarities

diff --git a/docs/output.md b/docs/output.md
@@ -60,7 +60,7 @@ This is an example of the output we can get:
 
 ## Bowtie
 
-[Bowtie](http://bowtie-bio.sourceforge.net/index.shtml) is used for mapping adapter trimmed reads against the mature miRNAs and miRNA precursors (hairpins) in [miRBase](http://www.mirbase.org/).
+[Bowtie](http://bowtie-bio.sourceforge.net/index.shtml) is used for mapping adapter trimmed reads against the mature miRNAs and miRNA precursors (hairpins) of the chosen database [miRBase](http://www.mirbase.org/) or [MirGeneDB](https://mirgenedb.org/).
 
 **Output directory: `results/samtools`**
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -16,16 +16,24 @@ This option indicates the experimental protocol used for the sample preparation.
 - 'cats': adapter (`GATCGGAAGAGCACACGTCTG), clip_r1(`3)
 - 'custom' (where the ser can indicate the `three_prime_adapter`, `clip_r1` and three_prime_clip_r1`)
 
-### `mirtrace_species`
+### `mirtrace_species or mirGeneDB_species`
 
-It should point to the 3-letter species name used by `miRBase`.
+It should point to the 3-letter species name used by `miRBase`, or `MirGeneDB`. Note the difference in case for the two databases.
 
 ### miRNA related files
 
+Different parameters can be set for the two supported datbases. By default `miRBase` will be used with the parameters below.
+
 - `mirna_gtf`: If not supplied by the user, then `mirna_gtf` will point to the latest GFF3 file in miRbase: `https://mirbase.org/ftp/CURRENT/genomes/${params.mirtrace_species}.gff3`
 - `mature`: points to the FASTA file of mature miRNA sequences. `https://mirbase.org/ftp/CURRENT/mature.fa.gz`
 - `hairpin`: points to the FASTA file of precursor miRNA sequences. `https://mirbase.org/ftp/CURRENT/hairpin.fa.gz`
 
+If `MirGeneDB` should be used instead it needs to be specified using `--mirGeneDB` and use the parameters below .
+
+- `mirGeneDB_gff`: The data can not be downloaded automatically, thus the user needs to supply the gff file for either his species, or all species downloaded from `https://mirgenedb.org/download`. The total set will automatically be subsetted to the species specified with `mirGeneDB_species`.
+- `mirGeneDB_mature`: points to the FASTA file of mature miRNA sequences. Download from `https://mirgenedb.org/download`.
+- `mirGeneDB_hairpin`: points to the FASTA file of precursor miRNA sequences. Download from `https://mirgenedb.org/download`. Note that `MirGeneDB` does not have a dedicated `hairpin` file, but the `Precursor sequences` are to be used.
+
 ### Genome
 
 - `fasta`: the reference genome FASTA file

diff --git a/modules/local/mirtop_quant.nf b/modules/local/mirtop_quant.nf
@@ -11,6 +11,8 @@ process MIRTOP_QUANT {
     path hairpin
     path gtf
 
+    //if (!params.mirGeneDB) {params.filterSpecies = params.mirtrace_species} else {params.filterSpecies = params.mirGeneDB_species}
+
     output:
     path "mirtop/mirtop.gff"
     path "mirtop/mirtop.tsv"        , emit: mirtop_table
@@ -20,9 +22,9 @@ process MIRTOP_QUANT {
 
     script:
     """
-    mirtop gff --hairpin $hairpin --gtf $gtf -o mirtop --sps $params.mirtrace_species ./bams/*
-    mirtop counts --hairpin $hairpin --gtf $gtf -o mirtop --sps $params.mirtrace_species --add-extra --gff mirtop/mirtop.gff
-    mirtop export --format isomir --hairpin $hairpin --gtf $gtf --sps $params.mirtrace_species -o mirtop mirtop/mirtop.gff
+    mirtop gff --hairpin $hairpin --gtf $gtf -o mirtop --sps $params.filterSpecies ./bams/*
+    mirtop counts --hairpin $hairpin --gtf $gtf -o mirtop --sps $params.filterSpecies --add-extra --gff mirtop/mirtop.gff
+    mirtop export --format isomir --hairpin $hairpin --gtf $gtf --sps $params.filterSpecies -o mirtop mirtop/mirtop.gff
     mirtop stats mirtop/mirtop.gff --out mirtop/stats
     mv mirtop/stats/mirtop_stats.log mirtop/stats/full_mirtop_stats.log
 

diff --git a/modules/local/parse_fasta_mirna.nf b/modules/local/parse_fasta_mirna.nf
@@ -9,6 +9,8 @@ process PARSE_FASTA_MIRNA {
     input:
     path fasta
 
+    //if (!params.mirGeneDB) {params.filterSpecies = params.mirtrace_species} else {params.filterSpecies = params.mirGeneDB_species}
+
     output:
     path '*_igenome.fa', emit: parsed_fasta
     path "versions.yml", emit: versions
@@ -27,7 +29,7 @@ process PARSE_FASTA_MIRNA {
     # TODO perl -ane 's/[ybkmrsw]/N/ig;print;' \${FASTA}_parsed_tmp.fa > \${FASTA}_parsed.fa
 
     sed -i 's/\s.*//' \${FASTA}_parsed.fa
-    seqkit grep -r --pattern \".*${params.mirtrace_species}-.*\" \${FASTA}_parsed.fa > \${FASTA}_sps.fa
+    seqkit grep -r --pattern \".*${params.filterSpecies}-.*\" \${FASTA}_parsed.fa > \${FASTA}_sps.fa
     seqkit seq --rna2dna \${FASTA}_sps.fa > \${FASTA}_igenome.fa
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/nextflow.config b/nextflow.config
@@ -26,6 +26,11 @@ params {
     mirtrace_protocol          = 'illumina'
     mature                     = "https://mirbase.org/ftp/CURRENT/mature.fa.gz"
     hairpin                    = "https://mirbase.org/ftp/CURRENT/hairpin.fa.gz"
+    mirGeneDB                  = false
+    mirGeneDB_mature           = "/Users/chriskub/Downloads/ALL-mat.fas"
+    mirGeneDB_hairpin          = "/Users/chriskub/Downloads/ALL-pre.fas"
+    mirGeneDB_gff              = "/Users/chriskub/Downloads/ALL.gff"
+    mirGeneDB_species          = null
 
     // Trimming options
     clip_r1                    = 0

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -62,12 +62,23 @@
                     "fa_icon": "fas fa-book",
                     "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details."
                 },
+                "mirGeneDB": {
+                    "type": "boolean",
+                    "description": "Boolean wether mirGeneDB should be used instead of miRBase",
+                    "help_text": "This allows you to use mirGeneDB instead of miRBase as the database. \n Note that you will need to set the additional flags `--mirGeneDB_species`, `--mirGeneDB_gff`, `--mirGeneDB_mature` and `--mirGeneDB_hairpin`",
+                    "default": "false"
+                },
                 "mirtrace_species": {
                     "type": "string",
                     "description": "Species for miRTrace.",
                     "help_text": "This is automatically set when using `--genome`. Example values: `hsa`, `mmu`...\n Note that mirTrace relies on miRBase for its species reference. See available references [here](https://mirbase.org/ftp/CURRENT/genomes/).",
                     "fa_icon": "fas fa-journal-whills"
                 },
+                "mirGeneDB_species": {
+                    "type": "string",
+                    "description": "Species of mirGeneDB.",
+                    "help_text": "This replaces the value of `--mirtrace_species` if `--mirGeneDB` is used. \n Note the difference in case for species names used in MirGeneDB and miRBase."
+                },
                 "fasta": {
                     "type": "string",
                     "fa_icon": "fas fa-font",
@@ -80,20 +91,35 @@
                     "help_text": "miRBase `.gff3` file, typically downloaded from [`https://mirbase.org/ftp/CURRENT/genomes/`](https://mirbase.org/ftp/CURRENT/genomes/)\n\nIf using iGenomes with `--genome` this file will be downloaded from miRBase automatically during the pipeline run.\n\n",
                     "fa_icon": "fas fa-address-book"
                 },
+                "mirGeneDB_gff": {
+                    "type": "string",
+                    "description": "GFF/GTF file with coordinates positions of precursor and miRNAs.",
+                    "help_text": "mirGeneDB `.gff3` file, typically downloaded from [`https://mirgenedb.org/download`]. This replaces the value of --mirna_gff if --mirGeneDB is used."
+                },
                 "mature": {
                     "type": "string",
                     "description": "Path to FASTA file with mature miRNAs.",
                     "fa_icon": "fas fa-wheelchair",
                     "help_text": "Typically this will be the `mature.fa` file from miRBase. Can be given either as a plain text `.fa` file or a compressed `.gz` file.\n\nDefaults to the current miRBase release URL, from which the file will be downloaded.",
                     "default": "https://mirbase.org/ftp/CURRENT/mature.fa.gz"
                 },
+                "mirGeneDB_mature": {
+                    "type": "string",
+                    "description": "Path to FASTA file with mirGeneDB mature miRNAs.",
+                    "help_text": "This file needs to be downloaded from [`https://mirgenedb.org/download`]. Can be given either as a plain text `.fa` file or a compressed `.gz` file."
+                },
                 "hairpin": {
                     "type": "string",
                     "description": "Path to FASTA file with miRNAs precursors.",
                     "fa_icon": "fab fa-cuttlefish",
                     "help_text": "Typically this will be the `mature.fa` file from miRBase. Can be given either as a plain text `.fa` file or a compressed `.gz` file.\n\nDefaults to the current miRBase release URL, from which the file will be downloaded.",
                     "default": "https://mirbase.org/ftp/CURRENT/hairpin.fa.gz"
                 },
+                "mirGeneDB_hairpin": {
+                    "type": "string",
+                    "description": "Path to FASTA file with miRNAs precursors.",
+                    "help_text": "This file needs to be downloaded from [`https://mirgenedb.org/download`]. Can be given either as a plain text `.fa` file or a compressed `.gz` file.\nNote that mirGeneDB does not have a dedicated hairpin file. The equivalent is the `Precursor sequences`."
+                },
                 "bowtie_indices": {
                     "type": "string",
                     "description": "Path to a Bowtie 1 index directory",

diff --git a/workflows/smrnaseq.nf b/workflows/smrnaseq.nf
@@ -26,6 +26,7 @@ if (!params.mirtrace_species){
 // Genome options
 bt_index_from_species = params.genome ? params.genomes[ params.genome ].bowtie ?: false : false
 bt_index              = params.bowtie_indices ?: bt_index_from_species
+
 mirtrace_species_from_species = params.genome ? params.genomes[ params.genome ].mirtrace_species ?: false : false
 mirtrace_species = params.mirtrace_species ?: mirtrace_species_from_species
 fasta_from_species = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
@@ -51,8 +52,16 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
-if (params.mature) { reference_mature = file(params.mature, checkIfExists: true) } else { exit 1, "Mature miRNA fasta file not found: ${params.mature}" }
-if (params.hairpin) { reference_hairpin = file(params.hairpin, checkIfExists: true) } else { exit 1, "Hairpin miRNA fasta file not found: ${params.hairpin}" }
+if (!params.mirGeneDB) {
+    if (params.mature) { reference_mature = file(params.mature, checkIfExists: true) } else { exit 1, "Mature miRNA fasta file not found: ${params.mature}" }
+    if (params.hairpin) { reference_hairpin = file(params.hairpin, checkIfExists: true) } else { exit 1, "Hairpin miRNA fasta file not found: ${params.hairpin}" }
+    params.filterSpecies = params.mirtrace_species
+} else {
+    if (params.mirGeneDB_mature) { reference_mature = file(params.mirGeneDB_mature, checkIfExists: true) } else { exit 1, "Mature miRNA fasta file not found: ${params.mirGeneDB_mature}" }
+    if (params.mirGeneDB_hairpin) { reference_hairpin = file(params.mirGeneDB_hairpin, checkIfExists: true) } else { exit 1, "Hairpin miRNA fasta file not found: ${params.mirGeneDB_hairpin}" }
+    if (params.mirGeneDB_gff) { mirna_gtf = file(params.mirGeneDB_gff, checkIfExists: true) } else { exit 1, "MirGeneDB gff file not found: ${params.mirGeneDB_gff}"}  
+    params.filterSpecies = params.mirGeneDB_species
+}
 
 include { INPUT_CHECK       } from '../subworkflows/local/input_check'
 include { FASTQC_TRIMGALORE } from '../subworkflows/nf-core/fastqc_trimgalore'