Merge branch 'main' into retreat-brainstorming

nf-core · Mar 12, 2024 · 7a64f75 · 7a64f75
2 parents d105f7f + 186265e
commit 7a64f75
Show file tree

Hide file tree

Showing 36 changed files with 962 additions and 10 deletions.
diff --git a/conf/test.config b/conf/test.config
@@ -0,0 +1 @@
+params.input = "${projectDir}/assets/test.csv"
diff --git a/main.nf b/main.nf
@@ -2,7 +2,51 @@ include { fromSamplesheet } from 'plugin/nf-validation'
 
 include { BOWTIE_BUILD } from "./modules/nf-core/bowtie/build/main"
 include { BOWTIE2_BUILD } from "./modules/nf-core/bowtie2/build/main"
+// RNASEQ
 include { STAR_GENOMEGENERATE } from "./modules/nf-core/star/genomegenerate/main"
+include { HISAT2_EXTRACTSPLICESITES } from "./modules/nf-core/hisat2/extractsplicesites"
+include { HISAT2_BUILD } from "./modules/nf-core/hisat2/build"
+include { RSEM_PREPAREREFERENCE as MAKE_TRANSCRIPTS_FASTA } from "./modules/nf-core/rsem/preparereference"
+include { SALMON_INDEX } from "./modules/nf-core/salmon/index"
+include { KALLISTO_INDEX } from "./modules/nf-core/kallisto/index"
+include { RSEM_PREPAREREFERENCE as RSEM_PREPAREREFERENCE_GENOME } from "./modules/nf-core/rsem/preparereference"
+
+workflow RNASEQ {
+    take:
+    reference // fasta, gtf
+
+    main:
+    reference
+        .multiMap { meta, fasta, gtf, bed, readme, mito, size ->
+            fasta: tuple(meta, fasta)
+            gtf:   tuple(meta, gtf)
+            bed:   tuple(meta, bed)
+        }
+        .set { input }
+
+    STAR_GENOMEGENERATE ( input.fasta, input.gtf )
+
+    ch_splicesites = HISAT2_EXTRACTSPLICESITES ( input.gtf ).txt.map { it[1] }
+    HISAT2_BUILD ( input.fasta, input.gtf, ch_splicesites.map { [ [:], it ] } )
+
+    ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( input.fasta, input.gtf ).transcript_fasta
+
+    SALMON_INDEX ( input.fasta, ch_transcript_fasta )
+
+    KALLISTO_INDEX ( ch_transcript_fasta.map{[ [:], it]} )
+
+    RSEM_PREPAREREFERENCE_GENOME ( input.fasta, input.gtf )
+
+    emit:
+    star_index = STAR_GENOMEGENERATE.out.index
+    hisat2_index = HISAT2_BUILD.out.index
+    transcript_fasta = ch_transcript_fasta
+    salmon_index = SALMON_INDEX.out.index
+    kallisto_index = KALLISTO_INDEX.out.index
+    rsem_index = RSEM_PREPAREREFERENCE_GENOME.out.index
+}
+
+// TODO workflow SAREK {
 
 workflow INDEX {
     take:
@@ -19,17 +63,16 @@ workflow INDEX {
 
     BOWTIE_BUILD ( input.fasta )
     BOWTIE2_BUILD ( input.fasta )
-    STAR_GENOMEGENERATE ( input.fasta, input.gtf )
 
     emit:
     // bowtie_index = BOWTIE_BUILD.out.index
     bowtie2_index = BOWTIE2_BUILD.out.index
-    star_index = STAR_GENOMEGENERATE.out.index
 }
 
 
 workflow {
     ch_input = Channel.fromSamplesheet("input")
 
     INDEX ( ch_input )
+    RNASEQ ( ch_input )
 }
diff --git a/modules.json b/modules.json
@@ -15,6 +15,33 @@
             "git_sha": "1fea64f5132a813ec97c1c6d3a74e0aee7142b6d",
             "installed_by": ["modules"]
           },
+          "hisat2/build": {
+            "branch": "master",
+            "git_sha": "400037f54de4b0c42712ec5a499d9fd9e66250d1",
+            "installed_by": ["modules"]
+          },
+          "hisat2/extractsplicesites": {
+            "branch": "master",
+            "git_sha": "400037f54de4b0c42712ec5a499d9fd9e66250d1",
+            "installed_by": ["modules"]
+          },
+          "kallisto/index": {
+            "branch": "master",
+            "git_sha": "de5811dd9ca15af1e131806001bcaae909e42021",
+            "installed_by": ["modules"]
+          },
+          "rsem/preparereference": {
+            "branch": "master",
+            "git_sha": "301b088c7e9e00c4c80686411383f07173b54d69",
+            "installed_by": ["modules"],
+            "patch": "modules/nf-core/rsem/preparereference/rsem-preparereference.diff"
+          },
+          "salmon/index": {
+            "branch": "master",
+            "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d",
+            "installed_by": ["modules"],
+            "patch": "modules/nf-core/salmon/index/salmon-index.diff"
+          },
           "star/genomegenerate": {
             "branch": "master",
             "git_sha": "0e98289b5bec6e3f8f588a8a9d05e8aacc1179a0",

diff --git a/modules/nf-core/hisat2/build/environment.yml b/modules/nf-core/hisat2/build/environment.yml
@@ -0,0 +1,7 @@
+name: hisat2_build
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::hisat2=2.2.1
diff --git a/modules/nf-core/hisat2/build/main.nf b/modules/nf-core/hisat2/build/main.nf
@@ -0,0 +1,64 @@
+process HISAT2_BUILD {
+    tag "$fasta"
+    label 'process_high'
+    label 'process_high_memory'
+
+    // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions.
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/hisat2:2.2.1--h1b792b2_3' :
+        'biocontainers/hisat2:2.2.1--h1b792b2_3' }"
+
+    input:
+    tuple val(meta), path(fasta)
+    tuple val(meta2), path(gtf)
+    tuple val(meta3), path(splicesites)
+
+    output:
+    tuple val(meta), path("hisat2") , emit: index
+    path "versions.yml"             , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def avail_mem = 0
+    if (!task.memory) {
+        log.info "[HISAT2 index build] Available memory not known - defaulting to 0. Specify process memory requirements to change this."
+    } else {
+        log.info "[HISAT2 index build] Available memory: ${task.memory}"
+        avail_mem = task.memory.toGiga()
+    }
+
+    def ss = ''
+    def exon = ''
+    def extract_exons = ''
+    def hisat2_build_memory = params.hisat2_build_memory ? (params.hisat2_build_memory as nextflow.util.MemoryUnit).toGiga() : 0
+    if (avail_mem >= hisat2_build_memory) {
+        log.info "[HISAT2 index build] At least ${hisat2_build_memory} GB available, so using splice sites and exons to build HISAT2 index"
+        extract_exons = gtf ? "hisat2_extract_exons.py $gtf > ${gtf.baseName}.exons.txt" : ""
+        ss = splicesites ? "--ss $splicesites" : ""
+        exon = gtf ? "--exon ${gtf.baseName}.exons.txt" : ""
+    } else {
+        log.info "[HISAT2 index build] Less than ${hisat2_build_memory} GB available, so NOT using splice sites and exons to build HISAT2 index."
+        log.info "[HISAT2 index build] Use --hisat2_build_memory [small number] to skip this check."
+    }
+    def VERSION = '2.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
+    """
+    mkdir hisat2
+    $extract_exons
+    hisat2-build \\
+        -p $task.cpus \\
+        $ss \\
+        $exon \\
+        $args \\
+        $fasta \\
+        hisat2/${fasta.baseName}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        hisat2: $VERSION
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/hisat2/build/meta.yml b/modules/nf-core/hisat2/build/meta.yml
@@ -0,0 +1,61 @@
+name: hisat2_build
+description: Builds HISAT2 index for reference genome
+keywords:
+  - build
+  - index
+  - fasta
+  - genome
+  - reference
+tools:
+  - hisat2:
+      description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome.
+      homepage: https://daehwankimlab.github.io/hisat2/
+      documentation: https://daehwankimlab.github.io/hisat2/manual/
+      doi: "10.1038/s41587-019-0201-4"
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing reference information
+        e.g. [ id:'genome' ]
+  - fasta:
+      type: file
+      description: Reference fasta file
+      pattern: "*.{fa,fasta,fna}"
+  - meta2:
+      type: map
+      description: |
+        Groovy Map containing reference information
+        e.g. [ id:'genome' ]
+  - gtf:
+      type: file
+      description: Reference gtf annotation file
+      pattern: "*.{gtf}"
+  - meta3:
+      type: map
+      description: |
+        Groovy Map containing reference information
+        e.g. [ id:'genome' ]
+  - splicesites:
+      type: file
+      description: Splices sites in gtf file
+      pattern: "*.{txt}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing reference information
+        e.g. [ id:'genome' ]
+  - index:
+      type: file
+      description: HISAT2 genome index file
+      pattern: "*.ht2"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@ntoda03"
+maintainers:
+  - "@ntoda03"
diff --git a/modules/nf-core/hisat2/build/tests/main.nf.test b/modules/nf-core/hisat2/build/tests/main.nf.test
@@ -0,0 +1,53 @@
+nextflow_process {
+
+    name "Test Process HISAT2_BUILD"
+    script "../main.nf"
+    process "HISAT2_BUILD"
+    tag "modules"
+    tag "modules_nfcore"
+    tag "hisat2"
+    tag "hisat2/build"
+    tag "hisat2/extractsplicesites"
+
+    test("Should run without failures") {
+
+        setup {
+            run("HISAT2_EXTRACTSPLICESITES") {
+                script "../../extractsplicesites/main.nf"
+                process {
+                """
+                input[0] = Channel.of([
+                    [id:'genome'],
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true)
+                ])
+                """
+                }
+            }
+        }
+
+        when {
+            params {
+                outdir = "$outputDir"
+            }
+            process {
+                """
+                input[0] = Channel.of([
+                    [id:'genome'],
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+                ])
+                input[1] = Channel.of([ [id:'genome'],
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true)
+                ])
+                input[2] = HISAT2_EXTRACTSPLICESITES.out.txt
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+}
diff --git a/modules/nf-core/hisat2/build/tests/main.nf.test.snap b/modules/nf-core/hisat2/build/tests/main.nf.test.snap
@@ -0,0 +1,49 @@
+{
+    "Should run without failures": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "genome"
+                        },
+                        [
+                            "genome.1.ht2:md5,057cfa8a22b97ee9cff4c8d342498803",
+                            "genome.2.ht2:md5,47b153cd1319abc88dda532462651fcf",
+                            "genome.3.ht2:md5,4ed93abba181d8dfab2e303e33114777",
+                            "genome.4.ht2:md5,c25be5f8b0378abf7a58c8a880b87626",
+                            "genome.5.ht2:md5,91198831aaba993acac1734138c5f173",
+                            "genome.6.ht2:md5,265e1284ce85686516fae5d35540994a",
+                            "genome.7.ht2:md5,9013eccd91ad614d7893c739275a394f",
+                            "genome.8.ht2:md5,33cdeccccebe80329f1fdbee7f5874cb"
+                        ]
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,e36ef3cd73d19ccf2378c9358fe942c0"
+                ],
+                "index": [
+                    [
+                        {
+                            "id": "genome"
+                        },
+                        [
+                            "genome.1.ht2:md5,057cfa8a22b97ee9cff4c8d342498803",
+                            "genome.2.ht2:md5,47b153cd1319abc88dda532462651fcf",
+                            "genome.3.ht2:md5,4ed93abba181d8dfab2e303e33114777",
+                            "genome.4.ht2:md5,c25be5f8b0378abf7a58c8a880b87626",
+                            "genome.5.ht2:md5,91198831aaba993acac1734138c5f173",
+                            "genome.6.ht2:md5,265e1284ce85686516fae5d35540994a",
+                            "genome.7.ht2:md5,9013eccd91ad614d7893c739275a394f",
+                            "genome.8.ht2:md5,33cdeccccebe80329f1fdbee7f5874cb"
+                        ]
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,e36ef3cd73d19ccf2378c9358fe942c0"
+                ]
+            }
+        ],
+        "timestamp": "2023-10-16T14:42:22.381609786"
+    }
+}
diff --git a/modules/nf-core/hisat2/build/tests/tags.yml b/modules/nf-core/hisat2/build/tests/tags.yml
@@ -0,0 +1,3 @@
+hisat2/build:
+  - modules/nf-core/hisat2/build/**
+  - modules/nf-core/hisat2/extractsplicesites/**
diff --git a/modules/nf-core/hisat2/extractsplicesites/environment.yml b/modules/nf-core/hisat2/extractsplicesites/environment.yml
@@ -0,0 +1,7 @@
+name: hisat2_extractsplicesites
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::hisat2=2.2.1
diff --git a/modules/nf-core/hisat2/extractsplicesites/main.nf b/modules/nf-core/hisat2/extractsplicesites/main.nf
@@ -0,0 +1,31 @@
+process HISAT2_EXTRACTSPLICESITES {
+    tag "$gtf"
+    label 'process_medium'
+
+    // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions.
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/hisat2:2.2.1--h1b792b2_3' :
+        'biocontainers/hisat2:2.2.1--h1b792b2_3' }"
+
+    input:
+    tuple val(meta), path(gtf)
+
+    output:
+    tuple val(meta), path("*.splice_sites.txt"), emit: txt
+    path "versions.yml"                        , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def VERSION = '2.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
+    """
+    hisat2_extract_splice_sites.py $gtf > ${gtf.baseName}.splice_sites.txt
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        hisat2: $VERSION
+    END_VERSIONS
+    """
+}