Update vep subwf (nf-core#3385)

* commit before git merge * complete the vep update * update modules in meta * fix tests * update bcftools query test * remove merge + update scatter * update comments + fix tests * ssssh eclint * updated subwf description * add contains to tests * Update subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf Co-authored-by: Maxime U Garcia <maxime.garcia@seqera.io> * added snpeff * align includes * Update subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/meta.yml Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> * changes suggested by review * add snpeff reports * fix tests * made scatter optional * re-added the original subwf * update meta.yml --------- Co-authored-by: Maxime U Garcia <maxime.garcia@seqera.io> Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
mskcc-omics-workflows · Jul 18, 2023 · 4512120 · 4512120
1 parent 4c1acbc
commit 4512120
Show file tree

Hide file tree

Showing 9 changed files with 854 additions and 7 deletions.
diff --git a/.github/workflows/pytest-workflow.yml b/.github/workflows/pytest-workflow.yml
@@ -148,6 +148,8 @@ jobs:
             tags: gatk4/determinegermlinecontigploidy
           - profile: "conda"
             tags: subworkflows/bcl_demultiplex
+          - profile: "conda"
+            tags: subworkflows/vcf_annotate_ensemblvep_snpeff
           - profile: "conda"
             tags: subworkflows/vcf_annotate_ensemblvep
           - profile: "conda"

diff --git a/modules/nf-core/bcftools/query/main.nf b/modules/nf-core/bcftools/query/main.nf
@@ -1,6 +1,6 @@
 process BCFTOOLS_QUERY {
     tag "$meta.id"
-    label 'process_medium'
+    label 'process_single'
 
     conda "bioconda::bcftools=1.17"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -29,12 +29,12 @@ process BCFTOOLS_QUERY {
     def samples_file =  samples ? "--samples-file ${samples}" : ""
     """
     bcftools query \\
-        --output ${prefix}.txt \\
         $regions_file \\
         $targets_file \\
         $samples_file \\
         $args \\
-        $vcf
+        $vcf \\
+        > ${prefix}.txt
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/main.nf b/subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/main.nf
@@ -0,0 +1,191 @@
+//
+// Run VEP and/or SNPEFF to annotate VCF files
+//
+
+include { ENSEMBLVEP_VEP         } from '../../../modules/nf-core/ensemblvep/vep/main'
+include { SNPEFF_SNPEFF          } from '../../../modules/nf-core/snpeff/snpeff/main'
+include { TABIX_TABIX            } from '../../../modules/nf-core/tabix/tabix/main'
+include { BCFTOOLS_PLUGINSCATTER } from '../../../modules/nf-core/bcftools/pluginscatter/main'
+include { BCFTOOLS_CONCAT        } from '../../../modules/nf-core/bcftools/concat/main'
+include { BCFTOOLS_SORT          } from '../../../modules/nf-core/bcftools/sort/main'
+
+workflow VCF_ANNOTATE_ENSEMBLVEP_SNPEFF {
+    take:
+    ch_vcf                      // channel: [ val(meta), path(vcf), path(tbi), [path(file1), path(file2)...] ]
+    ch_fasta                    // channel: [ val(meta2), path(fasta) ] (optional)
+    val_vep_genome              //   value: genome to use
+    val_vep_species             //   value: species to use
+    val_vep_cache_version       //   value: cache version to use
+    ch_vep_cache                // channel: [ path(cache) ] (optional)
+    ch_vep_extra_files          // channel: [ path(file1), path(file2)... ] (optional)
+    val_snpeff_db               //   value: the db version to use for snpEff
+    ch_snpeff_cache             // channel: [ path(cache) ] (optional)
+    val_tools_to_use            //   value: a list of tools to use options are: ["ensemblvep", "snpeff"]
+    val_sites_per_chunk         //   value: the amount of variants per scattered VCF
+
+    main:
+    ch_versions = Channel.empty()
+
+    // Check if val_sites_per_chunk is set and scatter if it is
+    if(val_sites_per_chunk) {
+        //
+        // Prepare the input VCF channel for scattering (split VCFs from custom files)
+        //
+
+        ch_input = ch_vcf
+            .multiMap { meta, vcf, tbi, custom_files ->
+                vcf:    [ meta, vcf, tbi ]
+                custom: [ meta, custom_files ]
+            }
+
+        //
+        // Scatter the input VCFs into multiple VCFs. These VCFs contain the amount of variants
+        // specified by `val_sites_per_chunk`. The lower this value is, the more files will be created
+        //
+
+        BCFTOOLS_PLUGINSCATTER(
+            ch_input.vcf,
+            val_sites_per_chunk,
+            [],
+            [],
+            [],
+            []
+        )
+        ch_versions = ch_versions.mix(BCFTOOLS_PLUGINSCATTER.out.versions.first())
+
+        //
+        // Run the annotation with EnsemblVEP
+        //
+
+        ch_scatter = BCFTOOLS_PLUGINSCATTER.out.scatter
+            .map { meta, vcfs ->
+                // This checks if multiple files were created using the scatter process
+                // If multiple files are created, a list will be made as output of the process
+                // So if the output isn't a list, there is always one file and if there is a list,
+                // the amount of files in the list gets counted by .size()
+                is_list = vcfs instanceof ArrayList
+                count = is_list ? vcfs.size() : 1
+                [ meta, is_list ? vcfs : [vcfs], count ]
+                // Channel containing the list of VCFs and the size of this list
+            }
+            .transpose(by:1) // Transpose on the VCFs => Creates an entry for each VCF in the list
+            .combine(ch_input.custom, by: 0) // Re-add the sample specific custom files
+            .multiMap { meta, vcf, count, custom_files ->
+                // Define the new ID. The `_annotated` is to disambiguate the VEP output with its input
+                new_id = "${meta.id}${vcf.name.replace(meta.id,"").tokenize(".")[0]}_annotated" as String
+                new_meta = meta + [id:new_id]
+
+                // Create channels: one with the VEP input and one with the original ID and count of scattered VCFs
+                input:  [ new_meta, vcf, custom_files ]
+                count:  [ new_meta, meta.id, count ]
+            }
+
+        ch_vep_input = ch_scatter.input
+    } else {
+        // Use the normal input when no scattering has to be performed
+        ch_vep_input = ch_vcf.map { meta, vcf, tbi, files -> [ meta, vcf, files ] }
+    }
+
+    // Annotate with ensemblvep if it's part of the requested tools
+    if("ensemblvep" in val_tools_to_use){
+        ENSEMBLVEP_VEP(
+            ch_vep_input,
+            val_vep_genome,
+            val_vep_species,
+            val_vep_cache_version,
+            ch_vep_cache,
+            ch_fasta,
+            ch_vep_extra_files
+        )
+        ch_versions = ch_versions.mix(ENSEMBLVEP_VEP.out.versions.first())
+
+        ch_vep_output  = ENSEMBLVEP_VEP.out.vcf
+        ch_vep_reports = ENSEMBLVEP_VEP.out.report
+    } else {
+        ch_vep_output  = ch_vep_input.map { meta, vcf, files -> [ meta, vcf ] }
+        ch_vep_reports = Channel.empty()
+    }
+
+    // Annotate with snpeff if it's part of the requested tools
+    if("snpeff" in val_tools_to_use){
+        SNPEFF_SNPEFF(
+            ch_vep_output,
+            val_snpeff_db,
+            ch_snpeff_cache
+        )
+        ch_versions = ch_versions.mix(SNPEFF_SNPEFF.out.versions.first())
+
+        ch_snpeff_output  = SNPEFF_SNPEFF.out.vcf
+        ch_snpeff_reports = SNPEFF_SNPEFF.out.report
+        ch_snpeff_html    = SNPEFF_SNPEFF.out.summary_html
+        ch_snpeff_genes   = SNPEFF_SNPEFF.out.genes_txt
+    } else {
+        ch_snpeff_output  = ch_vep_output
+        ch_snpeff_reports = Channel.empty()
+        ch_snpeff_html    = Channel.empty()
+        ch_snpeff_genes   = Channel.empty()
+    }
+
+    // Gather the files back together if they were scattered
+    if(val_sites_per_chunk) {
+        //
+        // Concatenate the VCFs back together with bcftools concat
+        //
+
+        ch_concat_input = ch_snpeff_output
+            .join(ch_scatter.count, failOnDuplicate:true, failOnMismatch:true)
+            .map { meta, vcf, id, count ->
+                new_meta = meta + [id:id]
+                [ groupKey(new_meta, count), vcf ]
+            }
+            .groupTuple() // Group the VCFs which need to be concatenated
+            .map { it + [[]] }
+
+        BCFTOOLS_CONCAT(
+            ch_concat_input
+        )
+        ch_versions = ch_versions.mix(BCFTOOLS_CONCAT.out.versions.first())
+
+        //
+        // Sort the concatenate output (bcftools concat is unable to do this on its own)
+        //
+
+        BCFTOOLS_SORT(
+            BCFTOOLS_CONCAT.out.vcf
+        )
+        ch_versions = ch_versions.mix(BCFTOOLS_SORT.out.versions.first())
+
+        ch_ready_vcfs = BCFTOOLS_SORT.out.vcf
+    } else {
+        ch_ready_vcfs = ch_snpeff_output
+    }
+
+    //
+    // Index the resulting bgzipped VCFs
+    //
+
+    ch_tabix_input = ch_ready_vcfs
+        .branch { meta, vcf ->
+            // Split the bgzipped VCFs from the unzipped VCFs (only bgzipped VCFs should be indexed)
+            bgzip: vcf.extension == "gz"
+            unzip: true
+                return [ meta, vcf, [] ]
+        }
+
+    TABIX_TABIX(
+        ch_tabix_input.bgzip
+    )
+    ch_versions = ch_versions.mix(TABIX_TABIX.out.versions)
+
+    ch_vcf_tbi = ch_tabix_input.bgzip
+        .join(TABIX_TABIX.out.tbi, failOnDuplicate: true, failOnMismatch: true)
+        .mix(ch_tabix_input.unzip)
+
+    emit:
+    vcf_tbi         = ch_vcf_tbi        // channel: [ val(meta), path(vcf), path(tbi) ]
+    vep_reports     = ch_vep_reports    // channel: [ path(html) ]
+    snpeff_reports  = ch_snpeff_reports // channel: [ path(csv) ]
+    snpeff_html     = ch_snpeff_html    // channel: [ path(html) ]
+    snpeff_genes    = ch_snpeff_genes   // channel: [ path(genes) ]
+    versions        = ch_versions       // channel: [ versions.yml ]
+}
diff --git a/subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/meta.yml b/subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/meta.yml
@@ -0,0 +1,73 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
+name: vcf_annotate_ensemblvep_snpeff
+description: |
+  Perform annotation with ensemblvep and/or snpeff and bgzip + tabix index the resulting VCF file. This subworkflow uses the scatter-gather method to run VEP/snpEff in parallel to increase throughput. The input VCF is split into multiple smaller VCFs of fixed size, which are annotated separately and concatenated back together to a single output file per sample. Only VCF/BCF outputs are currently supported.
+keywords:
+  - vcf
+  - annotation
+  - ensemblvep
+  - snpeff
+modules:
+  - ensemblvep/vep
+  - snpeff/snpeff
+  - tabix/tabix
+  - bcftools/pluginscatter
+  - bcftools/concat
+input:
+  - ch_vcf:
+      description: |
+        vcf file to annotate
+        Structure: [ val(meta), path(vcf), path(tbi) ]
+  - ch_fasta:
+      description: |
+        Reference genome fasta file (optional)
+        Structure: [ val(meta2), path(fasta) ]
+  - val_vep_genome:
+      type: string
+      description: genome to use for ensemblvep
+  - val_vep_species:
+      type: string
+      description: species to use for ensemblvep
+  - val_vep_cache_version:
+      type: integer
+      description: cache version to use for ensemblvep
+  - ch_vep_cache:
+      description: |
+        the root cache folder for ensemblvep (optional)
+        Structure: [ path(cache) ]
+  - ch_vep_extra_files:
+      description: |
+        any extra files needed by plugins for ensemblvep (optional)
+        Structure: [ path(file1), path(file2)... ]
+  - val_snpeff_db:
+      type: string
+      description: database to use for snpeff
+  - ch_snpeff_cache:
+      description: |
+        the root cache folder for snpeff (optional)
+        Structure: [ path(cache) ]
+  - val_tools_to_use:
+      type: list
+      description: The tools to use. Options => '["ensemblvep", "snpeff"]'
+  - val_sites_per_chunk:
+      type: integer
+      description: |
+        The amount of variants per scattered VCF.
+        Set this value to `null`, `[]` or `false` to disable scattering.
+output:
+  - vcf_tbi:
+      description: |
+        Compressed vcf file + tabix index
+        Structure: [ val(meta), path(vcf), path(tbi) ]
+  - reports:
+      type: file
+      description: html reports
+      pattern: "*.html"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@maxulysse"
+  - "@matthdsm"
+  - "@nvnieuwk"
diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml
@@ -3736,6 +3736,10 @@ subworkflows/vcf_annotate_ensemblvep:
   - subworkflows/nf-core/vcf_annotate_ensemblvep/**
   - tests/subworkflows/nf-core/vcf_annotate_ensemblvep/**
 
+subworkflows/vcf_annotate_ensemblvep_snpeff:
+  - subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/**
+  - tests/subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/**
+
 subworkflows/vcf_annotate_snpeff:
   - subworkflows/nf-core/vcf_annotate_snpeff/**
   - tests/subworkflows/nf-core/vcf_annotate_snpeff/**

diff --git a/tests/modules/nf-core/bcftools/query/test.yml b/tests/modules/nf-core/bcftools/query/test.yml
@@ -1,17 +1,19 @@
-- name: bcftools query
+- name: bcftools query test_bcftools_query
   command: nextflow run ./tests/modules/nf-core/bcftools/query -entry test_bcftools_query -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/bcftools/query/nextflow.config
   tags:
     - bcftools/query
     - bcftools
   files:
     - path: output/bcftools/out.txt
-      md5sum: c32a6d28f185822d8fe1eeb7e42ec155
+      md5sum: 51d135de052f3bcef50dcd6b74806094
+    - path: output/bcftools/versions.yml
 
-- name: bcftools query with optional files
+- name: bcftools query test_bcftools_query_with_optional_files
   command: nextflow run ./tests/modules/nf-core/bcftools/query -entry test_bcftools_query_with_optional_files -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/bcftools/query/nextflow.config
   tags:
     - bcftools/query
     - bcftools
   files:
     - path: output/bcftools/out.txt
-      md5sum: 5a87e0865df2f0ab2884fc113ec2a70d
+      md5sum: 1785d1957ba7206df852d0689b91753f
+    - path: output/bcftools/versions.yml