Skip to content

Commit

Permalink
Update vep subwf (nf-core#3385)
Browse files Browse the repository at this point in the history
* commit before git merge

* complete the vep update

* update modules in meta

* fix tests

* update bcftools query test

* remove merge + update scatter

* update comments + fix tests

* ssssh eclint

* updated subwf description

* add contains to tests

* Update subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf

Co-authored-by: Maxime U Garcia <maxime.garcia@seqera.io>

* added snpeff

* align includes

* Update subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/meta.yml

Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>

* changes suggested by review

* add snpeff reports

* fix tests

* made scatter optional

* re-added the original subwf

* update meta.yml

---------

Co-authored-by: Maxime U Garcia <maxime.garcia@seqera.io>
Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
  • Loading branch information
3 people authored and buehlere committed Jul 18, 2023
1 parent 4c1acbc commit 4512120
Show file tree
Hide file tree
Showing 9 changed files with 854 additions and 7 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/pytest-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ jobs:
tags: gatk4/determinegermlinecontigploidy
- profile: "conda"
tags: subworkflows/bcl_demultiplex
- profile: "conda"
tags: subworkflows/vcf_annotate_ensemblvep_snpeff
- profile: "conda"
tags: subworkflows/vcf_annotate_ensemblvep
- profile: "conda"
Expand Down
6 changes: 3 additions & 3 deletions modules/nf-core/bcftools/query/main.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
process BCFTOOLS_QUERY {
tag "$meta.id"
label 'process_medium'
label 'process_single'

conda "bioconda::bcftools=1.17"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
Expand Down Expand Up @@ -29,12 +29,12 @@ process BCFTOOLS_QUERY {
def samples_file = samples ? "--samples-file ${samples}" : ""
"""
bcftools query \\
--output ${prefix}.txt \\
$regions_file \\
$targets_file \\
$samples_file \\
$args \\
$vcf
$vcf \\
> ${prefix}.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
191 changes: 191 additions & 0 deletions subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
//
// Run VEP and/or SNPEFF to annotate VCF files
//

include { ENSEMBLVEP_VEP } from '../../../modules/nf-core/ensemblvep/vep/main'
include { SNPEFF_SNPEFF } from '../../../modules/nf-core/snpeff/snpeff/main'
include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main'
include { BCFTOOLS_PLUGINSCATTER } from '../../../modules/nf-core/bcftools/pluginscatter/main'
include { BCFTOOLS_CONCAT } from '../../../modules/nf-core/bcftools/concat/main'
include { BCFTOOLS_SORT } from '../../../modules/nf-core/bcftools/sort/main'

workflow VCF_ANNOTATE_ENSEMBLVEP_SNPEFF {
take:
ch_vcf // channel: [ val(meta), path(vcf), path(tbi), [path(file1), path(file2)...] ]
ch_fasta // channel: [ val(meta2), path(fasta) ] (optional)
val_vep_genome // value: genome to use
val_vep_species // value: species to use
val_vep_cache_version // value: cache version to use
ch_vep_cache // channel: [ path(cache) ] (optional)
ch_vep_extra_files // channel: [ path(file1), path(file2)... ] (optional)
val_snpeff_db // value: the db version to use for snpEff
ch_snpeff_cache // channel: [ path(cache) ] (optional)
val_tools_to_use // value: a list of tools to use options are: ["ensemblvep", "snpeff"]
val_sites_per_chunk // value: the amount of variants per scattered VCF

main:
ch_versions = Channel.empty()

// Check if val_sites_per_chunk is set and scatter if it is
if(val_sites_per_chunk) {
//
// Prepare the input VCF channel for scattering (split VCFs from custom files)
//

ch_input = ch_vcf
.multiMap { meta, vcf, tbi, custom_files ->
vcf: [ meta, vcf, tbi ]
custom: [ meta, custom_files ]
}

//
// Scatter the input VCFs into multiple VCFs. These VCFs contain the amount of variants
// specified by `val_sites_per_chunk`. The lower this value is, the more files will be created
//

BCFTOOLS_PLUGINSCATTER(
ch_input.vcf,
val_sites_per_chunk,
[],
[],
[],
[]
)
ch_versions = ch_versions.mix(BCFTOOLS_PLUGINSCATTER.out.versions.first())

//
// Run the annotation with EnsemblVEP
//

ch_scatter = BCFTOOLS_PLUGINSCATTER.out.scatter
.map { meta, vcfs ->
// This checks if multiple files were created using the scatter process
// If multiple files are created, a list will be made as output of the process
// So if the output isn't a list, there is always one file and if there is a list,
// the amount of files in the list gets counted by .size()
is_list = vcfs instanceof ArrayList
count = is_list ? vcfs.size() : 1
[ meta, is_list ? vcfs : [vcfs], count ]
// Channel containing the list of VCFs and the size of this list
}
.transpose(by:1) // Transpose on the VCFs => Creates an entry for each VCF in the list
.combine(ch_input.custom, by: 0) // Re-add the sample specific custom files
.multiMap { meta, vcf, count, custom_files ->
// Define the new ID. The `_annotated` is to disambiguate the VEP output with its input
new_id = "${meta.id}${vcf.name.replace(meta.id,"").tokenize(".")[0]}_annotated" as String
new_meta = meta + [id:new_id]

// Create channels: one with the VEP input and one with the original ID and count of scattered VCFs
input: [ new_meta, vcf, custom_files ]
count: [ new_meta, meta.id, count ]
}

ch_vep_input = ch_scatter.input
} else {
// Use the normal input when no scattering has to be performed
ch_vep_input = ch_vcf.map { meta, vcf, tbi, files -> [ meta, vcf, files ] }
}

// Annotate with ensemblvep if it's part of the requested tools
if("ensemblvep" in val_tools_to_use){
ENSEMBLVEP_VEP(
ch_vep_input,
val_vep_genome,
val_vep_species,
val_vep_cache_version,
ch_vep_cache,
ch_fasta,
ch_vep_extra_files
)
ch_versions = ch_versions.mix(ENSEMBLVEP_VEP.out.versions.first())

ch_vep_output = ENSEMBLVEP_VEP.out.vcf
ch_vep_reports = ENSEMBLVEP_VEP.out.report
} else {
ch_vep_output = ch_vep_input.map { meta, vcf, files -> [ meta, vcf ] }
ch_vep_reports = Channel.empty()
}

// Annotate with snpeff if it's part of the requested tools
if("snpeff" in val_tools_to_use){
SNPEFF_SNPEFF(
ch_vep_output,
val_snpeff_db,
ch_snpeff_cache
)
ch_versions = ch_versions.mix(SNPEFF_SNPEFF.out.versions.first())

ch_snpeff_output = SNPEFF_SNPEFF.out.vcf
ch_snpeff_reports = SNPEFF_SNPEFF.out.report
ch_snpeff_html = SNPEFF_SNPEFF.out.summary_html
ch_snpeff_genes = SNPEFF_SNPEFF.out.genes_txt
} else {
ch_snpeff_output = ch_vep_output
ch_snpeff_reports = Channel.empty()
ch_snpeff_html = Channel.empty()
ch_snpeff_genes = Channel.empty()
}

// Gather the files back together if they were scattered
if(val_sites_per_chunk) {
//
// Concatenate the VCFs back together with bcftools concat
//

ch_concat_input = ch_snpeff_output
.join(ch_scatter.count, failOnDuplicate:true, failOnMismatch:true)
.map { meta, vcf, id, count ->
new_meta = meta + [id:id]
[ groupKey(new_meta, count), vcf ]
}
.groupTuple() // Group the VCFs which need to be concatenated
.map { it + [[]] }

BCFTOOLS_CONCAT(
ch_concat_input
)
ch_versions = ch_versions.mix(BCFTOOLS_CONCAT.out.versions.first())

//
// Sort the concatenate output (bcftools concat is unable to do this on its own)
//

BCFTOOLS_SORT(
BCFTOOLS_CONCAT.out.vcf
)
ch_versions = ch_versions.mix(BCFTOOLS_SORT.out.versions.first())

ch_ready_vcfs = BCFTOOLS_SORT.out.vcf
} else {
ch_ready_vcfs = ch_snpeff_output
}

//
// Index the resulting bgzipped VCFs
//

ch_tabix_input = ch_ready_vcfs
.branch { meta, vcf ->
// Split the bgzipped VCFs from the unzipped VCFs (only bgzipped VCFs should be indexed)
bgzip: vcf.extension == "gz"
unzip: true
return [ meta, vcf, [] ]
}

TABIX_TABIX(
ch_tabix_input.bgzip
)
ch_versions = ch_versions.mix(TABIX_TABIX.out.versions)

ch_vcf_tbi = ch_tabix_input.bgzip
.join(TABIX_TABIX.out.tbi, failOnDuplicate: true, failOnMismatch: true)
.mix(ch_tabix_input.unzip)

emit:
vcf_tbi = ch_vcf_tbi // channel: [ val(meta), path(vcf), path(tbi) ]
vep_reports = ch_vep_reports // channel: [ path(html) ]
snpeff_reports = ch_snpeff_reports // channel: [ path(csv) ]
snpeff_html = ch_snpeff_html // channel: [ path(html) ]
snpeff_genes = ch_snpeff_genes // channel: [ path(genes) ]
versions = ch_versions // channel: [ versions.yml ]
}
73 changes: 73 additions & 0 deletions subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
name: vcf_annotate_ensemblvep_snpeff
description: |
Perform annotation with ensemblvep and/or snpeff and bgzip + tabix index the resulting VCF file. This subworkflow uses the scatter-gather method to run VEP/snpEff in parallel to increase throughput. The input VCF is split into multiple smaller VCFs of fixed size, which are annotated separately and concatenated back together to a single output file per sample. Only VCF/BCF outputs are currently supported.
keywords:
- vcf
- annotation
- ensemblvep
- snpeff
modules:
- ensemblvep/vep
- snpeff/snpeff
- tabix/tabix
- bcftools/pluginscatter
- bcftools/concat
input:
- ch_vcf:
description: |
vcf file to annotate
Structure: [ val(meta), path(vcf), path(tbi) ]
- ch_fasta:
description: |
Reference genome fasta file (optional)
Structure: [ val(meta2), path(fasta) ]
- val_vep_genome:
type: string
description: genome to use for ensemblvep
- val_vep_species:
type: string
description: species to use for ensemblvep
- val_vep_cache_version:
type: integer
description: cache version to use for ensemblvep
- ch_vep_cache:
description: |
the root cache folder for ensemblvep (optional)
Structure: [ path(cache) ]
- ch_vep_extra_files:
description: |
any extra files needed by plugins for ensemblvep (optional)
Structure: [ path(file1), path(file2)... ]
- val_snpeff_db:
type: string
description: database to use for snpeff
- ch_snpeff_cache:
description: |
the root cache folder for snpeff (optional)
Structure: [ path(cache) ]
- val_tools_to_use:
type: list
description: The tools to use. Options => '["ensemblvep", "snpeff"]'
- val_sites_per_chunk:
type: integer
description: |
The amount of variants per scattered VCF.
Set this value to `null`, `[]` or `false` to disable scattering.
output:
- vcf_tbi:
description: |
Compressed vcf file + tabix index
Structure: [ val(meta), path(vcf), path(tbi) ]
- reports:
type: file
description: html reports
pattern: "*.html"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@maxulysse"
- "@matthdsm"
- "@nvnieuwk"
4 changes: 4 additions & 0 deletions tests/config/pytest_modules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3736,6 +3736,10 @@ subworkflows/vcf_annotate_ensemblvep:
- subworkflows/nf-core/vcf_annotate_ensemblvep/**
- tests/subworkflows/nf-core/vcf_annotate_ensemblvep/**

subworkflows/vcf_annotate_ensemblvep_snpeff:
- subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/**
- tests/subworkflows/nf-core/vcf_annotate_ensemblvep_snpeff/**

subworkflows/vcf_annotate_snpeff:
- subworkflows/nf-core/vcf_annotate_snpeff/**
- tests/subworkflows/nf-core/vcf_annotate_snpeff/**
Expand Down
10 changes: 6 additions & 4 deletions tests/modules/nf-core/bcftools/query/test.yml
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
- name: bcftools query
- name: bcftools query test_bcftools_query
command: nextflow run ./tests/modules/nf-core/bcftools/query -entry test_bcftools_query -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/bcftools/query/nextflow.config
tags:
- bcftools/query
- bcftools
files:
- path: output/bcftools/out.txt
md5sum: c32a6d28f185822d8fe1eeb7e42ec155
md5sum: 51d135de052f3bcef50dcd6b74806094
- path: output/bcftools/versions.yml

- name: bcftools query with optional files
- name: bcftools query test_bcftools_query_with_optional_files
command: nextflow run ./tests/modules/nf-core/bcftools/query -entry test_bcftools_query_with_optional_files -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/bcftools/query/nextflow.config
tags:
- bcftools/query
- bcftools
files:
- path: output/bcftools/out.txt
md5sum: 5a87e0865df2f0ab2884fc113ec2a70d
md5sum: 1785d1957ba7206df852d0689b91753f
- path: output/bcftools/versions.yml
Loading

0 comments on commit 4512120

Please sign in to comment.