diff --git a/CHANGELOG.md b/CHANGELOG.md index bc90054c88..60d93f7426 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- [#1193](https://github.com/nf-core/sarek/pull/1193) - Adding support for Sentieon's DnaScope for germline variant-calling including joint-germline. - [#1271](https://github.com/nf-core/sarek/pull/1271) - Back to dev ### Changed diff --git a/conf/modules/prepare_genome.config b/conf/modules/prepare_genome.config index 66eb2041d2..85367196c2 100644 --- a/conf/modules/prepare_genome.config +++ b/conf/modules/prepare_genome.config @@ -76,7 +76,7 @@ process { } withName: 'TABIX_DBSNP' { - ext.when = { !params.dbsnp_tbi && params.dbsnp && ((params.step == "mapping" || params.step == "markduplicates" || params.step == "prepare_recalibration") || params.tools && (params.tools.split(',').contains('controlfreec') || params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('mutect2'))) } + ext.when = { !params.dbsnp_tbi && params.dbsnp && ((params.step == "mapping" || params.step == "markduplicates" || params.step == "prepare_recalibration") || params.tools && (params.tools.split(',').contains('controlfreec') || params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope') || params.tools.split(',').contains('mutect2'))) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, @@ -96,7 +96,7 @@ process { } withName: 'TABIX_KNOWN_INDELS' { - ext.when = { !params.known_indels_tbi && params.known_indels && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper'))) ) } + ext.when = { !params.known_indels_tbi && params.known_indels && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope'))) ) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, @@ -106,7 +106,7 @@ process { } withName: 'TABIX_KNOWN_SNPS' { - ext.when = { !params.known_snps_tbi && params.known_snps && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper'))) ) } + ext.when = { !params.known_snps_tbi && params.known_snps && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') )) ) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, diff --git a/conf/modules/sentieon_dnascope.config b/conf/modules/sentieon_dnascope.config new file mode 100644 index 0000000000..fa431ae417 --- /dev/null +++ b/conf/modules/sentieon_dnascope.config @@ -0,0 +1,68 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// SENTIEON DNASCOPE + +process { + + withName: 'SENTIEON_DNASCOPE' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.dnascope" : "${meta.id}.dnascope.${intervals.simpleName}" } + ext.when = { params.tools && params.tools.split(',').contains('sentieon_dnascope') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}", + saveAs: { meta.num_intervals > 1 ? null : "sentieon_dnascope/${meta.id}/${it}" } + ] + } + + withName: 'MERGE_SENTIEON_DNASCOPE_VCFS' { + ext.prefix = { params.joint_germline ? "${meta.id}.dnascope.g" : "${meta.id}.dnascope.unfiltered" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'MERGE_SENTIEON_DNASCOPE_GVCFS' { + ext.prefix = { "${meta.id}.dnascope.g" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + if (params.tools && params.tools.contains('sentieon_dnascope')) { + withName: '.*FILTERVARIANTTRANCHES' { + ext.prefix = {"${meta.id}.dnascope"} + ext.args = { "--info-key CNN_1D" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + } + + withName: 'SENTIEON_DNAMODELAPPLY' { + ext.prefix = {"${meta.id}.dnascope.filtered"} + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + +} diff --git a/conf/modules/sentieon_dnascope_joint_germline.config b/conf/modules/sentieon_dnascope_joint_germline.config new file mode 100644 index 0000000000..72dd6c3144 --- /dev/null +++ b/conf/modules/sentieon_dnascope_joint_germline.config @@ -0,0 +1,45 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// SENTIEON DNASCOPE JOINT_GERMLINE + +process { + + // TO-DO: duplicate!! + withName: 'SENTIEON_GVCFTYPER' { + ext.args = { "--allow-old-rms-mapping-quality-annotation-data" } + ext.prefix = { meta.intervals_name } + publishDir = [ + enabled: false + ] + } + + if (params.tools && params.tools.contains('sentieon_dnascope') && params.joint_germline) { + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_JOINT_CALLING_GERMLINE_SENTIEON:BCFTOOLS_SORT' { + ext.prefix = { vcf.baseName - ".vcf" + ".sort" } + publishDir = [ + enabled: false + ] + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_JOINT_CALLING_GERMLINE_SENTIEON:MERGE_GENOTYPEGVCFS' { + ext.prefix = "joint_germline" + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/joint_variant_calling/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + } +} diff --git a/conf/modules/sentieon_joint_germline.config b/conf/modules/sentieon_haplotyper_joint_germline.config similarity index 100% rename from conf/modules/sentieon_joint_germline.config rename to conf/modules/sentieon_haplotyper_joint_germline.config diff --git a/docs/output.md b/docs/output.md index b250ed2817..b03f4c11d0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -37,8 +37,10 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [GATK Germline Single Sample Variant Calling](#gatk-germline-single-sample-variant-calling) - [GATK Joint Germline Variant Calling](#gatk-joint-germline-variant-calling) - [GATK Mutect2](#gatk-mutect2) + - [Sentieon DNAscope](#sentieon-dnascope) + - [Sentieon DNAscope joint germline variant calling](#sentieon-dnascope-joint-germline-variant-calling) - [Sentieon Haplotyper](#sentieon-haplotyper) - - [Sentieon Joint Germline Variant Calling](#sentieon-joint-germline-variant-calling) + - [Sentieon Haplotyper joint germline variant calling](#sentieon-haplotyper-joint-germline-variant-calling) - [Strelka2](#strelka2) - [Structural Variants](#structural-variants) - [Manta](#manta) @@ -442,6 +444,53 @@ Files created: +#### Sentieon DNAscope + +[Sentieon DNAscope](https://support.sentieon.com/appnotes/dnascope_ml/#dnascope-germline-variant-calling-with-a-machine-learning-model) is a variant-caller which aims at outperforming GATK's Haplotypecaller in terms of both speed and accuracy. DNAscope allows you to use a machine learning model to perform variant calling with higher accuracy by improving the candidate detection and filtering. + +
+Unfiltered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.unfiltered.vcf.gz` and `.dnascope.unfiltered.vcf.gz.tbi` + - VCF with tabix index + +
+ +The output from Sentieon's DNAscope can be controlled through the option `--sentieon_dnascope_emit_mode` for Sarek, see [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions). + +Unless `dnascope_filter` is listed under `--skip_tools` in the nextflow command, Sentieon's [DNAModelApply](https://support.sentieon.com/manual/usages/general/#dnamodelapply-algorithm) is applied to the unfiltered VCF-files in order to obtain filtered VCF-files. + +
+Filtered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.filtered.vcf.gz` and `.dnascope.filtered.vcf.gz.tbi` + - VCF with tabix index + +
+ +##### Sentieon DNAscope joint germline variant calling + +In Sentieon's package DNAscope, joint germline variant calling is done by first running Sentieon's Dnacope in emit-mode `gvcf` for each sample and then running Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) on the set of gVCF-files. See [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions) for information on how joint germline variant calling can be done in Sarek using Sentieon's DNAscope. + +
+Output files from joint germline variant calling + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.g.vcf.gz` and `.dnascope.g.vcf.gz.tbi` + - VCF with tabix index + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope/joint_variant_calling/`** + +- `joint_germline.vcf.gz` and `joint_germline.vcf.gz.tbi` + - VCF with tabix index + +
+ #### Sentieon Haplotyper [Sentieon Haplotyper](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm) is Sention's speedup version of GATK's Haplotypecaller (see above). @@ -456,7 +505,7 @@ Files created: -The output from Sentieon's Haplotyper can be controlled through the option `--sentieon_haplotyper_emit_mode` for Sarek, see [Basic usage of Sentieon functions in Sarek](#basic-usage-of-sentieon-functions-in-sarek). +The output from Sentieon's Haplotyper can be controlled through the option `--sentieon_haplotyper_emit_mode` for Sarek, see [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions). Unless `haplotyper_filter` is listed under `--skip_tools` in the nextflow command, GATK's CNNScoreVariants and FilterVariantTranches (see above) is applied to the unfiltered VCF-files in order to obtain filtered VCF-files. @@ -470,16 +519,16 @@ Unless `haplotyper_filter` is listed under `--skip_tools` in the nextflow comman -##### Sentieon Joint Germline Variant Calling +##### Sentieon Haplotyper joint germline variant calling -In Sentieon's package DNAseq, joint germline variant calling is done by first running Sentieon's Haplotyper in emit-mode `gvcf` for each sample and then running Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) on the set of gVCF-files. See [Basic usage of Sentieon functions in Sarek](#basic-usage-of-sentieon-functions-in-sarek) for information on how joint germline variant calling can be done in Sarek using Sentieon's DNAseq. After joint genotyping, Sentieon's version of VQSR ([VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) and [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm)) is applied for filtering to produce the final multisample callset with the desired balance of precision and sensitivity. +In Sentieon's package DNAseq, joint germline variant calling is done by first running Sentieon's Haplotyper in emit-mode `gvcf` for each sample and then running Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) on the set of gVCF-files. See [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions) for information on how joint germline variant calling can be done in Sarek using Sentieon's DNAseq. After joint genotyping, Sentieon's version of VQSR ([VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) and [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm)) is applied for filtering to produce the final multisample callset with the desired balance of precision and sensitivity.
Output files from joint germline variant calling **Output directory: `{outdir}/variantcalling/sentieon_haplotyper//`** -- `.haplotypecaller.g.vcf.gz` and `.haplotypecaller.g.vcf.gz.tbi` +- `.haplotyper.g.vcf.gz` and `.haplotyper.g.vcf.gz.tbi` - VCF with tabix index **Output directory: `{outdir}/variantcalling/sentieon_haplotyper/joint_variant_calling/`** diff --git a/docs/usage.md b/docs/usage.md index 0895fac23c..3b2767b302 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1083,7 +1083,9 @@ nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat ### Available Sentieon functions -Sarek contains the following Sentieon functions [bwa mem](https://support.sentieon.com/manual/usages/general/#bwa-mem-syntax), [LocusCollector](https://support.sentieon.com/manual/usages/general/#locuscollector-algorithm) + [Dedup](https://support.sentieon.com/manual/usages/general/#dedup-algorithm), [Haplotyper](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm), [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) and [VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) + [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm), so the basic processing of alignment of fastq-files to VCF-files can be done using speedup Sentieon functions. +Sarek contains the following Sentieon functions from [DnaSeq](https://support.sentieon.com/manual/DNAseq_usage/dnaseq/) : [bwa mem](https://support.sentieon.com/manual/usages/general/#bwa-mem-syntax), [LocusCollector](https://support.sentieon.com/manual/usages/general/#locuscollector-algorithm) + [Dedup](https://support.sentieon.com/manual/usages/general/#dedup-algorithm), [Haplotyper](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm), [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) and [VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) + [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm), so the basic processing of alignment of fastq-files to VCF-files can be done using speedup Sentieon functions. + +Sarek also contains the Sentieon functions [DnaScope](https://support.sentieon.com/manual/usages/general/?highlight=dnamodelapply#dnascope-algorithm) and [DNAModelApply](https://support.sentieon.com/manual/usages/general/?highlight=dnamodelapply#dnamodelapply-algorithm). ### Basic usage of Sentieon functions @@ -1091,9 +1093,11 @@ To use Sentieon's aligner `bwa mem`, set the aligner option `sentieon-bwamem`. ( To use Sentieon's function `Dedup`, specify `sentieon_dedup` as one of the tools. (This can, for example, be done by adding `--tools sentieon_dedup` to the nextflow run command.) -To use Sentieon's function `Haplotyper`, specify `sentieon_haplotyper` as one of the tools. This can, for example, be done by adding `--tools sentieon_haplotyper` to the nextflow run command. In order to skip the GATK-based variant-filter, one may add `--skip_tools haplotyper_filter` to the nextflow run command. Sarek also provides the option `sentieon_haplotyper_emit_mode` which can be used to set the [emit-mode](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm) of Sentieon's haplotyper. Sentieon's haplotyper can output both a vcf-file and a gvcf-file in the same run; this is achieved by setting `sentieon_haplotyper_emit_mode` to `,gvcf`, where `` is `variant`, `confident` or `all`. +To use Sentieon's function `DNAscope`, specify `sentieon_dnascope` as one of the tools. This can, for example, be done by adding `--tools sentieon_dnascope` to the nextflow run command. In order to skip Sentieon's variant-filter `DNAModelApply`, one may add `--skip_tools dnascope_filter` to the nextflow run command. Sarek also provides the option `sentieon_dnascope_emit_mode` which can be used to set the [emit-mode](https://support.sentieon.com/manual/usages/general/#dnascope-algorithm) of Sentieon's dnascope. Sentieon's dnascope can output both a vcf-file and a gvcf-file in the same run; this is achieved by setting `sentieon_dnascope_emit_mode` to `,gvcf`, where `` is `variant`, `confident` or `all`. + +Sentieon's function `Haplotyper` is used in much the same way as `DNAscope`. To use Sentieon's function `Haplotyper`, specify `sentieon_haplotyper` as one of the tools. This can, for example, be done by adding `--tools sentieon_haplotyper` to the nextflow run command. In order to skip the GATK-based variant-filter, one may add `--skip_tools haplotyper_filter` to the nextflow run command. Sarek also provides the option `sentieon_haplotyper_emit_mode` which can be used to set the [emit-mode](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm) of Sentieon's haplotyper. Sentieon's haplotyper can output both a vcf-file and a gvcf-file in the same run; this is achieved by setting `sentieon_haplotyper_emit_mode` to `,gvcf`, where `` is `variant`, `confident` or `all`. -To use Sentieon's function `GVCFtyper` along with Sention's version of VQSR (`VarCal` and `ApplyVarCal`) for joint-germline genotyping, specify `sentieon_haplotyper` as one of the tools, set the option `sentieon_haplotyper_emit_mode` to `gvcf`, and add the option `joint_germline`. This can, for example, be done by adding `--tools sentieon_haplotyper --joint_germline --sentieon_haplotyper_emit_mode gvcf` to the nextflow run command. +To use Sentieon's function `GVCFtyper` along with Sention's version of VQSR (`VarCal` and `ApplyVarCal`) for joint-germline genotyping, specify `sentieon_haplotyper` as one of the tools, set the option `sentieon_haplotyper_emit_mode` to `gvcf`, and add the option `joint_germline`. This can, for example, be done by adding `--tools sentieon_haplotyper --joint_germline --sentieon_haplotyper_emit_mode gvcf` to the nextflow run command. If `sentieon_dnascope` is chosen instead of `sentieon_haplotyper`, then Sention's version of VQSR is skipped, as recommended by Sentieon. ### Joint germline variant calling diff --git a/modules.json b/modules.json index 84b30a76dc..06ff0b6dab 100644 --- a/modules.json +++ b/modules.json @@ -388,6 +388,16 @@ "git_sha": "915a0b16ba3e40ef59e7b44843b3118e17a9c906", "installed_by": ["modules"] }, + "sentieon/dnamodelapply": { + "branch": "master", + "git_sha": "43ef68091a1188fd8dc4c03f9341b556803c7514", + "installed_by": ["modules"] + }, + "sentieon/dnascope": { + "branch": "master", + "git_sha": "4fb6fdc8046ec09cd30f92a2a252e9a0ba4a6309", + "installed_by": ["modules"] + }, "sentieon/gvcftyper": { "branch": "master", "git_sha": "6c9c11ee96796e53a01b4719286acce6af14bc3a", diff --git a/modules/nf-core/sentieon/dnamodelapply/main.nf b/modules/nf-core/sentieon/dnamodelapply/main.nf new file mode 100644 index 0000000000..3fe9a28f19 --- /dev/null +++ b/modules/nf-core/sentieon/dnamodelapply/main.nf @@ -0,0 +1,81 @@ +process SENTIEON_DNAMODELAPPLY { + tag "$meta.id" + label 'process_high' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + container 'nf-core/sentieon:202112.06' + + input: + tuple val(meta), path(vcf), path(idx) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(ml_model) + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf + tuple val(meta), path("*.vcf.gz.tbi"), emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + sentieon driver \\ + -t $task.cpus \\ + -r $fasta \\ + $args \\ + --algo DNAModelApply \\ + --model $ml_model \\ + -v $vcf \\ + ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/dnamodelapply/meta.yml b/modules/nf-core/sentieon/dnamodelapply/meta.yml new file mode 100644 index 0000000000..ec429bea21 --- /dev/null +++ b/modules/nf-core/sentieon/dnamodelapply/meta.yml @@ -0,0 +1,78 @@ +name: "sentieon_dnamodelapply" +description: modifies the input VCF file by adding the MLrejected FILTER to the variants +keywords: + - dnamodelapply + - vcf + - filter + - sentieon +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` + - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` + - vcf: + type: file + description: INPUT VCF file + pattern: "*.{vcf,vcf.gz}" + - idx: + type: file + description: Index of the input VCF file + pattern: "*.{tbi}" + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" + - fai: + type: file + description: Index of the genome fasta file + pattern: "*.fai" + - ml_model: + type: file + description: machine learning model file + pattern: "*.model" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: INPUT VCF file + pattern: "*.{vcf,vcf.gz}" + - index: + type: file + description: Index of the input VCF file + pattern: "*.{tbi}" + +authors: + - "@ramprasadn" diff --git a/modules/nf-core/sentieon/dnascope/main.nf b/modules/nf-core/sentieon/dnascope/main.nf new file mode 100644 index 0000000000..6be42a1728 --- /dev/null +++ b/modules/nf-core/sentieon/dnascope/main.nf @@ -0,0 +1,100 @@ +process SENTIEON_DNASCOPE { + tag "$meta.id" + label 'process_high' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + container 'nf-core/sentieon:202112.06' + + input: + tuple val(meta), path(bam), path(bai), path(intervals) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(dbsnp) + tuple val(meta5), path(dbsnp_tbi) + tuple val(meta6), path(ml_model) + val(pcr_indel_model) + val(emit_vcf) + val(emit_gvcf) + + output: + tuple val(meta), path("*.unfiltered.vcf.gz") , optional:true, emit: vcf // added the substring ".unfiltered" in the filename of the vcf-files since without that the g.vcf.gz-files were ending up in the vcf-channel + tuple val(meta), path("*.unfiltered.vcf.gz.tbi"), optional:true, emit: vcf_tbi + tuple val(meta), path("*.g.vcf.gz") , optional:true, emit: gvcf // these output-files have to have the extension ".vcf.gz", otherwise the subsequent GATK-MergeVCFs will fail. + tuple val(meta), path("*.g.vcf.gz.tbi") , optional:true, emit: gvcf_tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' // options for the driver + def args2 = task.ext.args2 ?: '' // options for the vcf generation + def args3 = task.ext.args3 ?: '' // options for the gvcf generation + def interval = intervals ? "--interval ${intervals}" : '' + def dbsnp_cmd = dbsnp ? "-d ${dbsnp}" : '' + def model_cmd = ml_model ? " --model ${ml_model}" : '' + def pcr_indel_model_cmd = pcr_indel_model ? " --pcr_indel_model ${pcr_indel_model}" : '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + def vcf_cmd = "" + def gvcf_cmd = "" + def base_cmd = '--algo DNAscope ' + dbsnp_cmd + ' ' + + if (emit_vcf) { // emit_vcf can be the empty string, 'variant', 'confident' or 'all' but NOT 'gvcf' + vcf_cmd = base_cmd + args2 + ' ' + model_cmd + pcr_indel_model_cmd + ' --emit_mode ' + emit_vcf + ' ' + prefix + '.unfiltered.vcf.gz' + } + + if (emit_gvcf) { // emit_gvcf can be either true or false + gvcf_cmd = base_cmd + args3 + ' ' + model_cmd + pcr_indel_model_cmd + ' --emit_mode gvcf ' + prefix + '.g.vcf.gz' + } + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + sentieon driver $args -r $fasta -t $task.cpus -i $bam $interval $vcf_cmd $gvcf_cmd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + touch ${prefix}.unfiltered.vcf.gz + touch ${prefix}.unfiltered.vcf.gz.tbi + touch ${prefix}.g.vcf.gz + touch ${prefix}.g.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/dnascope/meta.yml b/modules/nf-core/sentieon/dnascope/meta.yml new file mode 100644 index 0000000000..34e0b97b4c --- /dev/null +++ b/modules/nf-core/sentieon/dnascope/meta.yml @@ -0,0 +1,119 @@ +name: sentieon_dnascope +description: DNAscope algorithm performs an improved version of Haplotype variant calling. +keywords: + - dnascope + - sentieon + - variant_calling +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file. + pattern: "*.bam" + - bai: + type: file + description: BAI file + pattern: "*.bai" + - intervals: + type: file + description: bed or interval_list file containing interval in the reference that will be used in the analysis + pattern: "*.{bed,interval_list}" + - meta2: + type: map + description: | + Groovy Map containing meta information for fasta. + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" + - meta3: + type: map + description: | + Groovy Map containing meta information for fasta index. + - fai: + type: file + description: Index of the genome fasta file + pattern: "*.fai" + - meta4: + type: map + description: | + Groovy Map containing meta information for dbsnp. + - dbsnp: + type: file + description: Single Nucleotide Polymorphism database (dbSNP) file + pattern: "*.vcf.gz" + - meta5: + type: map + description: | + Groovy Map containing meta information for dbsnp_tbi. + - dbsnp_tbi: + type: file + description: Index of the Single Nucleotide Polymorphism database (dbSNP) file + pattern: "*.vcf.gz.tbi" + - meta6: + type: map + description: | + Groovy Map containing meta information for machine learning model for Dnascope. + - ml_model: + type: file + description: machine learning model file + pattern: "*.model" + - ml_model: + type: file + description: machine learning model file + pattern: "*.model" + - pcr_indel_model: + type: string + description: | + Controls the option pcr_indel_model for Dnascope. + The possible options are "NONE" (used for PCR free samples), and "HOSTILE", "AGGRESSIVE" and "CONSERVATIVE". + See Sentieons documentation for further explanation. + - emit_vcf: + type: string + description: | + Controls the vcf output from Dnascope. + Possible options are "all", "confident" and "variant". + See Sentieons documentation for further explanation. + - emit_gvcf: + type: boolean + description: If true, the haplotyper will output a gvcf +output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: Compressed VCF file + pattern: "*.unfiltered.vcf.gz" + - vcf_tbi: + type: file + description: Index of VCF file + pattern: "*.unfiltered.vcf.gz.tbi" + - gvcf: + type: file + description: Compressed GVCF file + pattern: "*.g.vcf.gz" + - gvcf_tbi: + type: file + description: Index of GVCF file + pattern: "*.g.vcf.gz.tbi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@ramprasadn" diff --git a/nextflow.config b/nextflow.config index ccf3f5068c..56cab933a6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -51,26 +51,29 @@ params { seq_platform = 'ILLUMINA' // Default platform written in read group PL field by aligner // Variant Calling - only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired samples - ascat_ploidy = null // default value for ASCAT - ascat_min_base_qual = 20 // default value for ASCAT - ascat_min_counts = 10 // default value for ASCAT - ascat_min_map_qual = 35 // default value for ASCAT - ascat_purity = null // default value for ASCAT - cf_ploidy = "2" // default value for Control-FREEC - cf_coeff = 0.05 // default value for Control-FREEC - cf_contamination = 0 // default value for Control-FREEC - cf_contamination_adjustment = false // by default we are not using this in Control-FREEC - cf_mincov = 0 // ControlFreec default values - cf_minqual = 0 // ControlFreec default values - cf_window = null // by default we are not using this in Control-FREEC - cnvkit_reference = null // by default the reference is build from the fasta file - concatenate_vcfs = false // by default we don't concatenate the germline-vcf-files - ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 - wes = false // Set to true, if data is exome/targeted sequencing data. Used to use correct models in various variant callers - joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected - joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling - sentieon_haplotyper_emit_mode = "variant" // default value for Sentieon haplotyper + ascat_ploidy = null // default value for ASCAT + ascat_min_base_qual = 20 // default value for ASCAT + ascat_min_counts = 10 // default value for ASCAT + ascat_min_map_qual = 35 // default value for ASCAT + ascat_purity = null // default value for ASCAT + cf_ploidy = "2" // default value for Control-FREEC + cf_coeff = 0.05 // default value for Control-FREEC + cf_contamination = 0 // default value for Control-FREEC + cf_contamination_adjustment = false // by default we are not using this in Control-FREEC + cf_mincov = 0 // ControlFreec default values + cf_minqual = 0 // ControlFreec default values + cf_window = null // by default we are not using this in Control-FREEC + cnvkit_reference = null // by default the reference is build from the fasta file + concatenate_vcfs = false // by default we don't concatenate the germline-vcf-files + ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 + joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected + joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling + only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired sample + sentieon_dnascope_emit_mode = "variant" // default value for Sentieon dnascope + sentieon_dnascope_model = "https://s3.amazonaws.com/sentieon-release/other/SentieonDNAscopeModel1.1.model" + sentieon_dnascope_pcr_indel_model = "CONSERVATIVE" + sentieon_haplotyper_emit_mode = "variant" // default value for Sentieon haplotyper + wes = false // Set to true, if data is exome/targeted sequencing data. Used to use correct models in various variant callers // Annotation dbnsfp = null // No dbnsfp processed file @@ -377,8 +380,10 @@ includeConfig 'conf/modules/manta.config' includeConfig 'conf/modules/mpileup.config' includeConfig 'conf/modules/msisensorpro.config' includeConfig 'conf/modules/mutect2.config' +includeConfig 'conf/modules/sentieon_dnascope.config' +includeConfig 'conf/modules/sentieon_dnascope_joint_germline.config' includeConfig 'conf/modules/sentieon_haplotyper.config' -includeConfig 'conf/modules/sentieon_joint_germline.config' +includeConfig 'conf/modules/sentieon_haplotyper_joint_germline.config' includeConfig 'conf/modules/strelka.config' includeConfig 'conf/modules/tiddit.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index 51850b6fa9..68c6b77146 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -100,14 +100,14 @@ "fa_icon": "fas fa-toolbox", "description": "Tools to use for duplicate marking, variant calling and/or for annotation.", "help_text": "Multiple tools separated with commas.\n\n**Variant Calling:**\n\nGermline variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: DeepVariant, FreeBayes, GATK HaplotypeCaller, mpileup, Sentieon Haplotyper, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit\n\nTumor-only somatic variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, mpileup, Mutect2, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit, ControlFREEC\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, Mutect2, Strelka2\n- Structural variants: Manta, TIDDIT\n- Copy-Number: ASCAT, CNVKit, Control-FREEC\n- Microsatellite Instability: MSIsensorpro\n\n> **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.", - "pattern": "^((ascat|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|sentieon_haplotyper|manta|merge|mpileup|msisensorpro|mutect2|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(? **NB** `--skip_tools baserecalibrator_report` is actually just not saving the reports.\n> **NB** `--skip_tools markduplicates_report` does not skip `MarkDuplicates` but prevent the collection of duplicate metrics that slows down performance.", - "pattern": "^((baserecalibrator|baserecalibrator_report|bcftools|documentation|fastqc|haplotypecaller_filter|haplotyper_filter|markduplicates|markduplicates_report|mosdepth|multiqc|samtools|vcftools|versions)?,?)*(? [ meta.subMap('num_intervals') + [ id:'joint_variant_calling', patient:'all_samples', variantcaller:'sentieon_haplotyper' ], vcf ]}.groupTuple() + gvcf_to_merge = BCFTOOLS_SORT.out.vcf.map{ meta, vcf -> [ meta.subMap('num_intervals') + [ id:'joint_variant_calling', patient:'all_samples', variantcaller:variant_caller ], vcf ]}.groupTuple() // Merge scatter/gather vcfs & index // Rework meta for variantscalled.csv and annotation tools MERGE_GENOTYPEGVCFS(gvcf_to_merge, dict) - vqsr_input = MERGE_GENOTYPEGVCFS.out.vcf.join(MERGE_GENOTYPEGVCFS.out.tbi, failOnDuplicate: true) - indels_resource_label = known_indels_vqsr.mix(dbsnp_vqsr).collect() - snps_resource_label = known_snps_vqsr.mix(dbsnp_vqsr).collect() - - // Recalibrate INDELs and SNPs separately - SENTIEON_VARCAL_INDEL( - vqsr_input, - resource_indels_vcf, - resource_indels_tbi, - indels_resource_label, - fasta, - fai) - - SENTIEON_VARCAL_SNP( - vqsr_input, - resource_snps_vcf, - resource_snps_tbi, - snps_resource_label, - fasta, - fai) - - //Prepare SNPs and INDELs for Sentieon's applyvarcal - // Step 1. : applyvarcal to SNPs - // Step 2. : Use SENTIEON_APPLYVARCAL_SNP output and run ApplyVQSR_INDEL. This avoids duplicate entries in the vcf as described here: https://hpc.nih.gov/training/gatk_tutorial/vqsr.html - - // Join results of variant recalibration into a single channel tuple - // Rework meta for variantscalled.csv and annotation tools - vqsr_input_snp = vqsr_input.join(SENTIEON_VARCAL_SNP.out.recal, failOnDuplicate: true) - .join(SENTIEON_VARCAL_SNP.out.idx, failOnDuplicate: true) - .join(SENTIEON_VARCAL_SNP.out.tranches, failOnDuplicate: true) - .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } - - SENTIEON_APPLYVARCAL_SNP( - vqsr_input_snp, - fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, - fai.map{ fai -> [ [ id:fai.baseName ], fai ] }) - - // Join results of SENTIEON_APPLYVARCAL_SNP and use as input for SENTIEON_APPLYVARCAL_INDEL to avoid duplicate entries in the result - // Rework meta for variantscalled.csv and annotation tools - vqsr_input_indel = SENTIEON_APPLYVARCAL_SNP.out.vcf.join(SENTIEON_APPLYVARCAL_SNP.out.tbi).map{ meta, vcf, tbi -> [ meta + [ id:'joint_variant_calling' ], vcf, tbi ]} - .join(SENTIEON_VARCAL_INDEL.out.recal, failOnDuplicate: true) - .join(SENTIEON_VARCAL_INDEL.out.idx, failOnDuplicate: true) - .join(SENTIEON_VARCAL_INDEL.out.tranches, failOnDuplicate: true) - .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } - - SENTIEON_APPLYVARCAL_INDEL( - vqsr_input_indel, - fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, - fai.map{ fai -> [ [ id:fai.baseName ], fai ] }) - - // The following is an ugly monster to achieve the following: - // When MERGE_GENOTYPEGVCFS and SENTIEON_APPLYVARCAL are run, then use output from SENTIEON_APPLYVARCAL - // When MERGE_GENOTYPEGVCFS and NOT SENTIEON_APPLYVARCAL, then use the output from MERGE_GENOTYPEGVCFS - - merge_vcf_for_join = MERGE_GENOTYPEGVCFS.out.vcf.map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} - merge_tbi_for_join = MERGE_GENOTYPEGVCFS.out.tbi.map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} - - // Remap for both to have the same key, if ApplyBQSR is not run, the channel is empty --> populate with empty elements - vqsr_vcf_for_join = SENTIEON_APPLYVARCAL_INDEL.out.vcf.ifEmpty([[:], []]).map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} - vqsr_tbi_for_join = SENTIEON_APPLYVARCAL_INDEL.out.tbi.ifEmpty([[:], []]).map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} - - // Join on metamap - // If both --> meta, vcf_merged, vcf_bqsr - // If not VQSR --> meta, vcf_merged, [] - // if the second is empty, use the first - genotype_vcf = merge_vcf_for_join.join(vqsr_vcf_for_join, remainder: true).map{ - meta, joint_vcf, recal_vcf -> - - vcf_out = recal_vcf ?: joint_vcf - - [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"sentieon_haplotyper"], vcf_out] - } - - genotype_index = merge_tbi_for_join.join(vqsr_tbi_for_join, remainder: true).map{ - meta, joint_tbi, recal_tbi -> - - tbi_out = recal_tbi ?: joint_tbi - [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"sentieon_haplotyper"], tbi_out] + merged_vcf = MERGE_GENOTYPEGVCFS.out.vcf.map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} + merged_tbi = MERGE_GENOTYPEGVCFS.out.tbi.map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} + + if (variant_caller == 'sentieon_dnascope') { + // As advised by Don Freed (Sentieon), VQSR is skipped for DnaScope + genotype_vcf = merged_vcf.map{ + meta, vcf -> [ meta + [ patient:"all_samples", variantcaller:'sentieon_dnascope'], vcf ] + } + genotype_index = merged_tbi.map{ + meta, tbi -> [ meta + [ patient:"all_samples", variantcaller:'sentieon_dnascope'], tbi ] + } + } else { + vqsr_input = MERGE_GENOTYPEGVCFS.out.vcf.join(MERGE_GENOTYPEGVCFS.out.tbi, failOnDuplicate: true) + indels_resource_label = known_indels_vqsr.mix(dbsnp_vqsr).collect() + snps_resource_label = known_snps_vqsr.mix(dbsnp_vqsr).collect() + + // Recalibrate INDELs and SNPs separately + SENTIEON_VARCAL_INDEL( + vqsr_input, + resource_indels_vcf, + resource_indels_tbi, + indels_resource_label, + fasta, + fai) + + SENTIEON_VARCAL_SNP( + vqsr_input, + resource_snps_vcf, + resource_snps_tbi, + snps_resource_label, + fasta, + fai) + + //Prepare SNPs and INDELs for Sentieon's applyvarcal + // Step 1. : applyvarcal to SNPs + // Step 2. : Use SENTIEON_APPLYVARCAL_SNP output and run ApplyVQSR_INDEL. This avoids duplicate entries in the vcf as described here: https://hpc.nih.gov/training/gatk_tutorial/vqsr.html + + // Join results of variant recalibration into a single channel tuple + // Rework meta for variantscalled.csv and annotation tools + vqsr_input_snp = vqsr_input.join(SENTIEON_VARCAL_SNP.out.recal, failOnDuplicate: true) + .join(SENTIEON_VARCAL_SNP.out.idx, failOnDuplicate: true) + .join(SENTIEON_VARCAL_SNP.out.tranches, failOnDuplicate: true) + .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } + + SENTIEON_APPLYVARCAL_SNP( + vqsr_input_snp, + fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, + fai.map{ fai -> [ [ id:fai.baseName ], fai ] }) + + // Join results of SENTIEON_APPLYVARCAL_SNP and use as input for SENTIEON_APPLYVARCAL_INDEL to avoid duplicate entries in the result + // Rework meta for variantscalled.csv and annotation tools + vqsr_input_indel = SENTIEON_APPLYVARCAL_SNP.out.vcf.join(SENTIEON_APPLYVARCAL_SNP.out.tbi).map{ meta, vcf, tbi -> [ meta + [ id:'joint_variant_calling' ], vcf, tbi ]} + .join(SENTIEON_VARCAL_INDEL.out.recal, failOnDuplicate: true) + .join(SENTIEON_VARCAL_INDEL.out.idx, failOnDuplicate: true) + .join(SENTIEON_VARCAL_INDEL.out.tranches, failOnDuplicate: true) + .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } + + SENTIEON_APPLYVARCAL_INDEL( + vqsr_input_indel, + fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, + fai.map{ fai -> [ [ id:fai.baseName ], fai ] }) + + // The following is an ugly monster to achieve the following: + // When MERGE_GENOTYPEGVCFS and SENTIEON_APPLYVARCAL are run, then use output from SENTIEON_APPLYVARCAL + // When MERGE_GENOTYPEGVCFS and NOT SENTIEON_APPLYVARCAL, then use the output from MERGE_GENOTYPEGVCFS + + // Remap for both to have the same key, if ApplyBQSR is not run, the channel is empty --> populate with empty elements + vqsr_vcf_for_join = SENTIEON_APPLYVARCAL_INDEL.out.vcf.ifEmpty([[:], []]).map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} + vqsr_tbi_for_join = SENTIEON_APPLYVARCAL_INDEL.out.tbi.ifEmpty([[:], []]).map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} + + // Join on metamap + // If both --> meta, vcf_merged, vcf_bqsr + // If not VQSR --> meta, vcf_merged, [] + // if the second is empty, use the first + genotype_vcf = merged_vcf.join(vqsr_vcf_for_join, remainder: true).map{ + meta, joint_vcf, recal_vcf -> + + vcf_out = recal_vcf ?: joint_vcf + + [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"sentieon_haplotyper"], vcf_out] + } + + genotype_index = merged_tbi.join(vqsr_tbi_for_join, remainder: true).map{ + meta, joint_tbi, recal_tbi -> + + tbi_out = recal_tbi ?: joint_tbi + [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"sentieon_haplotyper"], tbi_out] + } + + versions = versions.mix(SENTIEON_VARCAL_SNP.out.versions) + versions = versions.mix(SENTIEON_VARCAL_INDEL.out.versions) + versions = versions.mix(SENTIEON_APPLYVARCAL_INDEL.out.versions) } versions = versions.mix(SENTIEON_GVCFTYPER.out.versions) - versions = versions.mix(SENTIEON_VARCAL_SNP.out.versions) - versions = versions.mix(SENTIEON_VARCAL_INDEL.out.versions) - versions = versions.mix(SENTIEON_APPLYVARCAL_INDEL.out.versions) emit: genotype_index // channel: [ val(meta), [ tbi ] ] diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf index 666c7c7b6b..5989023adf 100644 --- a/subworkflows/local/bam_variant_calling_germline_all/main.nf +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -9,12 +9,16 @@ include { BAM_VARIANT_CALLING_DEEPVARIANT } from '../bam_variant_calling include { BAM_VARIANT_CALLING_FREEBAYES } from '../bam_variant_calling_freebayes/main' include { BAM_VARIANT_CALLING_GERMLINE_MANTA } from '../bam_variant_calling_germline_manta/main' include { BAM_VARIANT_CALLING_HAPLOTYPECALLER } from '../bam_variant_calling_haplotypecaller/main' +include { BAM_VARIANT_CALLING_SENTIEON_DNASCOPE } from '../bam_variant_calling_sentieon_dnascope/main' include { BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER } from '../bam_variant_calling_sentieon_haplotyper/main' include { BAM_VARIANT_CALLING_MPILEUP } from '../bam_variant_calling_mpileup/main' include { BAM_VARIANT_CALLING_SINGLE_STRELKA } from '../bam_variant_calling_single_strelka/main' include { BAM_VARIANT_CALLING_SINGLE_TIDDIT } from '../bam_variant_calling_single_tiddit/main' +include { SENTIEON_DNAMODELAPPLY } from '../../../modules/nf-core/sentieon/dnamodelapply/main' include { VCF_VARIANT_FILTERING_GATK } from '../vcf_variant_filtering_gatk/main' + + workflow BAM_VARIANT_CALLING_GERMLINE_ALL { take: tools // Mandatory, list of tools to apply @@ -41,18 +45,24 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { joint_germline // boolean: [mandatory] [default: false] joint calling of germline variants skip_haplotypecaller_filter // boolean: [mandatory] [default: false] whether to filter haplotypecaller single sample vcfs sentieon_haplotyper_emit_mode // channel: [mandatory] value channel with string + sentieon_dnascope_emit_mode // channel: [mandatory] value channel with string + sentieon_dnascope_pcr_indel_model // channel: [mandatory] value channel with string + sentieon_dnascope_model // channel: [mandatory] value channel with string main: versions = Channel.empty() //TODO: Temporary until the if's can be removed and printing to terminal is prevented with "when" in the modules.config + gvcf_sentieon_dnascope = Channel.empty() + gvcf_sentieon_haplotyper = Channel.empty() + vcf_deepvariant = Channel.empty() vcf_freebayes = Channel.empty() vcf_haplotypecaller = Channel.empty() vcf_manta = Channel.empty() vcf_mpileup = Channel.empty() + vcf_sentieon_dnascope = Channel.empty() vcf_sentieon_haplotyper = Channel.empty() - gvcf_sentieon_haplotyper = Channel.empty() vcf_strelka = Channel.empty() vcf_tiddit = Channel.empty() @@ -180,6 +190,66 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { versions = versions.mix(BAM_VARIANT_CALLING_GERMLINE_MANTA.out.versions) } + // SENTIEON DNASCOPE + if (tools.split(',').contains('sentieon_dnascope')) { + BAM_VARIANT_CALLING_SENTIEON_DNASCOPE( + cram, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + intervals, + joint_germline, + sentieon_dnascope_emit_mode, + sentieon_dnascope_pcr_indel_model, + sentieon_dnascope_model) + + versions = versions.mix(BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.versions) + + vcf_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.vcf + vcf_tbi_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.vcf_tbi + gvcf_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.gvcf + gvcf_tbi_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.gvcf_tbi + + if (joint_germline) { + BAM_JOINT_CALLING_GERMLINE_SENTIEON( + BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.genotype_intervals, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + known_sites_indels, + known_sites_indels_tbi, + known_indels_vqsr, + known_sites_snps, + known_sites_snps_tbi, + known_snps_vqsr, + 'sentieon_dnascope') + + vcf_sentieon_dnascope = BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.genotype_vcf + versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.versions) + } else { + // If single sample track, check if filtering should be done + if (!(skip_tools && skip_tools.split(',').contains('dnascope_filter'))) { + + SENTIEON_DNAMODELAPPLY( + vcf_sentieon_dnascope.join(vcf_tbi_sentieon_dnascope, failOnDuplicate: true, failOnMismatch: true), + fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, + fasta_fai.map{ fai -> [ [ id:fai.baseName ], fai ] }, + sentieon_dnascope_model.map{ model -> [ [ id:model.baseName ], model ] }) + + vcf_sentieon_dnascope = SENTIEON_DNAMODELAPPLY.out.vcf + versions = versions.mix(SENTIEON_DNAMODELAPPLY.out.versions) + + } + + } + } + // SENTIEON HAPLOTYPER if (tools.split(',').contains('sentieon_haplotyper')) { BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER( @@ -215,7 +285,8 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { known_indels_vqsr, known_sites_snps, known_sites_snps_tbi, - known_snps_vqsr) + known_snps_vqsr, + 'sentieon_haplotyper') vcf_sentieon_haplotyper = BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.genotype_vcf versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.versions) @@ -270,6 +341,7 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { vcf_all = Channel.empty().mix( vcf_deepvariant, vcf_freebayes, + vcf_sentieon_dnascope, vcf_haplotypecaller, vcf_manta, vcf_mpileup, @@ -279,6 +351,8 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { ) emit: + gvcf_sentieon_dnascope + gvcf_sentieon_haplotyper vcf_all vcf_deepvariant vcf_freebayes @@ -286,8 +360,8 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { vcf_manta vcf_mpileup vcf_strelka + vcf_sentieon_dnascope vcf_sentieon_haplotyper - gvcf_sentieon_haplotyper vcf_tiddit versions diff --git a/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf b/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf new file mode 100644 index 0000000000..9eea9b2d61 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf @@ -0,0 +1,157 @@ +// +// SENTIEON HAPLOTYPER germline variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_MERGEVCFS as MERGE_SENTIEON_DNASCOPE_GVCFS } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { GATK4_MERGEVCFS as MERGE_SENTIEON_DNASCOPE_VCFS } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { SENTIEON_DNASCOPE } from '../../../modules/nf-core/sentieon/dnascope/main' + +workflow BAM_VARIANT_CALLING_SENTIEON_DNASCOPE { + take: + cram // channel: [mandatory] [ meta, cram, crai, interval.bed ] + fasta // channel: [mandatory] + fasta_fai // channel: [mandatory] + dict // channel: [mandatory] + dbsnp // channel: [optional] + dbsnp_tbi // channel: [optional] + dbsnp_vqsr // channel: [optional] + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + joint_germline // boolean: [mandatory] [default: false] joint calling of germline variants + sentieon_dnascope_emit_mode // string + sentieon_dnascope_pcr_indel_model // string + sentieon_dnascope_model // channel + + main: + versions = Channel.empty() + + gvcf = Channel.empty() + vcf = Channel.empty() + genotype_intervals = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals_for_sentieon = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, cram, crai, intervals, num_intervals -> [ + meta + [ + num_intervals:num_intervals, + intervals_name:intervals.simpleName, + variantcaller:'sentieon_dnascope'], + cram, + crai, + intervals + ] + } + + emit_mode_items = sentieon_dnascope_emit_mode.split(',').each{ it -> it.toLowerCase().trim() } + lst = emit_mode_items - 'gvcf' + emit_vcf = lst.size() > 0 ? lst[0] : '' + + SENTIEON_DNASCOPE( + cram_intervals_for_sentieon, + fasta.map{it -> [[:], it]}, + fasta_fai.map{it -> [[:], it]}, + dbsnp.map{it -> [[:], it]}, + dbsnp_tbi.map{it -> [[:], it]}, + sentieon_dnascope_model.map{it -> [[:], it]}, + sentieon_dnascope_pcr_indel_model, + emit_vcf, + emit_mode_items.any{ it.equals('gvcf') }) + + if (joint_germline) { + genotype_intervals = SENTIEON_DNASCOPE.out.gvcf + .join(SENTIEON_DNASCOPE.out.gvcf_tbi, failOnMismatch: true) + .join(cram_intervals_for_sentieon, failOnMismatch: true) + .map{ meta, gvcf, tbi, cram, crai, intervals -> [ meta, gvcf, tbi, intervals ] } + } + + // Figure out if using intervals or no_intervals + dnascope_vcf_branch = SENTIEON_DNASCOPE.out.vcf.map{ + meta, vcf -> [ meta - meta.subMap('interval_name'), vcf] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + dnascope_vcf_tbi_branch = SENTIEON_DNASCOPE.out.vcf_tbi.map{ + meta, vcf_tbi -> [ meta - meta.subMap('interval_name'), vcf_tbi] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + haplotyper_gvcf_branch = SENTIEON_DNASCOPE.out.gvcf.map{ + meta, gvcf -> [ meta - meta.subMap('interval_name'), gvcf] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + haplotyper_gvcf_tbi_branch = SENTIEON_DNASCOPE.out.gvcf_tbi.map{ + meta, gvcf_tbi -> [ meta - meta.subMap('interval_name'), gvcf_tbi] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + vcfs_for_merging = dnascope_vcf_branch.intervals.map{ + meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]} + + vcfs_for_merging = vcfs_for_merging.map{ + meta, vcf -> [ + meta - meta.subMap('intervals_name'), + vcf]}.groupTuple() + + // VCFs + // Only when using intervals + MERGE_SENTIEON_DNASCOPE_VCFS(vcfs_for_merging, dict) + + dnascope_vcf = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_VCFS.out.vcf, + dnascope_vcf_branch.no_intervals) + + haplotyper_tbi = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_VCFS.out.tbi, + dnascope_vcf_tbi_branch.no_intervals) + + // Remove no longer necessary field: num_intervals + vcf = dnascope_vcf.map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] } + vcf_tbi = haplotyper_tbi.map{ meta, tbi -> [ meta - meta.subMap('num_intervals'), tbi ] } + + // GVFs + // Only when using intervals + gvcfs_for_merging = haplotyper_gvcf_branch.intervals.map{ + meta, vcf -> [groupKey(meta, meta.num_intervals), vcf]} + + gvcfs_for_merging = gvcfs_for_merging.map{ + meta, vcf -> [ meta - meta.subMap('intervals_name'), vcf ] + }.groupTuple() + + MERGE_SENTIEON_DNASCOPE_GVCFS(gvcfs_for_merging, dict) + + gvcf = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_GVCFS.out.vcf, + haplotyper_gvcf_branch.no_intervals) + + gvcf_tbi = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_GVCFS.out.tbi, + haplotyper_gvcf_tbi_branch.no_intervals) + + versions = versions.mix(SENTIEON_DNASCOPE.out.versions) + versions = versions.mix(MERGE_SENTIEON_DNASCOPE_VCFS.out.versions) + versions = versions.mix(MERGE_SENTIEON_DNASCOPE_GVCFS.out.versions) + + emit: + versions + vcf + vcf_tbi + gvcf + gvcf_tbi + genotype_intervals // For joint genotyping + +} diff --git a/subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf b/subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf index cf9bcf9e21..4b280d271c 100644 --- a/subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf +++ b/subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf @@ -42,7 +42,8 @@ workflow BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER { ] } - emit_mode_items = sentieon_haplotyper_emit_mode.split(',') + + emit_mode_items = sentieon_haplotyper_emit_mode.split(',').each{ it -> it.toLowerCase().trim() } lst = emit_mode_items - 'gvcf' emit_vcf = lst.size() > 0 ? lst[0] : '' @@ -53,7 +54,7 @@ workflow BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER { dbsnp, dbsnp_tbi, emit_vcf, - emit_mode_items.contains('gvcf')) + emit_mode_items.any{ it.equals('gvcf') }) if (joint_germline) { genotype_intervals = SENTIEON_HAPLOTYPER.out.gvcf diff --git a/tests/config/tags.yml b/tests/config/tags.yml index 362bc525b9..4fabc3a7d5 100644 --- a/tests/config/tags.yml +++ b/tests/config/tags.yml @@ -314,6 +314,31 @@ haplotypecaller_skip_filter: - tests/csv/3.0/mapped_single_bam.csv - tests/test_haplotypecaller_skip_filter.yml +## sentieon/dnascope +sentieon/dnascope: + - conf/modules/sentieon_dnascope.config + - modules/nf-core/sentieon/dnascope/main.nf + - modules/nf-core/gatk4/mergevcfs/main.nf + - modules/nf-core/samtools/index/main.nf + - modules/nf-core/samtools/merge/main.nf + - subworkflows/local/bam_merge_index_samtools/main.nf + - subworkflows/local/bam_variant_calling_germline_all/main.nf + - subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf + - tests/csv/3.0/mapped_single_bam.csv + - tests/test_sentieon_dnascope.yml + +sentieon_dnascope_skip_filter: + - conf/modules/sentieon_dnascope.config + - modules/nf-core/sentieon/dnascope/main.nf + - modules/nf-core/gatk4/mergevcfs/main.nf + - modules/nf-core/samtools/index/main.nf + - modules/nf-core/samtools/merge/main.nf + - subworkflows/local/bam_merge_index_samtools/main.nf + - subworkflows/local/bam_variant_calling_germline_all/main.nf + - subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf + - tests/csv/3.0/mapped_single_bam.csv + - tests/test_sentieon_dnascope_skip_filter.yml + ## sentieon/haplotyper sentieon/haplotyper: - conf/modules/sentieon_haplotyper.config @@ -364,16 +389,27 @@ joint_germline: - tests/csv/3.0/mapped_joint_bam.csv - tests/test_joint_germline.yml -## sentieon_joint_germline -sentieon_joint_germline: +## sentieon_dnascope_joint_germline +sentieon_dnascope_joint_germline: + - conf/modules/prepare_genome.config + - conf/modules/sentieon_dnascope.config + - conf/modules/sentieon_dnascope_joint_germline.config + - modules/nf-core/sentieon/dnascope/main.nf + - subworkflows/local/bam_variant_calling_germline_all/main.nf + - subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf + - tests/csv/3.0/mapped_joint_bam.csv + - tests/test_sentieon_dnascop_joint_germline.yml + +## sentieon_haplotyper_joint_germline +sentieon_haplotyper_joint_germline: - conf/modules/prepare_genome.config - conf/modules/sentieon_haplotyper.config - - conf/modules/sentieon_joint_germline.config + - conf/modules/sentieon_haplotyper_joint_germline.config - modules/nf-core/sentieon/haplotyper/main.nf - subworkflows/local/bam_variant_calling_germline_all/main.nf - subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf - tests/csv/3.0/mapped_joint_bam.csv - - tests/test_sentieon_joint_germline.yml + - tests/test_sentieon_haplotyper_joint_germline.yml ## manta manta: diff --git a/tests/test_gatk4_spark.yml b/tests/test_gatk4_spark.yml index 32082446ec..8dbc8fb974 100644 --- a/tests/test_gatk4_spark.yml +++ b/tests/test_gatk4_spark.yml @@ -48,7 +48,7 @@ # conda changes md5sums for test - path: results/preprocessing/mapped/ should_exist: false -- name: Run default pipeline with gatk4_spark & skipping all QC steps +- name: Run default pipeline with gatk4_spark and skipping all QC steps command: nextflow run main.nf -profile test_cache,use_gatk_spark --skip_tools fastqc,markduplicates_report,mosdepth,multiqc,samtools --outdir results tags: - gatk4_spark diff --git a/tests/test_sentieon_dnascope.yml b/tests/test_sentieon_dnascope.yml new file mode 100644 index 0000000000..f51e0bca72 --- /dev/null +++ b/tests/test_sentieon_dnascope.yml @@ -0,0 +1,147 @@ +- name: Run variant calling on germline sample with sentieons dnascope + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results + tags: + - germline + - sentieon/dnascope + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: b2144d21a0ebfd807a8646f1751d0ddc + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + md5sum: 912c7d5b31784c50e0a75b4fcfa4997b + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + md5sum: e67b24d296810a075378e5864bcea0fa + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/dnascope + should_exist: false +- name: Run variant calling on germline sample with sentieons dnascope without intervals + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --no_intervals --outdir results + tags: + - germline + - sentieon/dnascope + - no_intervals + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: b2144d21a0ebfd807a8646f1751d0ddc + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + md5sum: 912c7d5b31784c50e0a75b4fcfa4997b + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + md5sum: e67b24d296810a075378e5864bcea0fa + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_dnascope + should_exist: false +- name: Run variant calling on germline sample with sentieons dnascope output gvcf + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results --sentieon_dnascope_emit_mode gvcf + tags: + - germline + - sentieon/dnascope + - variant_calling + files: + - path: results/csv/variantcalled.csv + should_exist: false + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + should_exist: false + - path: results/dnascope + should_exist: false +- name: Run variant calling on germline sample with sentieons dnascope output both gvcf and vcf + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results --sentieon_dnascope_emit_mode variant,gvcf + tags: + - germline + - sentieon/dnascope + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: b2144d21a0ebfd807a8646f1751d0ddc + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + - path: results/dnascope + should_exist: false diff --git a/tests/test_sentieon_dnascope_joint_germline.yml b/tests/test_sentieon_dnascope_joint_germline.yml new file mode 100644 index 0000000000..e905b9cd53 --- /dev/null +++ b/tests/test_sentieon_dnascope_joint_germline.yml @@ -0,0 +1,68 @@ +- name: Run joint germline variant calling with sentieon dnascope + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf + tags: + - germline + - sentieon_dnascope_joint_germline + - variant_calling + - sentieon/dnascope + files: + - path: results/csv/variantcalled.csv + md5sum: 62d70060aad96337254efe2d7a1df170 + - path: results/multiqc + - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.FILTER.summary + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.count + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.qual + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz.tbi + - path: results/dnascope + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.bcftools_stats.txt + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.FILTER.summary + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.TsTv.count + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.TsTv.qual + should_exist: false + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.vcf.gz.tbi + should_exist: false +- name: Run joint germline variant calling with sentieon dnascope all intervals at once + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf --nucleotides_per_second 100 + tags: + - germline + - sentieon_dnascope_joint_germline + - variant_calling + - sentieon/dnascope + files: + - path: results/csv/variantcalled.csv + md5sum: 62d70060aad96337254efe2d7a1df170 + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.FILTER.summary + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.count + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.qual + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz.tbi + - path: results/dnascope + should_exist: false diff --git a/tests/test_sentieon_dnascope_skip_filter.yml b/tests/test_sentieon_dnascope_skip_filter.yml new file mode 100644 index 0000000000..16bbca9e7c --- /dev/null +++ b/tests/test_sentieon_dnascope_skip_filter.yml @@ -0,0 +1,81 @@ +- name: Run variant calling on germline sample with sentieon dnascope and skip filter + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --skip_tools dnascope_filter --outdir results + tags: + - germline + - sentieon_dnascope_skip_filter + - variant_calling + - sentieon/dnascope + files: + - path: results/csv/variantcalled.csv + md5sum: 10254414c0679ba1fb25e41b9ff548cc + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.unfiltered.bcftools_stats.txt + md5sum: f915fe1591ababb0da5e7b43dfc35092 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.FILTER.summary + md5sum: 87a84b5f8ac3d3cbeeef7d60afcdbfe7 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_dnascope + should_exist: false +- name: Run variant calling on germline sample with sentieon dnascope without intervals and skip filter + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --skip_tools dnascope_filter --no_intervals --outdir results + tags: + - germline + - sentieon_dnascope_skip_filter + - no_intervals + - variant_calling + - sentieon/dnascope + files: + - path: results/csv/variantcalled.csv + md5sum: 10254414c0679ba1fb25e41b9ff548cc + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.unfiltered.bcftools_stats.txt + md5sum: f915fe1591ababb0da5e7b43dfc35092 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.FILTER.summary + md5sum: 87a84b5f8ac3d3cbeeef7d60afcdbfe7 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_dnascope + should_exist: false diff --git a/tests/test_sentieon_joint_germline.yml b/tests/test_sentieon_haplotyper_joint_germline.yml similarity index 97% rename from tests/test_sentieon_joint_germline.yml rename to tests/test_sentieon_haplotyper_joint_germline.yml index 4637571aec..1c12f101db 100644 --- a/tests/test_sentieon_joint_germline.yml +++ b/tests/test_sentieon_haplotyper_joint_germline.yml @@ -2,7 +2,7 @@ command: nextflow run main.nf -profile test_cache,software_license,targeted --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_haplotyper --step variant_calling --joint_germline --outdir results --sentieon_haplotyper_emit_mode gvcf tags: - germline - - sentieon_joint_germline + - sentieon_haplotyper_joint_germline - variant_calling - sentieon/haplotyper files: @@ -31,7 +31,7 @@ command: nextflow run main.nf -profile test_cache,software_license,targeted --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_haplotyper --step variant_calling --joint_germline --outdir results --sentieon_haplotyper_emit_mode gvcf --nucleotides_per_second 100 tags: - germline - - sentieon_joint_germline + - sentieon_haplotyper_joint_germline - variant_calling - sentieon/haplotyper files: @@ -58,7 +58,7 @@ command: nextflow run main.nf -profile test_cache,software_license,tools_germline --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_haplotyper --step variant_calling --joint_germline --outdir results --sentieon_haplotyper_emit_mode gvcf -stub-run tags: - germline - - sentieon_joint_germline + - sentieon_haplotyper_joint_germline - variant_calling - vqsr files: diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 5ae80fbae1..a6ed5b2e50 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -50,6 +50,8 @@ def checkPathParamList = [ params.multiqc_config, params.pon, params.pon_tbi, + params.sentieon_dnascope_model, + params.snpeff_cache, params.spliceai_indel, params.spliceai_indel_tbi, params.spliceai_snv, @@ -255,18 +257,60 @@ if (!params.dbsnp && !params.known_indels) { if (params.step in ['mapping', 'markduplicates', 'prepare_recalibration', 'recalibrate'] && (!params.skip_tools || (params.skip_tools && !params.skip_tools.split(',').contains('baserecalibrator')))) { error("Base quality score recalibration requires at least one resource file. Please provide at least one of `--dbsnp` or `--known_indels`\nYou can skip this step in the workflow by adding `--skip_tools baserecalibrator` to the command.") } - if (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper'))) { - log.warn "If GATK's Haplotypecaller or Sentieon's Haplotyper is specified, without `--dbsnp` or `--known_indels no filtering will be done. For filtering, please provide at least one of `--dbsnp` or `--known_indels`.\nFor more information see FilterVariantTranches (single-sample, default): https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches\nFor more information see VariantRecalibration (--joint_germline): https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator\nFor more information on GATK Best practice germline variant calling: https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-" + if (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope'))) { + log.warn "If GATK's Haplotypecaller, Sentieon's Dnascpe or Sentieon's Haplotyper is specified, without `--dbsnp` or `--known_indels no filtering will be done. For filtering, please provide at least one of `--dbsnp` or `--known_indels`.\nFor more information see FilterVariantTranches (single-sample, default): https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches\nFor more information see VariantRecalibration (--joint_germline): https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator\nFor more information on GATK Best practice germline variant calling: https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-" } } -if (params.joint_germline && (!params.tools || !(params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper')))) { - error("The GATK's Haplotypecaller or Sentieon's Haplotyper should be specified as one of the tools when doing joint germline variant calling.) ") +if (params.joint_germline && (!params.tools || !(params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope')))) { + error("The GATK's Haplotypecaller, Sentieon's Dnascope or Sentieon's Haplotyper should be specified as one of the tools when doing joint germline variant calling.) ") } -if (params.joint_germline && (!params.dbsnp || !params.known_indels || !params.known_snps || params.no_intervals)) { - log.warn "If GATK's Haplotypecaller or Sentieon's Haplotyper is specified, without `--dbsnp`, `--known_snps`, `--known_indels` or the associated resource labels (ie `known_snps_vqsr`), no variant recalibration will be done. For recalibration you must provide all of these resources.\nFor more information see VariantRecalibration: https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator \nJoint germline variant calling also requires intervals in order to genotype the samples. As a result, if `--no_intervals` is set to `true` the joint germline variant calling will not be performed." +if ( + params.tools && + ( + params.tools.split(',').contains('haplotypecaller') || + params.tools.split(',').contains('sentieon_haplotyper') || + params.tools.split(',').contains('sentieon_dnascope') + ) && + params.joint_germline && + ( + !params.dbsnp || + !params.known_indels || + !params.known_snps || + params.no_intervals + ) + ) { + log.warn("""If GATK's Haplotypecaller, Sentieon's Dnascope and/or Sentieon's Haplotyper is specified, \ +but without `--dbsnp`, `--known_snps`, `--known_indels` or the associated resource labels (ie `known_snps_vqsr`), \ +no variant recalibration will be done. For recalibration you must provide all of these resources.\nFor more information \ +see VariantRecalibration: https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator \n\ +Joint germline variant calling also requires intervals in order to genotype the samples. \ +As a result, if `--no_intervals` is set to `true` the joint germline variant calling will not be performed.""") } +if (params.tools && + params.tools.split(',').contains('sentieon_dnascope') && + params.joint_germline && + ( + !params.sentieon_dnascope_emit_mode || + !params.sentieon_dnascope_emit_mode.split(',').contains('gvcf') + ) + ) { + error("When using Sentieon Dnascope for joint-germline variant-calling the option `--sentieon_dnascope_emit_mode` has to include `gvcf`.") +} + +if (params.tools && + params.tools.split(',').contains('sentieon_haplotyper') && + params.joint_germline && + ( + !params.sentieon_haplotyper_emit_mode || + !params.sentieon_haplotyper_emit_mode.split(',').contains('gvcf') + ) + ) { + error("When using Sentieon Haplotyper for joint-germline variant-calling the option `--sentieon_haplotyper_emit_mode` has to include `gvcf`.") +} + + // Fails when --joint_mutect2 is used without enabling mutect2 if (params.joint_mutect2 && (!params.tools || !params.tools.split(',').contains('mutect2'))) { error("The mutect2 should be specified as one of the tools when doing joint somatic variant calling with Mutect2. (The mutect2 could be specified by adding `--tools mutect2` to the nextflow command.)") @@ -297,20 +341,21 @@ if ((params.download_cache) && (params.snpeff_cache || params.vep_cache)) { */ // Initialize file channels based on params, defined in the params.genomes[params.genome] scope -ascat_alleles = params.ascat_alleles ? Channel.fromPath(params.ascat_alleles).collect() : Channel.empty() -ascat_loci = params.ascat_loci ? Channel.fromPath(params.ascat_loci).collect() : Channel.empty() -ascat_loci_gc = params.ascat_loci_gc ? Channel.fromPath(params.ascat_loci_gc).collect() : Channel.value([]) -ascat_loci_rt = params.ascat_loci_rt ? Channel.fromPath(params.ascat_loci_rt).collect() : Channel.value([]) -cf_chrom_len = params.cf_chrom_len ? Channel.fromPath(params.cf_chrom_len).collect() : [] -chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : Channel.value([]) -dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([]) -fasta = params.fasta ? Channel.fromPath(params.fasta).first() : Channel.empty() -fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : Channel.empty() -germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.value([]) // Mutect2 does not require a germline resource, so set to optional input -known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([]) -known_snps = params.known_snps ? Channel.fromPath(params.known_snps).collect() : Channel.value([]) -mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : Channel.value([]) -pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.value([]) // PON is optional for Mutect2 (but highly recommended) +ascat_alleles = params.ascat_alleles ? Channel.fromPath(params.ascat_alleles).collect() : Channel.empty() +ascat_loci = params.ascat_loci ? Channel.fromPath(params.ascat_loci).collect() : Channel.empty() +ascat_loci_gc = params.ascat_loci_gc ? Channel.fromPath(params.ascat_loci_gc).collect() : Channel.value([]) +ascat_loci_rt = params.ascat_loci_rt ? Channel.fromPath(params.ascat_loci_rt).collect() : Channel.value([]) +cf_chrom_len = params.cf_chrom_len ? Channel.fromPath(params.cf_chrom_len).collect() : [] +chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : Channel.value([]) +dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([]) +fasta = params.fasta ? Channel.fromPath(params.fasta).first() : Channel.empty() +fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : Channel.empty() +germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.value([]) // Mutect2 does not require a germline resource, so set to optional input +known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([]) +known_snps = params.known_snps ? Channel.fromPath(params.known_snps).collect() : Channel.value([]) +mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : Channel.value([]) +pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.value([]) // PON is optional for Mutect2 (but highly recommended) +sentieon_dnascope_model = params.sentieon_dnascope_model ? Channel.fromPath(params.sentieon_dnascope_model).collect() : Channel.value([]) // Initialize value channels based on params, defined in the params.genomes[params.genome] scope ascat_genome = params.ascat_genome ?: Channel.empty() @@ -1164,7 +1209,10 @@ workflow SAREK { known_snps_vqsr, params.joint_germline, params.skip_tools && params.skip_tools.split(',').contains('haplotypecaller_filter'), // true if filtering should be skipped - params.sentieon_haplotyper_emit_mode) + params.sentieon_haplotyper_emit_mode, + params.sentieon_dnascope_emit_mode, + params.sentieon_dnascope_pcr_indel_model, + sentieon_dnascope_model) // TUMOR ONLY VARIANT CALLING BAM_VARIANT_CALLING_TUMOR_ONLY_ALL( @@ -1232,6 +1280,7 @@ workflow SAREK { vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_freebayes) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_haplotypecaller) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_manta) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_dnascope) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_haplotyper) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_strelka) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_tiddit)