From 72bb50b69627d3e600653f9f3dd2911dcc7afb83 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Mon, 28 Aug 2023 10:43:18 +0000 Subject: [PATCH 01/50] WIP: Implementing Dnascope module and subworkflow in Sarek. --- conf/modules/prepare_genome.config | 6 +- conf/modules/sentieon_dnascope.config | 63 +++++++ modules.json | 5 + modules/nf-core/sentieon/dnascope/main.nf | 100 +++++++++++ modules/nf-core/sentieon/dnascope/meta.yml | 103 ++++++++++++ nextflow.config | 6 +- nextflow_schema.json | 23 ++- .../bam_variant_calling_germline_all/main.nf | 33 ++++ .../main.nf | 157 ++++++++++++++++++ workflows/sarek.nf | 5 +- 10 files changed, 495 insertions(+), 6 deletions(-) create mode 100644 conf/modules/sentieon_dnascope.config create mode 100644 modules/nf-core/sentieon/dnascope/main.nf create mode 100644 modules/nf-core/sentieon/dnascope/meta.yml create mode 100644 subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf diff --git a/conf/modules/prepare_genome.config b/conf/modules/prepare_genome.config index 66eb2041d2..8b1f45c4fb 100644 --- a/conf/modules/prepare_genome.config +++ b/conf/modules/prepare_genome.config @@ -76,7 +76,7 @@ process { } withName: 'TABIX_DBSNP' { - ext.when = { !params.dbsnp_tbi && params.dbsnp && ((params.step == "mapping" || params.step == "markduplicates" || params.step == "prepare_recalibration") || params.tools && (params.tools.split(',').contains('controlfreec') || params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('mutect2'))) } + ext.when = { !params.dbsnp_tbi && params.dbsnp && ((params.step == "mapping" || params.step == "markduplicates" || params.step == "prepare_recalibration") || params.tools && (params.tools.split(',').contains('controlfreec') || params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope') || params.tools.split(',').contains('mutect2'))) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, @@ -96,7 +96,7 @@ process { } withName: 'TABIX_KNOWN_INDELS' { - ext.when = { !params.known_indels_tbi && params.known_indels && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper'))) ) } + ext.when = { !params.known_indels_tbi && params.known_indels && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope'))) ) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, @@ -106,7 +106,7 @@ process { } withName: 'TABIX_KNOWN_SNPS' { - ext.when = { !params.known_snps_tbi && params.known_snps && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper'))) ) } + ext.when = { !params.known_snps_tbi && params.known_snps && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope'))) ) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, diff --git a/conf/modules/sentieon_dnascope.config b/conf/modules/sentieon_dnascope.config new file mode 100644 index 0000000000..4378a35f5a --- /dev/null +++ b/conf/modules/sentieon_dnascope.config @@ -0,0 +1,63 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// SENTIEON DNASCOPE + +process { + + withName: 'SENTIEON_DNASCOPE' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.dnascope" : "${meta.id}.dnascope.${intervals.simpleName}" } + ext.when = { params.tools && params.tools.split(',').contains('sentieon_dnascope') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}", + saveAs: { meta.num_intervals > 1 ? null : "sentieon_dnascope/${meta.id}/${it}" } + ] + } + + // TO-DO: Clean up all this stuff below concerning SENTIEON_HAPLOTYPER. + // It is just copied from sentieon_haplotyper.config, but some of it + // may also be needed for Dnascope. + + withName: 'MERGE_SENTIEON_HAPLOTYPER_VCFS' { + ext.prefix = { params.joint_germline ? "${meta.id}.haplotyper.g" : "${meta.id}.haplotyper.unfiltered" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_haplotyper/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'MERGE_SENTIEON_HAPLOTYPER_GVCFS' { + ext.prefix = { "${meta.id}.haplotyper.g" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_haplotyper/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + if (params.tools && params.tools.contains('sentieon_haplotyper')) { + withName: '.*FILTERVARIANTTRANCHES' { + ext.prefix = {"${meta.id}.haplotyper"} + ext.args = { "--info-key CNN_1D" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_haplotyper/${meta.id}/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + } + +} diff --git a/modules.json b/modules.json index 6f13ad15dd..7b2142663a 100644 --- a/modules.json +++ b/modules.json @@ -386,6 +386,11 @@ "git_sha": "915a0b16ba3e40ef59e7b44843b3118e17a9c906", "installed_by": ["modules"] }, + "sentieon/dnascope": { + "branch": "master", + "git_sha": "127edadc279e19da093fdd513926c6cdee82c306", + "installed_by": ["modules"] + }, "sentieon/gvcftyper": { "branch": "master", "git_sha": "6c9c11ee96796e53a01b4719286acce6af14bc3a", diff --git a/modules/nf-core/sentieon/dnascope/main.nf b/modules/nf-core/sentieon/dnascope/main.nf new file mode 100644 index 0000000000..f94654e034 --- /dev/null +++ b/modules/nf-core/sentieon/dnascope/main.nf @@ -0,0 +1,100 @@ +process SENTIEON_DNASCOPE { + tag "$meta.id" + label 'process_high' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + container 'nf-core/sentieon:202112.06' + + input: + tuple val(meta), path(bam), path(bai), path(intervals) + path(fasta) + path(fai) + path(dbsnp) + path(dbsnp_tbi) + path(ml_model) + val(emit_vcf) + val(emit_gvcf) + val(sentieon_dnascope_pcr_based) + + output: + tuple val(meta), path("*.unfiltered.vcf.gz") , optional:true, emit: vcf // added the substring ".unfiltered" in the filename of the vcf-files since without that the g.vcf.gz-files were ending up in the vcf-channel + tuple val(meta), path("*.unfiltered.vcf.gz.tbi"), optional:true, emit: vcf_tbi + tuple val(meta), path("*.g.vcf.gz") , optional:true, emit: gvcf // these output-files have to have the extension ".vcf.gz", otherwise the subsequent GATK-MergeVCFs will fail. + tuple val(meta), path("*.g.vcf.gz.tbi") , optional:true, emit: gvcf_tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' // options for the driver + def args2 = task.ext.args2 ?: '' // options for the vcf generation + def args3 = task.ext.args3 ?: '' // options for the gvcf generation + def interval = intervals ? "--interval ${intervals}" : '' + def dbsnp_str = dbsnp ? "-d ${dbsnp}" : '' + def model = ml_model ? " --model ${ml_model}" : '' + def pcr_indel_model = sentieon_dnascope_pcr_based ? '' : " --pcr_indel_model NONE" + def prefix = task.ext.prefix ?: "${meta.id}" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + def vcf_cmd = "" + def gvcf_cmd = "" + def base_cmd = '--algo DNAscope ' + dbsnp_str + + if (emit_vcf) { // emit_vcf can be the empty string, 'variant', 'confident' or 'all' but NOT 'gvcf' + vcf_cmd = base_cmd + args2 + model + pcr_indel_model + ' --emit_mode ' + emit_vcf + ' ' + prefix + '.unfiltered.vcf.gz' + } + + if (emit_gvcf) { // emit_gvcf can be either true or false + gvcf_cmd = base_cmd + args3 + model + pcr_indel_model_str + ' --emit_mode gvcf ' + prefix + '.g.vcf.gz' + } + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + sentieon driver $args -r $fasta -t $task.cpus -i $bam $interval $vcf_cmd $gvcf_cmd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + touch ${prefix}.unfiltered.vcf.gz + touch ${prefix}.unfiltered.vcf.gz.tbi + touch ${prefix}.g.vcf.gz + touch ${prefix}.g.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/dnascope/meta.yml b/modules/nf-core/sentieon/dnascope/meta.yml new file mode 100644 index 0000000000..3e1abf1a1f --- /dev/null +++ b/modules/nf-core/sentieon/dnascope/meta.yml @@ -0,0 +1,103 @@ +name: sentieon_dnascope +description: TO-DO UPDATE THIS FILE!!! DNAscope algorithm performs an improved version of Haplotype variant calling. +keywords: + - dnascope + - sentieon + - variant_calling +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. [ id:'test', single_end:false ] + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta3: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta4: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta5: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta6: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - meta7: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test' ] + - bam: + type: file + description: BAM file. + pattern: "*.bam" + - bai: + type: file + description: BAI file + pattern: "*.bai" + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" + - fai: + type: file + description: Index of the genome fasta file + pattern: "*.fai" + - dbsnp: + type: file + description: Single Nucleotide Polymorphism database (dbSNP) file + pattern: "*.vcf.gz" + - dbsnp_tbi: + type: file + description: Index of the Single Nucleotide Polymorphism database (dbSNP) file + pattern: "*.vcf.gz.tbi" + - call_interval: + type: file + description: bed or interval_list file containing interval in the reference that will be used in the analysis + pattern: "*.{bed,interval_list}" + - ml_model: + type: file + description: machine learning model file + pattern: "*.model" + +output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF file + pattern: "*.{vcf.gz}" + - index: + type: file + description: Index of the VCF file + pattern: "*.vcf.gz.tbi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@ramprasadn" diff --git a/nextflow.config b/nextflow.config index 1351f768f9..d91d72d34e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,8 +69,11 @@ params { ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 wes = false // Set to true, if data is exome/targeted sequencing data. Used to use correct models in various variant callers joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected - joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling + joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling sentieon_haplotyper_emit_mode = "variant" // default value for Sentieon haplotyper + sentieon_dnascope_emit_mode = "variant" // default value for Sentieon dnascope + sentieon_dnascope_pcr_based = true + sentieon_dnascope_model = "https://s3.amazonaws.com/sentieon-release/other/SentieonDNAscopeModel1.1.model" // Annotation dbnsfp = null // No dbnsfp processed file @@ -379,6 +382,7 @@ includeConfig 'conf/modules/manta.config' includeConfig 'conf/modules/mpileup.config' includeConfig 'conf/modules/msisensorpro.config' includeConfig 'conf/modules/mutect2.config' +includeConfig 'conf/modules/sentieon_dnascope.config' includeConfig 'conf/modules/sentieon_haplotyper.config' includeConfig 'conf/modules/sentieon_joint_germline.config' includeConfig 'conf/modules/strelka.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index 4db215cae8..12d9c98180 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -107,7 +107,7 @@ "fa_icon": "fas fa-toolbox", "description": "Tools to use for duplicate marking, variant calling and/or for annotation.", "help_text": "Multiple tools separated with commas.\n\n**Variant Calling:**\n\nGermline variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: DeepVariant, FreeBayes, GATK HaplotypeCaller, mpileup, Sentieon Haplotyper, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit\n\nTumor-only somatic variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, mpileup, Mutect2, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit, ControlFREEC\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, Mutect2, Strelka2\n- Structural variants: Manta, TIDDIT\n- Copy-Number: ASCAT, CNVKit, Control-FREEC\n- Microsatellite Instability: MSIsensorpro\n\n> **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.", - "pattern": "^((ascat|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|sentieon_haplotyper|manta|merge|mpileup|msisensorpro|mutect2|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(? [ + meta + [ + num_intervals:num_intervals, + intervals_name:intervals.simpleName, + variantcaller:'SENTIEON_DNASCOPE'], + cram, + crai, + intervals + ] + } + + emit_mode_items = sentieon_dnascope_emit_mode.split(',').each{ it -> it.toLowerCase().trim() } + lst = emit_mode_items - 'gvcf' + emit_vcf = lst.size() > 0 ? lst[0] : '' + + SENTIEON_DNASCOPE( + cram_intervals_for_sentieon, + fasta, + fasta_fai, + dbsnp, + dbsnp_tbi, + sentieon_dnascope_model, + emit_vcf, + emit_mode_items.any{ it.equals('gvcf') }, + sentieon_dnascope_pcr_based) + + if (joint_germline) { + genotype_intervals = SENTIEON_DNASCOPE.out.gvcf + .join(SENTIEON_DNASCOPE.out.gvcf_tbi, failOnMismatch: true) + .join(cram_intervals_for_sentieon, failOnMismatch: true) + .map{ meta, gvcf, tbi, cram, crai, intervals -> [ meta, gvcf, tbi, intervals ] } + } + + // Figure out if using intervals or no_intervals + haplotyper_vcf_branch = SENTIEON_DNASCOPE.out.vcf.map{ + meta, vcf -> [ meta - meta.subMap('interval_name'), vcf] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + haplotyper_vcf_tbi_branch = SENTIEON_DNASCOPE.out.vcf_tbi.map{ + meta, vcf_tbi -> [ meta - meta.subMap('interval_name'), vcf_tbi] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + haplotyper_gvcf_branch = SENTIEON_DNASCOPE.out.gvcf.map{ + meta, gvcf -> [ meta - meta.subMap('interval_name'), gvcf] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + haplotyper_gvcf_tbi_branch = SENTIEON_DNASCOPE.out.gvcf_tbi.map{ + meta, gvcf_tbi -> [ meta - meta.subMap('interval_name'), gvcf_tbi] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + vcfs_for_merging = haplotyper_vcf_branch.intervals.map{ + meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]} + + vcfs_for_merging = vcfs_for_merging.map{ + meta, vcf -> [ + meta - meta.subMap('intervals_name'), + vcf]}.groupTuple() + + // VCFs + // Only when using intervals + MERGE_SENTIEON_DNASCOPE_VCFS(vcfs_for_merging, dict) + + haplotyper_vcf = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_VCFS.out.vcf, + haplotyper_vcf_branch.no_intervals) + + haplotyper_tbi = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_VCFS.out.tbi, + haplotyper_vcf_tbi_branch.no_intervals) + + // Remove no longer necessary field: num_intervals + vcf = haplotyper_vcf.map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] } + vcf_tbi = haplotyper_tbi.map{ meta, tbi -> [ meta - meta.subMap('num_intervals'), tbi ] } + + // GVFs + // Only when using intervals + gvcfs_for_merging = haplotyper_gvcf_branch.intervals.map{ + meta, vcf -> [groupKey(meta, meta.num_intervals), vcf]} + + gvcfs_for_merging = gvcfs_for_merging.map{ + meta, vcf -> [ meta - meta.subMap('intervals_name'), vcf ] + }.groupTuple() + + MERGE_SENTIEON_DNASCOPE_GVCFS(gvcfs_for_merging, dict) + + gvcf = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_GVCFS.out.vcf, + haplotyper_gvcf_branch.no_intervals) + + gvcf_tbi = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_GVCFS.out.tbi, + haplotyper_gvcf_tbi_branch.no_intervals) + + versions = versions.mix(SENTIEON_DNASCOPE.out.versions) + versions = versions.mix(MERGE_SENTIEON_DNASCOPE_VCFS.out.versions) + versions = versions.mix(MERGE_SENTIEON_DNASCOPE_GVCFS.out.versions) + + emit: + versions + vcf + vcf_tbi + gvcf + gvcf_tbi + genotype_intervals // For joint genotyping + +} diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 8bb11326b3..03386a3b2b 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -1122,7 +1122,10 @@ workflow SAREK { known_snps_vqsr, params.joint_germline, params.skip_tools && params.skip_tools.split(',').contains('haplotypecaller_filter'), // true if filtering should be skipped - params.sentieon_haplotyper_emit_mode) + params.sentieon_haplotyper_emit_mode, + params.sentieon_dnascope_emit_mode, + params.sentieon_dnascope_pcr_based, + params.sentieon_dnascope_model) // TUMOR ONLY VARIANT CALLING BAM_VARIANT_CALLING_TUMOR_ONLY_ALL( From c81aeb55a1e7fc4a6b722d4f94c933268196a7fd Mon Sep 17 00:00:00 2001 From: asp8200 Date: Mon, 28 Aug 2023 12:29:50 +0000 Subject: [PATCH 02/50] WIP: Changing sentieon_dnascope_model from string to file-channel --- modules/nf-core/sentieon/dnascope/main.nf | 10 +++--- .../main.nf | 10 +++--- .../main.nf | 4 +-- workflows/sarek.nf | 32 ++++++++++--------- 4 files changed, 29 insertions(+), 27 deletions(-) diff --git a/modules/nf-core/sentieon/dnascope/main.nf b/modules/nf-core/sentieon/dnascope/main.nf index f94654e034..c1aba83d1b 100644 --- a/modules/nf-core/sentieon/dnascope/main.nf +++ b/modules/nf-core/sentieon/dnascope/main.nf @@ -14,9 +14,9 @@ process SENTIEON_DNASCOPE { path(dbsnp) path(dbsnp_tbi) path(ml_model) + val(pcr_based) val(emit_vcf) val(emit_gvcf) - val(sentieon_dnascope_pcr_based) output: tuple val(meta), path("*.unfiltered.vcf.gz") , optional:true, emit: vcf // added the substring ".unfiltered" in the filename of the vcf-files since without that the g.vcf.gz-files were ending up in the vcf-channel @@ -38,8 +38,8 @@ process SENTIEON_DNASCOPE { def args3 = task.ext.args3 ?: '' // options for the gvcf generation def interval = intervals ? "--interval ${intervals}" : '' def dbsnp_str = dbsnp ? "-d ${dbsnp}" : '' - def model = ml_model ? " --model ${ml_model}" : '' - def pcr_indel_model = sentieon_dnascope_pcr_based ? '' : " --pcr_indel_model NONE" + def model_cmd = ml_model ? " --model ${ml_model}" : '' + def pcr_indel_model_cmd = pcr_based ? '' : " --pcr_indel_model NONE" def prefix = task.ext.prefix ?: "${meta.id}" def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' @@ -48,11 +48,11 @@ process SENTIEON_DNASCOPE { def base_cmd = '--algo DNAscope ' + dbsnp_str if (emit_vcf) { // emit_vcf can be the empty string, 'variant', 'confident' or 'all' but NOT 'gvcf' - vcf_cmd = base_cmd + args2 + model + pcr_indel_model + ' --emit_mode ' + emit_vcf + ' ' + prefix + '.unfiltered.vcf.gz' + vcf_cmd = base_cmd + args2 + model_cmd + pcr_indel_model_cmd + ' --emit_mode ' + emit_vcf + ' ' + prefix + '.unfiltered.vcf.gz' } if (emit_gvcf) { // emit_gvcf can be either true or false - gvcf_cmd = base_cmd + args3 + model + pcr_indel_model_str + ' --emit_mode gvcf ' + prefix + '.g.vcf.gz' + gvcf_cmd = base_cmd + args3 + model_cmd + pcr_indel_model_cmd + ' --emit_mode gvcf ' + prefix + '.g.vcf.gz' } """ diff --git a/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf b/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf index 2eaa41ccac..cef683519c 100644 --- a/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf +++ b/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf @@ -19,9 +19,9 @@ workflow BAM_VARIANT_CALLING_SENTIEON_DNASCOPE { dbsnp_vqsr // channel: [optional] intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals joint_germline // boolean: [mandatory] [default: false] joint calling of germline variants - sentieon_dnascope_emit_mode - sentieon_dnascope_pcr_based - sentieon_dnascope_model + sentieon_dnascope_emit_mode // string + sentieon_dnascope_pcr_based // boolean + sentieon_dnascope_model // channel main: versions = Channel.empty() @@ -55,9 +55,9 @@ workflow BAM_VARIANT_CALLING_SENTIEON_DNASCOPE { dbsnp, dbsnp_tbi, sentieon_dnascope_model, + sentieon_dnascope_pcr_based, emit_vcf, - emit_mode_items.any{ it.equals('gvcf') }, - sentieon_dnascope_pcr_based) + emit_mode_items.any{ it.equals('gvcf') }) if (joint_germline) { genotype_intervals = SENTIEON_DNASCOPE.out.gvcf diff --git a/subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf b/subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf index 370fc9963a..4b280d271c 100644 --- a/subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf +++ b/subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf @@ -43,7 +43,7 @@ workflow BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER { } - emit_mode_items = sentieon_haplotyper_emit_mode.split(',') + emit_mode_items = sentieon_haplotyper_emit_mode.split(',').each{ it -> it.toLowerCase().trim() } lst = emit_mode_items - 'gvcf' emit_vcf = lst.size() > 0 ? lst[0] : '' @@ -54,7 +54,7 @@ workflow BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER { dbsnp, dbsnp_tbi, emit_vcf, - emit_mode_items.contains('gvcf')) + emit_mode_items.any{ it.equals('gvcf') }) if (joint_germline) { genotype_intervals = SENTIEON_HAPLOTYPER.out.gvcf diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 03386a3b2b..6a3f10c6ea 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -50,6 +50,7 @@ def checkPathParamList = [ params.multiqc_config, params.pon, params.pon_tbi, + params.sentieon_dnascope_model, params.snpeff_cache, params.spliceai_indel, params.spliceai_indel_tbi, @@ -295,20 +296,21 @@ if ((params.download_cache) && (params.snpeff_cache || params.vep_cache)) { */ // Initialize file channels based on params, defined in the params.genomes[params.genome] scope -ascat_alleles = params.ascat_alleles ? Channel.fromPath(params.ascat_alleles).collect() : Channel.empty() -ascat_loci = params.ascat_loci ? Channel.fromPath(params.ascat_loci).collect() : Channel.empty() -ascat_loci_gc = params.ascat_loci_gc ? Channel.fromPath(params.ascat_loci_gc).collect() : Channel.value([]) -ascat_loci_rt = params.ascat_loci_rt ? Channel.fromPath(params.ascat_loci_rt).collect() : Channel.value([]) -cf_chrom_len = params.cf_chrom_len ? Channel.fromPath(params.cf_chrom_len).collect() : [] -chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : Channel.value([]) -dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([]) -fasta = params.fasta ? Channel.fromPath(params.fasta).first() : Channel.empty() -fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : Channel.empty() -germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.value([]) // Mutect2 does not require a germline resource, so set to optional input -known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([]) -known_snps = params.known_snps ? Channel.fromPath(params.known_snps).collect() : Channel.value([]) -mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : Channel.value([]) -pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.value([]) // PON is optional for Mutect2 (but highly recommended) +ascat_alleles = params.ascat_alleles ? Channel.fromPath(params.ascat_alleles).collect() : Channel.empty() +ascat_loci = params.ascat_loci ? Channel.fromPath(params.ascat_loci).collect() : Channel.empty() +ascat_loci_gc = params.ascat_loci_gc ? Channel.fromPath(params.ascat_loci_gc).collect() : Channel.value([]) +ascat_loci_rt = params.ascat_loci_rt ? Channel.fromPath(params.ascat_loci_rt).collect() : Channel.value([]) +cf_chrom_len = params.cf_chrom_len ? Channel.fromPath(params.cf_chrom_len).collect() : [] +chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : Channel.value([]) +dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([]) +fasta = params.fasta ? Channel.fromPath(params.fasta).first() : Channel.empty() +fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : Channel.empty() +germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.value([]) // Mutect2 does not require a germline resource, so set to optional input +known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([]) +known_snps = params.known_snps ? Channel.fromPath(params.known_snps).collect() : Channel.value([]) +mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : Channel.value([]) +pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.value([]) // PON is optional for Mutect2 (but highly recommended) +sentieon_dnascope_model = params.sentieon_dnascope_model ? Channel.fromPath(params.sentieon_dnascope_model).collect() : Channel.value([]) // Initialize value channels based on params, defined in the params.genomes[params.genome] scope ascat_genome = params.ascat_genome ?: Channel.empty() @@ -1125,7 +1127,7 @@ workflow SAREK { params.sentieon_haplotyper_emit_mode, params.sentieon_dnascope_emit_mode, params.sentieon_dnascope_pcr_based, - params.sentieon_dnascope_model) + sentieon_dnascope_model) // TUMOR ONLY VARIANT CALLING BAM_VARIANT_CALLING_TUMOR_ONLY_ALL( From 4dcdc9c3c7230632e7628a987c616700c21fff36 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Mon, 28 Aug 2023 12:33:01 +0000 Subject: [PATCH 03/50] align --- modules/nf-core/sentieon/dnascope/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/sentieon/dnascope/main.nf b/modules/nf-core/sentieon/dnascope/main.nf index c1aba83d1b..1b830a34a8 100644 --- a/modules/nf-core/sentieon/dnascope/main.nf +++ b/modules/nf-core/sentieon/dnascope/main.nf @@ -23,7 +23,7 @@ process SENTIEON_DNASCOPE { tuple val(meta), path("*.unfiltered.vcf.gz.tbi"), optional:true, emit: vcf_tbi tuple val(meta), path("*.g.vcf.gz") , optional:true, emit: gvcf // these output-files have to have the extension ".vcf.gz", otherwise the subsequent GATK-MergeVCFs will fail. tuple val(meta), path("*.g.vcf.gz.tbi") , optional:true, emit: gvcf_tbi - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when From cec591b7b1fadcf8ab5d0c37e946990e75870d70 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 30 Aug 2023 08:38:10 +0000 Subject: [PATCH 04/50] Changing Sarek-option sentieon_dnascope_pcr_based to sentieon_dnascope_pcr_indel_model --- modules/nf-core/sentieon/dnascope/main.nf | 4 +- nextflow.config | 46 +++++++++---------- nextflow_schema.json | 11 +++-- .../bam_variant_calling_germline_all/main.nf | 4 +- .../main.nf | 26 +++++------ workflows/sarek.nf | 2 +- 6 files changed, 47 insertions(+), 46 deletions(-) diff --git a/modules/nf-core/sentieon/dnascope/main.nf b/modules/nf-core/sentieon/dnascope/main.nf index 1b830a34a8..f7e3a66af4 100644 --- a/modules/nf-core/sentieon/dnascope/main.nf +++ b/modules/nf-core/sentieon/dnascope/main.nf @@ -14,7 +14,7 @@ process SENTIEON_DNASCOPE { path(dbsnp) path(dbsnp_tbi) path(ml_model) - val(pcr_based) + val(pcr_indel_model) val(emit_vcf) val(emit_gvcf) @@ -39,7 +39,7 @@ process SENTIEON_DNASCOPE { def interval = intervals ? "--interval ${intervals}" : '' def dbsnp_str = dbsnp ? "-d ${dbsnp}" : '' def model_cmd = ml_model ? " --model ${ml_model}" : '' - def pcr_indel_model_cmd = pcr_based ? '' : " --pcr_indel_model NONE" + def pcr_indel_model_cmd = pcr_indel_model ? " --pcr_indel_model ${pcr_indel_model}" : '' def prefix = task.ext.prefix ?: "${meta.id}" def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' diff --git a/nextflow.config b/nextflow.config index d91d72d34e..34076be9b8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -51,29 +51,29 @@ params { seq_platform = 'ILLUMINA' // Default platform written in read group PL field by aligner // Variant Calling - only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired samples - ascat_ploidy = null // default value for ASCAT - ascat_min_base_qual = 20 // default value for ASCAT - ascat_min_counts = 10 // default value for ASCAT - ascat_min_map_qual = 35 // default value for ASCAT - ascat_purity = null // default value for ASCAT - cf_ploidy = "2" // default value for Control-FREEC - cf_coeff = 0.05 // default value for Control-FREEC - cf_contamination = 0 // default value for Control-FREEC - cf_contamination_adjustment = false // by default we are not using this in Control-FREEC - cf_mincov = 0 // ControlFreec default values - cf_minqual = 0 // ControlFreec default values - cf_window = null // by default we are not using this in Control-FREEC - cnvkit_reference = null // by default the reference is build from the fasta file - concatenate_vcfs = false // by default we don't concatenate the germline-vcf-files - ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 - wes = false // Set to true, if data is exome/targeted sequencing data. Used to use correct models in various variant callers - joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected - joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling - sentieon_haplotyper_emit_mode = "variant" // default value for Sentieon haplotyper - sentieon_dnascope_emit_mode = "variant" // default value for Sentieon dnascope - sentieon_dnascope_pcr_based = true - sentieon_dnascope_model = "https://s3.amazonaws.com/sentieon-release/other/SentieonDNAscopeModel1.1.model" + only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired samples + ascat_ploidy = null // default value for ASCAT + ascat_min_base_qual = 20 // default value for ASCAT + ascat_min_counts = 10 // default value for ASCAT + ascat_min_map_qual = 35 // default value for ASCAT + ascat_purity = null // default value for ASCAT + cf_ploidy = "2" // default value for Control-FREEC + cf_coeff = 0.05 // default value for Control-FREEC + cf_contamination = 0 // default value for Control-FREEC + cf_contamination_adjustment = false // by default we are not using this in Control-FREEC + cf_mincov = 0 // ControlFreec default values + cf_minqual = 0 // ControlFreec default values + cf_window = null // by default we are not using this in Control-FREEC + cnvkit_reference = null // by default the reference is build from the fasta file + concatenate_vcfs = false // by default we don't concatenate the germline-vcf-files + ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 + wes = false // Set to true, if data is exome/targeted sequencing data. Used to use correct models in various variant callers + joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected + joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling + sentieon_haplotyper_emit_mode = "variant" // default value for Sentieon haplotyper + sentieon_dnascope_emit_mode = "variant" // default value for Sentieon dnascope + sentieon_dnascope_pcr_indel_model = 'CONSERVATIVE' + sentieon_dnascope_model = "https://s3.amazonaws.com/sentieon-release/other/SentieonDNAscopeModel1.1.model" // Annotation dbnsfp = null // No dbnsfp processed file diff --git a/nextflow_schema.json b/nextflow_schema.json index 12d9c98180..0b1a8f50a0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -407,12 +407,13 @@ "help_text": "The option `--sentieon_dnascope_emit_mode` can be set to the same string values as the Dnascope's `--emit_mode`. To output both a vcf and a gvcf, specify both a vcf-option (currently, `all`, `confident` and `variant`) and `gvcf`. For example, to obtain a vcf and gvcf one could set `--sentieon_dnascope_emit_mode` to `variant, gvcf`.", "pattern": "^(all|confident|gvcf|variant|gvcf,all|gvcf,confident|gvcf,variant|all,gvcf|confident,gvcf|variant,gvcf)(? Date: Wed, 30 Aug 2023 14:44:48 +0000 Subject: [PATCH 05/50] Adding meta to some input-channels for Dnascope module --- modules/nf-core/sentieon/dnascope/main.nf | 10 +++++----- .../bam_variant_calling_sentieon_dnascope/main.nf | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/nf-core/sentieon/dnascope/main.nf b/modules/nf-core/sentieon/dnascope/main.nf index f7e3a66af4..fec38b47b5 100644 --- a/modules/nf-core/sentieon/dnascope/main.nf +++ b/modules/nf-core/sentieon/dnascope/main.nf @@ -9,11 +9,11 @@ process SENTIEON_DNASCOPE { input: tuple val(meta), path(bam), path(bai), path(intervals) - path(fasta) - path(fai) - path(dbsnp) - path(dbsnp_tbi) - path(ml_model) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(dbsnp) + tuple val(meta5), path(dbsnp_tbi) + tuple val(meta6), path(ml_model) val(pcr_indel_model) val(emit_vcf) val(emit_gvcf) diff --git a/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf b/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf index 8eb931a2da..6a395af18f 100644 --- a/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf +++ b/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf @@ -50,11 +50,11 @@ workflow BAM_VARIANT_CALLING_SENTIEON_DNASCOPE { SENTIEON_DNASCOPE( cram_intervals_for_sentieon, - fasta, - fasta_fai, - dbsnp, - dbsnp_tbi, - sentieon_dnascope_model, + fasta.map{it -> [[:], it]}, + fasta_fai.map{it -> [[:], it]}, + dbsnp.map{it -> [[:], it]}, + dbsnp_tbi.map{it -> [[:], it]}, + sentieon_dnascope_model.map{it -> [[:], it]}, sentieon_dnascope_pcr_indel_model, emit_vcf, emit_mode_items.any{ it.equals('gvcf') }) From c12ce59390106e18d8b545d18aac81bfd0eec4d3 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 6 Sep 2023 13:34:11 +0000 Subject: [PATCH 06/50] Updated dnascope --- modules.json | 2 +- modules/nf-core/sentieon/dnascope/main.nf | 8 +- modules/nf-core/sentieon/dnascope/meta.yml | 96 +++++++++++++--------- 3 files changed, 61 insertions(+), 45 deletions(-) diff --git a/modules.json b/modules.json index 7b2142663a..45457529ee 100644 --- a/modules.json +++ b/modules.json @@ -388,7 +388,7 @@ }, "sentieon/dnascope": { "branch": "master", - "git_sha": "127edadc279e19da093fdd513926c6cdee82c306", + "git_sha": "4fb6fdc8046ec09cd30f92a2a252e9a0ba4a6309", "installed_by": ["modules"] }, "sentieon/gvcftyper": { diff --git a/modules/nf-core/sentieon/dnascope/main.nf b/modules/nf-core/sentieon/dnascope/main.nf index fec38b47b5..6be42a1728 100644 --- a/modules/nf-core/sentieon/dnascope/main.nf +++ b/modules/nf-core/sentieon/dnascope/main.nf @@ -37,7 +37,7 @@ process SENTIEON_DNASCOPE { def args2 = task.ext.args2 ?: '' // options for the vcf generation def args3 = task.ext.args3 ?: '' // options for the gvcf generation def interval = intervals ? "--interval ${intervals}" : '' - def dbsnp_str = dbsnp ? "-d ${dbsnp}" : '' + def dbsnp_cmd = dbsnp ? "-d ${dbsnp}" : '' def model_cmd = ml_model ? " --model ${ml_model}" : '' def pcr_indel_model_cmd = pcr_indel_model ? " --pcr_indel_model ${pcr_indel_model}" : '' def prefix = task.ext.prefix ?: "${meta.id}" @@ -45,14 +45,14 @@ process SENTIEON_DNASCOPE { def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' def vcf_cmd = "" def gvcf_cmd = "" - def base_cmd = '--algo DNAscope ' + dbsnp_str + def base_cmd = '--algo DNAscope ' + dbsnp_cmd + ' ' if (emit_vcf) { // emit_vcf can be the empty string, 'variant', 'confident' or 'all' but NOT 'gvcf' - vcf_cmd = base_cmd + args2 + model_cmd + pcr_indel_model_cmd + ' --emit_mode ' + emit_vcf + ' ' + prefix + '.unfiltered.vcf.gz' + vcf_cmd = base_cmd + args2 + ' ' + model_cmd + pcr_indel_model_cmd + ' --emit_mode ' + emit_vcf + ' ' + prefix + '.unfiltered.vcf.gz' } if (emit_gvcf) { // emit_gvcf can be either true or false - gvcf_cmd = base_cmd + args3 + model_cmd + pcr_indel_model_cmd + ' --emit_mode gvcf ' + prefix + '.g.vcf.gz' + gvcf_cmd = base_cmd + args3 + ' ' + model_cmd + pcr_indel_model_cmd + ' --emit_mode gvcf ' + prefix + '.g.vcf.gz' } """ diff --git a/modules/nf-core/sentieon/dnascope/meta.yml b/modules/nf-core/sentieon/dnascope/meta.yml index 3e1abf1a1f..34e0b97b4c 100644 --- a/modules/nf-core/sentieon/dnascope/meta.yml +++ b/modules/nf-core/sentieon/dnascope/meta.yml @@ -1,5 +1,5 @@ name: sentieon_dnascope -description: TO-DO UPDATE THIS FILE!!! DNAscope algorithm performs an improved version of Haplotype variant calling. +description: DNAscope algorithm performs an improved version of Haplotype variant calling. keywords: - dnascope - sentieon @@ -17,36 +17,6 @@ input: description: | Groovy Map containing sample information. e.g. [ id:'test', single_end:false ] - - meta2: - type: map - description: | - Groovy Map containing reference information. - e.g. [ id:'test' ] - - meta3: - type: map - description: | - Groovy Map containing reference information. - e.g. [ id:'test' ] - - meta4: - type: map - description: | - Groovy Map containing reference information. - e.g. [ id:'test' ] - - meta5: - type: map - description: | - Groovy Map containing reference information. - e.g. [ id:'test' ] - - meta6: - type: map - description: | - Groovy Map containing reference information. - e.g. [ id:'test' ] - - meta7: - type: map - description: | - Groovy Map containing reference information. - e.g. [ id:'test' ] - bam: type: file description: BAM file. @@ -55,31 +25,69 @@ input: type: file description: BAI file pattern: "*.bai" + - intervals: + type: file + description: bed or interval_list file containing interval in the reference that will be used in the analysis + pattern: "*.{bed,interval_list}" + - meta2: + type: map + description: | + Groovy Map containing meta information for fasta. - fasta: type: file description: Genome fasta file pattern: "*.{fa,fasta}" + - meta3: + type: map + description: | + Groovy Map containing meta information for fasta index. - fai: type: file description: Index of the genome fasta file pattern: "*.fai" + - meta4: + type: map + description: | + Groovy Map containing meta information for dbsnp. - dbsnp: type: file description: Single Nucleotide Polymorphism database (dbSNP) file pattern: "*.vcf.gz" + - meta5: + type: map + description: | + Groovy Map containing meta information for dbsnp_tbi. - dbsnp_tbi: type: file description: Index of the Single Nucleotide Polymorphism database (dbSNP) file pattern: "*.vcf.gz.tbi" - - call_interval: + - meta6: + type: map + description: | + Groovy Map containing meta information for machine learning model for Dnascope. + - ml_model: type: file - description: bed or interval_list file containing interval in the reference that will be used in the analysis - pattern: "*.{bed,interval_list}" + description: machine learning model file + pattern: "*.model" - ml_model: type: file description: machine learning model file pattern: "*.model" - + - pcr_indel_model: + type: string + description: | + Controls the option pcr_indel_model for Dnascope. + The possible options are "NONE" (used for PCR free samples), and "HOSTILE", "AGGRESSIVE" and "CONSERVATIVE". + See Sentieons documentation for further explanation. + - emit_vcf: + type: string + description: | + Controls the vcf output from Dnascope. + Possible options are "all", "confident" and "variant". + See Sentieons documentation for further explanation. + - emit_gvcf: + type: boolean + description: If true, the haplotyper will output a gvcf output: - meta: type: map @@ -88,12 +96,20 @@ output: e.g. [ id:'test', single_end:false ] - vcf: type: file - description: VCF file - pattern: "*.{vcf.gz}" - - index: + description: Compressed VCF file + pattern: "*.unfiltered.vcf.gz" + - vcf_tbi: type: file - description: Index of the VCF file - pattern: "*.vcf.gz.tbi" + description: Index of VCF file + pattern: "*.unfiltered.vcf.gz.tbi" + - gvcf: + type: file + description: Compressed GVCF file + pattern: "*.g.vcf.gz" + - gvcf_tbi: + type: file + description: Index of GVCF file + pattern: "*.g.vcf.gz.tbi" - versions: type: file description: File containing software versions From 47ec9132ea8b4964d98a3c5ac586f2b21929f37f Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 6 Sep 2023 14:51:57 +0000 Subject: [PATCH 07/50] Adding option dnascope_filter for skip_tools. Will be removed later #1210 --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 0b1a8f50a0..c702afd956 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -114,7 +114,7 @@ "fa_icon": "fas fa-forward", "description": "Disable specified tools.", "help_text": "Multiple tools can be specified, separated by commas.\n\n> **NB** `--skip_tools baserecalibrator_report` is actually just not saving the reports.\n> **NB** `--skip_tools markduplicates_report` does not skip `MarkDuplicates` but prevent the collection of duplicate metrics that slows down performance.", - "pattern": "^((baserecalibrator|baserecalibrator_report|bcftools|documentation|fastqc|haplotypecaller_filter|haplotyper_filter|markduplicates|markduplicates_report|mosdepth|multiqc|samtools|vcftools|versions)?,?)*(? Date: Wed, 6 Sep 2023 14:52:36 +0000 Subject: [PATCH 08/50] Add variant-filter for DNASCOPE --- .../bam_variant_calling_germline_all/main.nf | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf index fc18f55b1a..cb22566ff5 100644 --- a/subworkflows/local/bam_variant_calling_germline_all/main.nf +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -208,7 +208,29 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { gvcf_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.gvcf gvcf_tbi_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.gvcf_tbi - // TO-DO: Implement joint_germline and VCF_VARIANT_FILTERING_GATK like, for instance, in the sentieon-haplotyper call below. + if (joint_germline) { + // TO-DO: Implement this like below for sentieon-haplotyper + error("Joint-germline with Sentieon-DNASCOPE has not yet been implemented.") + } else { + + // If single sample track, check if filtering should be done + if (!(skip_tools && skip_tools.split(',').contains('dnascope_filter'))) { + + VCF_VARIANT_FILTERING_GATK( + vcf_sentieon_dnascope.join(vcf_tbi_sentieon_dnascope, failOnDuplicate: true, failOnMismatch: true), + fasta, + fasta_fai, + dict.map{ meta, dict -> [ dict ] }, + intervals_bed_combined_haplotypec, + known_sites_indels.concat(known_sites_snps).flatten().unique().collect(), + known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect()) + + vcf_sentieon_dnascope = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf + + versions = versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions) + } + + } } // SENTIEON HAPLOTYPER From 5aa33fb34288fcbb9e2e9ec78e191e9a819131a9 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 6 Sep 2023 14:59:25 +0000 Subject: [PATCH 09/50] Trying to order stuff alphabetically --- nextflow.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nextflow.config b/nextflow.config index 34076be9b8..849509ea02 100644 --- a/nextflow.config +++ b/nextflow.config @@ -51,7 +51,6 @@ params { seq_platform = 'ILLUMINA' // Default platform written in read group PL field by aligner // Variant Calling - only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired samples ascat_ploidy = null // default value for ASCAT ascat_min_base_qual = 20 // default value for ASCAT ascat_min_counts = 10 // default value for ASCAT @@ -67,13 +66,14 @@ params { cnvkit_reference = null // by default the reference is build from the fasta file concatenate_vcfs = false // by default we don't concatenate the germline-vcf-files ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 - wes = false // Set to true, if data is exome/targeted sequencing data. Used to use correct models in various variant callers joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling - sentieon_haplotyper_emit_mode = "variant" // default value for Sentieon haplotyper + only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired samples + wes = false // Set to true, if data is exome/targeted sequencing data. Used to use correct models in various variant callers sentieon_dnascope_emit_mode = "variant" // default value for Sentieon dnascope - sentieon_dnascope_pcr_indel_model = 'CONSERVATIVE' sentieon_dnascope_model = "https://s3.amazonaws.com/sentieon-release/other/SentieonDNAscopeModel1.1.model" + sentieon_dnascope_pcr_indel_model = "CONSERVATIVE" + sentieon_haplotyper_emit_mode = "variant" // default value for Sentieon haplotyper // Annotation dbnsfp = null // No dbnsfp processed file From d8a06bd2a0c0a4a8aa3467500c304146760c37e0 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 6 Sep 2023 15:45:37 +0000 Subject: [PATCH 10/50] calling BAM_JOINT_CALLING_GERMLINE_SENTIEON for joint-germline variant-valling with Dnascope --- .../bam_variant_calling_germline_all/main.nf | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf index cb22566ff5..8af871c013 100644 --- a/subworkflows/local/bam_variant_calling_germline_all/main.nf +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -209,8 +209,23 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { gvcf_tbi_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.gvcf_tbi if (joint_germline) { - // TO-DO: Implement this like below for sentieon-haplotyper - error("Joint-germline with Sentieon-DNASCOPE has not yet been implemented.") + BAM_JOINT_CALLING_GERMLINE_SENTIEON( // TO-DO: Check that subworkflow is okay for Dnascope as well. Email sent to Don. + BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.genotype_intervals, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + known_sites_indels, + known_sites_indels_tbi, + known_indels_vqsr, + known_sites_snps, + known_sites_snps_tbi, + known_snps_vqsr) + + vcf_sentieon_dnascope = BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.genotype_vcf + versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.versions) } else { // If single sample track, check if filtering should be done From 56f65d6151e1174db6022b174feb8bcad13ed5c4 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 7 Sep 2023 07:38:26 +0000 Subject: [PATCH 11/50] Skipping Sentieon VarCal and ApplyVarCal for joint-germline variant-calling with Sentieon Dnascope --- .../main.nf | 171 +++++++++--------- .../bam_variant_calling_germline_all/main.nf | 6 +- 2 files changed, 93 insertions(+), 84 deletions(-) diff --git a/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf b/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf index da8b0e60be..ad22a44ba7 100644 --- a/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf +++ b/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf @@ -27,6 +27,7 @@ workflow BAM_JOINT_CALLING_GERMLINE_SENTIEON { resource_snps_vcf resource_snps_tbi known_snps_vqsr + variant_caller main: versions = Channel.empty() @@ -39,96 +40,102 @@ workflow BAM_JOINT_CALLING_GERMLINE_SENTIEON { BCFTOOLS_SORT(SENTIEON_GVCFTYPER.out.vcf_gz) - gvcf_to_merge = BCFTOOLS_SORT.out.vcf.map{ meta, vcf -> [ meta.subMap('num_intervals') + [ id:'joint_variant_calling', patient:'all_samples', variantcaller:'sentieon_haplotyper' ], vcf ]}.groupTuple() + gvcf_to_merge = BCFTOOLS_SORT.out.vcf.map{ meta, vcf -> [ meta.subMap('num_intervals') + [ id:'joint_variant_calling', patient:'all_samples', variantcaller:variant_caller ], vcf ]}.groupTuple() // Merge scatter/gather vcfs & index // Rework meta for variantscalled.csv and annotation tools MERGE_GENOTYPEGVCFS(gvcf_to_merge, dict) - vqsr_input = MERGE_GENOTYPEGVCFS.out.vcf.join(MERGE_GENOTYPEGVCFS.out.tbi, failOnDuplicate: true) - indels_resource_label = known_indels_vqsr.mix(dbsnp_vqsr).collect() - snps_resource_label = known_snps_vqsr.mix(dbsnp_vqsr).collect() - - // Recalibrate INDELs and SNPs separately - SENTIEON_VARCAL_INDEL( - vqsr_input, - resource_indels_vcf, - resource_indels_tbi, - indels_resource_label, - fasta, - fai) - - SENTIEON_VARCAL_SNP( - vqsr_input, - resource_snps_vcf, - resource_snps_tbi, - snps_resource_label, - fasta, - fai) - - //Prepare SNPs and INDELs for Sentieon's applyvarcal - // Step 1. : applyvarcal to SNPs - // Step 2. : Use SENTIEON_APPLYVARCAL_SNP output and run ApplyVQSR_INDEL. This avoids duplicate entries in the vcf as described here: https://hpc.nih.gov/training/gatk_tutorial/vqsr.html - - // Join results of variant recalibration into a single channel tuple - // Rework meta for variantscalled.csv and annotation tools - vqsr_input_snp = vqsr_input.join(SENTIEON_VARCAL_SNP.out.recal, failOnDuplicate: true) - .join(SENTIEON_VARCAL_SNP.out.idx, failOnDuplicate: true) - .join(SENTIEON_VARCAL_SNP.out.tranches, failOnDuplicate: true) - .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } - - SENTIEON_APPLYVARCAL_SNP( - vqsr_input_snp, - fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, - fai.map{ fai -> [ [ id:fai.baseName ], fai ] }) - - // Join results of SENTIEON_APPLYVARCAL_SNP and use as input for SENTIEON_APPLYVARCAL_INDEL to avoid duplicate entries in the result - // Rework meta for variantscalled.csv and annotation tools - vqsr_input_indel = SENTIEON_APPLYVARCAL_SNP.out.vcf.join(SENTIEON_APPLYVARCAL_SNP.out.tbi).map{ meta, vcf, tbi -> [ meta + [ id:'joint_variant_calling' ], vcf, tbi ]} - .join(SENTIEON_VARCAL_INDEL.out.recal, failOnDuplicate: true) - .join(SENTIEON_VARCAL_INDEL.out.idx, failOnDuplicate: true) - .join(SENTIEON_VARCAL_INDEL.out.tranches, failOnDuplicate: true) - .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } - - SENTIEON_APPLYVARCAL_INDEL( - vqsr_input_indel, - fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, - fai.map{ fai -> [ [ id:fai.baseName ], fai ] }) - - // The following is an ugly monster to achieve the following: - // When MERGE_GENOTYPEGVCFS and SENTIEON_APPLYVARCAL are run, then use output from SENTIEON_APPLYVARCAL - // When MERGE_GENOTYPEGVCFS and NOT SENTIEON_APPLYVARCAL, then use the output from MERGE_GENOTYPEGVCFS - - merge_vcf_for_join = MERGE_GENOTYPEGVCFS.out.vcf.map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} - merge_tbi_for_join = MERGE_GENOTYPEGVCFS.out.tbi.map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} - - // Remap for both to have the same key, if ApplyBQSR is not run, the channel is empty --> populate with empty elements - vqsr_vcf_for_join = SENTIEON_APPLYVARCAL_INDEL.out.vcf.ifEmpty([[:], []]).map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} - vqsr_tbi_for_join = SENTIEON_APPLYVARCAL_INDEL.out.tbi.ifEmpty([[:], []]).map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} - - // Join on metamap - // If both --> meta, vcf_merged, vcf_bqsr - // If not VQSR --> meta, vcf_merged, [] - // if the second is empty, use the first - genotype_vcf = merge_vcf_for_join.join(vqsr_vcf_for_join, remainder: true).map{ - meta, joint_vcf, recal_vcf -> - - vcf_out = recal_vcf ?: joint_vcf - - [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"sentieon_haplotyper"], vcf_out] - } - - genotype_index = merge_tbi_for_join.join(vqsr_tbi_for_join, remainder: true).map{ - meta, joint_tbi, recal_tbi -> - - tbi_out = recal_tbi ?: joint_tbi - [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"sentieon_haplotyper"], tbi_out] + merged_vcf = MERGE_GENOTYPEGVCFS.out.vcf.map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} + merged_tbi = MERGE_GENOTYPEGVCFS.out.tbi.map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} + + if (variant_caller == 'sentieon_dnascope') { + genotype_index = merged_vcf + genotype_vcf = merged_tbi + } else { + vqsr_input = MERGE_GENOTYPEGVCFS.out.vcf.join(MERGE_GENOTYPEGVCFS.out.tbi, failOnDuplicate: true) + indels_resource_label = known_indels_vqsr.mix(dbsnp_vqsr).collect() + snps_resource_label = known_snps_vqsr.mix(dbsnp_vqsr).collect() + + // Recalibrate INDELs and SNPs separately + SENTIEON_VARCAL_INDEL( + vqsr_input, + resource_indels_vcf, + resource_indels_tbi, + indels_resource_label, + fasta, + fai) + + SENTIEON_VARCAL_SNP( + vqsr_input, + resource_snps_vcf, + resource_snps_tbi, + snps_resource_label, + fasta, + fai) + + //Prepare SNPs and INDELs for Sentieon's applyvarcal + // Step 1. : applyvarcal to SNPs + // Step 2. : Use SENTIEON_APPLYVARCAL_SNP output and run ApplyVQSR_INDEL. This avoids duplicate entries in the vcf as described here: https://hpc.nih.gov/training/gatk_tutorial/vqsr.html + + // Join results of variant recalibration into a single channel tuple + // Rework meta for variantscalled.csv and annotation tools + vqsr_input_snp = vqsr_input.join(SENTIEON_VARCAL_SNP.out.recal, failOnDuplicate: true) + .join(SENTIEON_VARCAL_SNP.out.idx, failOnDuplicate: true) + .join(SENTIEON_VARCAL_SNP.out.tranches, failOnDuplicate: true) + .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } + + SENTIEON_APPLYVARCAL_SNP( + vqsr_input_snp, + fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, + fai.map{ fai -> [ [ id:fai.baseName ], fai ] }) + + // Join results of SENTIEON_APPLYVARCAL_SNP and use as input for SENTIEON_APPLYVARCAL_INDEL to avoid duplicate entries in the result + // Rework meta for variantscalled.csv and annotation tools + vqsr_input_indel = SENTIEON_APPLYVARCAL_SNP.out.vcf.join(SENTIEON_APPLYVARCAL_SNP.out.tbi).map{ meta, vcf, tbi -> [ meta + [ id:'joint_variant_calling' ], vcf, tbi ]} + .join(SENTIEON_VARCAL_INDEL.out.recal, failOnDuplicate: true) + .join(SENTIEON_VARCAL_INDEL.out.idx, failOnDuplicate: true) + .join(SENTIEON_VARCAL_INDEL.out.tranches, failOnDuplicate: true) + .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } + + SENTIEON_APPLYVARCAL_INDEL( + vqsr_input_indel, + fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, + fai.map{ fai -> [ [ id:fai.baseName ], fai ] }) + + // The following is an ugly monster to achieve the following: + // When MERGE_GENOTYPEGVCFS and SENTIEON_APPLYVARCAL are run, then use output from SENTIEON_APPLYVARCAL + // When MERGE_GENOTYPEGVCFS and NOT SENTIEON_APPLYVARCAL, then use the output from MERGE_GENOTYPEGVCFS + + // Remap for both to have the same key, if ApplyBQSR is not run, the channel is empty --> populate with empty elements + vqsr_vcf_for_join = SENTIEON_APPLYVARCAL_INDEL.out.vcf.ifEmpty([[:], []]).map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} + vqsr_tbi_for_join = SENTIEON_APPLYVARCAL_INDEL.out.tbi.ifEmpty([[:], []]).map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} + + // Join on metamap + // If both --> meta, vcf_merged, vcf_bqsr + // If not VQSR --> meta, vcf_merged, [] + // if the second is empty, use the first + genotype_vcf = merged_vcf.join(vqsr_vcf_for_join, remainder: true).map{ + meta, joint_vcf, recal_vcf -> + + vcf_out = recal_vcf ?: joint_vcf + + [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"sentieon_haplotyper"], vcf_out] + } + + genotype_index = merged_tbi.join(vqsr_tbi_for_join, remainder: true).map{ + meta, joint_tbi, recal_tbi -> + + tbi_out = recal_tbi ?: joint_tbi + [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"sentieon_haplotyper"], tbi_out] + } + + versions = versions.mix(SENTIEON_VARCAL_SNP.out.versions) + versions = versions.mix(SENTIEON_VARCAL_INDEL.out.versions) + versions = versions.mix(SENTIEON_APPLYVARCAL_INDEL.out.versions) } versions = versions.mix(SENTIEON_GVCFTYPER.out.versions) - versions = versions.mix(SENTIEON_VARCAL_SNP.out.versions) - versions = versions.mix(SENTIEON_VARCAL_INDEL.out.versions) - versions = versions.mix(SENTIEON_APPLYVARCAL_INDEL.out.versions) emit: genotype_index // channel: [ val(meta), [ tbi ] ] diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf index 8af871c013..d288e31fd9 100644 --- a/subworkflows/local/bam_variant_calling_germline_all/main.nf +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -222,7 +222,8 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { known_indels_vqsr, known_sites_snps, known_sites_snps_tbi, - known_snps_vqsr) + known_snps_vqsr, + 'sentieon_dnascope') vcf_sentieon_dnascope = BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.genotype_vcf versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.versions) @@ -283,7 +284,8 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { known_indels_vqsr, known_sites_snps, known_sites_snps_tbi, - known_snps_vqsr) + known_snps_vqsr, + 'sentieon_haplotyper') vcf_sentieon_haplotyper = BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.genotype_vcf versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.versions) From 01562e7e79e6d052a25edd1a2fa78dff705d86fb Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 7 Sep 2023 09:39:16 +0000 Subject: [PATCH 12/50] Adding sentieon Dnascope to some error msgs and warnings --- workflows/sarek.nf | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/workflows/sarek.nf b/workflows/sarek.nf index f8bdbf46e7..4e55ad5756 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -254,16 +254,35 @@ if (!params.dbsnp && !params.known_indels) { if (params.step in ['mapping', 'markduplicates', 'prepare_recalibration', 'recalibrate'] && (!params.skip_tools || (params.skip_tools && !params.skip_tools.split(',').contains('baserecalibrator')))) { error("Base quality score recalibration requires at least one resource file. Please provide at least one of `--dbsnp` or `--known_indels`\nYou can skip this step in the workflow by adding `--skip_tools baserecalibrator` to the command.") } - if (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper'))) { - log.warn "If GATK's Haplotypecaller or Sentieon's Haplotyper is specified, without `--dbsnp` or `--known_indels no filtering will be done. For filtering, please provide at least one of `--dbsnp` or `--known_indels`.\nFor more information see FilterVariantTranches (single-sample, default): https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches\nFor more information see VariantRecalibration (--joint_germline): https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator\nFor more information on GATK Best practice germline variant calling: https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-" + if (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope'))) { + log.warn "If GATK's Haplotypecaller, Sentieon's Dnascpe or Sentieon's Haplotyper is specified, without `--dbsnp` or `--known_indels no filtering will be done. For filtering, please provide at least one of `--dbsnp` or `--known_indels`.\nFor more information see FilterVariantTranches (single-sample, default): https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches\nFor more information see VariantRecalibration (--joint_germline): https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator\nFor more information on GATK Best practice germline variant calling: https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-" } } -if (params.joint_germline && (!params.tools || !(params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper')))) { - error("The GATK's Haplotypecaller or Sentieon's Haplotyper should be specified as one of the tools when doing joint germline variant calling.) ") +if (params.joint_germline && (!params.tools || !(params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope')))) { + error("The GATK's Haplotypecaller, Sentieon's Dnascope or Sentieon's Haplotyper should be specified as one of the tools when doing joint germline variant calling.) ") } -if (params.joint_germline && (!params.dbsnp || !params.known_indels || !params.known_snps || params.no_intervals)) { - log.warn "If GATK's Haplotypecaller or Sentieon's Haplotyper is specified, without `--dbsnp`, `--known_snps`, `--known_indels` or the associated resource labels (ie `known_snps_vqsr`), no variant recalibration will be done. For recalibration you must provide all of these resources.\nFor more information see VariantRecalibration: https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator \nJoint germline variant calling also requires intervals in order to genotype the samples. As a result, if `--no_intervals` is set to `true` the joint germline variant calling will not be performed." +if ( + params.tools && + ( + params.tools.split(',').contains('haplotypecaller') || + params.tools.split(',').contains('sentieon_haplotyper') || + params.tools.split(',').contains('sentieon_dnascope') + ) && + params.joint_germline && + ( + !params.dbsnp || + !params.known_indels || + !params.known_snps || + params.no_intervals + ) + ) { + log.warn("""If GATK's Haplotypecaller, Sentieon's Dnascope and/or Sentieon's Haplotyper is specified, \ +but without `--dbsnp`, `--known_snps`, `--known_indels` or the associated resource labels (ie `known_snps_vqsr`), \ +no variant recalibration will be done. For recalibration you must provide all of these resources.\nFor more information \ +see VariantRecalibration: https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator \n\ +Joint germline variant calling also requires intervals in order to genotype the samples.\ +As a result, if `--no_intervals` is set to `true` the joint germline variant calling will not be performed.""") } // Fails when --joint_mutect2 is used without enabling mutect2 From 0f4080cc6ab01deac130cc2534dfed2ff54c90d7 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 13 Sep 2023 12:25:11 +0000 Subject: [PATCH 13/50] params.joint_germline was unintentionally out-commented --- nextflow.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 5c9d6cdf30..fd800c1ed2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -65,7 +65,8 @@ params { cf_window = null // by default we are not using this in Control-FREEC cnvkit_reference = null // by default the reference is build from the fasta file concatenate_vcfs = false // by default we don't concatenate the germline-vcf-files - ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected + ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 + joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired sample sentieon_dnascope_emit_mode = "variant" // default value for Sentieon dnascope From cc727860354239b97a6e1ea5c05241b710b614b0 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 07:00:13 +0000 Subject: [PATCH 14/50] Continued implementing Sentieon Dnascope --- conf/modules/sentieon_dnascope.config | 22 ++++++++----------- .../main.nf | 1 + .../bam_variant_calling_germline_all/main.nf | 5 ++--- .../main.nf | 16 +++++++------- workflows/sarek.nf | 1 + 5 files changed, 21 insertions(+), 24 deletions(-) diff --git a/conf/modules/sentieon_dnascope.config b/conf/modules/sentieon_dnascope.config index 4378a35f5a..325a1642a9 100644 --- a/conf/modules/sentieon_dnascope.config +++ b/conf/modules/sentieon_dnascope.config @@ -26,35 +26,31 @@ process { ] } - // TO-DO: Clean up all this stuff below concerning SENTIEON_HAPLOTYPER. - // It is just copied from sentieon_haplotyper.config, but some of it - // may also be needed for Dnascope. - - withName: 'MERGE_SENTIEON_HAPLOTYPER_VCFS' { - ext.prefix = { params.joint_germline ? "${meta.id}.haplotyper.g" : "${meta.id}.haplotyper.unfiltered" } + withName: 'MERGE_SENTIEON_DNASCOPE_VCFS' { + ext.prefix = { params.joint_germline ? "${meta.id}.dnascope.g" : "${meta.id}.dnascope.unfiltered" } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/sentieon_haplotyper/${meta.id}/" }, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/" }, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: 'MERGE_SENTIEON_HAPLOTYPER_GVCFS' { - ext.prefix = { "${meta.id}.haplotyper.g" } + withName: 'MERGE_SENTIEON_DNASCOPE_GVCFS' { + ext.prefix = { "${meta.id}.dnascope.g" } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/sentieon_haplotyper/${meta.id}/" }, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/" }, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - if (params.tools && params.tools.contains('sentieon_haplotyper')) { + if (params.tools && params.tools.contains('sentieon_dnascope')) { withName: '.*FILTERVARIANTTRANCHES' { - ext.prefix = {"${meta.id}.haplotyper"} + ext.prefix = {"${meta.id}.dnascope"} ext.args = { "--info-key CNN_1D" } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/sentieon_haplotyper/${meta.id}/"}, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/"}, pattern: "*{vcf.gz,vcf.gz.tbi}" ] } diff --git a/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf b/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf index ad22a44ba7..ae34721ed6 100644 --- a/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf +++ b/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf @@ -50,6 +50,7 @@ workflow BAM_JOINT_CALLING_GERMLINE_SENTIEON { merged_tbi = MERGE_GENOTYPEGVCFS.out.tbi.map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} if (variant_caller == 'sentieon_dnascope') { + // As advised by Don Freed (Sentieon), VQSR is skipped for DnaScope genotype_index = merged_vcf genotype_vcf = merged_tbi } else { diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf index d288e31fd9..8c48a157a2 100644 --- a/subworkflows/local/bam_variant_calling_germline_all/main.nf +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -209,7 +209,7 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { gvcf_tbi_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.gvcf_tbi if (joint_germline) { - BAM_JOINT_CALLING_GERMLINE_SENTIEON( // TO-DO: Check that subworkflow is okay for Dnascope as well. Email sent to Don. + BAM_JOINT_CALLING_GERMLINE_SENTIEON( BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.genotype_intervals, fasta, fasta_fai, @@ -228,7 +228,6 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { vcf_sentieon_dnascope = BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.genotype_vcf versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.versions) } else { - // If single sample track, check if filtering should be done if (!(skip_tools && skip_tools.split(',').contains('dnascope_filter'))) { @@ -353,11 +352,11 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { vcf_all vcf_deepvariant vcf_freebayes - vcf_sentieon_dnascope vcf_haplotypecaller vcf_manta vcf_mpileup vcf_strelka + vcf_sentieon_dnascope vcf_sentieon_haplotyper gvcf_sentieon_haplotyper vcf_tiddit diff --git a/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf b/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf index 6a395af18f..9eea9b2d61 100644 --- a/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf +++ b/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf @@ -37,7 +37,7 @@ workflow BAM_VARIANT_CALLING_SENTIEON_DNASCOPE { meta + [ num_intervals:num_intervals, intervals_name:intervals.simpleName, - variantcaller:'SENTIEON_DNASCOPE'], + variantcaller:'sentieon_dnascope'], cram, crai, intervals @@ -67,7 +67,7 @@ workflow BAM_VARIANT_CALLING_SENTIEON_DNASCOPE { } // Figure out if using intervals or no_intervals - haplotyper_vcf_branch = SENTIEON_DNASCOPE.out.vcf.map{ + dnascope_vcf_branch = SENTIEON_DNASCOPE.out.vcf.map{ meta, vcf -> [ meta - meta.subMap('interval_name'), vcf] } .branch{ @@ -75,7 +75,7 @@ workflow BAM_VARIANT_CALLING_SENTIEON_DNASCOPE { no_intervals: it[0].num_intervals <= 1 } - haplotyper_vcf_tbi_branch = SENTIEON_DNASCOPE.out.vcf_tbi.map{ + dnascope_vcf_tbi_branch = SENTIEON_DNASCOPE.out.vcf_tbi.map{ meta, vcf_tbi -> [ meta - meta.subMap('interval_name'), vcf_tbi] } .branch{ @@ -99,7 +99,7 @@ workflow BAM_VARIANT_CALLING_SENTIEON_DNASCOPE { no_intervals: it[0].num_intervals <= 1 } - vcfs_for_merging = haplotyper_vcf_branch.intervals.map{ + vcfs_for_merging = dnascope_vcf_branch.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]} vcfs_for_merging = vcfs_for_merging.map{ @@ -111,16 +111,16 @@ workflow BAM_VARIANT_CALLING_SENTIEON_DNASCOPE { // Only when using intervals MERGE_SENTIEON_DNASCOPE_VCFS(vcfs_for_merging, dict) - haplotyper_vcf = Channel.empty().mix( + dnascope_vcf = Channel.empty().mix( MERGE_SENTIEON_DNASCOPE_VCFS.out.vcf, - haplotyper_vcf_branch.no_intervals) + dnascope_vcf_branch.no_intervals) haplotyper_tbi = Channel.empty().mix( MERGE_SENTIEON_DNASCOPE_VCFS.out.tbi, - haplotyper_vcf_tbi_branch.no_intervals) + dnascope_vcf_tbi_branch.no_intervals) // Remove no longer necessary field: num_intervals - vcf = haplotyper_vcf.map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] } + vcf = dnascope_vcf.map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] } vcf_tbi = haplotyper_tbi.map{ meta, tbi -> [ meta - meta.subMap('num_intervals'), tbi ] } // GVFs diff --git a/workflows/sarek.nf b/workflows/sarek.nf index b5e1c1bece..c38e97871d 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -1227,6 +1227,7 @@ workflow SAREK { vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_freebayes) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_haplotypecaller) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_manta) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_dnascope) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_haplotyper) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_strelka) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_tiddit) From 5cf6520a96d3c7a8d2ead8678df7aadf55e81fbd Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 10:43:46 +0000 Subject: [PATCH 15/50] Fixing name of pytest. Replacing problematic ampersand character --- tests/test_gatk4_spark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_gatk4_spark.yml b/tests/test_gatk4_spark.yml index 32082446ec..8dbc8fb974 100644 --- a/tests/test_gatk4_spark.yml +++ b/tests/test_gatk4_spark.yml @@ -48,7 +48,7 @@ # conda changes md5sums for test - path: results/preprocessing/mapped/ should_exist: false -- name: Run default pipeline with gatk4_spark & skipping all QC steps +- name: Run default pipeline with gatk4_spark and skipping all QC steps command: nextflow run main.nf -profile test_cache,use_gatk_spark --skip_tools fastqc,markduplicates_report,mosdepth,multiqc,samtools --outdir results tags: - gatk4_spark From ccdf4e5422bdf98a435aabde57d436e98714cc56 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 11:24:46 +0000 Subject: [PATCH 16/50] Bug fix: Sent tbi to vcf-channel and vcf to tbi-channel --- .../local/bam_joint_calling_germline_sentieon/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf b/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf index ae34721ed6..4092bd4775 100644 --- a/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf +++ b/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf @@ -51,8 +51,8 @@ workflow BAM_JOINT_CALLING_GERMLINE_SENTIEON { if (variant_caller == 'sentieon_dnascope') { // As advised by Don Freed (Sentieon), VQSR is skipped for DnaScope - genotype_index = merged_vcf - genotype_vcf = merged_tbi + genotype_index = merged_tbi + genotype_vcf = merged_vcf } else { vqsr_input = MERGE_GENOTYPEGVCFS.out.vcf.join(MERGE_GENOTYPEGVCFS.out.tbi, failOnDuplicate: true) indels_resource_label = known_indels_vqsr.mix(dbsnp_vqsr).collect() From d74ec55ebfd10c677ddeb0fcf421788c7be9dbb3 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 11:43:22 +0000 Subject: [PATCH 17/50] Adding a bit of error-handling for sentieon-based joint-germline variant-calling --- workflows/sarek.nf | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/workflows/sarek.nf b/workflows/sarek.nf index c38e97871d..1f851ac12b 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -281,10 +281,33 @@ if ( but without `--dbsnp`, `--known_snps`, `--known_indels` or the associated resource labels (ie `known_snps_vqsr`), \ no variant recalibration will be done. For recalibration you must provide all of these resources.\nFor more information \ see VariantRecalibration: https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator \n\ -Joint germline variant calling also requires intervals in order to genotype the samples.\ +Joint germline variant calling also requires intervals in order to genotype the samples. \ As a result, if `--no_intervals` is set to `true` the joint germline variant calling will not be performed.""") } +if (params.tools && + params.tools.split(',').contains('sentieon_dnascope') && + params.joint_germline && + ( + !params.sentieon_dnascope_emit_mode || + !params.sentieon_dnascope_emit_mode.split(',').contains('gvcf') + ) + ) { + error("When using Sentieon Dnascope for joint-germline variant-calling the option `--sentieon_dnascope_emit_mode` has to include `gvcf`.") +} + +if (params.tools && + params.tools.split(',').contains('sentieon_haplotyper') && + params.joint_germline && + ( + !params.sentieon_haplotyper_emit_mode || + !params.sentieon_haplotyper_emit_mode.split(',').contains('gvcf') + ) + ) { + error("When using Sentieon Haplotyper for joint-germline variant-calling the option `--sentieon_haplotyper_emit_mode` has to include `gvcf`.") +} + + // Fails when --joint_mutect2 is used without enabling mutect2 if (params.joint_mutect2 && (!params.tools || !params.tools.split(',').contains('mutect2'))) { error("The mutect2 should be specified as one of the tools when doing joint somatic variant calling with Mutect2. (The mutect2 could be specified by adding `--tools mutect2` to the nextflow command.)") From a367e37c27cff34b0ca11aa69df97b005fc16296 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 15:27:57 +0000 Subject: [PATCH 18/50] Separate configs for sentieon_dnascope_joint_germline and sentieon_haplotyper_joint_germline --- .../sentieon_dnascope_joint_germline.config | 45 +++++++++++++++++++ ...sentieon_haplotyper_joint_germline.config} | 0 nextflow.config | 3 +- 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 conf/modules/sentieon_dnascope_joint_germline.config rename conf/modules/{sentieon_joint_germline.config => sentieon_haplotyper_joint_germline.config} (100%) diff --git a/conf/modules/sentieon_dnascope_joint_germline.config b/conf/modules/sentieon_dnascope_joint_germline.config new file mode 100644 index 0000000000..72dd6c3144 --- /dev/null +++ b/conf/modules/sentieon_dnascope_joint_germline.config @@ -0,0 +1,45 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// SENTIEON DNASCOPE JOINT_GERMLINE + +process { + + // TO-DO: duplicate!! + withName: 'SENTIEON_GVCFTYPER' { + ext.args = { "--allow-old-rms-mapping-quality-annotation-data" } + ext.prefix = { meta.intervals_name } + publishDir = [ + enabled: false + ] + } + + if (params.tools && params.tools.contains('sentieon_dnascope') && params.joint_germline) { + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_JOINT_CALLING_GERMLINE_SENTIEON:BCFTOOLS_SORT' { + ext.prefix = { vcf.baseName - ".vcf" + ".sort" } + publishDir = [ + enabled: false + ] + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_JOINT_CALLING_GERMLINE_SENTIEON:MERGE_GENOTYPEGVCFS' { + ext.prefix = "joint_germline" + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/joint_variant_calling/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + } +} diff --git a/conf/modules/sentieon_joint_germline.config b/conf/modules/sentieon_haplotyper_joint_germline.config similarity index 100% rename from conf/modules/sentieon_joint_germline.config rename to conf/modules/sentieon_haplotyper_joint_germline.config diff --git a/nextflow.config b/nextflow.config index fd800c1ed2..0dfd0d9331 100644 --- a/nextflow.config +++ b/nextflow.config @@ -376,7 +376,8 @@ includeConfig 'conf/modules/msisensorpro.config' includeConfig 'conf/modules/mutect2.config' includeConfig 'conf/modules/sentieon_dnascope.config' includeConfig 'conf/modules/sentieon_haplotyper.config' -includeConfig 'conf/modules/sentieon_joint_germline.config' +includeConfig 'conf/modules/sentieon_haplotyper_joint_germline.config' +includeConfig 'conf/modules/sentieon_dnascope_joint_germline.config' includeConfig 'conf/modules/strelka.config' includeConfig 'conf/modules/tiddit.config' From f55c22eda8f1fce7bd92342c5007f050c6f9d2c9 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 15:28:42 +0000 Subject: [PATCH 19/50] Adding tests for dnascope (excel joint-germline) --- tests/test_sentieon_dnascope.yml | 147 +++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 tests/test_sentieon_dnascope.yml diff --git a/tests/test_sentieon_dnascope.yml b/tests/test_sentieon_dnascope.yml new file mode 100644 index 0000000000..94835af4a8 --- /dev/null +++ b/tests/test_sentieon_dnascope.yml @@ -0,0 +1,147 @@ +- name: Run variant calling on germline sample with sentieons dnascope + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results + tags: + - germline + - sentieon/dnascope + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: b2144d21a0ebfd807a8646f1751d0ddc + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + md5sum: 912c7d5b31784c50e0a75b4fcfa4997b + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + md5sum: ce2769af8f853b93d9e16b6493fc7e0d + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/dnascope + should_exist: false +- name: Run variant calling on germline sample with sentieons dnascope without intervals + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --no_intervals --outdir results + tags: + - germline + - sentieon/dnascope + - no_intervals + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: b2144d21a0ebfd807a8646f1751d0ddc + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + md5sum: 912c7d5b31784c50e0a75b4fcfa4997b + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + md5sum: ce2769af8f853b93d9e16b6493fc7e0d + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_dnascope + should_exist: false +- name: Run variant calling on germline sample with sentieons dnascope output gvcf + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results --sentieon_dnascope_emit_mode gvcf + tags: + - germline + - sentieon/dnascope + - variant_calling + files: + - path: results/csv/variantcalled.csv + should_exist: false + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + should_exist: false + - path: results/dnascope + should_exist: false +- name: Run variant calling on germline sample with sentieons dnascope output both gvcf and vcf + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results --sentieon_dnascope_emit_mode variant,gvcf + tags: + - germline + - sentieon/dnascope + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: b2144d21a0ebfd807a8646f1751d0ddc + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + - path: results/dnascope + should_exist: false From a10ad132caaa74a7a5a1bfca2edd414dc7ec4308 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 15:30:19 +0000 Subject: [PATCH 20/50] Adding meta tags patient and variantcaller to channels genotype_vcf and genotype_tbi. (Needed for output csv-file.) --- .../local/bam_joint_calling_germline_sentieon/main.nf | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf b/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf index 4092bd4775..3f19b33d52 100644 --- a/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf +++ b/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf @@ -51,8 +51,12 @@ workflow BAM_JOINT_CALLING_GERMLINE_SENTIEON { if (variant_caller == 'sentieon_dnascope') { // As advised by Don Freed (Sentieon), VQSR is skipped for DnaScope - genotype_index = merged_tbi - genotype_vcf = merged_vcf + genotype_vcf = merged_vcf.map{ + meta, vcf -> [ meta + [ patient:"all_samples", variantcaller:'sentieon_dnascope'], vcf ] + } + genotype_index = merged_tbi.map{ + meta, tbi -> [ meta + [ patient:"all_samples", variantcaller:'sentieon_dnascope'], tbi ] + } } else { vqsr_input = MERGE_GENOTYPEGVCFS.out.vcf.join(MERGE_GENOTYPEGVCFS.out.tbi, failOnDuplicate: true) indels_resource_label = known_indels_vqsr.mix(dbsnp_vqsr).collect() From 6c78764c830fc9876ef47ebec757cf372de92499 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 15:31:11 +0000 Subject: [PATCH 21/50] joint germline with dnascope --- subworkflows/local/bam_variant_calling_germline_all/main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf index 8c48a157a2..b35f28d827 100644 --- a/subworkflows/local/bam_variant_calling_germline_all/main.nf +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -349,6 +349,8 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { ) emit: + gvcf_sentieon_dnascope + gvcf_sentieon_haplotyper vcf_all vcf_deepvariant vcf_freebayes @@ -358,7 +360,6 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { vcf_strelka vcf_sentieon_dnascope vcf_sentieon_haplotyper - gvcf_sentieon_haplotyper vcf_tiddit versions From c48f68c05ba00a4b369e742906628c7af59d0f37 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 15:32:02 +0000 Subject: [PATCH 22/50] test_sentieon_joint_germline.yml -> test_sentieon_haplotyper_joint_germline.yml --- tests/config/tags.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/config/tags.yml b/tests/config/tags.yml index 851d80c225..9f98cc6a2b 100644 --- a/tests/config/tags.yml +++ b/tests/config/tags.yml @@ -373,7 +373,7 @@ sentieon_joint_germline: - subworkflows/local/bam_variant_calling_germline_all/main.nf - subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf - tests/csv/3.0/mapped_joint_bam.csv - - tests/test_sentieon_joint_germline.yml + - tests/test_sentieon_haplotyper_joint_germline.yml ## manta manta: From af59cd351760e39251e7650f5def500844e209af Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 15:32:39 +0000 Subject: [PATCH 23/50] Adding test for joint-germline with dnascope --- .../test_sentieon_dnascope_joint_germline.yml | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 tests/test_sentieon_dnascope_joint_germline.yml diff --git a/tests/test_sentieon_dnascope_joint_germline.yml b/tests/test_sentieon_dnascope_joint_germline.yml new file mode 100644 index 0000000000..e27865372e --- /dev/null +++ b/tests/test_sentieon_dnascope_joint_germline.yml @@ -0,0 +1,56 @@ +- name: Run joint germline variant calling with sentieon dnascope + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf + tags: + - germline + - sentieon_joint_germline + - variant_calling + - sentieon/dnascope + files: + - path: results/csv/variantcalled.csv + md5sum: 62d70060aad96337254efe2d7a1df170 + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.FILTER.summary + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.count + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.qual + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz.tbi + - path: results/dnascope + should_exist: false +- name: Run joint germline variant calling with sentieon dnascope all intervals at once + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf --nucleotides_per_second 100 + tags: + - germline + - sentieon_joint_germline + - variant_calling + - sentieon/dnascope + files: + - path: results/csv/variantcalled.csv + md5sum: 62d70060aad96337254efe2d7a1df170 + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.FILTER.summary + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.count + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.qual + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz.tbi + - path: results/dnascope + should_exist: false From d0bb50ddb0fd8e60196d16d2d1733d5ff7ff75c4 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 15:33:20 +0000 Subject: [PATCH 24/50] test_sentieon_joint_germline.yml -> test_sentieon_haplotyper_joint_germline.yml --- ...t_germline.yml => test_sentieon_haplotyper_joint_germline.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_sentieon_joint_germline.yml => test_sentieon_haplotyper_joint_germline.yml} (100%) diff --git a/tests/test_sentieon_joint_germline.yml b/tests/test_sentieon_haplotyper_joint_germline.yml similarity index 100% rename from tests/test_sentieon_joint_germline.yml rename to tests/test_sentieon_haplotyper_joint_germline.yml From a063f3f0151d863fd00502f71a26353f19d62716 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 15:48:06 +0000 Subject: [PATCH 25/50] initializing output channel gvcf_sentieon_dnascope --- subworkflows/local/bam_variant_calling_germline_all/main.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf index b35f28d827..eea9c2e0b8 100644 --- a/subworkflows/local/bam_variant_calling_germline_all/main.nf +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -50,6 +50,9 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { versions = Channel.empty() //TODO: Temporary until the if's can be removed and printing to terminal is prevented with "when" in the modules.config + gvcf_sentieon_dnascope = Channel.empty() + gvcf_sentieon_haplotyper = Channel.empty() + vcf_deepvariant = Channel.empty() vcf_freebayes = Channel.empty() vcf_haplotypecaller = Channel.empty() @@ -57,7 +60,6 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { vcf_mpileup = Channel.empty() vcf_sentieon_dnascope = Channel.empty() vcf_sentieon_haplotyper = Channel.empty() - gvcf_sentieon_haplotyper = Channel.empty() vcf_strelka = Channel.empty() vcf_tiddit = Channel.empty() From 2902b5b0304aef6329ac1a6c4b832ab25862a6b4 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Thu, 14 Sep 2023 22:21:03 +0000 Subject: [PATCH 26/50] secrets not secret --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 0f794f54e4..63e338c3c6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1062,7 +1062,7 @@ In particular, Sentieon contains what may be view as speedup version of some sta Sentieon supply license in the form of a string-value (a url) or a file. It should be base64-encoded and stored in a nextflow secret named `SENTIEON_LICENSE_BASE64`. If a license string (url) is supplied, then the nextflow secret should be set like this: ```bash -nextflow secret set SENTIEON_LICENSE_BASE64 $(echo -n | base64 -w 0) +nextflow secrets set SENTIEON_LICENSE_BASE64 $(echo -n | base64 -w 0) ``` If a license file is supplied, then the nextflow secret should be set like this: From 8d13939d840cc8a90fa8b1944e2eefcb384b63e4 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Fri, 15 Sep 2023 06:08:13 +0000 Subject: [PATCH 27/50] include configs in alphabetical order --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 0dfd0d9331..f2d90dd3cf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -375,9 +375,9 @@ includeConfig 'conf/modules/mpileup.config' includeConfig 'conf/modules/msisensorpro.config' includeConfig 'conf/modules/mutect2.config' includeConfig 'conf/modules/sentieon_dnascope.config' +includeConfig 'conf/modules/sentieon_dnascope_joint_germline.config' includeConfig 'conf/modules/sentieon_haplotyper.config' includeConfig 'conf/modules/sentieon_haplotyper_joint_germline.config' -includeConfig 'conf/modules/sentieon_dnascope_joint_germline.config' includeConfig 'conf/modules/strelka.config' includeConfig 'conf/modules/tiddit.config' From c2cd5880a3d646d1bb7182ca686543e647e26dee Mon Sep 17 00:00:00 2001 From: asp8200 Date: Fri, 15 Sep 2023 06:13:45 +0000 Subject: [PATCH 28/50] Adding tag for sentieon/dnascope (excl joint-germline) --- tests/config/tags.yml | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tests/config/tags.yml b/tests/config/tags.yml index 9f98cc6a2b..2440bf7fad 100644 --- a/tests/config/tags.yml +++ b/tests/config/tags.yml @@ -314,6 +314,22 @@ haplotypecaller_skip_filter: - tests/csv/3.0/mapped_single_bam.csv - tests/test_haplotypecaller_skip_filter.yml +## sentieon/dnascope +sentieon/dnascope: + - conf/modules/sentieon_dnascope.config + - modules/nf-core/gatk4/cnnscorevariants/main.nf + - modules/nf-core/gatk4/filtervarianttranches/main.nf + - modules/nf-core/sentieon/dnascope/main.nf + - modules/nf-core/gatk4/mergevcfs/main.nf + - modules/nf-core/samtools/index/main.nf + - modules/nf-core/samtools/merge/main.nf + - subworkflows/local/bam_merge_index_samtools/main.nf + - subworkflows/local/bam_variant_calling_germline_all/main.nf + - subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf + - subworkflows/local/vcf_variant_filtering_gatk/main.nf + - tests/csv/3.0/mapped_single_bam.csv + - tests/test_sentieon_dnascope.yml + ## sentieon/haplotyper sentieon/haplotyper: - conf/modules/sentieon_haplotyper.config @@ -364,11 +380,11 @@ joint_germline: - tests/csv/3.0/mapped_joint_bam.csv - tests/test_joint_germline.yml -## sentieon_joint_germline -sentieon_joint_germline: +## sentieon_haplotyper_joint_germline +sentieon_haplotyper_joint_germline: - conf/modules/prepare_genome.config - conf/modules/sentieon_haplotyper.config - - conf/modules/sentieon_joint_germline.config + - conf/modules/sentieon_haplotyper_joint_germline.config - modules/nf-core/sentieon/haplotyper/main.nf - subworkflows/local/bam_variant_calling_germline_all/main.nf - subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf From de09b05908f1e40218c21e550cc40235b634d0ee Mon Sep 17 00:00:00 2001 From: asp8200 Date: Fri, 15 Sep 2023 06:21:31 +0000 Subject: [PATCH 29/50] update tags --- tests/test_sentieon_dnascope_joint_germline.yml | 4 ++-- tests/test_sentieon_haplotyper_joint_germline.yml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_sentieon_dnascope_joint_germline.yml b/tests/test_sentieon_dnascope_joint_germline.yml index e27865372e..a25117b3b9 100644 --- a/tests/test_sentieon_dnascope_joint_germline.yml +++ b/tests/test_sentieon_dnascope_joint_germline.yml @@ -2,7 +2,7 @@ command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf tags: - germline - - sentieon_joint_germline + - sentieon_dnascope_joint_germline - variant_calling - sentieon/dnascope files: @@ -31,7 +31,7 @@ command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf --nucleotides_per_second 100 tags: - germline - - sentieon_joint_germline + - sentieon_dnascope_joint_germline - variant_calling - sentieon/dnascope files: diff --git a/tests/test_sentieon_haplotyper_joint_germline.yml b/tests/test_sentieon_haplotyper_joint_germline.yml index 4b42a505a9..c08b07485e 100644 --- a/tests/test_sentieon_haplotyper_joint_germline.yml +++ b/tests/test_sentieon_haplotyper_joint_germline.yml @@ -2,7 +2,7 @@ command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_haplotyper --step variant_calling --joint_germline --outdir results --sentieon_haplotyper_emit_mode gvcf tags: - germline - - sentieon_joint_germline + - sentieon_haplotyper_joint_germline - variant_calling - sentieon/haplotyper files: @@ -31,7 +31,7 @@ command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_haplotyper --step variant_calling --joint_germline --outdir results --sentieon_haplotyper_emit_mode gvcf --nucleotides_per_second 100 tags: - germline - - sentieon_joint_germline + - sentieon_haplotyper_joint_germline - variant_calling - sentieon/haplotyper files: @@ -58,7 +58,7 @@ command: nextflow run main.nf -profile test_cache,tools_germline --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_haplotyper --step variant_calling --joint_germline --outdir results --sentieon_haplotyper_emit_mode gvcf -stub-run tags: - germline - - sentieon_joint_germline + - sentieon_haplotyper_joint_germline - variant_calling - vqsr files: From 704fd8044a6693a6a324ae53da52e03b30e19875 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Sun, 17 Sep 2023 19:50:35 +0000 Subject: [PATCH 30/50] Adding tag sentieon_dnascope_joint_germline --- tests/config/tags.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/config/tags.yml b/tests/config/tags.yml index 2440bf7fad..53e7c23a0f 100644 --- a/tests/config/tags.yml +++ b/tests/config/tags.yml @@ -380,6 +380,17 @@ joint_germline: - tests/csv/3.0/mapped_joint_bam.csv - tests/test_joint_germline.yml +## sentieon_dnascope_joint_germline +sentieon_dnascope_joint_germline: + - conf/modules/prepare_genome.config + - conf/modules/sentieon_dnascope.config + - conf/modules/sentieon_dnascope_joint_germline.config + - modules/nf-core/sentieon/dnascope/main.nf + - subworkflows/local/bam_variant_calling_germline_all/main.nf + - subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf + - tests/csv/3.0/mapped_joint_bam.csv + - tests/test_sentieon_dnascop_joint_germline.yml + ## sentieon_haplotyper_joint_germline sentieon_haplotyper_joint_germline: - conf/modules/prepare_genome.config From e46f6d9ba8f81fbe7bd33315627f99631f3209c5 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Sun, 17 Sep 2023 20:17:14 +0000 Subject: [PATCH 31/50] TABIX_KNOWN_INDELS and TABIX_KNOWN_SNPS is probably not needed in the DnaScope subworkflow as VQSR is not applied for DnaScope --- conf/modules/prepare_genome.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules/prepare_genome.config b/conf/modules/prepare_genome.config index 8b1f45c4fb..d4bf1853b0 100644 --- a/conf/modules/prepare_genome.config +++ b/conf/modules/prepare_genome.config @@ -96,7 +96,7 @@ process { } withName: 'TABIX_KNOWN_INDELS' { - ext.when = { !params.known_indels_tbi && params.known_indels && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope'))) ) } + ext.when = { !params.known_indels_tbi && params.known_indels && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') )) ) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, @@ -106,7 +106,7 @@ process { } withName: 'TABIX_KNOWN_SNPS' { - ext.when = { !params.known_snps_tbi && params.known_snps && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope'))) ) } + ext.when = { !params.known_snps_tbi && params.known_snps && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') )) ) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, From 1e424a642ee770404ec0e960b251e5b1dbee09ac Mon Sep 17 00:00:00 2001 From: asp8200 Date: Sun, 17 Sep 2023 21:09:37 +0000 Subject: [PATCH 32/50] Re-installing config for TABIX_KNOWN_INDELS for dnascope --- conf/modules/prepare_genome.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules/prepare_genome.config b/conf/modules/prepare_genome.config index d4bf1853b0..85367196c2 100644 --- a/conf/modules/prepare_genome.config +++ b/conf/modules/prepare_genome.config @@ -96,7 +96,7 @@ process { } withName: 'TABIX_KNOWN_INDELS' { - ext.when = { !params.known_indels_tbi && params.known_indels && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') )) ) } + ext.when = { !params.known_indels_tbi && params.known_indels && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope'))) ) } publishDir = [ enabled: (params.save_reference || params.build_only_index), mode: params.publish_dir_mode, From 2738c8d19da358c95c54ffcc0003150e888d98e2 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Mon, 18 Sep 2023 11:41:07 +0000 Subject: [PATCH 33/50] Briefly mentioning DnaScope --- docs/usage.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 63e338c3c6..94ec009d96 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1073,7 +1073,9 @@ nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat ### Available Sentieon functions -Sarek contains the following Sentieon functions [bwa mem](https://support.sentieon.com/manual/usages/general/#bwa-mem-syntax), [LocusCollector](https://support.sentieon.com/manual/usages/general/#locuscollector-algorithm) + [Dedup](https://support.sentieon.com/manual/usages/general/#dedup-algorithm), [Haplotyper](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm), [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) and [VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) + [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm), so the basic processing of alignment of fastq-files to VCF-files can be done using speedup Sentieon functions. +Sarek contains the following Sentieon functions from [DnaSeq](https://support.sentieon.com/manual/DNAseq_usage/dnaseq/) : [bwa mem](https://support.sentieon.com/manual/usages/general/#bwa-mem-syntax), [LocusCollector](https://support.sentieon.com/manual/usages/general/#locuscollector-algorithm) + [Dedup](https://support.sentieon.com/manual/usages/general/#dedup-algorithm), [Haplotyper](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm), [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) and [VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) + [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm), so the basic processing of alignment of fastq-files to VCF-files can be done using speedup Sentieon functions. + +Sarek also contains the Sentieon functions [DnaScope](https://support.sentieon.com/manual/usages/general/?highlight=dnamodelapply#dnascope-algorithm) and [DNAModelApply](https://support.sentieon.com/manual/usages/general/?highlight=dnamodelapply#dnamodelapply-algorithm). ### Basic usage of Sentieon functions From 0479416432fe5a14cc57d99d18d4cea597c5fce3 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Mon, 18 Sep 2023 11:41:34 +0000 Subject: [PATCH 34/50] Publishing from SENTIEON_DNAMODELAPPLY --- conf/modules/sentieon_dnascope.config | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/conf/modules/sentieon_dnascope.config b/conf/modules/sentieon_dnascope.config index 325a1642a9..3f9b181217 100644 --- a/conf/modules/sentieon_dnascope.config +++ b/conf/modules/sentieon_dnascope.config @@ -56,4 +56,13 @@ process { } } + withName: 'SENTIEON_DNAMODELAPPLY' { + ext.prefix = {"${meta.id}.dnamodelapply"} + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + } From ba06cac096571d9432996554b9b10e79b349c686 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Mon, 18 Sep 2023 11:45:14 +0000 Subject: [PATCH 35/50] Replacing VCF_VARIANT_FILTERING_GATK with SENTIEON_DNAMODELAPPLY in DnaScope-subworkflow --- modules.json | 5 +++ .../bam_variant_calling_germline_all/main.nf | 34 +++++++++++++------ 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/modules.json b/modules.json index ffec443b25..d7aca8486e 100644 --- a/modules.json +++ b/modules.json @@ -386,6 +386,11 @@ "git_sha": "915a0b16ba3e40ef59e7b44843b3118e17a9c906", "installed_by": ["modules"] }, + "sentieon/dnamodelapply": { + "branch": "master", + "git_sha": "43ef68091a1188fd8dc4c03f9341b556803c7514", + "installed_by": ["modules"] + }, "sentieon/dnascope": { "branch": "master", "git_sha": "4fb6fdc8046ec09cd30f92a2a252e9a0ba4a6309", diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf index eea9c2e0b8..73a6e4c901 100644 --- a/subworkflows/local/bam_variant_calling_germline_all/main.nf +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -14,8 +14,11 @@ include { BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER } from '../bam_variant_calling include { BAM_VARIANT_CALLING_MPILEUP } from '../bam_variant_calling_mpileup/main' include { BAM_VARIANT_CALLING_SINGLE_STRELKA } from '../bam_variant_calling_single_strelka/main' include { BAM_VARIANT_CALLING_SINGLE_TIDDIT } from '../bam_variant_calling_single_tiddit/main' +include { SENTIEON_DNAMODELAPPLY } from '../../../modules/nf-core/sentieon/dnamodelapply/main' include { VCF_VARIANT_FILTERING_GATK } from '../vcf_variant_filtering_gatk/main' + + workflow BAM_VARIANT_CALLING_GERMLINE_ALL { take: tools // Mandatory, list of tools to apply @@ -233,18 +236,27 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { // If single sample track, check if filtering should be done if (!(skip_tools && skip_tools.split(',').contains('dnascope_filter'))) { - VCF_VARIANT_FILTERING_GATK( + SENTIEON_DNAMODELAPPLY( vcf_sentieon_dnascope.join(vcf_tbi_sentieon_dnascope, failOnDuplicate: true, failOnMismatch: true), - fasta, - fasta_fai, - dict.map{ meta, dict -> [ dict ] }, - intervals_bed_combined_haplotypec, - known_sites_indels.concat(known_sites_snps).flatten().unique().collect(), - known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect()) - - vcf_sentieon_dnascope = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf - - versions = versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions) + fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, + fasta_fai.map{ fai -> [ [ id:fai.baseName ], fai ] }, + sentieon_dnascope_model.map{ model -> [ [ id:model.baseName ], model ] }) + + vcf_sentieon_dnascope = SENTIEON_DNAMODELAPPLY.out.vcf + versions = versions.mix(SENTIEON_DNAMODELAPPLY.out.versions) + + // TO-DO: Figure out whether it should be possible to also run VCF_VARIANT_FILTERING_GATK here. + // VCF_VARIANT_FILTERING_GATK( + // vcf_sentieon_dnascope.join(vcf_tbi_sentieon_dnascope, failOnDuplicate: true, failOnMismatch: true), + // fasta, + // fasta_fai, + // dict.map{ meta, dict -> [ dict ] }, + // intervals_bed_combined_haplotypec, + // known_sites_indels.concat(known_sites_snps).flatten().unique().collect(), + // known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect()) + + // vcf_sentieon_dnascope = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf + // versions = versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions) } } From 89591c711fffb81eadaae8842e2e9228d2ae214f Mon Sep 17 00:00:00 2001 From: asp8200 Date: Mon, 18 Sep 2023 11:50:37 +0000 Subject: [PATCH 36/50] New sentieon module dnamodelapply --- .../nf-core/sentieon/dnamodelapply/main.nf | 81 +++++++++++++++++++ .../nf-core/sentieon/dnamodelapply/meta.yml | 78 ++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 modules/nf-core/sentieon/dnamodelapply/main.nf create mode 100644 modules/nf-core/sentieon/dnamodelapply/meta.yml diff --git a/modules/nf-core/sentieon/dnamodelapply/main.nf b/modules/nf-core/sentieon/dnamodelapply/main.nf new file mode 100644 index 0000000000..3fe9a28f19 --- /dev/null +++ b/modules/nf-core/sentieon/dnamodelapply/main.nf @@ -0,0 +1,81 @@ +process SENTIEON_DNAMODELAPPLY { + tag "$meta.id" + label 'process_high' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + container 'nf-core/sentieon:202112.06' + + input: + tuple val(meta), path(vcf), path(idx) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(ml_model) + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf + tuple val(meta), path("*.vcf.gz.tbi"), emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + sentieon driver \\ + -t $task.cpus \\ + -r $fasta \\ + $args \\ + --algo DNAModelApply \\ + --model $ml_model \\ + -v $vcf \\ + ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sentieon modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/dnamodelapply/meta.yml b/modules/nf-core/sentieon/dnamodelapply/meta.yml new file mode 100644 index 0000000000..ec429bea21 --- /dev/null +++ b/modules/nf-core/sentieon/dnamodelapply/meta.yml @@ -0,0 +1,78 @@ +name: "sentieon_dnamodelapply" +description: modifies the input VCF file by adding the MLrejected FILTER to the variants +keywords: + - dnamodelapply + - vcf + - filter + - sentieon +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` + - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` + - vcf: + type: file + description: INPUT VCF file + pattern: "*.{vcf,vcf.gz}" + - idx: + type: file + description: Index of the input VCF file + pattern: "*.{tbi}" + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" + - fai: + type: file + description: Index of the genome fasta file + pattern: "*.fai" + - ml_model: + type: file + description: machine learning model file + pattern: "*.model" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: INPUT VCF file + pattern: "*.{vcf,vcf.gz}" + - index: + type: file + description: Index of the input VCF file + pattern: "*.{tbi}" + +authors: + - "@ramprasadn" From 1f51e30862ce2ae79ce3b81965812a76e9e6db12 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Tue, 19 Sep 2023 19:20:32 +0000 Subject: [PATCH 37/50] Adding tests of dnascope skipping DnaModelApply --- tests/test_sentieon_dnascope_skip_filter.yml | 81 ++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 tests/test_sentieon_dnascope_skip_filter.yml diff --git a/tests/test_sentieon_dnascope_skip_filter.yml b/tests/test_sentieon_dnascope_skip_filter.yml new file mode 100644 index 0000000000..b07064a663 --- /dev/null +++ b/tests/test_sentieon_dnascope_skip_filter.yml @@ -0,0 +1,81 @@ +- name: Run variant calling on germline sample with sentieon dnascope and skip filter + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --skip_tools dnascope_filter --outdir results + tags: + - germline + - sentieon_dnascope_skip_filter + - variant_calling + - sentieon/dnascope + files: + - path: results/csv/variantcalled.csv + md5sum: 10254414c0679ba1fb25e41b9ff548cc + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.unfiltered.bcftools_stats.txt + md5sum: f915fe1591ababb0da5e7b43dfc35092 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.FILTER.summary + md5sum: 87a84b5f8ac3d3cbeeef7d60afcdbfe7 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_dnascope + should_exist: false +- name: Run variant calling on germline sample with sentieon dnascope without intervals and skip filter + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --skip_tools dnascope_filter --no_intervals --outdir results + tags: + - germline + - sentieon_dnascope_skip_filter + - no_intervals + - variant_calling + - sentieon/dnascope + files: + - path: results/csv/variantcalled.csv + md5sum: 10254414c0679ba1fb25e41b9ff548cc + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.unfiltered.bcftools_stats.txt + md5sum: f915fe1591ababb0da5e7b43dfc35092 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.FILTER.summary + md5sum: 87a84b5f8ac3d3cbeeef7d60afcdbfe7 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_dnascope + should_exist: false From 801b8ebf3d0dc870cd9195c508332ab1aa6768e5 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 20 Sep 2023 06:41:32 +0000 Subject: [PATCH 38/50] Adding sentieon_dnascope_skip_filter --- tests/config/tags.yml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/config/tags.yml b/tests/config/tags.yml index 53e7c23a0f..66c9a594d6 100644 --- a/tests/config/tags.yml +++ b/tests/config/tags.yml @@ -317,8 +317,6 @@ haplotypecaller_skip_filter: ## sentieon/dnascope sentieon/dnascope: - conf/modules/sentieon_dnascope.config - - modules/nf-core/gatk4/cnnscorevariants/main.nf - - modules/nf-core/gatk4/filtervarianttranches/main.nf - modules/nf-core/sentieon/dnascope/main.nf - modules/nf-core/gatk4/mergevcfs/main.nf - modules/nf-core/samtools/index/main.nf @@ -326,10 +324,21 @@ sentieon/dnascope: - subworkflows/local/bam_merge_index_samtools/main.nf - subworkflows/local/bam_variant_calling_germline_all/main.nf - subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf - - subworkflows/local/vcf_variant_filtering_gatk/main.nf - tests/csv/3.0/mapped_single_bam.csv - tests/test_sentieon_dnascope.yml +sentieon_dnascope_skip_filter: + - conf/modules/sentieon_dnascope.config + - modules/nf-core/sentieon/dnascope/main.nf + - modules/nf-core/gatk4/mergevcfs/main.nf + - modules/nf-core/samtools/index/main.nf + - modules/nf-core/samtools/merge/main.nf + - subworkflows/local/bam_merge_index_samtools/main.nf + - subworkflows/local/bam_variant_calling_germline_all/main.nf + - subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf + - tests/csv/3.0/mapped_single_bam.csv + - tests/test_sentieon_dnascope_skip_filter.yml + ## sentieon/haplotyper sentieon/haplotyper: - conf/modules/sentieon_haplotyper.config From 9ccad29b1bdb8f00fd8898bce6ca8a75820d93c4 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 20 Sep 2023 06:49:01 +0000 Subject: [PATCH 39/50] Just triggering tests --- tests/test_sentieon_dnascope_skip_filter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sentieon_dnascope_skip_filter.yml b/tests/test_sentieon_dnascope_skip_filter.yml index b07064a663..46d6d1de48 100644 --- a/tests/test_sentieon_dnascope_skip_filter.yml +++ b/tests/test_sentieon_dnascope_skip_filter.yml @@ -1,4 +1,4 @@ -- name: Run variant calling on germline sample with sentieon dnascope and skip filter +- name: Run variant calling on germline sample with sentieon dnascope and skip filter foo command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --skip_tools dnascope_filter --outdir results tags: - germline From eba764c193d6c966ba02e0abe16f801b64b7f6cf Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 20 Sep 2023 08:40:04 +0000 Subject: [PATCH 40/50] Adding profile software_license and option sentieon_extension to pytests of dnascope --- tests/test_sentieon_dnascope.yml | 8 ++++---- tests/test_sentieon_dnascope_joint_germline.yml | 4 ++-- tests/test_sentieon_dnascope_skip_filter.yml | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_sentieon_dnascope.yml b/tests/test_sentieon_dnascope.yml index 94835af4a8..e3b4611667 100644 --- a/tests/test_sentieon_dnascope.yml +++ b/tests/test_sentieon_dnascope.yml @@ -1,5 +1,5 @@ - name: Run variant calling on germline sample with sentieons dnascope - command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results tags: - germline - sentieon/dnascope @@ -35,7 +35,7 @@ - path: results/dnascope should_exist: false - name: Run variant calling on germline sample with sentieons dnascope without intervals - command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --no_intervals --outdir results + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --no_intervals --outdir results tags: - germline - sentieon/dnascope @@ -78,7 +78,7 @@ - path: results/sentieon_dnascope should_exist: false - name: Run variant calling on germline sample with sentieons dnascope output gvcf - command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results --sentieon_dnascope_emit_mode gvcf + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results --sentieon_dnascope_emit_mode gvcf tags: - germline - sentieon/dnascope @@ -116,7 +116,7 @@ - path: results/dnascope should_exist: false - name: Run variant calling on germline sample with sentieons dnascope output both gvcf and vcf - command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results --sentieon_dnascope_emit_mode variant,gvcf + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results --sentieon_dnascope_emit_mode variant,gvcf tags: - germline - sentieon/dnascope diff --git a/tests/test_sentieon_dnascope_joint_germline.yml b/tests/test_sentieon_dnascope_joint_germline.yml index a25117b3b9..19f2d41afd 100644 --- a/tests/test_sentieon_dnascope_joint_germline.yml +++ b/tests/test_sentieon_dnascope_joint_germline.yml @@ -1,5 +1,5 @@ - name: Run joint germline variant calling with sentieon dnascope - command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf tags: - germline - sentieon_dnascope_joint_germline @@ -28,7 +28,7 @@ - path: results/dnascope should_exist: false - name: Run joint germline variant calling with sentieon dnascope all intervals at once - command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf --nucleotides_per_second 100 + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf --nucleotides_per_second 100 tags: - germline - sentieon_dnascope_joint_germline diff --git a/tests/test_sentieon_dnascope_skip_filter.yml b/tests/test_sentieon_dnascope_skip_filter.yml index 46d6d1de48..16bbca9e7c 100644 --- a/tests/test_sentieon_dnascope_skip_filter.yml +++ b/tests/test_sentieon_dnascope_skip_filter.yml @@ -1,5 +1,5 @@ -- name: Run variant calling on germline sample with sentieon dnascope and skip filter foo - command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --skip_tools dnascope_filter --outdir results +- name: Run variant calling on germline sample with sentieon dnascope and skip filter + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --skip_tools dnascope_filter --outdir results tags: - germline - sentieon_dnascope_skip_filter @@ -36,7 +36,7 @@ - path: results/sentieon_dnascope should_exist: false - name: Run variant calling on germline sample with sentieon dnascope without intervals and skip filter - command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --skip_tools dnascope_filter --no_intervals --outdir results + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --skip_tools dnascope_filter --no_intervals --outdir results tags: - germline - sentieon_dnascope_skip_filter From ef3997d46387b4ab9de77e649cfb8e6d6429850c Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 20 Sep 2023 10:54:55 +0000 Subject: [PATCH 41/50] Fixing prefix for SENTIEON_DNAMODELAPPLY --- conf/modules/sentieon_dnascope.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules/sentieon_dnascope.config b/conf/modules/sentieon_dnascope.config index 3f9b181217..fa431ae417 100644 --- a/conf/modules/sentieon_dnascope.config +++ b/conf/modules/sentieon_dnascope.config @@ -57,7 +57,7 @@ process { } withName: 'SENTIEON_DNAMODELAPPLY' { - ext.prefix = {"${meta.id}.dnamodelapply"} + ext.prefix = {"${meta.id}.dnascope.filtered"} publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/"}, From a4d7dd2adb9c4281da036145f5a96579a077597c Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 20 Sep 2023 10:55:36 +0000 Subject: [PATCH 42/50] Updating md5sums --- tests/test_sentieon_dnascope.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_sentieon_dnascope.yml b/tests/test_sentieon_dnascope.yml index e3b4611667..f51e0bca72 100644 --- a/tests/test_sentieon_dnascope.yml +++ b/tests/test_sentieon_dnascope.yml @@ -19,7 +19,7 @@ - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt md5sum: 912c7d5b31784c50e0a75b4fcfa4997b - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary - md5sum: ce2769af8f853b93d9e16b6493fc7e0d + md5sum: e67b24d296810a075378e5864bcea0fa - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count md5sum: b77c120ee5cc0423267200c67d60c663 - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual @@ -62,7 +62,7 @@ - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt md5sum: 912c7d5b31784c50e0a75b4fcfa4997b - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary - md5sum: ce2769af8f853b93d9e16b6493fc7e0d + md5sum: e67b24d296810a075378e5864bcea0fa - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count md5sum: b77c120ee5cc0423267200c67d60c663 - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual From a7436c590a53c44a53f53f3222244a6f17ab18a9 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 20 Sep 2023 10:56:09 +0000 Subject: [PATCH 43/50] Removing comment --- .../local/bam_variant_calling_germline_all/main.nf | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf index 73a6e4c901..5989023adf 100644 --- a/subworkflows/local/bam_variant_calling_germline_all/main.nf +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -245,18 +245,6 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { vcf_sentieon_dnascope = SENTIEON_DNAMODELAPPLY.out.vcf versions = versions.mix(SENTIEON_DNAMODELAPPLY.out.versions) - // TO-DO: Figure out whether it should be possible to also run VCF_VARIANT_FILTERING_GATK here. - // VCF_VARIANT_FILTERING_GATK( - // vcf_sentieon_dnascope.join(vcf_tbi_sentieon_dnascope, failOnDuplicate: true, failOnMismatch: true), - // fasta, - // fasta_fai, - // dict.map{ meta, dict -> [ dict ] }, - // intervals_bed_combined_haplotypec, - // known_sites_indels.concat(known_sites_snps).flatten().unique().collect(), - // known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect()) - - // vcf_sentieon_dnascope = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf - // versions = versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions) } } From 235d7843b5c99b1d2f3269acc5483691e34650d6 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 20 Sep 2023 11:55:53 +0000 Subject: [PATCH 44/50] Updating changelog with PR for Sentieon DnaScope --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 793e005802..8d17ccf405 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#1246](https://github.com/nf-core/sarek/pull/1246) - Back to dev ### Added +- [#1193](https://github.com/nf-core/sarek/pull/1193) - Adding support for Sentieon's DnaScope for germline variant-calling including joint-germline. ### Changed From 5cb89c17974c21fc2f222bc08797cc7b697b3354 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 20 Sep 2023 13:00:17 +0000 Subject: [PATCH 45/50] Adding test of Sentieon VQSR not running for Sentieon DnaScope --- .../test_sentieon_dnascope_joint_germline.yml | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/test_sentieon_dnascope_joint_germline.yml b/tests/test_sentieon_dnascope_joint_germline.yml index 19f2d41afd..910043bafb 100644 --- a/tests/test_sentieon_dnascope_joint_germline.yml +++ b/tests/test_sentieon_dnascope_joint_germline.yml @@ -54,3 +54,37 @@ - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz.tbi - path: results/dnascope should_exist: false +- name: Run joint germline variant calling with sentieon dnascope with stub + command: nextflow run main.nf -profile test_cache,software_license,tools_germline --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf -stub-run + tags: + - germline + - sentieon_dnascope_joint_germline + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 62d70060aad96337254efe2d7a1df170 + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.bcftools_stats.txt + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.FILTER.summary + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.TsTv.count + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.TsTv.qual + should_exist: false + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz.tbi + - path: results/dnascope + should_exist: false From 45e961ae8b4d074aa1c96efbcc720bfc2f6663c2 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 20 Sep 2023 13:23:19 +0000 Subject: [PATCH 46/50] prettier --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d17ccf405..d75378f9a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#1246](https://github.com/nf-core/sarek/pull/1246) - Back to dev ### Added + - [#1193](https://github.com/nf-core/sarek/pull/1193) - Adding support for Sentieon's DnaScope for germline variant-calling including joint-germline. ### Changed From c1c7c6ee2edb05dd567669be005df004a5b4f767 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 20 Sep 2023 13:31:07 +0000 Subject: [PATCH 47/50] Adding info concerning DNAscope. Also other minor improvements to the docs. --- docs/output.md | 56 ++++++++++++++++++++++++++++++++++++++++++++++---- docs/usage.md | 6 ++++-- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/docs/output.md b/docs/output.md index 9f90bac28c..cb721b5423 100644 --- a/docs/output.md +++ b/docs/output.md @@ -31,6 +31,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [DeepVariant](#deepvariant) - [FreeBayes](#freebayes) - [GATK HaplotypeCaller](#gatk-haplotypecaller) + - [Sentieon DNAscope](#sentieon-dnascope) - [Sentieon Haplotyper](#sentieon-haplotyper) - [GATK Mutect2](#gatk-mutect2) - [bcftools](#bcftools) @@ -437,6 +438,34 @@ Files created: +#### Sentieon DNAscope + +[Sentieon DNAscope](https://support.sentieon.com/appnotes/dnascope_ml/#dnascope-germline-variant-calling-with-a-machine-learning-model) is a variant-caller which aims at outperforming GATK's Haplotypecaller in terms of both speed and accuracy. DNAscope allows you to use a machine learning model to perform variant calling with higher accuracy by improving the candidate detection and filtering. + +
+Unfiltered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.unfiltered.vcf.gz` and `.dnascope.unfiltered.vcf.gz.tbi` + - VCF with tabix index + +
+ +The output from Sentieon's DNAscope can be controlled through the option `--sentieon_dnascope_emit_mode` for Sarek, see [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions). + +Unless `dnascope_filter` is listed under `--skip_tools` in the nextflow command, Sentieon's [DNAModelApply](https://support.sentieon.com/manual/usages/general/#dnamodelapply-algorithm) is applied to the unfiltered VCF-files in order to obtain filtered VCF-files. + +
+Filtered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.filtered.vcf.gz` and `.dnascope.filtered.vcf.gz.tbi` + - VCF with tabix index + +
+ #### Sentieon Haplotyper [Sentieon Haplotyper](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm) is Sention's speedup version of GATK's Haplotypecaller (see above). @@ -451,7 +480,7 @@ Files created: -The output from Sentieon's Haplotyper can be controlled through the option `--sentieon_haplotyper_emit_mode` for Sarek, see [Basic usage of Sentieon functions in Sarek](#basic-usage-of-sentieon-functions-in-sarek). +The output from Sentieon's Haplotyper can be controlled through the option `--sentieon_haplotyper_emit_mode` for Sarek, see [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions). Unless `haplotyper_filter` is listed under `--skip_tools` in the nextflow command, GATK's CNNScoreVariants and FilterVariantTranches (see above) is applied to the unfiltered VCF-files in order to obtain filtered VCF-files. @@ -465,16 +494,35 @@ Unless `haplotyper_filter` is listed under `--skip_tools` in the nextflow comman -##### Sentieon Joint Germline Variant Calling +##### Joint Germline Variant Calling with Sentieon's DNAscope -In Sentieon's package DNAseq, joint germline variant calling is done by first running Sentieon's Haplotyper in emit-mode `gvcf` for each sample and then running Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) on the set of gVCF-files. See [Basic usage of Sentieon functions in Sarek](#basic-usage-of-sentieon-functions-in-sarek) for information on how joint germline variant calling can be done in Sarek using Sentieon's DNAseq. After joint genotyping, Sentieon's version of VQSR ([VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) and [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm)) is applied for filtering to produce the final multisample callset with the desired balance of precision and sensitivity. +In Sentieon's package DNAscope, joint germline variant calling is done by first running Sentieon's Dnacope in emit-mode `gvcf` for each sample and then running Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) on the set of gVCF-files. See [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions) for information on how joint germline variant calling can be done in Sarek using Sentieon's DNAscope. + +
+Output files from joint germline variant calling + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.g.vcf.gz` and `.dnascope.g.vcf.gz.tbi` + - VCF with tabix index + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope/joint_variant_calling/`** + +- `joint_germline.vcf.gz` and `joint_germline.vcf.gz.tbi` + - VCF with tabix index + +
+ +##### Joint Germline Variant Calling with Sentieon's DNAseq + +In Sentieon's package DNAseq, joint germline variant calling is done by first running Sentieon's Haplotyper in emit-mode `gvcf` for each sample and then running Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) on the set of gVCF-files. See [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions) for information on how joint germline variant calling can be done in Sarek using Sentieon's DNAseq. After joint genotyping, Sentieon's version of VQSR ([VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) and [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm)) is applied for filtering to produce the final multisample callset with the desired balance of precision and sensitivity.
Output files from joint germline variant calling **Output directory: `{outdir}/variantcalling/sentieon_haplotyper//`** -- `.haplotypecaller.g.vcf.gz` and `.haplotypecaller.g.vcf.gz.tbi` +- `.haplotyper.g.vcf.gz` and `.haplotyper.g.vcf.gz.tbi` - VCF with tabix index **Output directory: `{outdir}/variantcalling/sentieon_haplotyper/joint_variant_calling/`** diff --git a/docs/usage.md b/docs/usage.md index 94ec009d96..9c9f4084da 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1083,9 +1083,11 @@ To use Sentieon's aligner `bwa mem`, set the aligner option `sentieon-bwamem`. ( To use Sentieon's function `Dedup`, specify `sentieon_dedup` as one of the tools. (This can, for example, be done by adding `--tools sentieon_dedup` to the nextflow run command.) -To use Sentieon's function `Haplotyper`, specify `sentieon_haplotyper` as one of the tools. This can, for example, be done by adding `--tools sentieon_haplotyper` to the nextflow run command. In order to skip the GATK-based variant-filter, one may add `--skip_tools haplotyper_filter` to the nextflow run command. Sarek also provides the option `sentieon_haplotyper_emit_mode` which can be used to set the [emit-mode](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm) of Sentieon's haplotyper. Sentieon's haplotyper can output both a vcf-file and a gvcf-file in the same run; this is achieved by setting `sentieon_haplotyper_emit_mode` to `,gvcf`, where `` is `variant`, `confident` or `all`. +To use Sentieon's function `DNAscope`, specify `sentieon_dnascope` as one of the tools. This can, for example, be done by adding `--tools sentieon_dnascope` to the nextflow run command. In order to skip Sentieon's variant-filter `DNAModelApply`, one may add `--skip_tools dnascope_filter` to the nextflow run command. Sarek also provides the option `sentieon_dnascope_emit_mode` which can be used to set the [emit-mode](https://support.sentieon.com/manual/usages/general/#dnascope-algorithm) of Sentieon's dnascope. Sentieon's dnascope can output both a vcf-file and a gvcf-file in the same run; this is achieved by setting `sentieon_dnascope_emit_mode` to `,gvcf`, where `` is `variant`, `confident` or `all`. -To use Sentieon's function `GVCFtyper` along with Sention's version of VQSR (`VarCal` and `ApplyVarCal`) for joint-germline genotyping, specify `sentieon_haplotyper` as one of the tools, set the option `sentieon_haplotyper_emit_mode` to `gvcf`, and add the option `joint_germline`. This can, for example, be done by adding `--tools sentieon_haplotyper --joint_germline --sentieon_haplotyper_emit_mode gvcf` to the nextflow run command. +Sentieon's function `Haplotyper` is used in much the same way as `DNAscope`. To use Sentieon's function `Haplotyper`, specify `sentieon_haplotyper` as one of the tools. This can, for example, be done by adding `--tools sentieon_haplotyper` to the nextflow run command. In order to skip the GATK-based variant-filter, one may add `--skip_tools haplotyper_filter` to the nextflow run command. Sarek also provides the option `sentieon_haplotyper_emit_mode` which can be used to set the [emit-mode](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm) of Sentieon's haplotyper. Sentieon's haplotyper can output both a vcf-file and a gvcf-file in the same run; this is achieved by setting `sentieon_haplotyper_emit_mode` to `,gvcf`, where `` is `variant`, `confident` or `all`. + +To use Sentieon's function `GVCFtyper` along with Sention's version of VQSR (`VarCal` and `ApplyVarCal`) for joint-germline genotyping, specify `sentieon_haplotyper` as one of the tools, set the option `sentieon_haplotyper_emit_mode` to `gvcf`, and add the option `joint_germline`. This can, for example, be done by adding `--tools sentieon_haplotyper --joint_germline --sentieon_haplotyper_emit_mode gvcf` to the nextflow run command. If `sentieon_dnascope` is chosen instead of `sentieon_haplotyper`, then Sention's version of VQSR is skipped, as recommended by Sentieon. ### Joint germline variant calling From cfeb7625402121be0a0fe78eccbabe898a5a06eb Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 20 Sep 2023 18:49:12 +0000 Subject: [PATCH 48/50] Removing redundant test --- .../test_sentieon_dnascope_joint_germline.yml | 54 ++++++------------- 1 file changed, 16 insertions(+), 38 deletions(-) diff --git a/tests/test_sentieon_dnascope_joint_germline.yml b/tests/test_sentieon_dnascope_joint_germline.yml index 910043bafb..e905b9cd53 100644 --- a/tests/test_sentieon_dnascope_joint_germline.yml +++ b/tests/test_sentieon_dnascope_joint_germline.yml @@ -9,10 +9,6 @@ - path: results/csv/variantcalled.csv md5sum: 62d70060aad96337254efe2d7a1df170 - path: results/multiqc - - path: results/preprocessing/recalibrated/test/test.recal.cram - should_exist: false - - path: results/preprocessing/recalibrated/test/test.recal.cram.crai - should_exist: false - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline.bcftools_stats.txt - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.FILTER.summary - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.count @@ -27,6 +23,22 @@ - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz.tbi - path: results/dnascope should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.bcftools_stats.txt + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.FILTER.summary + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.TsTv.count + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.TsTv.qual + should_exist: false + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.vcf.gz.tbi + should_exist: false - name: Run joint germline variant calling with sentieon dnascope all intervals at once command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf --nucleotides_per_second 100 tags: @@ -54,37 +66,3 @@ - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz.tbi - path: results/dnascope should_exist: false -- name: Run joint germline variant calling with sentieon dnascope with stub - command: nextflow run main.nf -profile test_cache,software_license,tools_germline --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf -stub-run - tags: - - germline - - sentieon_dnascope_joint_germline - - variant_calling - files: - - path: results/csv/variantcalled.csv - md5sum: 62d70060aad96337254efe2d7a1df170 - - path: results/multiqc - - path: results/preprocessing/recalibrated/test/test.recal.cram - should_exist: false - - path: results/preprocessing/recalibrated/test/test.recal.cram.crai - should_exist: false - - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.bcftools_stats.txt - should_exist: false - - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.FILTER.summary - should_exist: false - - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.TsTv.count - should_exist: false - - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.TsTv.qual - should_exist: false - - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz - - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz.tbi - - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.vcf.gz - should_exist: false - - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.vcf.gz.tbi - should_exist: false - - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz - - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz.tbi - - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz - - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz.tbi - - path: results/dnascope - should_exist: false From 31fc80a32d593eb72975f019fd17dbb48217fc27 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Mon, 2 Oct 2023 17:19:31 +0000 Subject: [PATCH 49/50] Trying to sort out section and subsection for Sentieon/haplotyper and Sentieon/haplotyper/joint_germline --- docs/output.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/output.md b/docs/output.md index c7e1e8d0b3..b03f4c11d0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -491,7 +491,7 @@ In Sentieon's package DNAscope, joint germline variant calling is done by first
-##### Sentieon Haplotyper joint germline variant calling +#### Sentieon Haplotyper [Sentieon Haplotyper](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm) is Sention's speedup version of GATK's Haplotypecaller (see above). @@ -519,7 +519,7 @@ Unless `haplotyper_filter` is listed under `--skip_tools` in the nextflow comman -##### Joint Germline Variant Calling with Sentieon's DNAseq +##### Sentieon Haplotyper joint germline variant calling In Sentieon's package DNAseq, joint germline variant calling is done by first running Sentieon's Haplotyper in emit-mode `gvcf` for each sample and then running Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) on the set of gVCF-files. See [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions) for information on how joint germline variant calling can be done in Sarek using Sentieon's DNAseq. After joint genotyping, Sentieon's version of VQSR ([VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) and [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm)) is applied for filtering to produce the final multisample callset with the desired balance of precision and sensitivity. From d6f610c3077ef5b502b277108af204b0716104d8 Mon Sep 17 00:00:00 2001 From: asp8200 Date: Wed, 4 Oct 2023 16:12:11 +0000 Subject: [PATCH 50/50] Moving PR1193 to dev-section since the PR was not merged for Sarek 3.3.2 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 159ff6a2a5..60d93f7426 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- [#1193](https://github.com/nf-core/sarek/pull/1193) - Adding support for Sentieon's DnaScope for germline variant-calling including joint-germline. - [#1271](https://github.com/nf-core/sarek/pull/1271) - Back to dev ### Changed @@ -26,7 +27,6 @@ Ráhpajávvre is the Lule Sámi spelling of Rapaselet. ### Added -- [#1193](https://github.com/nf-core/sarek/pull/1193) - Adding support for Sentieon's DnaScope for germline variant-calling including joint-germline. - [#1246](https://github.com/nf-core/sarek/pull/1246) - Back to dev - [#1259](https://github.com/nf-core/sarek/pull/1259) - nf-prov plugin