diff --git a/CHANGELOG.md b/CHANGELOG.md index bdc8e1fc8..e25941320 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -93,6 +93,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#587](https://github.com/nf-core/sarek/pull/587) - Fix issue with VEP extra files - [#581](https://github.com/nf-core/sarek/pull/581) - `TIDDIT` is back - [#590](https://github.com/nf-core/sarek/pull/590) - Fix empty folders during scatter/gather +- [#592](https://github.com/nf-core/sarek/pull/592) - Fix optional resources for Mutect2, GetPileupSummaries, and HaplotypeCaller: issue [#299](https://github.com/nf-core/sarek/issues/299), [#359](https://github.com/nf-core/sarek/issues/359), [#367](https://github.com/nf-core/sarek/issues/367) ### Deprecated diff --git a/conf/modules.config b/conf/modules.config index 1deca131d..26f747991 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -808,8 +808,8 @@ process{ ext.args = { "-tumor-segmentation ${meta.id}.segmentation.table" } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/${meta.id}/mutect2" }, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + path: { "${params.outdir}/variant_calling/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : "${meta.id}/mutect2/${filename}" } ] } @@ -825,8 +825,8 @@ process{ ext.prefix = {"${meta.id}.filtered"} publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/${meta.id}/mutect2" }, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + path: { "${params.outdir}/variant_calling/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : "${meta.id}/mutect2/${filename}" } ] } @@ -850,9 +850,18 @@ process{ ext.prefix = { meta.num_intervals <= 1 ? meta.id : "${meta.id}_${intervals.simpleName}" } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/${meta.id}/" }, + path: { "${params.outdir}/variant_calling/" }, + pattern: "*.table", + saveAs: { meta.num_intervals > 1 ? null : "${meta.id}/mutect2/${it}" } + ] + } + + withName: 'GETPILEUPSUMMARIES_.*' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/" }, pattern: "*.table", - saveAs: { meta.num_intervals > 1 ? null : "mutect2/${it}" } + saveAs: { meta.num_intervals > 1 ? null : "${meta.tumor_id}_vs_${meta.normal_id}/mutect2/${it}" } ] } @@ -880,9 +889,9 @@ process{ ext.args = { params.ignore_soft_clipped_bases ? "--dont-use-soft-clipped-bases true --f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz" : "--f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz" } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/${meta.id}/" }, + path: { "${params.outdir}/variant_calling/" }, pattern: "*{vcf.gz,vcf.gz.tbi,stats}", - saveAs: { meta.num_intervals > 1 ? null : "mutect2/${it}" } + saveAs: { meta.num_intervals > 1 ? null : "${meta.id}/mutect2/${it}" } ] } diff --git a/conf/test.config b/conf/test.config index 7c4854674..3cc63737f 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,7 +22,7 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '6.GB' + max_memory = '6.5GB' max_time = '8.h' // Input data diff --git a/modules.json b/modules.json index 36dff42a6..4d27a0011 100644 --- a/modules.json +++ b/modules.json @@ -130,7 +130,7 @@ "git_sha": "169b2b96c1167f89ab07127b7057c1d90a6996c7" }, "gatk4/getpileupsummaries": { - "git_sha": "f40cfefc0899fd6bb6adc300142ca6c3a35573ff" + "git_sha": "1ac223ad436c1410e9c16a5966274b7ca1f8d855" }, "gatk4/haplotypecaller": { "git_sha": "169b2b96c1167f89ab07127b7057c1d90a6996c7" diff --git a/modules/nf-core/modules/gatk4/getpileupsummaries/main.nf b/modules/nf-core/modules/gatk4/getpileupsummaries/main.nf index 1c0561704..5945a937a 100644 --- a/modules/nf-core/modules/gatk4/getpileupsummaries/main.nf +++ b/modules/nf-core/modules/gatk4/getpileupsummaries/main.nf @@ -25,7 +25,7 @@ process GATK4_GETPILEUPSUMMARIES { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def interval_command = intervals ? "--intervals $intervals" : "" + def interval_command = intervals ? "--intervals $intervals" : "--intervals $variants" def reference_command = fasta ? "--reference $fasta" : '' def avail_mem = 3 diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index 2e09e628a..285318b39 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -52,7 +52,6 @@ workflow PREPARE_GENOME { TABIX_PON(pon.flatten().map{ it -> [[id:it.baseName], it] }) chr_files = chr_dir - //TODO this works, but is not pretty. I will leave this in your hands during refactoring @Maxime if ( params.chr_dir.endsWith('tar.gz')){ UNTAR_CHR_DIR(chr_dir.map{ it -> [[id:it[0].baseName], it] }) chr_files = UNTAR_CHR_DIR.out.untar.map{ it[1] } @@ -71,16 +70,16 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(TABIX_PON.out.versions) emit: - bwa = BWAMEM1_INDEX.out.index // path: bwa/* - bwamem2 = BWAMEM2_INDEX.out.index // path: bwamem2/* - hashtable = DRAGMAP_HASHTABLE.out.hashmap // path: dragmap/* - dbsnp_tbi = TABIX_DBSNP.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: dbsnb.vcf.gz.tbi - dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict - fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai - germline_resource_tbi = TABIX_GERMLINE_RESOURCE.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: germline_resource.vcf.gz.tbi - known_indels_tbi = TABIX_KNOWN_INDELS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi - msisensorpro_scan = MSISENSORPRO_SCAN.out.list.map{ meta, list -> [list] } // path: genome_msi.list - pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi + bwa = BWAMEM1_INDEX.out.index // path: bwa/* + bwamem2 = BWAMEM2_INDEX.out.index // path: bwamem2/* + hashtable = DRAGMAP_HASHTABLE.out.hashmap // path: dragmap/* + dbsnp_tbi = TABIX_DBSNP.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: dbsnb.vcf.gz.tbi + dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict + fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai + germline_resource_tbi = TABIX_GERMLINE_RESOURCE.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: germline_resource.vcf.gz.tbi + known_indels_tbi = TABIX_KNOWN_INDELS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi + msisensorpro_scan = MSISENSORPRO_SCAN.out.list.map{ meta, list -> [list] } // path: genome_msi.list + pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi chr_files = chr_files - versions = ch_versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/nf-core/gatk4/tumor_normal_somatic_variant_calling/main.nf b/subworkflows/nf-core/gatk4/tumor_normal_somatic_variant_calling/main.nf index 67f9cfd46..5bf22d8bb 100644 --- a/subworkflows/nf-core/gatk4/tumor_normal_somatic_variant_calling/main.nf +++ b/subworkflows/nf-core/gatk4/tumor_normal_somatic_variant_calling/main.nf @@ -114,13 +114,15 @@ workflow GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING { normal: [ meta, input_list[0], input_index_list[0], intervals ] } + germline_resource_pileup = germline_resource_tbi ? germline_resource : Channel.empty() + germline_resource_pileup_tbi = germline_resource_tbi ?: Channel.empty() GETPILEUPSUMMARIES_TUMOR ( pileup.tumor.map{ meta, cram, crai, intervals -> [[patient:meta.patient, normal_id:meta.normal_id, tumor_id:meta.tumor_id, gender:meta.gender, id:meta.tumor_id, num_intervals:meta.num_intervals], cram, crai, intervals] }, - fasta, fai, dict, germline_resource, germline_resource_tbi ) + fasta, fai, dict, germline_resource_pileup, germline_resource_pileup_tbi ) GETPILEUPSUMMARIES_NORMAL ( pileup.normal.map{ meta, cram, crai, intervals -> @@ -128,7 +130,7 @@ workflow GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING { [[patient:meta.patient, normal_id:meta.normal_id, tumor_id:meta.tumor_id, gender:meta.gender, id:meta.normal_id, num_intervals:meta.num_intervals], cram, crai, intervals] }, - fasta, fai, dict, germline_resource, germline_resource_tbi ) + fasta, fai, dict, germline_resource_pileup, germline_resource_pileup_tbi ) GETPILEUPSUMMARIES_NORMAL.out.table.branch{ intervals: it[0].num_intervals > 1 diff --git a/subworkflows/nf-core/gatk4/tumor_only_somatic_variant_calling/main.nf b/subworkflows/nf-core/gatk4/tumor_only_somatic_variant_calling/main.nf index bfeca83c3..34975771e 100644 --- a/subworkflows/nf-core/gatk4/tumor_only_somatic_variant_calling/main.nf +++ b/subworkflows/nf-core/gatk4/tumor_only_somatic_variant_calling/main.nf @@ -107,7 +107,9 @@ workflow GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING { // //Generate pileup summary table using getepileupsummaries. // - GETPILEUPSUMMARIES ( input , fasta, fai, dict, germline_resource , germline_resource_tbi ) + germline_resource_pileup = germline_resource_tbi ? germline_resource : Channel.empty() //Channel.empty().concat(germline_resource) //germline_resource.ifEmpty() ?: Channel.empty() + germline_resource_pileup_tbi = germline_resource_tbi ?: Channel.empty() + GETPILEUPSUMMARIES ( input , fasta, fai, dict, germline_resource_pileup , germline_resource_pileup_tbi ) GETPILEUPSUMMARIES.out.table.branch{ intervals: it[0].num_intervals > 1 diff --git a/tests/test_tools.yml b/tests/test_tools.yml index ab0111ebb..834f3b622 100644 --- a/tests/test_tools.yml +++ b/tests/test_tools.yml @@ -430,10 +430,18 @@ - no_intervals - tumor_only - variant_calling - exit_code: 1 - stdout: - contains: - - "--tools mutect2 and --no_intervals cannot be used together." + files: + - path: results/variant_calling/sample2/mutect2/sample2.vcf.gz + - path: results/variant_calling/sample2/mutect2/sample2.vcf.gz.tbi + - path: results/variant_calling/sample2/mutect2/sample2.vcf.gz.stats + - path: results/variant_calling/sample2/mutect2/sample2.contamination.table + - path: results/variant_calling/sample2/mutect2/sample2.segmentation.table + - path: results/variant_calling/sample2/mutect2/sample2.artifactprior.tar.gz + - path: results/variant_calling/sample2/mutect2/sample2.pileupsummaries.table + - path: results/variant_calling/sample2/mutect2/sample2.filtered.vcf.gz + - path: results/variant_calling/sample2/mutect2/sample2.filtered.vcf.gz.tbi + - path: results/variant_calling/sample2/mutect2/sample2.filtered.vcf.gz.filteringStats.tsv + - path: results/csv/variantcalled.csv - name: Run variant calling on somatic sample with mutect2 command: nextflow run main.nf -profile test,tools_somatic,docker --tools mutect2 -c ./tests/nextflow.config @@ -456,16 +464,25 @@ - path: results/csv/variantcalled.csv - name: Run variant calling on somatic sample with mutect2 without intervals - command: nextflow run main.nf -profile test,tools_somatic,docker --tools mutect2 --no_intervals + command: nextflow run main.nf -profile test,tools_somatic,docker --tools mutect2 --no_intervals -c ./tests/nextflow.config tags: - mutect2 - no_intervals - somatic - variant_calling - exit_code: 1 - stdout: - contains: - - "--tools mutect2 and --no_intervals cannot be used together." + files: + - path: results/variant_calling/sample4_vs_sample3/mutect2/sample4_vs_sample3.vcf.gz + - path: results/variant_calling/sample4_vs_sample3/mutect2/sample4_vs_sample3.vcf.gz.tbi + - path: results/variant_calling/sample4_vs_sample3/mutect2/sample4_vs_sample3.vcf.gz.stats + - path: results/variant_calling/sample4_vs_sample3/mutect2/sample4_vs_sample3.contamination.table + - path: results/variant_calling/sample4_vs_sample3/mutect2/sample4_vs_sample3.segmentation.table + - path: results/variant_calling/sample4_vs_sample3/mutect2/sample3.pileupsummaries.table + - path: results/variant_calling/sample4_vs_sample3/mutect2/sample4.pileupsummaries.table + - path: results/variant_calling/sample4_vs_sample3/mutect2/sample4_vs_sample3.artifactprior.tar.gz + - path: results/variant_calling/sample4_vs_sample3/mutect2/sample4_vs_sample3.filtered.vcf.gz + - path: results/variant_calling/sample4_vs_sample3/mutect2/sample4_vs_sample3.filtered.vcf.gz.tbi + - path: results/variant_calling/sample4_vs_sample3/mutect2/sample4_vs_sample3.filtered.vcf.gz.filteringStats.tsv + - path: results/csv/variantcalled.csv - name: Run variant calling on somatic sample with msisensor-pro command: nextflow run main.nf -profile test,tools_somatic,docker --tools msisensorpro diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 884e9452d..388c7dee7 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -59,10 +59,25 @@ if (params.wes) { } if(params.tools && params.tools.contains('mutect2')){ - if(params.no_intervals){ - log.error "--tools mutect2 and --no_intervals cannot be used together.\nOne of the tools within the Mutect2 subworkflow requires intervals. They can be provided to the pipeline with --intervals. If none are provided, they will be generated from the FASTA file.\nFor more information on the Mutect2 workflow, see here: https://gatk.broadinstitute.org/hc/en-us/articles/360035531132--How-to-Call-somatic-mutations-using-GATK4-Mutect2.\nFor more information on GetPileupsummaries, see here: https://gatk.broadinstitute.org/hc/en-us/articles/5358860217115-GetPileupSummaries" + if(!params.pon){ + log.warn("No Panel-of-normal was specified for Mutect2.\nIt is highly recommended to use one: https://gatk.broadinstitute.org/hc/en-us/articles/5358911630107-Mutect2\nFor more information on how to create one: https://gatk.broadinstitute.org/hc/en-us/articles/5358921041947-CreateSomaticPanelOfNormals-BETA-") + } + if(!params.germline_resource){ + log.warn("If Mutect2 is specified without a germline resource, no filtering will be done.\nIt is recommended to use one: https://gatk.broadinstitute.org/hc/en-us/articles/5358911630107-Mutect2") + } + if(params.pon && params.pon.contains("/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz")){ + log.warn("The default Panel-of-Normals provided by GATK is used for Mutect2.\nIt is highly recommended to generate one from normal samples that are technical similar to the tumor ones.\nFor more information: https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON-") + } +} + +if(!params.dbsnp && !params.known_indels){ + if(!params.skip_tools || params.skip_tools && !params.skip_tools.contains('baserecalibrator')){ + log.error "Base quality score recalibration requires at least one resource file. Please provide at least one of `--dbsnp` or `--known_indels`\nYou can skip this step in the workflow by adding `--skip_tools baserecalibrator` to the command." exit 1 } + if(params.tools && params.tools.contains('haplotypecaller')){ + log.warn "If Haplotypecaller is specified, without `--dbsnp` or `--known_indels no filtering will be done. For filtering, please provide at least one of `--dbsnp` or `--known_indels`.\nFor more information see FilterVariantTranches (single-sample, default): https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches\nFor more information see VariantRecalibration (--joint_germline): https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator\nFor more information on GATK Best practice germline variant calling: https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-" + } } // Save AWS IGenomes file containing annotation version @@ -79,15 +94,16 @@ if (anno_readme && file(anno_readme).exists()) { */ // Initialize file channels based on params, defined in the params.genomes[params.genome] scope -chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : [] -dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.empty() +chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : Channel.value([]) +dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([]) fasta = params.fasta ? Channel.fromPath(params.fasta).collect() : Channel.empty() fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : Channel.empty() -germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.empty() -known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.empty() -loci = params.ac_loci ? Channel.fromPath(params.ac_loci).collect() : [] -loci_gc = params.ac_loci_gc ? Channel.fromPath(params.ac_loci_gc).collect() : [] -mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : [] +germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.value([]) //Mutec2 does not require a germline resource, so set to optional input +known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([]) +loci = params.ac_loci ? Channel.fromPath(params.ac_loci).collect() : Channel.value([]) +loci_gc = params.ac_loci_gc ? Channel.fromPath(params.ac_loci_gc).collect() : Channel.value([]) +mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : Channel.value([]) +pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.value([]) //PON is optional for Mutect2 (but highly recommended) // Initialize value channels based on params, defined in the params.genomes[params.genome] scope snpeff_db = params.snpeff_db ?: Channel.empty() @@ -96,7 +112,6 @@ vep_genome = params.vep_genome ?: Channel.empty() vep_species = params.vep_species ?: Channel.empty() // Initialize files channels based on params, not defined within the params.genomes[params.genome] scope -pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.empty() snpeff_cache = params.snpeff_cache ? Channel.fromPath(params.snpeff_cache).collect() : [] vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : [] @@ -246,9 +261,9 @@ workflow SAREK { dragmap = params.fasta ? params.dragmap ? Channel.fromPath(params.dragmap).collect() : PREPARE_GENOME.out.hashtable : [] dict = params.fasta ? params.dict ? Channel.fromPath(params.dict).collect() : PREPARE_GENOME.out.dict : [] fasta_fai = params.fasta ? params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : PREPARE_GENOME.out.fasta_fai : [] - dbsnp_tbi = params.dbsnp ? params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : PREPARE_GENOME.out.dbsnp_tbi : Channel.empty() + dbsnp_tbi = params.dbsnp ? params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : PREPARE_GENOME.out.dbsnp_tbi : Channel.value([]) germline_resource_tbi = params.germline_resource ? params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi).collect() : PREPARE_GENOME.out.germline_resource_tbi : [] - known_indels_tbi = params.known_indels ? params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : PREPARE_GENOME.out.known_indels_tbi : Channel.empty() + known_indels_tbi = params.known_indels ? params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : PREPARE_GENOME.out.known_indels_tbi : Channel.value([]) pon_tbi = params.pon ? params.pon_tbi ? Channel.fromPath(params.pon_tbi).collect() : PREPARE_GENOME.out.pon_tbi : [] msisensorpro_scan = PREPARE_GENOME.out.msisensorpro_scan @@ -259,7 +274,6 @@ workflow SAREK { // known_sites is made by grouping both the dbsnp and the known indels ressources // Which can either or both be optional - // Actually BQSR has been throwing erros if no sides were provided so it must be at least one known_sites = dbsnp.concat(known_indels).collect() known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect()