diff --git a/CHANGELOG.md b/CHANGELOG.md index 57044545f7..43c21b19be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -138,6 +138,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#618](https://github.com/nf-core/sarek/pull/618) - Fix channel issue with `targets.bed` in prepare_intervals - [#634](https://github.com/nf-core/sarek/pull/634) - Fix issue with samtools/mosdepth plots in multiqc_report - [#641](https://github.com/nf-core/sarek/pull/641) - Fix issue with duplicate substring in tools and skip_tools +- [#642](https://github.com/nf-core/sarek/pull/642) - Only unzip ref files if tool is run, only publish ref files if `--save_reference` and simplify CNKit logic - [#650](https://github.com/nf-core/sarek/pull/650) - Fix intervals checks ### Deprecated diff --git a/conf/modules.config b/conf/modules.config index 8cad034259..631f56e938 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -130,6 +130,36 @@ process { ] } + withName: 'UNZIP_ALLELES|UNZIP_LOCI|UNZIP_GC|UNZIP_RT'{ + ext.when = { params.tools && params.tools.split(',').contains('ascat')} + } + + withName: 'UNTAR_CHR_DIR'{ + ext.when = { params.tools && params.tools.split(',').contains('controlfreec')} + } + + withName: 'CNVKIT_ANTITARGET' { + ext.when = { params.tools && params.tools.split(',').contains('cnvkit') } + publishDir = [ + enabled: params.save_reference, + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference" }, + pattern: "*{bed}" + ] + } + + withName: 'CNVKIT_REFERENCE' { + ext.prefix = "cnvkit" + ext.when = { params.tools && params.tools.split(',').contains('cnvkit') } + publishDir = [ + enabled: params.save_reference, + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference" }, + pattern: "*{cnn}" + ] + } + + // PREPARE INTERVALS withName: 'CREATE_INTERVALS_BED' { @@ -519,8 +549,8 @@ process { // VARIANT CALLING process{ - // CNVKIT_GERMLINE - withName: 'CNVKIT_BATCH_GERMLINE' { + // CNVKIT + withName: 'CNVKIT_BATCH' { ext.args = { params.wes ? "--method hybrid --diagram --scatter" : "--method wgs --diagram --scatter" } ext.when = { params.tools && params.tools.split(',').contains('cnvkit') } publishDir = [ @@ -817,36 +847,6 @@ process{ ] } - //CNVKIT - withName: 'CNVKIT_ANTITARGET' { - ext.when = { params.tools && params.tools.split(',').contains('cnvkit') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/reference" }, - pattern: "*{bed}" - ] - } - - withName: 'CNVKIT_REFERENCE' { - ext.prefix = "cnvkit" - ext.when = { params.tools && params.tools.split(',').contains('cnvkit') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/reference" }, - pattern: "*{cnn}" - ] - } - - withName: 'CNVKIT_BATCH_TUMORONLY' { - ext.args = { params.wes ? "--method hybrid --diagram --scatter" : "--method wgs --diagram --scatter" } - ext.when = { params.tools && params.tools.split(',').contains('cnvkit') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/${meta.id}/cnvkit" }, - pattern: "*{bed,cnn,cnr,cns,pdf,png}" - ] - } - //MANTA withName: 'MERGE_MANTA_TUMOR' { ext.prefix = {"${meta.id}.manta.tumor_sv"} @@ -1044,17 +1044,6 @@ process{ } } - //CNVKIT - withName: 'CNVKIT_BATCH_SOMATIC' { - ext.args = { params.wes ? "--method hybrid --diagram --scatter" : "--method wgs --diagram --scatter" } - ext.when = { params.tools && params.tools.split(',').contains('cnvkit') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/${meta.id}/cnvkit" }, - pattern: "*{bed,cnn,cnr,cns,pdf,png}" - ] - } - //FREEBAYES if (params.tools && params.tools.split(',').contains('freebayes')) { withName: 'NFCORE_SAREK:SAREK:PAIR_VARIANT_CALLING:RUN_FREEBAYES_SOMATIC:FREEBAYES' { diff --git a/subworkflows/local/germline_variant_calling.nf b/subworkflows/local/germline_variant_calling.nf index 761f976766..18df95f69e 100644 --- a/subworkflows/local/germline_variant_calling.nf +++ b/subworkflows/local/germline_variant_calling.nf @@ -2,7 +2,7 @@ // GERMLINE VARIANT CALLING // -include { RUN_CNVKIT_GERMLINE } from '../nf-core/variantcalling/cnvkit/germline/main.nf' +include { RUN_CNVKIT } from '../nf-core/variantcalling/cnvkit/main.nf' include { RUN_DEEPVARIANT } from '../nf-core/variantcalling/deepvariant/main.nf' include { RUN_FREEBAYES } from '../nf-core/variantcalling/freebayes/main.nf' include { RUN_HAPLOTYPECALLER } from '../nf-core/variantcalling/haplotypecaller/main.nf' @@ -85,12 +85,12 @@ workflow GERMLINE_VARIANT_CALLING { [meta, [], cram] } - RUN_CNVKIT_GERMLINE(cram_recalibrated_cnvkit_germline, + RUN_CNVKIT(cram_recalibrated_cnvkit_germline, fasta, fasta_fai, intervals_bed_combined, []) - ch_versions = ch_versions.mix(RUN_CNVKIT_GERMLINE.out.versions) + ch_versions = ch_versions.mix(RUN_CNVKIT.out.versions) } // DEEPVARIANT diff --git a/subworkflows/local/pair_variant_calling.nf b/subworkflows/local/pair_variant_calling.nf index 152edc2c38..4f315e0278 100644 --- a/subworkflows/local/pair_variant_calling.nf +++ b/subworkflows/local/pair_variant_calling.nf @@ -7,7 +7,7 @@ include { RUN_CONTROLFREEC_SOMATIC } from '../nf-core/variantca include { RUN_FREEBAYES as RUN_FREEBAYES_SOMATIC } from '../nf-core/variantcalling/freebayes/main.nf' include { RUN_MANTA_SOMATIC } from '../nf-core/variantcalling/manta/somatic/main.nf' include { RUN_STRELKA_SOMATIC } from '../nf-core/variantcalling/strelka/somatic/main.nf' -include { RUN_CNVKIT_SOMATIC } from '../nf-core/variantcalling/cnvkit/somatic/main.nf' +include { RUN_CNVKIT } from '../nf-core/variantcalling/cnvkit/main.nf' include { RUN_MPILEUP as RUN_MPILEUP_NORMAL } from '../nf-core/variantcalling/mpileup/main' include { RUN_MPILEUP as RUN_MPILEUP_TUMOR } from '../nf-core/variantcalling/mpileup/main' include { RUN_ASCAT_SOMATIC } from '../nf-core/variantcalling/ascat/main' @@ -130,11 +130,11 @@ workflow PAIR_VARIANT_CALLING { [meta, tumor_cram, normal_cram] } - RUN_CNVKIT_SOMATIC( cram_pair_cnvkit_somatic, - fasta, - fasta_fai, - intervals_bed_combined, - []) + RUN_CNVKIT( cram_pair_cnvkit_somatic, + fasta, + fasta_fai, + intervals_bed_combined, + []) } if (tools.split(',').contains('freebayes')){ diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index 55fc5f779d..b6287b6ab0 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -10,6 +10,8 @@ include { BWA_INDEX as BWAMEM1_INDEX } from '../../modules/nf-core/modules/bwa/index/main' include { BWAMEM2_INDEX } from '../../modules/nf-core/modules/bwamem2/index/main' +include {CNVKIT_ANTITARGET } from '../../modules/nf-core/modules/cnvkit/antitarget/main' +include {CNVKIT_REFERENCE } from '../../modules/nf-core/modules/cnvkit/reference/main' include { DRAGMAP_HASHTABLE } from '../../modules/nf-core/modules/dragmap/hashtable/main' include { GATK4_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/modules/gatk4/createsequencedictionary/main' include { MSISENSORPRO_SCAN } from '../../modules/nf-core/modules/msisensorpro/scan/main' @@ -26,17 +28,18 @@ include { UNZIP as UNZIP_RT } from '../../modules/nf-core/m workflow PREPARE_GENOME { take: - ascat_alleles // channel: [optional] ascat allele files - ascat_loci // channel: [optional] ascat loci files - ascat_loci_gc // channel: [optional] ascat gc content file - ascat_loci_rt // channel: [optional] ascat replictiming file - chr_dir // channel: [optional] chromosome files - dbsnp // channel: [optional] dbsnp - fasta // channel: [mandatory] fasta - fasta_fai // channel: [optional] fasta_fai - germline_resource // channel: [optional] germline_resource - known_indels // channel: [optional] known_indels - pon // channel: [optional] pon + ascat_alleles // channel: [optional] ascat allele files + ascat_loci // channel: [optional] ascat loci files + ascat_loci_gc // channel: [optional] ascat gc content file + ascat_loci_rt // channel: [optional] ascat replictiming file + chr_dir // channel: [optional] chromosome files + dbsnp // channel: [optional] dbsnp + fasta // channel: [mandatory] fasta + fasta_fai // channel: [optional] fasta_fai + germline_resource // channel: [optional] germline_resource + intervals_bed_combined // channel: [] + known_indels // channel: [optional] known_indels + pon // channel: [optional] pon main: @@ -60,6 +63,10 @@ workflow PREPARE_GENOME { TABIX_KNOWN_INDELS( known_indels.flatten().map{ it -> [[id:it.baseName], it] } ) TABIX_PON(pon.flatten().map{ it -> [[id:it.baseName], it] }) + // prepare a reference for tumor_only mode based on target_baits + CNVKIT_ANTITARGET(intervals_bed_combined.flatten().map{ it -> [[id:it[0].baseName], it] }) + CNVKIT_REFERENCE(fasta, intervals_bed_combined, CNVKIT_ANTITARGET.out.bed.map{ meta, bed -> [bed]} ) + // prepare ascat reference files allele_files = ascat_alleles if (params.ascat_alleles && params.ascat_alleles.endsWith('.zip')) { @@ -99,6 +106,8 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) ch_versions = ch_versions.mix(BWAMEM1_INDEX.out.versions) ch_versions = ch_versions.mix(BWAMEM2_INDEX.out.versions) + ch_versions = ch_versions.mix(CNVKIT_ANTITARGET.out.versions) + ch_versions = ch_versions.mix(CNVKIT_REFERENCE.out.versions) ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) ch_versions = ch_versions.mix(MSISENSORPRO_SCAN.out.versions) ch_versions = ch_versions.mix(TABIX_DBSNP.out.versions) @@ -118,6 +127,7 @@ workflow PREPARE_GENOME { msisensorpro_scan = MSISENSORPRO_SCAN.out.list.map{ meta, list -> [list] } // path: genome_msi.list pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi chr_files = chr_files + cnvkit_reference = CNVKIT_REFERENCE.out.cnn allele_files = allele_files loci_files = loci_files gc_file = gc_file diff --git a/subworkflows/local/tumor_variant_calling.nf b/subworkflows/local/tumor_variant_calling.nf index 166c85ab3d..f16220fd63 100644 --- a/subworkflows/local/tumor_variant_calling.nf +++ b/subworkflows/local/tumor_variant_calling.nf @@ -8,7 +8,7 @@ include { GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING } from '../../subworkflows/nf- include { RUN_MANTA_TUMORONLY } from '../nf-core/variantcalling/manta/tumoronly/main.nf' include { RUN_STRELKA_SINGLE } from '../nf-core/variantcalling/strelka/single/main.nf' include { RUN_CONTROLFREEC_TUMORONLY } from '../nf-core/variantcalling/controlfreec/tumoronly/main.nf' -include { RUN_CNVKIT_TUMORONLY } from '../nf-core/variantcalling/cnvkit/tumoronly/main.nf' +include { RUN_CNVKIT } from '../nf-core/variantcalling/cnvkit/main.nf' include { RUN_MPILEUP } from '../nf-core/variantcalling/mpileup/main' include { RUN_TIDDIT } from '../nf-core/variantcalling/tiddit/main.nf' @@ -18,6 +18,7 @@ workflow TUMOR_ONLY_VARIANT_CALLING { cram_recalibrated // channel: [mandatory] cram bwa // channel: [optional] bwa chr_files + cnvkit_reference dbsnp // channel: [mandatory] dbsnp dbsnp_tbi // channel: [mandatory] dbsnp_tbi dict // channel: [mandatory] dict @@ -99,13 +100,13 @@ workflow TUMOR_ONLY_VARIANT_CALLING { [meta, cram, []] } - RUN_CNVKIT_TUMORONLY ( cram_recalibrated_cnvkit_tumoronly, - fasta, - fasta_fai, - intervals_bed_combined, - [] ) + RUN_CNVKIT ( cram_recalibrated_cnvkit_tumoronly, + fasta, + fasta_fai, + [], + cnvkit_reference ) - ch_versions = ch_versions.mix(RUN_CNVKIT_TUMORONLY.out.versions) + ch_versions = ch_versions.mix(RUN_CNVKIT.out.versions) } if (tools.split(',').contains('freebayes')){ diff --git a/subworkflows/nf-core/variantcalling/cnvkit/germline/main.nf b/subworkflows/nf-core/variantcalling/cnvkit/main.nf similarity index 62% rename from subworkflows/nf-core/variantcalling/cnvkit/germline/main.nf rename to subworkflows/nf-core/variantcalling/cnvkit/main.nf index ab40eeffc8..4b96470994 100644 --- a/subworkflows/nf-core/variantcalling/cnvkit/germline/main.nf +++ b/subworkflows/nf-core/variantcalling/cnvkit/main.nf @@ -1,12 +1,12 @@ // -// CNV calling GERMLINE +// CNVKit calling // // For all modules here: // A when clause condition is defined in the conf/modules.config to determine if the module should be run -include { CNVKIT_BATCH as CNVKIT_BATCH_GERMLINE } from '../../../../../modules/nf-core/modules/cnvkit/batch/main' +include { CNVKIT_BATCH } from '../../../../modules/nf-core/modules/cnvkit/batch/main' -workflow RUN_CNVKIT_GERMLINE { +workflow RUN_CNVKIT { take: cram_recalibrated // channel: [mandatory] cram fasta // channel: [mandatory] fasta @@ -17,9 +17,9 @@ workflow RUN_CNVKIT_GERMLINE { main: ch_versions = Channel.empty() - CNVKIT_BATCH_GERMLINE(cram_recalibrated, fasta, fasta_fai, targets, []) + CNVKIT_BATCH(cram_recalibrated, fasta, fasta_fai, targets, reference) - ch_versions = ch_versions.mix(CNVKIT_BATCH_GERMLINE.out.versions) + ch_versions = ch_versions.mix(CNVKIT_BATCH.out.versions) emit: versions = ch_versions // channel: [ versions.yml ] diff --git a/subworkflows/nf-core/variantcalling/cnvkit/somatic/main.nf b/subworkflows/nf-core/variantcalling/cnvkit/somatic/main.nf deleted file mode 100644 index 18bb718bca..0000000000 --- a/subworkflows/nf-core/variantcalling/cnvkit/somatic/main.nf +++ /dev/null @@ -1,28 +0,0 @@ -// -// CNV calling -// -// For all modules here: -// A when clause condition is defined in the conf/modules.config to determine if the module should be run - -include { CNVKIT_BATCH as CNVKIT_BATCH_SOMATIC } from '../../../../../modules/nf-core/modules/cnvkit/batch/main' - -workflow RUN_CNVKIT_SOMATIC { - take: - cram_pair // channel: [mandatory] cram tumor, cram normal - fasta // channel: [mandatory] fasta - fasta_fai // channel: [optional] fasta_fai - targets // channel: [mandatory] bed - reference // channel: [optional] - - main: - ch_versions = Channel.empty() - - CNVKIT_BATCH_SOMATIC(cram_pair, fasta, fasta_fai, targets, []) - - ch_versions = ch_versions.mix(CNVKIT_BATCH_SOMATIC.out.versions.first()) - - emit: - versions = ch_versions // channel: [ versions.yml ] - - -} diff --git a/subworkflows/nf-core/variantcalling/cnvkit/tumoronly/main.nf b/subworkflows/nf-core/variantcalling/cnvkit/tumoronly/main.nf deleted file mode 100644 index 560680c692..0000000000 --- a/subworkflows/nf-core/variantcalling/cnvkit/tumoronly/main.nf +++ /dev/null @@ -1,41 +0,0 @@ -// -// CNV calling TUMOR_ONLY -// -// For all modules here: -// A when clause condition is defined in the conf/modules.config to determine if the module should be run - - -include {CNVKIT_ANTITARGET } from '../../../../../modules/nf-core/modules/cnvkit/antitarget/main' -include {CNVKIT_REFERENCE } from '../../../../../modules/nf-core/modules/cnvkit/reference/main' -include {CNVKIT_BATCH as CNVKIT_BATCH_TUMORONLY } from '../../../../../modules/nf-core/modules/cnvkit/batch/main' - -workflow RUN_CNVKIT_TUMORONLY { - take: - cram_recalibrated // channel: [mandatory] cram tumor - fasta // channel: [mandatory] fasta - fasta_fai // channel: [optional] fasta_fai - targets // channel: [mandatory] bed - reference // channel: [] cnn - - main: - ch_versions = Channel.empty() - - // prepare a reference for tumor_only mode based on target_baits - - CNVKIT_ANTITARGET(targets.map{ it -> [[id:it[0].baseName], it] }) - - CNVKIT_REFERENCE(fasta, targets, CNVKIT_ANTITARGET.out.bed.map{ meta, bed -> [bed]} ) - - // use reference for calling CNVs - // cram_input needs the fasta reference genome for bam_conversion - - CNVKIT_BATCH_TUMORONLY(cram_recalibrated, fasta, fasta_fai, [], CNVKIT_REFERENCE.out.cnn) - - ch_versions = ch_versions.mix(CNVKIT_ANTITARGET.out.versions) - ch_versions = ch_versions.mix(CNVKIT_REFERENCE.out.versions) - ch_versions = ch_versions.mix(CNVKIT_BATCH_TUMORONLY.out.versions) - - emit: - versions = ch_versions // channel: [ versions.yml ] - -} diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 118e009ea4..bfc40004d0 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -292,6 +292,16 @@ workflow SAREK { // To gather used softwares versions for MultiQC ch_versions = Channel.empty() + // Build intervals if needed + PREPARE_INTERVALS(fasta_fai) + + // Intervals for speed up preprocessing/variant calling by spread/gather + intervals_bed_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_combined // [interval.bed] all intervals in one file + intervals_for_preprocessing = params.wes ? intervals_bed_combined : [] // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS) + + intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather + intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather + // Build indices if needed PREPARE_GENOME( ascat_alleles, @@ -303,14 +313,16 @@ workflow SAREK { fasta, fasta_fai, germline_resource, + intervals_bed_combined, known_indels, pon) // Gather built indices or get them from the params allele_files = PREPARE_GENOME.out.allele_files bwa = params.fasta ? params.bwa ? Channel.fromPath(params.bwa).collect() : PREPARE_GENOME.out.bwa : [] - chr_files = PREPARE_GENOME.out.chr_files bwamem2 = params.fasta ? params.bwamem2 ? Channel.fromPath(params.bwamem2).collect() : PREPARE_GENOME.out.bwamem2 : [] + chr_files = PREPARE_GENOME.out.chr_files + cnvkit_reference = params.tools && params.tools.split(',').contains('cnvkit') ? PREPARE_GENOME.out.cnvkit_reference : Channel.empty() dragmap = params.fasta ? params.dragmap ? Channel.fromPath(params.dragmap).collect() : PREPARE_GENOME.out.hashtable : [] dict = params.fasta ? params.dict ? Channel.fromPath(params.dict).collect() : PREPARE_GENOME.out.dict : [] fasta_fai = params.fasta ? params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : PREPARE_GENOME.out.fasta_fai : [] @@ -333,16 +345,6 @@ workflow SAREK { known_sites = dbsnp.concat(known_indels).collect() known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect() - // Build intervals if needed - PREPARE_INTERVALS(fasta_fai) - - // Intervals for speed up preprocessing/variant calling by spread/gather - intervals_bed_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_combined // [interval.bed] all intervals in one file - intervals_for_preprocessing = params.wes ? intervals_bed_combined : [] // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS) - - intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather - intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather - // Gather used softwares versions ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) ch_versions = ch_versions.mix(PREPARE_INTERVALS.out.versions) @@ -857,6 +859,7 @@ workflow SAREK { cram_variant_calling_tumor_only, [], chr_files, + cnvkit_reference, dbsnp, dbsnp_tbi, dict,