diff --git a/CHANGELOG.md b/CHANGELOG.md index 5422ec561..0d1499919 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -64,6 +64,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#463](https://github.com/nf-core/sarek/pull/463), [#468](https://github.com/nf-core/sarek/pull/468) - Fix `nf-core lint` - [#513](https://github.com/nf-core/sarek/pull/513), [#527](https://github.com/nf-core/sarek/pull/527) - CNV is back - [#529](https://github.com/nf-core/sarek/pull/529) - Do not save `versions.yml` files +- [#524](https://github.com/nf-core/sarek/pull/524) - Fix intervals usage by counting the actual list of scatter/gather files produced and not overall number of intervals ### Deprecated diff --git a/conf/modules.config b/conf/modules.config index e622d76c5..a47053d42 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -298,8 +298,8 @@ process { ext.prefix = { "${meta.id}.md" } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/reports/${meta.id}/markduplicates" }, - pattern: "metrics" + path: { "${params.outdir}/reports/markduplicates/${meta.id}" }, + pattern: "*metrics" ] } @@ -344,17 +344,24 @@ process { // PREPARE_RECALIBRATION - withName: 'BASERECALIBRATOR|BASERECALIBRATOR_SPARK|GATHERBQSRREPORTS' { + withName: 'BASERECALIBRATOR|BASERECALIBRATOR_SPARK' { ext.prefix = {"${meta.id}.recal"} publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/preprocessing/${meta.id}/recal_table" }, - pattern: "*table" + pattern: "*table", + saveAs: { meta.num_intervals > 1 ? null : it } ] } withName: 'GATHERBQSRREPORTS' { - ext.when = { !params.no_intervals } + ext.prefix = {"${meta.id}.recal"} + ext.when = { meta.num_intervals > 1 } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/${meta.id}/recal_table" }, + pattern: "*table", + ] } // RECALIBRATE @@ -362,16 +369,16 @@ process { withName: 'APPLYBQSR|APPLYBQSR_SPARK' { ext.prefix = {"${meta.id}.recal"} publishDir = [ - enabled: !params.no_intervals, mode: params.publish_dir_mode, path: { "${params.outdir}/preprocessing/${meta.id}/recalibrated" }, - pattern: "*cram" + pattern: "*cram", + saveAs: { meta.num_intervals > 1 ? null : it } ] } withName: 'NFCORE_SAREK:SAREK:(RECALIBRATE|RECALIBRATE_SPARK):MERGE_INDEX_CRAM:MERGE_CRAM' { ext.prefix = { "${meta.id}.recal" } - ext.when = { !params.no_intervals } + ext.when = { meta.num_intervals > 1 } publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/preprocessing/${meta.id}/recalibrated" }, @@ -473,7 +480,6 @@ process{ // DEEPVARIANT withName: 'CONCAT_DEEPVARIANT_.*' { publishDir = [ - enabled: !params.no_intervals, mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/${meta.id}/deepvariant" }, pattern: "*{vcf.gz,vcf.gz.tbi}" @@ -486,24 +492,23 @@ process{ ext.args = { params.wes ? "--model_type WES" : "--model_type WGS" } ext.when = { params.tools && params.tools.contains('deepvariant') } publishDir = [ - enabled: params.no_intervals, mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/${meta.id}/deepvariant" }, - pattern: "*{vcf.gz,vcf.gz.tbi}" + pattern: "*vcf.gz", + saveAs: { meta.num_intervals > 1 ? null : it } ] } - withName : 'TABIX_VC_DEEPVARIANT.*' { + withName : 'TABIX_VC_DEEPVARIANT_.*' { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/${meta.id}/deepvariant" }, - pattern: "*{vcf.gz,vcf.gz.tbi}" + pattern: "*tbi" ] } // FREEBAYES withName: 'CONCAT_FREEBAYES' { publishDir = [ - enabled: !params.no_intervals, mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/${meta.id}/freebayes" }, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } @@ -513,10 +518,10 @@ process{ ext.args = '--min-alternate-fraction 0.1 --min-mapping-quality 1' ext.when = { params.tools && params.tools.contains('freebayes') } publishDir = [ - enabled: params.no_intervals, mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/${meta.id}/freebayes" }, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + pattern: "*vcf.gz", + saveAs: { meta.num_intervals > 1 ? null : it } ] } withName : 'TABIX_VC_FREEBAYES' { @@ -561,7 +566,6 @@ process{ // MANTA withName: 'CONCAT_MANTA.*' { publishDir = [ - enabled: !params.no_intervals, mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/${meta.id}/manta" }, pattern: "*{vcf.gz,vcf.gz.tbi}" @@ -580,17 +584,16 @@ process{ ext.args = { params.wes ? "--exome" : "" } ext.when = { params.tools && params.tools.contains('manta') } publishDir = [ - enabled: params.no_intervals, mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/${meta.id}/manta" }, - pattern: "*{vcf.gz,vcf.gz.tbi}" + pattern: "*{vcf.gz,vcf.gz.tbi}", + saveAs: { meta.num_intervals > 1 ? null : it } ] } // STRELKA withName: 'CONCAT_STRELKA.*' { publishDir = [ - enabled: !params.no_intervals, mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/${meta.id}/strelka" }, pattern: "*{vcf.gz,vcf.gz.tbi}" @@ -606,10 +609,10 @@ process{ ext.args = { params.wes ? "--exome" : "" } ext.when = { params.tools && params.tools.contains('strelka') } publishDir = [ - enabled: params.no_intervals, mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/${meta.id}/strelka" }, - pattern: "*{vcf.gz,vcf.gz.tbi}" + pattern: "*{vcf.gz,vcf.gz.tbi}", + saveAs: { meta.num_intervals > 1 ? null : it } ] } @@ -632,9 +635,8 @@ process{ } withName: 'CAT_MPILEUP_.*' { - ext.when = { !params.no_intervals } + ext.when = { meta.num_intervals > 1 } publishDir = [ - enabled: !params.no_intervals, mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/${meta.id}/controlfreec" }, pattern: "*mpileup" @@ -712,10 +714,10 @@ process{ withName: 'MPILEUP_.*' { ext.when = { params.tools && params.tools.contains('controlfreec') } publishDir = [ - enabled: params.no_intervals, mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/${meta.id}/controlfreec" }, - pattern: "*mpileup" + pattern: "*mpileup", + saveAs: { meta.num_intervals > 1 ? null : it } ] } @@ -959,11 +961,22 @@ process { withName: 'ENSEMBLVEP' { ext.args = '--everything --filter_common --per_gene --total_length --offline' container = { "nfcore/vep:104.3.${params.genome}" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/EnsemblVEP/${meta.id}/${meta.variantcaller}" }, + pattern: "*html" + ] } withName: 'SNPEFF' { ext.args = '-nodownload -canon -v' container = { "nfcore/snpeff:5.0.${params.genome}" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/SnpEff/${meta.id}/${meta.variantcaller}" }, + pattern: "*csv", + saveAs: { params.tools.contains('snpeff') ? it : null } + ] } withName: 'ANNOTATION_BGZIPTABIX' { @@ -979,6 +992,12 @@ if (params.tools && (params.tools.contains('snpeff') || params.tools.contains('m process { withName: 'NFCORE_SAREK:SAREK:ANNOTATE:ANNOTATION_SNPEFF:ANNOTATION_BGZIPTABIX' { ext.prefix = {"${meta.id}_snpEff.ann.vcf"} + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/annotation/${meta.id}/${meta.variantcaller}" }, + pattern: "*{gz,gz.tbi}", + saveAs: { params.tools.contains('snpeff') ? it : null } + ] } } } diff --git a/modules.json b/modules.json index e0040c33b..651c22ec8 100644 --- a/modules.json +++ b/modules.json @@ -25,7 +25,7 @@ "git_sha": "eeda4136c096688d04cc40bb3c70d948213ed641" }, "cat/fastq": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + "git_sha": "9aadd9a6d3f5964476582319b3a1c54a3e3fe7c9" }, "cnvkit/batch": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" @@ -121,7 +121,7 @@ "git_sha": "409af2f27cbe45109acc7fee70718d2bf20aa449" }, "gatk4/haplotypecaller": { - "git_sha": "409af2f27cbe45109acc7fee70718d2bf20aa449" + "git_sha": "68f1c27169946f931ea4318911de5681f88b2961" }, "gatk4/intervallisttobed": { "git_sha": "409af2f27cbe45109acc7fee70718d2bf20aa449" @@ -130,7 +130,7 @@ "git_sha": "409af2f27cbe45109acc7fee70718d2bf20aa449" }, "gatk4/markduplicates": { - "git_sha": "409af2f27cbe45109acc7fee70718d2bf20aa449" + "git_sha": "0511e7fbbfa4ba41940d33b687b1cc90227b4eb8" }, "gatk4/markduplicatesspark": { "git_sha": "e04970b7d249365cafa5a52912f9a28840481c05" @@ -217,16 +217,16 @@ "git_sha": "49b18b1639f4f7104187058866a8fab33332bdfe" }, "tabix/tabix": { - "git_sha": "49b18b1639f4f7104187058866a8fab33332bdfe" + "git_sha": "b3e9b88e80880f450ad79a95b2b7aa05e1de5484" }, "tiddit/sv": { - "git_sha": "fd5f6f5f4ffef4ab5a4e809bd3211bbc71c38d30" + "git_sha": "57cb730e78634673fb254a77606e014ce942734c" }, "trimgalore": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + "git_sha": "85ec13ff1fc2196c5a507ea497de468101baabed" }, "untar": { - "git_sha": "9ae34a01d1747019fd37753ff4cafb05aec35a2b" + "git_sha": "9aadd9a6d3f5964476582319b3a1c54a3e3fe7c9" }, "vcftools": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" diff --git a/modules/nf-core/modules/cat/fastq/main.nf b/modules/nf-core/modules/cat/fastq/main.nf index bf0877c3e..b68548959 100644 --- a/modules/nf-core/modules/cat/fastq/main.nf +++ b/modules/nf-core/modules/cat/fastq/main.nf @@ -4,8 +4,8 @@ process CAT_FASTQ { conda (params.enable_conda ? "conda-forge::sed=4.7" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : - 'biocontainers/biocontainers:v1.2.0_cv1' }" + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'ubuntu:20.04' }" input: tuple val(meta), path(reads, stageAs: "input*/*") diff --git a/modules/nf-core/modules/gatk4/haplotypecaller/main.nf b/modules/nf-core/modules/gatk4/haplotypecaller/main.nf index 57f69ecd3..2cd9e7d4e 100644 --- a/modules/nf-core/modules/gatk4/haplotypecaller/main.nf +++ b/modules/nf-core/modules/gatk4/haplotypecaller/main.nf @@ -17,7 +17,7 @@ process GATK4_HAPLOTYPECALLER { output: tuple val(meta), path("*.vcf.gz"), emit: vcf - tuple val(meta), path("*.tbi") , emit: tbi + tuple val(meta), path("*.tbi") , optional:true, emit: tbi path "versions.yml" , emit: versions when: diff --git a/modules/nf-core/modules/gatk4/markduplicates/main.nf b/modules/nf-core/modules/gatk4/markduplicates/main.nf index e8a98156c..2650925b7 100644 --- a/modules/nf-core/modules/gatk4/markduplicates/main.nf +++ b/modules/nf-core/modules/gatk4/markduplicates/main.nf @@ -12,7 +12,7 @@ process GATK4_MARKDUPLICATES { output: tuple val(meta), path("*.bam") , emit: bam - tuple val(meta), path("*.bai") , emit: bai + tuple val(meta), path("*.bai") , optional:true, emit: bai tuple val(meta), path("*.metrics"), emit: metrics path "versions.yml" , emit: versions diff --git a/modules/nf-core/modules/tabix/tabix/main.nf b/modules/nf-core/modules/tabix/tabix/main.nf index c9dab068a..e155e4685 100644 --- a/modules/nf-core/modules/tabix/tabix/main.nf +++ b/modules/nf-core/modules/tabix/tabix/main.nf @@ -11,7 +11,8 @@ process TABIX_TABIX { tuple val(meta), path(tab) output: - tuple val(meta), path("*.tbi"), emit: tbi + tuple val(meta), path("*.tbi"), optional:true, emit: tbi + tuple val(meta), path("*.csi"), optional:true, emit: csi path "versions.yml" , emit: versions when: diff --git a/modules/nf-core/modules/tabix/tabix/meta.yml b/modules/nf-core/modules/tabix/tabix/meta.yml index 89478abe4..fcc6e5246 100644 --- a/modules/nf-core/modules/tabix/tabix/meta.yml +++ b/modules/nf-core/modules/tabix/tabix/meta.yml @@ -31,6 +31,10 @@ output: type: file description: tabix index file pattern: "*.{tbi}" + - csi: + type: file + description: coordinate sorted index file + pattern: "*.{csi}" - versions: type: file description: File containing software versions diff --git a/modules/nf-core/modules/tiddit/sv/main.nf b/modules/nf-core/modules/tiddit/sv/main.nf index 1bf7146af..b3e3813c5 100644 --- a/modules/nf-core/modules/tiddit/sv/main.nf +++ b/modules/nf-core/modules/tiddit/sv/main.nf @@ -24,7 +24,7 @@ process TIDDIT_SV { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def reference = fasta == "dummy_file.txt" ? "--ref $fasta" : "" + def reference = fasta ? "--ref $fasta" : "" """ tiddit \\ --sv \\ diff --git a/modules/nf-core/modules/trimgalore/main.nf b/modules/nf-core/modules/trimgalore/main.nf index 9487c7990..3a3fca904 100644 --- a/modules/nf-core/modules/trimgalore/main.nf +++ b/modules/nf-core/modules/trimgalore/main.nf @@ -11,12 +11,13 @@ process TRIMGALORE { tuple val(meta), path(reads) output: - tuple val(meta), path("*.fq.gz") , emit: reads - tuple val(meta), path("*report.txt"), emit: log - path "versions.yml" , emit: versions + tuple val(meta), path("*{trimmed,val}*.fq.gz"), emit: reads + tuple val(meta), path("*report.txt") , emit: log + path "versions.yml" , emit: versions - tuple val(meta), path("*.html"), emit: html optional true - tuple val(meta), path("*.zip") , emit: zip optional true + tuple val(meta), path("*unpaired*.fq.gz") , emit: unpaired, optional: true + tuple val(meta), path("*.html") , emit: html , optional: true + tuple val(meta), path("*.zip") , emit: zip , optional: true when: task.ext.when == null || task.ext.when @@ -52,6 +53,7 @@ process TRIMGALORE { $c_r1 \\ $tpc_r1 \\ ${prefix}.fastq.gz + cat <<-END_VERSIONS > versions.yml "${task.process}": trimgalore: \$(echo \$(trim_galore --version 2>&1) | sed 's/^.*version //; s/Last.*\$//') @@ -73,6 +75,7 @@ process TRIMGALORE { $tpc_r2 \\ ${prefix}_1.fastq.gz \\ ${prefix}_2.fastq.gz + cat <<-END_VERSIONS > versions.yml "${task.process}": trimgalore: \$(echo \$(trim_galore --version 2>&1) | sed 's/^.*version //; s/Last.*\$//') diff --git a/modules/nf-core/modules/trimgalore/meta.yml b/modules/nf-core/modules/trimgalore/meta.yml index e99a88334..439f566df 100644 --- a/modules/nf-core/modules/trimgalore/meta.yml +++ b/modules/nf-core/modules/trimgalore/meta.yml @@ -37,6 +37,11 @@ output: List of input adapter trimmed FastQ files of size 1 and 2 for single-end and paired-end data, respectively. pattern: "*.{fq.gz}" + - unpaired: + type: file + description: | + FastQ files containing unpaired reads from read 1 or read 2 + pattern: "*unpaired*.fq.gz" - html: type: file description: FastQC report (optional) diff --git a/modules/nf-core/modules/untar/main.nf b/modules/nf-core/modules/untar/main.nf index bbfa0bfe7..058d17644 100644 --- a/modules/nf-core/modules/untar/main.nf +++ b/modules/nf-core/modules/untar/main.nf @@ -2,10 +2,10 @@ process UNTAR { tag "$archive" label 'process_low' - conda (params.enable_conda ? "conda-forge::tar=1.34" : null) + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv2/biocontainers_v1.2.0_cv2.img' : - 'biocontainers/biocontainers:v1.2.0_cv2' }" + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'ubuntu:20.04' }" input: tuple val(meta), path(archive) diff --git a/subworkflows/local/germline_variant_calling.nf b/subworkflows/local/germline_variant_calling.nf index ef59cc8cf..505049050 100644 --- a/subworkflows/local/germline_variant_calling.nf +++ b/subworkflows/local/germline_variant_calling.nf @@ -21,7 +21,6 @@ workflow GERMLINE_VARIANT_CALLING { intervals_bed_gz_tbi // channel: [mandatory] intervals/target regions index zipped and indexed intervals_bed_combine_gz_tbi // channel: [mandatory] intervals/target regions index zipped and indexed in one file intervals_bed_combine_gz // channel: [mandatory] intervals/target regions index zipped in one file - num_intervals // val: number of intervals that are used to parallelize exection, either based on capture kit or GATK recommended for WGS // joint_germline // val: true/false on whether to run joint_germline calling, only works in combination with haplotypecaller at the moment main: @@ -38,29 +37,38 @@ workflow GERMLINE_VARIANT_CALLING { // Remap channel with intervals cram_recalibrated_intervals = cram_recalibrated.combine(intervals) - .map{ meta, cram, crai, intervals -> - sample = meta.sample - //new_intervals = num_intervals > 1 ? intervals : [] - new_intervals = intervals.baseName != "no_intervals" ? intervals : [] - id = new_intervals ? sample + "_" + new_intervals.baseName : sample - [[ id: id, sample: meta.sample, gender: meta.gender, status: meta.status, patient: meta.patient ], cram, crai, new_intervals] + .map{ meta, cram, crai, intervals, num_intervals -> + new_meta = meta.clone() + + // If either no scatter/gather is done, i.e. no interval (0) or one interval (1), then don't rename samples + new_meta.id = num_intervals <= 1 ? meta.sample : meta.sample + "_" + intervals.baseName + new_meta.num_intervals = num_intervals + + //If no interval file provided (0) then add empty list + intervals_new = num_intervals == 0 ? [] : intervals + + [new_meta, cram, crai, intervals_new] } // Remap channel with gzipped intervals + indexes cram_recalibrated_intervals_gz_tbi = cram_recalibrated.combine(intervals_bed_gz_tbi) - .map{ meta, cram, crai, bed, tbi -> - sample = meta.sample - //new_bed = num_intervals > 1 ? bed : [] //TODO can I pass in empty lists? Then I only need to work with the id line - new_bed = bed.simpleName != "no_intervals" ? bed : [] - new_tbi = tbi.simpleName != "no_intervals" ? tbi : [] - id = new_bed ? sample + "_" + new_bed.simpleName : sample - new_meta = [ id: id, sample: meta.sample, gender: meta.gender, status: meta.status, patient: meta.patient ] - [new_meta, cram, crai, new_bed, new_tbi] + .map{ meta, cram, crai, bed_tbi, num_intervals -> + new_meta = meta.clone() + + // If either no scatter/gather is done, i.e. no interval (0) or one interval (1), then don't rename samples + new_meta.id = num_intervals <= 1 ? meta.sample : meta.sample + "_" + bed_tbi[0].simpleName + new_meta.num_intervals = num_intervals + + //If no interval file provided (0) then add empty list + bed_new = num_intervals == 0 ? [] : bed_tbi[0] + tbi_new = num_intervals == 0 ? [] : bed_tbi[1] + + [new_meta, cram, crai, bed_new, tbi_new] } // DEEPVARIANT if(params.tools.contains('deepvariant')){ - RUN_DEEPVARIANT(cram_recalibrated_intervals, fasta, fasta_fai, intervals_bed_combine_gz, num_intervals) + RUN_DEEPVARIANT(cram_recalibrated_intervals, fasta, fasta_fai, intervals_bed_combine_gz) deepvariant_vcf = RUN_DEEPVARIANT.out.deepvariant_vcf ch_versions = ch_versions.mix(RUN_DEEPVARIANT.out.versions) @@ -73,7 +81,7 @@ workflow GERMLINE_VARIANT_CALLING { .map{ meta, cram, crai, intervals -> [meta, cram, crai, [], [], intervals] } - RUN_FREEBAYES(cram_recalibrated_intervals_freebayes, fasta, fasta_fai, intervals_bed_combine_gz, num_intervals) + RUN_FREEBAYES(cram_recalibrated_intervals_freebayes, fasta, fasta_fai, intervals_bed_combine_gz) freebayes_vcf = RUN_FREEBAYES.out.freebayes_vcf ch_versions = ch_versions.mix(RUN_FREEBAYES.out.versions) @@ -88,8 +96,7 @@ workflow GERMLINE_VARIANT_CALLING { dbsnp, dbsnp_tbi, intervals_bed_combine_gz, - intervals_bed_combine_gz_tbi, - num_intervals) + intervals_bed_combine_gz_tbi) haplotypecaller_vcf = RUN_HAPLOTYPECALLER.out.haplotypecaller_vcf //genotype_gvcf = RUN_HAPLOTYPECALLER.out.genotype_gvcf @@ -101,8 +108,7 @@ workflow GERMLINE_VARIANT_CALLING { RUN_MANTA_GERMLINE (cram_recalibrated_intervals_gz_tbi, fasta, fasta_fai, - intervals_bed_combine_gz, - num_intervals) + intervals_bed_combine_gz) manta_vcf = RUN_MANTA_GERMLINE.out.manta_vcf ch_versions = ch_versions.mix(RUN_MANTA_GERMLINE.out.versions) @@ -113,8 +119,7 @@ workflow GERMLINE_VARIANT_CALLING { RUN_STRELKA_SINGLE(cram_recalibrated_intervals_gz_tbi, fasta, fasta_fai, - intervals_bed_combine_gz, - num_intervals) + intervals_bed_combine_gz) strelka_vcf = RUN_STRELKA_SINGLE.out.strelka_vcf ch_versions = ch_versions.mix(RUN_STRELKA_SINGLE.out.versions) diff --git a/subworkflows/local/pair_variant_calling.nf b/subworkflows/local/pair_variant_calling.nf index c06bf700f..41d21b6c2 100644 --- a/subworkflows/local/pair_variant_calling.nf +++ b/subworkflows/local/pair_variant_calling.nf @@ -21,8 +21,6 @@ workflow PAIR_VARIANT_CALLING { intervals_bed_combined_gz_tbi // channel: [mandatory] intervals/target regions all in one file zipped and indexed intervals_bed_combine_gz // channel: [mandatory] intervals/target regions zipped in one file intervals_bed_combined // channel: [mandatory] intervals/target regions in one file unzipped - num_intervals // val: number of intervals that are used to parallelize exection, either based on capture kit or GATK recommended for WGS - no_intervals msisensorpro_scan // channel: [optional] msisensorpro_scan germline_resource // channel: [optional] germline_resource germline_resource_tbi // channel: [optional] germline_resource_tbi @@ -41,26 +39,35 @@ workflow PAIR_VARIANT_CALLING { msisensorpro_output = Channel.empty() mutect2_vcf = Channel.empty() - cram_pair_intervals_gz_tbi = cram_pair.combine(intervals_bed_gz_tbi) - .map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, bed, tbi -> - normal_id = meta.normal_id - tumor_id = meta.tumor_id - - new_bed = bed.simpleName != "no_intervals" ? bed : [] - new_tbi = tbi.simpleName != "no_intervals" ? tbi : [] - id = bed.simpleName != "no_intervals" ? tumor_id + "_vs_" + normal_id + "_" + bed.simpleName : tumor_id + "_vs_" + normal_id - new_meta = [ id: id, normal_id: meta.normal_id, tumor_id: meta.tumor_id, gender: meta.gender, patient: meta.patient] - [new_meta, normal_cram, normal_crai, tumor_cram, tumor_crai, new_bed, new_tbi] + // Remap channel with intervals + cram_pair_intervals = cram_pair.combine(intervals) + .map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, intervals, num_intervals -> + new_meta = meta.clone() + + // If either no scatter/gather is done, i.e. no interval (0) or one interval (1), then don't rename samples + new_meta.id = num_intervals <= 1 ? meta.tumor_id + "_vs_" + meta.normal_id : meta.tumor_id + "_vs_" + meta.normal_id + "_" + intervals.baseName + new_meta.num_intervals = num_intervals + + //If no interval file provided (0) then add empty list + intervals_new = num_intervals == 0 ? [] : intervals + + [new_meta, normal_cram, normal_crai, tumor_cram, tumor_crai, intervals_new] } - cram_pair_intervals = cram_pair.combine(intervals) - .map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, intervals -> - normal_id = meta.normal_id - tumor_id = meta.tumor_id - new_intervals = intervals.baseName != "no_intervals" ? intervals : [] - id = new_intervals ? tumor_id + "_vs_" + normal_id + "_" + new_intervals.baseName : tumor_id + "_vs_" + normal_id - new_meta = [ id: id, normal_id: meta.normal_id, tumor_id: meta.tumor_id, gender: meta.gender, patient: meta.patient ] - [new_meta, normal_cram, normal_crai, tumor_cram, tumor_crai, intervals] + // Remap channel with gzipped intervals + indexes + cram_pair_intervals_gz_tbi = cram_pair.combine(intervals_bed_gz_tbi) + .map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, bed_tbi, num_intervals -> + new_meta = meta.clone() + + // If either no scatter/gather is done, i.e. no interval (0) or one interval (1), then don't rename samples + new_meta.id = num_intervals <= 1 ? meta.tumor_id + "_vs_" + meta.normal_id : meta.tumor_id + "_vs_" + meta.normal_id + "_" + bed_tbi[0].simpleName + new_meta.num_intervals = num_intervals + + //If no interval file provided (0) then add empty list + bed_new = num_intervals == 0 ? [] : bed_tbi[0] + tbi_new = num_intervals == 0 ? [] : bed_tbi[1] + + [new_meta, normal_cram, normal_crai, tumor_cram, tumor_crai, bed_new, tbi_new] } if (tools.contains('controlfreec')){ @@ -82,8 +89,7 @@ workflow PAIR_VARIANT_CALLING { dbsnp_tbi, chr_files, mappability, - intervals_bed_combined, - num_intervals) + intervals_bed_combined) ch_versions = ch_versions.mix(RUN_CONTROLFREEC_SOMATIC.out.versions) } @@ -91,8 +97,7 @@ workflow PAIR_VARIANT_CALLING { RUN_MANTA_SOMATIC( cram_pair_intervals_gz_tbi, fasta, fasta_fai, - intervals_bed_combine_gz, - num_intervals) + intervals_bed_combine_gz) manta_vcf = RUN_MANTA_SOMATIC.out.manta_vcf manta_candidate_small_indels_vcf = RUN_MANTA_SOMATIC.out.manta_candidate_small_indels_vcf @@ -104,18 +109,20 @@ workflow PAIR_VARIANT_CALLING { if (tools.contains('manta')) { cram_pair_strelka = cram_pair.join(manta_candidate_small_indels_vcf) - .join(manta_candidate_small_indels_vcf_tbi) - .combine(intervals_bed_gz_tbi) - .map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, vcf, vcf_tbi, bed, bed_tbi -> - normal_id = meta.normal_id - tumor_id = meta.tumor_id - - new_bed = bed.simpleName != "no_intervals" ? bed : [] - new_tbi = bed_tbi.simpleName != "no_intervals" ? bed_tbi : [] - id = bed.simpleName != "no_intervals" ? tumor_id + "_vs_" + normal_id + "_" + bed.simpleName : tumor_id + "_vs_" + normal_id - new_meta = [ id: id, normal_id: meta.normal_id, tumor_id: meta.tumor_id, gender: meta.gender, patient: meta.patient] - [new_meta, normal_cram, normal_crai, tumor_cram, tumor_crai, vcf, vcf_tbi, new_bed, new_tbi] - } + .join(manta_candidate_small_indels_vcf_tbi) + .combine(intervals_bed_gz_tbi) + .map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, vcf, bed_tbi, num_intervals -> + + // If either no scatter/gather is done, i.e. no interval (0) or one interval (1), then don't rename samples + new_meta.id = num_intervals <= 1 ? meta.tumor_id + "_vs_" + meta.normal_id : meta.tumor_id + "_vs_" + meta.normal_id + "_" + bed_tbi[0].simpleName + new_meta.num_intervals = num_intervals + + //If no interval file provided (0) then add empty list + bed_new = num_intervals == 0 ? [] : bed_tbi[0] + tbi_new = num_intervals == 0 ? [] : bed_tbi[1] + + [new_meta, normal_cram, normal_crai, tumor_cram, tumor_crai, vcf, vcf_tbi, bed_new, tbi_new] + } } else { cram_pair_strelka = cram_pair_intervals_gz_tbi.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, bed, tbi -> @@ -126,8 +133,7 @@ workflow PAIR_VARIANT_CALLING { RUN_STRELKA_SOMATIC(cram_pair_strelka, fasta, fasta_fai, - intervals_bed_combine_gz, - num_intervals) + intervals_bed_combine_gz) strelka_vcf = RUN_STRELKA_SOMATIC.out.strelka_vcf ch_versions = ch_versions.mix(RUN_STRELKA_SOMATIC.out.versions) @@ -141,27 +147,27 @@ workflow PAIR_VARIANT_CALLING { msisensorpro_output = msisensorpro_output.mix(MSISENSORPRO_MSI_SOMATIC.out.output_report) } - if (tools.contains('mutect2')) { - cram_pair_intervals.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, intervals -> - [meta, [normal_cram, tumor_cram], [normal_crai, tumor_crai], intervals, ['normal']] - }.set{cram_pair_mutect2} - - GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING( - cram_pair_mutect2, - fasta, - fasta_fai, - dict, - germline_resource, - germline_resource_tbi, - panel_of_normals, - panel_of_normals_tbi, - intervals_bed_combine_gz, - num_intervals - ) - - mutect2_vcf = GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING.out.mutect2_vcf - ch_versions = ch_versions.mix(GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING.out.versions) - } + // if (tools.contains('mutect2')) { + // cram_pair_intervals.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, intervals -> + // [meta, [normal_cram, tumor_cram], [normal_crai, tumor_crai], intervals, ['normal']] + // }.set{cram_pair_mutect2} + + // GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING( + // cram_pair_mutect2, + // fasta, + // fasta_fai, + // dict, + // germline_resource, + // germline_resource_tbi, + // panel_of_normals, + // panel_of_normals_tbi, + // intervals_bed_combine_gz, + // num_intervals + // ) + + // mutect2_vcf = GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING.out.mutect2_vcf + // ch_versions = ch_versions.mix(GATK_TUMOR_NORMAL_SOMATIC_VARIANT_CALLING.out.versions) + // } // if (tools.contains('tiddit')) { // } diff --git a/subworkflows/local/prepare_intervals.nf b/subworkflows/local/prepare_intervals.nf index 1158b14ca..9d74894f0 100644 --- a/subworkflows/local/prepare_intervals.nf +++ b/subworkflows/local/prepare_intervals.nf @@ -20,67 +20,96 @@ workflow PREPARE_INTERVALS { ch_versions = Channel.empty() - // TODO maybe instead [] ch_intervals = Channel.empty() ch_intervals_bed_gz_tbi = Channel.empty() ch_intervals_combined_bed_gz_tbi = Channel.empty() // Create bed.gz and bed.gz.tbi for input/or created interval file. Contains ALL regions. - tabix_in_combined = Channel.empty() if (params.no_intervals) { file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" + file("${params.outdir}/no_intervals.bed.gz").text = "no_intervals\n" + file("${params.outdir}/no_intervals.bed.gz.tbi").text = "no_intervals\n" + ch_intervals = Channel.fromPath(file("${params.outdir}/no_intervals.bed")) - tabix_in_combined = ch_intervals.map{it -> [[id:it.getName()], it] } + .map{ it -> [it, 0]} + + ch_intervals_bed_gz_tbi = Channel.fromPath(file("${params.outdir}/no_intervals.bed.{gz,gz.tbi}")) + .collect().map{ it -> [it, 0]} + + ch_intervals_combined_bed_gz_tbi = Channel.fromPath(file("${params.outdir}/no_intervals.bed.{gz,gz.tbi}")) + .collect() + ch_intervals_combined_bed_gz_tbi.view() + } else if (params.step != 'annotate' && params.step != 'controlfreec') { + + tabix_in_combined = Channel.empty() + + //If no interval/target file is provided, then intervals are generated from FASTA file if (!params.intervals) { + BUILD_INTERVALS(fasta_fai) - tabix_in_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.getName()], it] } + tabix_in_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.simpleName], it] } + ch_intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS.out.bed) + } else { + tabix_in_combined = Channel.fromPath(file(params.intervals)).map{it -> [[id:it.baseName], it] } + + //If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format if(!params.intervals.endsWith(".bed")) { GATK4_INTERVALLISTTOBED(tabix_in_combined) tabix_in_combined = GATK4_INTERVALLISTTOBED.out.bed ch_versions = ch_versions.mix(GATK4_INTERVALLISTTOBED.out.versions) } + ch_intervals = CREATE_INTERVALS_BED(file(params.intervals)) } - } - if (params.step != 'annotate' && params.step != 'controlfreec') { + // Now for the interval.bed the following operations are done: + // 1. Complete intervals file (with all intervals) is indexed + // 2. Interval file is split up into multiple bed files for scatter/gather + // 3. Each bed file from 2. is indexed + + // 1. Index complete interval file TABIX_BGZIPTABIX_INTERVAL_ALL(tabix_in_combined) ch_intervals_combined_bed_gz_tbi = TABIX_BGZIPTABIX_INTERVAL_ALL.out.gz_tbi.map{ meta, bed, tbi -> [bed, tbi] } ch_versions = ch_versions.mix(TABIX_BGZIPTABIX_INTERVAL_ALL.out.versions) - if (!params.no_intervals) { - ch_intervals = ch_intervals.flatten() - .map{ intervalFile -> - def duration = 0.0 - for (line in intervalFile.readLines()) { - final fields = line.split('\t') - if (fields.size() >= 5) duration += fields[4].toFloat() - else { - start = fields[1].toInteger() - end = fields[2].toInteger() - duration += (end - start) / params.nucleotides_per_second - } + // 2. Interval file is split up into multiple bed files for scatter/gather & grouping together small intervals + ch_intervals = ch_intervals.flatten() + .map{ intervalFile -> + def duration = 0.0 + for (line in intervalFile.readLines()) { + final fields = line.split('\t') + if (fields.size() >= 5) duration += fields[4].toFloat() + else { + start = fields[1].toInteger() + end = fields[2].toInteger() + duration += (end - start) / params.nucleotides_per_second } - [duration, intervalFile] - }.toSortedList({ a, b -> b[0] <=> a[0] }) - .flatten().collate(2) - .map{duration, intervalFile -> intervalFile} - } + } + [duration, intervalFile] + }.toSortedList({ a, b -> b[0] <=> a[0] }) + .flatten().collate(2) + .map{duration, intervalFile -> intervalFile} + .collect().map{ it -> + [it, it.size() ] // Adding number of intervals as elements + }.transpose() - // Create bed.gz and bed.gz.tbi for each interval file. They are split by region (see above) - tabix_in = ch_intervals.map{it -> [[id:it.baseName], it] } + // 3. Create bed.gz and bed.gz.tbi for each interval file. They are split by region (see above) + tabix_in = ch_intervals.map{ file, num_intervals -> [[id:file.baseName], file] } TABIX_BGZIPTABIX_INTERVAL_SPLIT(tabix_in) - ch_intervals_bed_gz_tbi = TABIX_BGZIPTABIX_INTERVAL_SPLIT.out.gz_tbi.map{ meta, bed, tbi -> [bed, tbi] } + ch_intervals_bed_gz_tbi = TABIX_BGZIPTABIX_INTERVAL_SPLIT.out.gz_tbi.map{ meta, bed, tbi -> [bed, tbi ]}.toList().map{ + it -> + [it, it.size()] // Adding number of intervals as elements + }.transpose() ch_versions = ch_versions.mix(TABIX_BGZIPTABIX_INTERVAL_SPLIT.out.versions) - } + } emit: - intervals_bed = ch_intervals // path: intervals.bed [intervals split for parallel execution] - intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi [intervals split for parallel execution] + intervals_bed = ch_intervals // path: intervals.bed, num_intervals [intervals split for parallel execution] + intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi, num_intervals [intervals split for parallel execution] intervals_combined_bed_gz_tbi = ch_intervals_combined_bed_gz_tbi // path: interval.bed.gz, interval.bed.gz.tbi [all intervals in one file] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/tumor_variant_calling.nf b/subworkflows/local/tumor_variant_calling.nf index 566ed8349..c3ebcf1fb 100644 --- a/subworkflows/local/tumor_variant_calling.nf +++ b/subworkflows/local/tumor_variant_calling.nf @@ -24,8 +24,6 @@ workflow TUMOR_ONLY_VARIANT_CALLING { intervals_bed_combine_gz_tbi // channel: [mandatory] intervals/target regions index zipped and indexed intervals_bed_combine_gz // channel: [mandatory] intervals/target regions index zipped and indexed in one file intervals_bed_combined // channel: [mandatory] intervals/target regions in one file unzipped - num_intervals // val: number of intervals that are used to parallelize exection, either based on capture kit or GATK recommended for WGS - no_intervals germline_resource // channel: [optional] germline_resource germline_resource_tbi // channel: [optional] germline_resource_tbi panel_of_normals // channel: [optional] panel_of_normals @@ -43,26 +41,41 @@ workflow TUMOR_ONLY_VARIANT_CALLING { mutect2_vcf = Channel.empty() strelka_vcf = Channel.empty() - cram_recalibrated.combine(intervals).map{ meta, cram, crai, intervals -> - sample = meta.sample - new_intervals = intervals.baseName != "no_intervals" ? intervals : [] - id = new_intervals ? sample + "_" + new_intervals.baseName : sample - new_new_meta = [ id: id, sample: meta.sample, gender: meta.gender, status: meta.status, patient: meta.patient ] - [new_new_meta, cram, crai, new_intervals] - }.set{cram_recalibrated_intervals} - - cram_recalibrated.combine(intervals_bed_gz_tbi) - .map{ meta, cram, crai, bed, tbi -> - sample = meta.sample - new_bed = bed.simpleName != "no_intervals" ? bed : [] - new_tbi = tbi.simpleName != "no_intervals" ? tbi : [] - id = new_bed ? sample + "_" + new_bed.simpleName : sample - new_meta = [ id: id, sample: meta.sample, gender: meta.gender, status: meta.status, patient: meta.patient ] - [new_meta, cram, crai, new_bed, new_tbi] - }.set{cram_recalibrated_intervals_gz_tbi} + // Remap channel with intervals + cram_recalibrated_intervals = cram_recalibrated.combine(intervals) + .map{ meta, cram, crai, intervals, num_intervals -> + new_meta = meta.clone() + + // If either no scatter/gather is done, i.e. no interval (0) or one interval (1), then don't rename samples + new_meta.id = num_intervals <= 1 ? meta.sample : meta.sample + "_" + intervals.baseName + new_meta.num_intervals = num_intervals + + //If no interval file provided (0) then add empty list + intervals_new = num_intervals == 0 ? [] : intervals + + [new_meta, cram, crai, intervals_new] + } + + // Remap channel with gzipped intervals + indexes + cram_recalibrated_intervals_gz_tbi = cram_recalibrated.combine(intervals_bed_gz_tbi) + .map{ meta, cram, crai, bed_tbi, num_intervals -> + new_meta = meta.clone() + + // If either no scatter/gather is done, i.e. no interval (0) or one interval (1), then don't rename samples + new_meta.id = num_intervals <= 1 ? meta.sample : meta.sample + "_" + bed_tbi[0].simpleName + new_meta.num_intervals = num_intervals + + //If no interval file provided (0) then add empty list + bed_new = num_intervals == 0 ? [] : bed_tbi[0] + tbi_new = num_intervals == 0 ? [] : bed_tbi[1] + + [new_meta, cram, crai, bed_new, tbi_new] + } if(tools.contains('controlfreec')){ - cram_recalibrated_intervals.map {meta, cram, crai, intervals -> [meta, cram, intervals]}.set{cram_intervals_no_index} + cram_intervals_no_index = cram_recalibrated_intervals.map { meta, cram, crai, intervals -> + [meta, cram, intervals] + } RUN_CONTROLFREEC_TUMORONLY( cram_intervals_no_index, fasta, @@ -71,8 +84,7 @@ workflow TUMOR_ONLY_VARIANT_CALLING { dbsnp_tbi, chr_files, mappability, - intervals_bed_combined, - num_intervals) + intervals_bed_combined) ch_versions = ch_versions.mix(RUN_CONTROLFREEC_TUMORONLY.out.versions) } @@ -83,40 +95,37 @@ workflow TUMOR_ONLY_VARIANT_CALLING { [meta, cram, crai, [], [], intervals] } - RUN_FREEBAYES(cram_recalibrated_intervals_freebayes, fasta, fasta_fai, intervals_bed_combine_gz, num_intervals) + RUN_FREEBAYES(cram_recalibrated_intervals_freebayes, fasta, fasta_fai, intervals_bed_combine_gz) freebayes_vcf = RUN_FREEBAYES.out.freebayes_vcf ch_versions = ch_versions.mix(RUN_FREEBAYES.out.versions) } - if (tools.contains('mutect2')) { + // if (tools.contains('mutect2')) { - which_norm = [] - cram_recalibrated_intervals.map{ meta, cram, crai, intervals -> [meta, cram, crai, intervals, which_norm]}.set{cram_recalibrated_mutect2} - GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING(cram_recalibrated_mutect2, - fasta, - fasta_fai, - dict, - germline_resource, - germline_resource_tbi, - panel_of_normals, - panel_of_normals_tbi, - intervals_bed_combine_gz, - num_intervals) + // which_norm = [] + // cram_recalibrated_intervals.map{ meta, cram, crai, intervals -> [meta, cram, crai, intervals, which_norm]}.set{cram_recalibrated_mutect2} + // GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING(cram_recalibrated_mutect2, + // fasta, + // fasta_fai, + // dict, + // germline_resource, + // germline_resource_tbi, + // panel_of_normals, + // panel_of_normals_tbi, + // intervals_bed_combine_gz, + // num_intervals) - mutect2_vcf = GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING.out.mutect2_vcf - ch_versions = ch_versions.mix(GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING.out.versions) + // mutect2_vcf = GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING.out.mutect2_vcf + // ch_versions = ch_versions.mix(GATK_TUMOR_ONLY_SOMATIC_VARIANT_CALLING.out.versions) - } + // } if (tools.contains('manta')){ - //TODO: Research if splitting by intervals is ok, we pretend for now it is fine. Seems to be the consensus on upstream modules implementaiton too - RUN_MANTA_TUMORONLY(cram_recalibrated_intervals_gz_tbi, fasta, fasta_fai, - intervals_bed_combine_gz, - num_intervals) + intervals_bed_combine_gz) manta_vcf = RUN_MANTA_TUMORONLY.out.manta_vcf ch_versions = ch_versions.mix(RUN_MANTA_TUMORONLY.out.versions) @@ -126,8 +135,7 @@ workflow TUMOR_ONLY_VARIANT_CALLING { RUN_STRELKA_SINGLE( cram_recalibrated_intervals_gz_tbi, fasta, fasta_fai, - intervals_bed_combine_gz, - num_intervals) + intervals_bed_combine_gz) strelka_vcf = RUN_STRELKA_SINGLE.out.strelka_vcf ch_versions = ch_versions.mix(RUN_STRELKA_SINGLE.out.versions) diff --git a/subworkflows/nf-core/gatk4/prepare_recalibration/main.nf b/subworkflows/nf-core/gatk4/prepare_recalibration/main.nf index 20c4b115a..abdbf674a 100644 --- a/subworkflows/nf-core/gatk4/prepare_recalibration/main.nf +++ b/subworkflows/nf-core/gatk4/prepare_recalibration/main.nf @@ -9,23 +9,28 @@ include { GATK4_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../../../modul workflow PREPARE_RECALIBRATION { take: - cram // channel: [mandatory] cram_markduplicates + cram // channel: [mandatory] meta, cram_markduplicates, crai dict // channel: [mandatory] dict fasta // channel: [mandatory] fasta fasta_fai // channel: [mandatory] fasta_fai - intervals // channel: [mandatory] intervals + intervals // channel: [mandatory] intervals, num_intervals known_sites // channel: [optional] known_sites known_sites_tbi // channel: [optional] known_sites_tbi - num_intervals // value: [mandatory] number of intervals main: ch_versions = Channel.empty() cram_intervals = cram.combine(intervals) - .map{ meta, cram, crai, intervals -> + .map{ meta, cram, crai, intervals, num_intervals -> new_meta = meta.clone() - new_meta.id = num_intervals == 1 ? meta.sample : meta.sample + "_" + intervals.baseName - intervals_new = params.no_intervals ? [] : intervals + + // If either no scatter/gather is done, i.e. no interval (0) or one interval (1), then don't rename samples + new_meta.id = num_intervals <= 1 ? meta.sample : meta.sample + "_" + intervals.baseName + new_meta.num_intervals = num_intervals + + //If no interval file provided (0) then add empty list + intervals_new = num_intervals == 0 ? [] : intervals + [new_meta, cram, crai, intervals_new] } @@ -35,12 +40,16 @@ workflow PREPARE_RECALIBRATION { // Figuring out if there is one or more table(s) from the same sample table_to_merge = BASERECALIBRATOR.out.table .map{ meta, table -> - meta.id = meta.sample - [meta, table] - }.groupTuple(size: num_intervals) + new_meta = meta.clone() + new_meta.id = meta.sample + + def groupKey = groupKey(new_meta, meta.num_intervals) + [new_meta, table] + }.groupTuple() .branch{ - single: num_intervals == 1 - multiple: num_intervals > 1 + //Warning: size() calculates file size not list length here, so use num_intervals instead + single: it[0].num_intervals <= 1 + multiple: it[0].num_intervals > 1 } // STEP 3.5: MERGING RECALIBRATION TABLES @@ -48,6 +57,14 @@ workflow PREPARE_RECALIBRATION { // Merge the tables only when we have intervals GATHERBQSRREPORTS(table_to_merge.multiple) table_bqsr = table_to_merge.single.mix(GATHERBQSRREPORTS.out.table) + .map{ meta, table -> + new_meta = meta.clone() + + // remove no longer necessary fields to make sure joining can be done correctly + new_meta.remove('num_intervals') + + [new_meta, table] + } // Gather versions of all tools used ch_versions = ch_versions.mix(BASERECALIBRATOR.out.versions) diff --git a/subworkflows/nf-core/gatk4/prepare_recalibration_spark/main.nf b/subworkflows/nf-core/gatk4/prepare_recalibration_spark/main.nf index 589c327de..0ccc0efad 100644 --- a/subworkflows/nf-core/gatk4/prepare_recalibration_spark/main.nf +++ b/subworkflows/nf-core/gatk4/prepare_recalibration_spark/main.nf @@ -9,44 +9,62 @@ include { GATK4_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../. workflow PREPARE_RECALIBRATION_SPARK { take: - cram // channel: [mandatory] cram_markduplicates + cram // channel: [mandatory] meta, cram_markduplicates, crai dict // channel: [mandatory] dict fasta // channel: [mandatory] fasta fasta_fai // channel: [mandatory] fasta_fai - intervals // channel: [mandatory] intervals + intervals // channel: [mandatory] intervals, num_intervals known_sites // channel: [optional] known_sites known_sites_tbi // channel: [optional] known_sites_tbi - num_intervals // value: [mandatory] number of intervals main: ch_versions = Channel.empty() cram_intervals = cram.combine(intervals) - .map{ meta, cram, crai, intervals -> + .map{ meta, cram, crai, intervals, num_intervals -> new_meta = meta.clone() - new_meta.id = num_intervals == 1 ? meta.sample : meta.sample + "_" + intervals.baseName - [new_meta, cram, crai, intervals] + + // If either no scatter/gather is done, i.e. no interval (0) or one interval (1), then don't rename samples + new_meta.id = num_intervals <= 1 ? meta.sample : meta.sample + "_" + intervals.baseName + new_meta.num_intervals = num_intervals + + //If no interval file provided (0) then add empty list + intervals_new = num_intervals == 0 ? [] : intervals + + [new_meta, cram, crai, intervals_new] } // Run Baserecalibrator spark BASERECALIBRATOR_SPARK(cram_intervals, fasta, fasta_fai, dict, known_sites, known_sites_tbi) // Figuring out if there is one or more table(s) from the same sample - ch_table = BASERECALIBRATOR_SPARK.out.table + table_to_merge = BASERECALIBRATOR_SPARK.out.table .map{ meta, table -> - meta.id = meta.sample - [meta, table] - }.groupTuple(size: num_intervals) + new_meta = meta.clone() + new_meta.id = meta.sample + + def groupKey = groupKey(new_meta, meta.num_intervals) + [new_meta, table] + }.groupTuple() .branch{ - single: it[1].size() == 1 - multiple: it[1].size() > 1 - }.set{table_to_merge} + //Warning: size() calculates file size not list length here, so use num_intervals instead + single: it[0].num_intervals <= 1 + multiple: it[0].num_intervals > 1 + } // STEP 3.5: MERGING RECALIBRATION TABLES // Merge the tables only when we have intervals GATHERBQSRREPORTS(table_to_merge.multiple) table_bqsr = table_to_merge.single.mix(GATHERBQSRREPORTS.out.table) + .map{ meta, table -> + new_meta = meta.clone() + + // remove no longer necessary fields to make sure joining can be done correctly + new_meta.remove('num_intervals') + + [new_meta, table] + } // Gather versions of all tools used ch_versions = ch_versions.mix(BASERECALIBRATOR_SPARK.out.versions) diff --git a/subworkflows/nf-core/gatk4/recalibrate/main.nf b/subworkflows/nf-core/gatk4/recalibrate/main.nf index ec2a18026..ec83bbbfd 100644 --- a/subworkflows/nf-core/gatk4/recalibrate/main.nf +++ b/subworkflows/nf-core/gatk4/recalibrate/main.nf @@ -9,35 +9,49 @@ include { MERGE_INDEX_CRAM } from '../../merge_index_cram' workflow RECALIBRATE { take: - cram // channel: [mandatory] cram + cram // channel: [mandatory] meta, cram, crai, recal dict // channel: [mandatory] dict fasta // channel: [mandatory] fasta fasta_fai // channel: [mandatory] fasta_fai - intervals // channel: [mandatory] intervals - num_intervals // value: [mandatory] number of intervals + intervals // channel: [mandatory] intervals, num_intervals main: ch_versions = Channel.empty() cram_intervals = cram.combine(intervals) - .map{ meta, cram, crai, recal, intervals -> + .map{ meta, cram, crai, recal, intervals, num_intervals -> new_meta = meta.clone() - new_meta.id = num_intervals == 1 ? meta.sample : meta.sample + "_" + intervals.baseName - [new_meta, cram, crai, recal, intervals] + + // If either no scatter/gather is done, i.e. no interval (0) or one interval (1), then don't rename samples + new_meta.id = num_intervals <= 1 ? meta.sample : meta.sample + "_" + intervals.baseName + new_meta.num_intervals = num_intervals + + //If no interval file provided (0) then add empty list + intervals_new = num_intervals == 0 ? [] : intervals + + [new_meta, cram, crai, recal, intervals_new] } // Run Applybqsr APPLYBQSR(cram_intervals, fasta, fasta_fai, dict) - // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES - MERGE_INDEX_CRAM(APPLYBQSR.out.cram, fasta, num_intervals) + // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED CRAM FILES + MERGE_INDEX_CRAM(APPLYBQSR.out.cram, fasta) + + ch_cram_recal_out = MERGE_INDEX_CRAM.out.cram_crai.map{ meta, cram, crai -> + new_meta = meta.clone() + + // remove no longer necessary fields to make sure joining can be done correctly + new_meta.remove('num_intervals') + + [new_meta, cram, crai] + } // Gather versions of all tools used ch_versions = ch_versions.mix(APPLYBQSR.out.versions) ch_versions = ch_versions.mix(MERGE_INDEX_CRAM.out.versions) emit: - cram = MERGE_INDEX_CRAM.out.cram_crai - + cram = ch_cram_recal_out versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/nf-core/gatk4/recalibrate_spark/main.nf b/subworkflows/nf-core/gatk4/recalibrate_spark/main.nf index 58d3bc548..ecb70e06b 100644 --- a/subworkflows/nf-core/gatk4/recalibrate_spark/main.nf +++ b/subworkflows/nf-core/gatk4/recalibrate_spark/main.nf @@ -9,35 +9,49 @@ include { MERGE_INDEX_CRAM } from '../../merge_index_cra workflow RECALIBRATE_SPARK { take: - cram // channel: [mandatory] cram + cram // channel: [mandatory] meta, cram, crai, recal dict // channel: [mandatory] dict fasta // channel: [mandatory] fasta fasta_fai // channel: [mandatory] fasta_fai - intervals // channel: [mandatory] intervals - num_intervals // value: [mandatory] number of intervals + intervals // channel: [mandatory] intervals, num_intervals main: ch_versions = Channel.empty() cram_intervals = cram.combine(intervals) - .map{ meta, cram, crai, recal, intervals -> + .map{ meta, cram, crai, recal, intervals, num_intervals -> new_meta = meta.clone() - new_meta.id = num_intervals == 1 ? meta.sample : meta.sample + "_" + intervals.baseName - [new_meta, cram, crai, recal, intervals] + + // If either no scatter/gather is done, i.e. no interval (0) or one interval (1), then don't rename samples + new_meta.id = num_intervals <= 1 ? meta.sample : meta.sample + "_" + intervals.baseName + new_meta.num_intervals = num_intervals + + //If no interval file provided (0) then add empty list + intervals_new = num_intervals == 0 ? [] : intervals + + [new_meta, cram, crai, recal, intervals_new] } // Run Applybqsr spark APPLYBQSR_SPARK(cram_intervals, fasta, fasta_fai, dict) // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES - MERGE_INDEX_CRAM(APPLYBQSR_SPARK.out.cram, fasta, num_intervals) + MERGE_INDEX_CRAM(APPLYBQSR_SPARK.out.cram, fasta) + + ch_cram_recal_out = MERGE_INDEX_CRAM.out.cram_crai.map{ meta, cram, crai -> + new_meta = meta.clone() + + // remove no longer necessary fields to make sure joining can be done correctly + new_meta.remove('num_intervals') + + [new_meta, cram, crai] + } // Gather versions of all tools used ch_versions = ch_versions.mix(APPLYBQSR_SPARK.out.versions) ch_versions = ch_versions.mix(MERGE_INDEX_CRAM.out.versions) emit: - cram = MERGE_INDEX_CRAM.out.cram_crai - + cram = ch_cram_recal_out versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/nf-core/merge_index_bam.nf b/subworkflows/nf-core/merge_index_bam.nf index b914d24f8..7443850ff 100644 --- a/subworkflows/nf-core/merge_index_bam.nf +++ b/subworkflows/nf-core/merge_index_bam.nf @@ -16,6 +16,7 @@ workflow MERGE_INDEX_BAM { // Figuring out if there is one or more bam(s) from the same sample bam.branch{ + //Here there actually is a list, so size() works single: it[1].size() == 1 multiple: it[1].size() > 1 }.set{bam_to_merge} diff --git a/subworkflows/nf-core/merge_index_cram.nf b/subworkflows/nf-core/merge_index_cram.nf index 769268133..7c8cfa37b 100644 --- a/subworkflows/nf-core/merge_index_cram.nf +++ b/subworkflows/nf-core/merge_index_cram.nf @@ -11,27 +11,28 @@ workflow MERGE_INDEX_CRAM { take: ch_cram // channel: [mandatory] meta, cram fasta // channel: [mandatory] fasta - num_intervals main: ch_versions = Channel.empty() // Figuring out if there is one or more cram(s) from the same sample - ch_cram.map{ meta, cram -> + ch_cram_to_merge = ch_cram.map{ meta, cram -> new_meta = meta.clone() new_meta.id = meta.sample + def groupKey = groupKey(new_meta, meta.num_intervals) [new_meta, cram] - }.groupTuple(size: num_intervals) + }.groupTuple() .branch{ - single: it[1].size() == 1 - multiple: it[1].size() > 1 - }.set{cram_to_merge} + //Warning: size() calculates file size not list length here, so use num_intervals instead + single: it[0].num_intervals <= 1 + multiple: it[0].num_intervals > 1 + } - MERGE_CRAM(cram_to_merge.multiple, fasta) - INDEX_CRAM(cram_to_merge.single.mix(MERGE_CRAM.out.cram)) + MERGE_CRAM(ch_cram_to_merge.multiple, fasta) + INDEX_CRAM(ch_cram_to_merge.single.mix(MERGE_CRAM.out.cram)) - cram_crai = cram_to_merge.single + cram_crai = ch_cram_to_merge.single .mix(MERGE_CRAM.out.cram) .join(INDEX_CRAM.out.crai) diff --git a/subworkflows/nf-core/variantcalling/controlfreec/somatic/main.nf b/subworkflows/nf-core/variantcalling/controlfreec/somatic/main.nf index ea7b2b10b..f48b12a97 100644 --- a/subworkflows/nf-core/variantcalling/controlfreec/somatic/main.nf +++ b/subworkflows/nf-core/variantcalling/controlfreec/somatic/main.nf @@ -19,7 +19,6 @@ workflow RUN_CONTROLFREEC_SOMATIC { chr_files // channel: [mandatory] mappability // channel: [mandatory] intervals_bed // channel: [optional] Contains a bed file of all intervals combined provided with the cram input(s). Should be empty for WGS - num_intervals // val: [optional] Number of used intervals, mandatory when intervals are provided. main: @@ -28,31 +27,35 @@ workflow RUN_CONTROLFREEC_SOMATIC { MPILEUP_NORMAL(cram_normal, fasta) MPILEUP_NORMAL.out.mpileup.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{mpileup_normal} MPILEUP_TUMOR(cram_tumor, fasta) MPILEUP_TUMOR.out.mpileup.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{mpileup_tumor} //Merge mpileup only when intervals and natural order sort them CAT_MPILEUP_NORMAL( mpileup_normal.intervals.map{ meta, pileup -> new_meta = meta.clone() new_meta.id = new_meta.tumor_id + "_vs_" + new_meta.normal_id + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, pileup] - }.groupTuple(size: num_intervals, sort:true)) + }.groupTuple(sort:true)) CAT_MPILEUP_TUMOR(mpileup_tumor.intervals .map{ meta, pileup -> new_meta = meta.clone() new_meta.id = new_meta.tumor_id + "_vs_" + new_meta.normal_id + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, pileup] } - .groupTuple(size: num_intervals, sort:true)) + .groupTuple(sort:true)) controlfreec_input_normal = Channel.empty().mix( CAT_MPILEUP_NORMAL.out.file_out, @@ -74,7 +77,6 @@ workflow RUN_CONTROLFREEC_SOMATIC { controlfreec_input_normal.cross(controlfreec_input_tumor) .map{ normal, tumor -> - // [meta, normal_pileup, tumor_pileup, [] , [], [], []] [normal[0], normal[1], tumor[1], [], [], [], []] } .set{controlfreec_input} diff --git a/subworkflows/nf-core/variantcalling/controlfreec/tumoronly/main.nf b/subworkflows/nf-core/variantcalling/controlfreec/tumoronly/main.nf index 3eaff249f..71cefaf1e 100644 --- a/subworkflows/nf-core/variantcalling/controlfreec/tumoronly/main.nf +++ b/subworkflows/nf-core/variantcalling/controlfreec/tumoronly/main.nf @@ -16,7 +16,6 @@ workflow RUN_CONTROLFREEC_TUMORONLY { chr_files // channel: [mandatory] mappability // channel: [mandatory] intervals_bed // channel: [optional] Contains a bed file of all intervals combined provided with the cram input(s). Should be empty for WGS - num_intervals // val: [optional] Number of used intervals, mandatory when intervals are provided. main: @@ -25,8 +24,8 @@ workflow RUN_CONTROLFREEC_TUMORONLY { MPILEUP_TUMOR(cram_tumor, fasta) MPILEUP_TUMOR.out.mpileup.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{mpileup_tumor} //Merge mpileup only when intervals and natural order sort them @@ -34,9 +33,11 @@ workflow RUN_CONTROLFREEC_TUMORONLY { .map{ meta, pileup -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, pileup] } - .groupTuple(size: num_intervals, sort:true)) + .groupTuple(sort:true)) controlfreec_input_tumor = Channel.empty().mix( CAT_MPILEUP_TUMOR.out.file_out, diff --git a/subworkflows/nf-core/variantcalling/deepvariant/main.nf b/subworkflows/nf-core/variantcalling/deepvariant/main.nf index 715c160f1..666822420 100644 --- a/subworkflows/nf-core/variantcalling/deepvariant/main.nf +++ b/subworkflows/nf-core/variantcalling/deepvariant/main.nf @@ -14,7 +14,6 @@ workflow RUN_DEEPVARIANT { fasta // channel: [mandatory] fasta_fai // channel: [mandatory] intervals_bed_gz // channel: [optional] Contains a bed.gz file of all intervals combined provided with the cram input(s). Mandatory if interval files are used. - num_intervals // val: [optional] Number of used intervals, mandatory when intervals are provided. main: @@ -22,23 +21,33 @@ workflow RUN_DEEPVARIANT { DEEPVARIANT(cram, fasta, fasta_fai) - //TODO Branch annotation? + DEEPVARIANT.out.vcf.branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + }.set{deepvariant_vcf_out} + + DEEPVARIANT.out.gvcf.branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + }.set{deepvariant_gvcf_out} // Only when no intervals - TABIX_VC_DEEPVARIANT_VCF(DEEPVARIANT.out.vcf) - TABIX_VC_DEEPVARIANT_GVCF(DEEPVARIANT.out.gvcf) + TABIX_VC_DEEPVARIANT_VCF(deepvariant_vcf_out.no_intervals) + TABIX_VC_DEEPVARIANT_GVCF(deepvariant_gvcf_out.no_intervals) // Only when using intervals - BGZIP_VC_DEEPVARIANT_VCF(DEEPVARIANT.out.vcf) - BGZIP_VC_DEEPVARIANT_GVCF(DEEPVARIANT.out.gvcf) + BGZIP_VC_DEEPVARIANT_VCF(deepvariant_vcf_out.intervals) + BGZIP_VC_DEEPVARIANT_GVCF(deepvariant_gvcf_out.intervals) CONCAT_DEEPVARIANT_VCF( BGZIP_VC_DEEPVARIANT_VCF.out.output .map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) @@ -47,8 +56,10 @@ workflow RUN_DEEPVARIANT { .map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) @@ -56,8 +67,8 @@ workflow RUN_DEEPVARIANT { deepvariant_vcf = Channel.empty().mix( CONCAT_DEEPVARIANT_GVCF.out.vcf, CONCAT_DEEPVARIANT_VCF.out.vcf, - DEEPVARIANT.out.gvcf, - DEEPVARIANT.out.vcf) + deepvariant_gvcf_out.no_intervals, + deepvariant_vcf_out.no_intervals) .map{ meta, vcf -> meta.variantcaller = "Deepvariant" [meta, vcf] diff --git a/subworkflows/nf-core/variantcalling/freebayes/main.nf b/subworkflows/nf-core/variantcalling/freebayes/main.nf index f6c39cbee..bcc06087e 100644 --- a/subworkflows/nf-core/variantcalling/freebayes/main.nf +++ b/subworkflows/nf-core/variantcalling/freebayes/main.nf @@ -9,7 +9,6 @@ workflow RUN_FREEBAYES { fasta // channel: [mandatory] fasta_fai // channel: [mandatory] intervals_bed_gz // channel: [optional] Contains a bed.gz file of all intervals combined provided with the cram input(s). Mandatory if interval files are used. - num_intervals // val: [optional] Number of used intervals, mandatory when intervals are provided. main: @@ -21,26 +20,33 @@ workflow RUN_FREEBAYES { fasta_fai, [], [], []) + FREEBAYES.out.vcf.branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + }.set{freebayes_vcf_out} + // Only when no intervals - TABIX_VC_FREEBAYES(FREEBAYES.out.vcf) + TABIX_VC_FREEBAYES(freebayes_vcf_out.no_intervals) // Only when using intervals - BGZIP_VC_FREEBAYES(FREEBAYES.out.vcf) + BGZIP_VC_FREEBAYES(freebayes_vcf_out.intervals) CONCAT_FREEBAYES( BGZIP_VC_FREEBAYES.out.output .map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) // Mix output channels for "no intervals" and "with intervals" results freebayes_vcf = Channel.empty().mix( CONCAT_FREEBAYES.out.vcf, - FREEBAYES.out.vcf) + freebayes_vcf_out.no_intervals) .map{ meta, vcf -> meta.variantcaller = "FreeBayes" [meta, vcf] diff --git a/subworkflows/nf-core/variantcalling/haplotypecaller/main.nf b/subworkflows/nf-core/variantcalling/haplotypecaller/main.nf index 28bffeeec..1ff59e542 100644 --- a/subworkflows/nf-core/variantcalling/haplotypecaller/main.nf +++ b/subworkflows/nf-core/variantcalling/haplotypecaller/main.nf @@ -14,7 +14,6 @@ workflow RUN_HAPLOTYPECALLER { dbsnp_tbi // channel: [mandatory] intervals_bed_gz // channel: [optional] Contains a bed.gz file of all intervals combined provided with the cram input(s). Mandatory if interval files are used. intervals_bed_combine_gz_tbi // channel: [optional] Contains a [bed.gz, bed.gz.tbi ]file of all intervals combined provided with the cram input(s). Mandatory if interval files are used. - num_intervals // val: [optional] Number of used intervals, mandatory when intervals are provided. main: @@ -30,13 +29,13 @@ workflow RUN_HAPLOTYPECALLER { // Figure out if using intervals or no_intervals HAPLOTYPECALLER.out.vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{haplotypecaller_vcf_branch} HAPLOTYPECALLER.out.tbi.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{haplotypecaller_tbi_branch} // Only when using intervals diff --git a/subworkflows/nf-core/variantcalling/manta/germline/main.nf b/subworkflows/nf-core/variantcalling/manta/germline/main.nf index 66346eaa9..864fbc075 100644 --- a/subworkflows/nf-core/variantcalling/manta/germline/main.nf +++ b/subworkflows/nf-core/variantcalling/manta/germline/main.nf @@ -14,7 +14,6 @@ workflow RUN_MANTA_GERMLINE { fasta // channel: [mandatory] fasta_fai // channel: [mandatory] intervals_bed_gz // channel: [optional] Contains a bed.gz file of all intervals combined provided with the cram input(s). Mandatory if interval files are used. - num_intervals // val: [optional] Number of used intervals, mandatory when intervals are provided. main: @@ -24,30 +23,32 @@ workflow RUN_MANTA_GERMLINE { // Figure out if using intervals or no_intervals MANTA_GERMLINE.out.candidate_small_indels_vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{manta_small_indels_vcf} MANTA_GERMLINE.out.candidate_sv_vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{manta_sv_vcf} MANTA_GERMLINE.out.diploid_sv_vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{manta_diploid_sv_vcf} // Only when using intervals BGZIP_VC_MANTA_SMALL_INDELS(manta_small_indels_vcf.intervals) CONCAT_MANTA_SMALL_INDELS( - BGZIP_VC_MANTA_SMALL_INDELS.out.vcf + BGZIP_VC_MANTA_SMALL_INDELS.out.output .map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) @@ -58,8 +59,10 @@ workflow RUN_MANTA_GERMLINE { .map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) @@ -70,18 +73,20 @@ workflow RUN_MANTA_GERMLINE { .map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) // Mix output channels for "no intervals" and "with intervals" results manta_vcf = Channel.empty().mix( CONCAT_MANTA_DIPLOID.out.vcf, - CONCAT_MANTA_SMALL_INDELS.out.vcf, + //CONCAT_MANTA_SMALL_INDELS.out.vcf, CONCAT_MANTA_SV.out.vcf, manta_diploid_sv_vcf.no_intervals, - manta_small_indels_vcf.no_intervals, + //manta_small_indels_vcf.no_intervals, manta_sv_vcf.no_intervals) .map{ meta, vcf -> meta.variantcaller = "Manta" @@ -97,6 +102,6 @@ workflow RUN_MANTA_GERMLINE { ch_versions = ch_versions.mix(MANTA_GERMLINE.out.versions) emit: - manta_vcf + manta_vcf = Channel.empty() versions = ch_versions } diff --git a/subworkflows/nf-core/variantcalling/manta/somatic/main.nf b/subworkflows/nf-core/variantcalling/manta/somatic/main.nf index 67764c2dc..e6005000c 100644 --- a/subworkflows/nf-core/variantcalling/manta/somatic/main.nf +++ b/subworkflows/nf-core/variantcalling/manta/somatic/main.nf @@ -14,7 +14,6 @@ workflow RUN_MANTA_SOMATIC { fasta // channel: [mandatory] fasta_fai // channel: [mandatory] intervals_bed_gz // channel: [optional] Contains a bed.gz file of all intervals combined provided with the cram input(s). Mandatory if interval files are used. - num_intervals // val: [optional] Number of used intervals, mandatory when intervals are provided. main: @@ -24,28 +23,28 @@ workflow RUN_MANTA_SOMATIC { // Figure out if using intervals or no_intervals MANTA_SOMATIC.out.candidate_small_indels_vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{manta_candidate_small_indels_vcf} MANTA_SOMATIC.out.candidate_small_indels_vcf_tbi.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{manta_candidate_small_indels_vcf_tbi} MANTA_SOMATIC.out.candidate_sv_vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{manta_candidate_sv_vcf} MANTA_SOMATIC.out.diploid_sv_vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{manta_diploid_sv_vcf} MANTA_SOMATIC.out.somatic_sv_vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{manta_somatic_sv_vcf} //Only when using intervals @@ -55,8 +54,10 @@ workflow RUN_MANTA_SOMATIC { BGZIP_VC_MANTA_SV.out.output.map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.tumor_id + "_vs_" + new_meta.normal_id + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) @@ -66,8 +67,10 @@ workflow RUN_MANTA_SOMATIC { BGZIP_VC_MANTA_SMALL_INDELS.out.output.map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.tumor_id + "_vs_" + new_meta.normal_id + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) @@ -77,8 +80,10 @@ workflow RUN_MANTA_SOMATIC { BGZIP_VC_MANTA_DIPLOID.out.output.map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.tumor_id + "_vs_" + new_meta.normal_id + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) @@ -88,19 +93,21 @@ workflow RUN_MANTA_SOMATIC { BGZIP_VC_MANTA_SOMATIC.out.output.map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.tumor_id + "_vs_" + new_meta.normal_id + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) // Mix output channels for "no intervals" and "with intervals" results manta_vcf = Channel.empty().mix( - CONCAT_MANTA_SV.out.vcf, - CONCAT_MANTA_SMALL_INDELS.out.vcf, + //CONCAT_MANTA_SV.out.vcf, + //CONCAT_MANTA_SMALL_INDELS.out.vcf, CONCAT_MANTA_DIPLOID.out.vcf, CONCAT_MANTA_SOMATIC.out.vcf, - manta_candidate_sv_vcf.no_intervals, - manta_candidate_small_indels_vcf.no_intervals, + //manta_candidate_sv_vcf.no_intervals, + //manta_candidate_small_indels_vcf.no_intervals, manta_diploid_sv_vcf.no_intervals, manta_somatic_sv_vcf.no_intervals ).map{ meta, vcf -> diff --git a/subworkflows/nf-core/variantcalling/manta/tumoronly/main.nf b/subworkflows/nf-core/variantcalling/manta/tumoronly/main.nf index b9858d5db..a09168cd1 100644 --- a/subworkflows/nf-core/variantcalling/manta/tumoronly/main.nf +++ b/subworkflows/nf-core/variantcalling/manta/tumoronly/main.nf @@ -14,7 +14,6 @@ workflow RUN_MANTA_TUMORONLY { fasta // channel: [mandatory] fasta_fai // channel: [mandatory] intervals_bed_gz // channel: [optional] Contains a bed.gz file of all intervals combined provided with the cram input(s). Mandatory if interval files are used. - num_intervals // val: [optional] Number of used intervals, mandatory when intervals are provided. main: @@ -24,18 +23,18 @@ workflow RUN_MANTA_TUMORONLY { // Figure out if using intervals or no_intervals MANTA_TUMORONLY.out.candidate_small_indels_vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{manta_small_indels_vcf} MANTA_TUMORONLY.out.candidate_sv_vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{manta_candidate_sv_vcf} MANTA_TUMORONLY.out.tumor_sv_vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{manta_tumor_sv_vcf} //Only when using intervals @@ -45,8 +44,10 @@ workflow RUN_MANTA_TUMORONLY { BGZIP_VC_MANTA_SMALL_INDELS.out.output.map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) @@ -56,8 +57,10 @@ workflow RUN_MANTA_TUMORONLY { BGZIP_VC_MANTA_SV.out.output.map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) @@ -67,8 +70,10 @@ workflow RUN_MANTA_TUMORONLY { BGZIP_VC_MANTA_TUMOR.out.output.map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) diff --git a/subworkflows/nf-core/variantcalling/strelka/single/main.nf b/subworkflows/nf-core/variantcalling/strelka/single/main.nf index 865f013fb..a9b0e5d1d 100644 --- a/subworkflows/nf-core/variantcalling/strelka/single/main.nf +++ b/subworkflows/nf-core/variantcalling/strelka/single/main.nf @@ -2,33 +2,30 @@ include { TABIX_BGZIP as BGZIP_VC_STRELKA } from '../../../../../modules/ include { TABIX_BGZIP as BGZIP_VC_STRELKA_GENOME } from '../../../../../modules/nf-core/modules/tabix/bgzip/main' include { CONCAT_VCF as CONCAT_STRELKA } from '../../../../../modules/local/concat_vcf/main' include { CONCAT_VCF as CONCAT_STRELKA_GENOME } from '../../../../../modules/local/concat_vcf/main' -include { STRELKA_GERMLINE } from '../../../../../modules/nf-core/modules/strelka/germline/main' +include { STRELKA_GERMLINE as STRELKA_SINGLE } from '../../../../../modules/nf-core/modules/strelka/germline/main' -// TODO: Research if splitting by intervals is ok, we pretend for now it is fine. -// Seems to be the consensus on upstream modules implementation too workflow RUN_STRELKA_SINGLE { take: cram // channel: [mandatory] [meta, cram, crai, interval.bed.gz, interval.bed.gz.tbi] fasta // channel: [mandatory] fasta_fai // channel: [mandatory] intervals_bed_gz // channel: [optional] Contains a bed.gz file of all intervals combined provided with the cram input(s). Mandatory if interval files are used. - num_intervals // val: [optional] Number of used intervals, mandatory when intervals are provided. main: ch_versions = Channel.empty() - STRELKA_GERMLINE(cram, fasta, fasta_fai) + STRELKA_SINGLE(cram, fasta, fasta_fai) // Figure out if using intervals or no_intervals - STRELKA_GERMLINE.out.vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + STRELKA_SINGLE.out.vcf.branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{strelka_vcf} - STRELKA_GERMLINE.out.genome_vcf.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + STRELKA_SINGLE.out.genome_vcf.branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{strelka_genome_vcf} // Only when using intervals @@ -39,8 +36,10 @@ workflow RUN_STRELKA_SINGLE { .map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) @@ -51,16 +50,18 @@ workflow RUN_STRELKA_SINGLE { .map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.sample + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) // Mix output channels for "no intervals" and "with intervals" results strelka_vcf = Channel.empty().mix( CONCAT_STRELKA.out.vcf, - CONCAT_STRELKA_GENOME.out.vcf, - strelka_genome_vcf.no_intervals, + //CONCAT_STRELKA_GENOME.out.vcf, + //strelka_genome_vcf.no_intervals, strelka_vcf.no_intervals) .map{ meta, vcf -> meta.variantcaller = "Strelka" @@ -71,7 +72,7 @@ workflow RUN_STRELKA_SINGLE { ch_versions = ch_versions.mix(BGZIP_VC_STRELKA_GENOME.out.versions) ch_versions = ch_versions.mix(CONCAT_STRELKA.out.versions) ch_versions = ch_versions.mix(CONCAT_STRELKA_GENOME.out.versions) - ch_versions = ch_versions.mix(STRELKA_GERMLINE.out.versions) + ch_versions = ch_versions.mix(STRELKA_SINGLE.out.versions) emit: strelka_vcf diff --git a/subworkflows/nf-core/variantcalling/strelka/somatic/main.nf b/subworkflows/nf-core/variantcalling/strelka/somatic/main.nf index bccc00f59..1c7353c56 100644 --- a/subworkflows/nf-core/variantcalling/strelka/somatic/main.nf +++ b/subworkflows/nf-core/variantcalling/strelka/somatic/main.nf @@ -4,15 +4,12 @@ include { CONCAT_VCF as CONCAT_STRELKA_INDELS } from '../../../../../modules/ include { CONCAT_VCF as CONCAT_STRELKA_SNVS } from '../../../../../modules/local/concat_vcf/main' include { STRELKA_SOMATIC } from '../../../../../modules/nf-core/modules/strelka/somatic/main' -// TODO: Research if splitting by intervals is ok, we pretend for now it is fine. -// Seems to be the consensus on upstream modules implementation too workflow RUN_STRELKA_SOMATIC { take: cram // channel: [mandatory] [meta, normal_cram, normal_crai, tumor_cram, tumor_crai, manta_vcf, manta_tbi, interval.bed.gz, interval.bed.gz.tbi] manta* are optional fasta // channel: [mandatory] fasta_fai // channel: [mandatory] intervals_bed_gz // channel: [optional] Contains a bed.gz file of all intervals combined provided with the cram input(s). Mandatory if interval files are used. - num_intervals // val: [optional] Number of used intervals, mandatory when intervals are provided. main: @@ -22,13 +19,13 @@ workflow RUN_STRELKA_SOMATIC { // Figure out if using intervals or no_intervals STRELKA_SOMATIC.out.vcf_snvs.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{strelka_vcf_snvs} STRELKA_SOMATIC.out.vcf_indels.branch{ - intervals: num_intervals > 1 - no_intervals: num_intervals == 1 + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 }.set{strelka_vcf_indels} // Only when using intervals @@ -37,8 +34,10 @@ workflow RUN_STRELKA_SOMATIC { CONCAT_STRELKA_SNVS(BGZIP_VC_STRELKA_SNVS.out.output.map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.tumor_id + "_vs_" + new_meta.normal_id + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) @@ -47,8 +46,10 @@ workflow RUN_STRELKA_SOMATIC { CONCAT_STRELKA_INDELS(BGZIP_VC_STRELKA_INDELS.out.output.map{ meta, vcf -> new_meta = meta.clone() new_meta.id = new_meta.tumor_id + "_vs_" + new_meta.normal_id + + def groupKey = groupKey(meta, meta.num_intervals) [new_meta, vcf] - }.groupTuple(size: num_intervals), + }.groupTuple(), fasta_fai, intervals_bed_gz) diff --git a/tests/test_aligner.yml b/tests/test_aligner.yml index c0d2dc16b..8318229a8 100644 --- a/tests/test_aligner.yml +++ b/tests/test_aligner.yml @@ -5,7 +5,7 @@ - bwa-mem2 - preprocessing files: - # - path: results/multiqc + - path: results/multiqc - path: results/preprocessing/csv/markduplicates.csv - path: results/preprocessing/csv/markduplicates_no_table.csv - path: results/preprocessing/csv/markduplicates_no_table_test.csv @@ -34,7 +34,7 @@ - dragmap - preprocessing files: - # - path: results/multiqc + - path: results/multiqc - path: results/preprocessing/csv/markduplicates.csv - path: results/preprocessing/csv/markduplicates_no_table.csv - path: results/preprocessing/csv/markduplicates_no_table_test.csv diff --git a/tests/test_annotation.yml b/tests/test_annotation.yml index 282b4546b..0fc517804 100644 --- a/tests/test_annotation.yml +++ b/tests/test_annotation.yml @@ -6,7 +6,8 @@ files: - path: results/annotation/1234N/1234N_snpEff.ann.vcf.gz - path: results/annotation/1234N/1234N_snpEff.ann.vcf.gz.tbi - # - path: results/multiqc + - path: results/reports/SnpEff/1234N/1234N.csv + # - path: results/multiqc //MultiQC not working (finishes succesfully, but log shows issues between human vcf and annotation) - name: Run VEP command: nextflow run main.nf -profile test,annotation,docker --tools vep tags: @@ -15,7 +16,8 @@ files: - path: results/annotation/1234N/1234N_VEP.ann.vcf.gz - path: results/annotation/1234N/1234N_VEP.ann.vcf.gz.tbi - # - path: results/multiqc + - path: results/reports/EnsemblVEP/1234N/1234N.summary.html + # - path: results/multiqc //MultiQC not working issues between human vcf and annotation - name: Run snpEff followed by VEP command: nextflow run main.nf -profile test,annotation,docker --tools merge tags: @@ -24,8 +26,6 @@ - snpeff - vep files: - - path: results/annotation/1234N/1234N_snpEff.ann.vcf.gz - - path: results/annotation/1234N/1234N_snpEff.ann.vcf.gz.tbi - path: results/annotation/1234N/1234N_snpEff_VEP.ann.vcf.gz - path: results/annotation/1234N/1234N_snpEff_VEP.ann.vcf.gz.tbi - # - path: results/multiqc + # - path: results/multiqc //MultiQC not working issues between human vcf and annotation diff --git a/tests/test_default.yml b/tests/test_default.yml index 4af69d953..7bff088d6 100644 --- a/tests/test_default.yml +++ b/tests/test_default.yml @@ -4,7 +4,7 @@ - default - preprocessing files: - # - path: results/multiqc + - path: results/multiqc - path: results/preprocessing/test/markduplicates/test.md.cram - path: results/preprocessing/test/markduplicates/test.md.cram.crai - path: results/preprocessing/test/recal_table/test.recal.table @@ -17,7 +17,36 @@ - path: results/preprocessing/csv/recalibrated.csv - path: results/preprocessing/csv/recalibrated_test.csv - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.metrics - path: results/reports/qualimap/test/test.mapped - path: results/reports/qualimap/test/test.recal - path: results/reports/samtools_stats/test/test.md.cram.stats - path: results/reports/samtools_stats/test/test.recal.cram.stats + - path: results/reports/deeptools/test/test.bigWig + +- name: Run default pipeline without intervals + command: nextflow run main.nf -profile test,no_intervals,docker + tags: + - default + - no_intervals + - preprocessing + files: + - path: results/multiqc + - path: results/preprocessing/test/markduplicates/test.md.cram + - path: results/preprocessing/test/markduplicates/test.md.cram.crai + - path: results/preprocessing/test/recal_table/test.recal.table + - path: results/preprocessing/test/recalibrated/test.recal.cram + - path: results/preprocessing/test/recalibrated/test.recal.cram.crai + - path: results/preprocessing/csv/markduplicates.csv + - path: results/preprocessing/csv/markduplicates_test.csv + - path: results/preprocessing/csv/markduplicates_no_table.csv + - path: results/preprocessing/csv/markduplicates_no_table_test.csv + - path: results/preprocessing/csv/recalibrated.csv + - path: results/preprocessing/csv/recalibrated_test.csv + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.metrics + - path: results/reports/qualimap/test/test.mapped + - path: results/reports/qualimap/test/test.recal + - path: results/reports/samtools_stats/test/test.md.cram.stats + - path: results/reports/samtools_stats/test/test.recal.cram.stats + - path: results/reports/deeptools/test/test.bigWig diff --git a/tests/test_gatk_spark.yml b/tests/test_gatk_spark.yml index d83149a95..63253af04 100644 --- a/tests/test_gatk_spark.yml +++ b/tests/test_gatk_spark.yml @@ -5,7 +5,7 @@ - gatk4_spark - preprocessing files: - # - path: results/multiqc + - path: results/multiqc - path: results/preprocessing/test/markduplicates/test.md.cram - path: results/preprocessing/test/markduplicates/test.md.cram.crai - path: results/preprocessing/test/recal_table/test.recal.table @@ -18,7 +18,9 @@ - path: results/preprocessing/csv/recalibrated.csv - path: results/preprocessing/csv/recalibrated_test.csv - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.metrics - path: results/reports/qualimap/test/test.mapped - path: results/reports/qualimap/test/test.recal - path: results/reports/samtools_stats/test/test.md.cram.stats - path: results/reports/samtools_stats/test/test.recal.cram.stats + - path: results/reports/deeptools/test/test.bigWig diff --git a/tests/test_pair.yml b/tests/test_pair.yml index 1020da8ba..eedd7a649 100644 --- a/tests/test_pair.yml +++ b/tests/test_pair.yml @@ -4,7 +4,7 @@ - preprocessing - tumor_normal_pair files: - # - path: results/multiqc + - path: results/multiqc - path: results/preprocessing/test/markduplicates/test.md.cram - path: results/preprocessing/test/markduplicates/test.md.cram.crai - path: results/preprocessing/test/recal_table/test.recal.table @@ -26,6 +26,8 @@ - path: results/preprocessing/csv/recalibrated_test2.csv - path: results/reports/fastqc/test-test_L1 - path: results/reports/fastqc/test2-test2_L1 + - path: results/reports/markduplicates/test/test.md.metrics + - path: results/reports/markduplicates/test2/test2.md.metrics - path: results/reports/qualimap/test/test.mapped - path: results/reports/qualimap/test/test.recal - path: results/reports/qualimap/test2/test2.mapped @@ -34,3 +36,5 @@ - path: results/reports/samtools_stats/test/test.recal.cram.stats - path: results/reports/samtools_stats/test2/test2.md.cram.stats - path: results/reports/samtools_stats/test2/test2.recal.cram.stats + - path: results/reports/deeptools/test/test.bigWig + - path: results/reports/deeptools/test2/test2.bigWig diff --git a/tests/test_prepare_recalibration.yml b/tests/test_prepare_recalibration.yml index 9c6221ddc..f35a05ea7 100644 --- a/tests/test_prepare_recalibration.yml +++ b/tests/test_prepare_recalibration.yml @@ -4,7 +4,7 @@ - prepare_recalibration - preprocessing files: - # - path: results/multiqc + - path: results/multiqc - path: results/preprocessing/test/markduplicates/test.md.cram - path: results/preprocessing/test/markduplicates/test.md.cram.crai - path: results/preprocessing/test/recal_table/test.recal.table diff --git a/tests/test_save_bam_mapped.yml b/tests/test_save_bam_mapped.yml index 0113cf97c..7fe4f1b03 100644 --- a/tests/test_save_bam_mapped.yml +++ b/tests/test_save_bam_mapped.yml @@ -4,7 +4,7 @@ - preprocessing - save_bam_mapped files: - # - path: results/multiqc + - path: results/multiqc - path: results/preprocessing/test/mapped/test.bam - path: results/preprocessing/test/mapped/test.bam.bai - path: results/preprocessing/test/markduplicates/test.md.cram diff --git a/tests/test_skip_markduplicates.yml b/tests/test_skip_markduplicates.yml index 5fc14ce2f..fd6f9cf63 100644 --- a/tests/test_skip_markduplicates.yml +++ b/tests/test_skip_markduplicates.yml @@ -5,7 +5,7 @@ - preprocessing - skip_markduplicates files: - # - path: results/multiqc + - path: results/multiqc - path: results/preprocessing/test/mapped/test.bam - path: results/preprocessing/test/mapped/test.bam.bai - path: results/preprocessing/test/recal_table/test.recal.table @@ -24,6 +24,8 @@ - path: results/reports/qualimap/test/test.recal - path: results/reports/samtools_stats/test/test.md.cram.stats - path: results/reports/samtools_stats/test/test.recal.cram.stats + - path: results/reports/deeptools/test/test.bigWig + - name: Run skip markduplicates preparerecal command: nextflow run main.nf -profile test,docker,prepare_recalibration,skip_markduplicates tags: @@ -32,7 +34,7 @@ - preprocessing - skip_markduplicates files: - # - path: results/multiqc + - path: results/multiqc - path: results/preprocessing/test/recal_table/test.recal.table - path: results/preprocessing/test/markduplicates/test.md.cram - path: results/preprocessing/test/markduplicates/test.md.cram.crai @@ -48,3 +50,4 @@ # - path: results/reports/qualimap/test/test.recal - path: results/reports/samtools_stats/test/test.md.cram.stats - path: results/reports/samtools_stats/test/test.recal.cram.stats + - path: results/reports/deeptools/test/test.bigWig diff --git a/tests/test_targeted.yml b/tests/test_targeted.yml index 8980ed4fa..f70201535 100644 --- a/tests/test_targeted.yml +++ b/tests/test_targeted.yml @@ -4,7 +4,7 @@ - preprocessing - targeted files: - # - path: results/multiqc + - path: results/multiqc - path: results/preprocessing/test/markduplicates/test.md.cram - path: results/preprocessing/test/markduplicates/test.md.cram.crai - path: results/preprocessing/test/recal_table/test.recal.table @@ -17,7 +17,9 @@ - path: results/preprocessing/csv/recalibrated.csv - path: results/preprocessing/csv/recalibrated_test.csv - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.metrics - path: results/reports/qualimap/test/test.mapped - path: results/reports/qualimap/test/test.recal - path: results/reports/samtools_stats/test/test.md.cram.stats - path: results/reports/samtools_stats/test/test.recal.cram.stats + - path: results/reports/deeptools/test/test.bigWig diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 90f59208c..eb34bcc8b 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -257,14 +257,14 @@ workflow SAREK { // Intervals for speed up preprocessing/variant calling by spread/gather intervals_bed_combined = (params.intervals && params.wes) ? Channel.fromPath(params.intervals).collect() : [] - intervals = PREPARE_INTERVALS.out.intervals_bed // multiple interval.bed files, divided by useful intervals for scatter/gather - intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather intervals_bed_combined_gz_tbi = PREPARE_INTERVALS.out.intervals_combined_bed_gz_tbi.collect() // one file containing all intervals interval.bed.gz/.tbi file intervals_bed_combined_gz = intervals_bed_combined_gz_tbi.map{ bed, tbi -> [bed]}.collect() // one file containing all intervals interval.bed.gz file - intervals_for_preprocessing = (!params.wes || params.no_intervals) ? [] : PREPARE_INTERVALS.out.intervals_bed //TODO: intervals also with WGS data? Probably need a parameter if WGS for deepvariant tool, that would allow to check here too - // TODO: needs to figure something out when intervals are made out of the fasta_fai file - num_intervals = !params.no_intervals ? (params.intervals ? count_intervals(file(params.intervals)) : 1) : 1 + intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather + intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather + + //TODO: intervals also with WGS data? Probably need a parameter if WGS for deepvariant tool, that would allow to check here too + intervals_for_preprocessing = (params.wes && !params.no_intervals) ? intervals_bed_combined : [] // Gather used softwares versions ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) @@ -379,6 +379,7 @@ workflow SAREK { new_meta.id = meta.sample // update data_type + //TODO: This is never used again as far as I see, could probably be removed new_meta.data_type = 'bam' // Use groupKey to make sure that the correct group can advance as soon as it is complete @@ -500,8 +501,7 @@ workflow SAREK { fasta_fai, intervals, known_sites, - known_sites_tbi, - num_intervals) + known_sites_tbi) ch_table_bqsr_spark = PREPARE_RECALIBRATION_SPARK.out.table_bqsr @@ -514,8 +514,7 @@ workflow SAREK { fasta_fai, intervals, known_sites, - known_sites_tbi, - num_intervals) + known_sites_tbi) ch_table_bqsr_no_spark = PREPARE_RECALIBRATION.out.table_bqsr @@ -546,12 +545,12 @@ workflow SAREK { ch_cram_variant_calling_spark = Channel.empty() if (params.use_gatk_spark && params.use_gatk_spark.contains('baserecalibrator')) { + RECALIBRATE_SPARK(ch_cram_applybqsr, dict, fasta, fasta_fai, - intervals, - num_intervals) + intervals) ch_cram_variant_calling_spark = RECALIBRATE_SPARK.out.cram @@ -559,12 +558,12 @@ workflow SAREK { ch_versions = ch_versions.mix(RECALIBRATE_SPARK.out.versions) } else { + RECALIBRATE(ch_cram_applybqsr, dict, fasta, fasta_fai, - intervals, - num_intervals) + intervals) ch_cram_variant_calling_no_spark = RECALIBRATE.out.cram @@ -653,8 +652,7 @@ workflow SAREK { intervals, intervals_bed_gz_tbi, intervals_bed_combined_gz_tbi, - intervals_bed_combined_gz, - num_intervals) + intervals_bed_combined_gz) // params.joint_germline) // TUMOR ONLY VARIANT CALLING @@ -671,8 +669,6 @@ workflow SAREK { intervals_bed_combined_gz_tbi, intervals_bed_combined_gz, intervals_bed_combined, - num_intervals, - params.no_intervals, germline_resource, germline_resource_tbi, pon, @@ -695,8 +691,6 @@ workflow SAREK { intervals_bed_combined_gz_tbi, intervals_bed_combined_gz, intervals_bed_combined, - num_intervals, - params.no_intervals, msisensorpro_scan, germline_resource, germline_resource_tbi, @@ -752,7 +746,6 @@ workflow SAREK { ch_versions = ch_versions.mix(ANNOTATE.out.versions) ch_reports = ch_reports.mix(ANNOTATE.out.reports) - ch_reports.view() } } @@ -894,17 +887,6 @@ def extract_csv(csv_file) { } } -// Function to count number of intervals -def count_intervals(intervals_file) { - count = 0 - - intervals_file.eachLine{ it -> - count += it.startsWith("@") ? 0 : 1 - } - - return count -} - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END