diff --git a/CHANGELOG.md b/CHANGELOG.md index 0321be4f1..146a0d834 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -149,6 +149,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#642](https://github.com/nf-core/sarek/pull/642) - Only unzip ref files if tool is run, only publish ref files if `--save_reference` and simplify CNKit logic - [#650](https://github.com/nf-core/sarek/pull/650) - Fix intervals checks - [#654](https://github.com/nf-core/sarek/pull/654) - Allow any step but annotation to start from BAM files +- [#655](https://github.com/nf-core/sarek/pull/655) - Fix `--intervals false` logic & add versioning for local modules - [#658](https://github.com/nf-core/sarek/pull/658) - Fix split fastq names in multiqc-report ### Deprecated diff --git a/modules/local/build_intervals/main.nf b/modules/local/build_intervals/main.nf index 77f4a5b82..9f14182ea 100644 --- a/modules/local/build_intervals/main.nf +++ b/modules/local/build_intervals/main.nf @@ -1,5 +1,5 @@ process BUILD_INTERVALS { - tag "$fasta_fai" + tag "$meta.id" conda (params.enable_conda ? "anaconda::gawk=5.1.0" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -7,10 +7,11 @@ process BUILD_INTERVALS { 'quay.io/biocontainers/gawk:5.1.0' }" input: - path fasta_fai + tuple val(meta), path(fasta_fai) output: - path "*.bed", emit: bed + tuple val(meta), path("${fasta_fai.baseName}.bed") , emit: bed + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -18,5 +19,10 @@ process BUILD_INTERVALS { script: """ awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fasta_fai} > ${fasta_fai.baseName}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS """ } diff --git a/modules/local/create_intervals_bed/main.nf b/modules/local/create_intervals_bed/main.nf index 50376259e..f46a0df09 100644 --- a/modules/local/create_intervals_bed/main.nf +++ b/modules/local/create_intervals_bed/main.nf @@ -7,11 +7,11 @@ process CREATE_INTERVALS_BED { 'quay.io/biocontainers/gawk:5.1.0' }" input: - path intervals + path(intervals) output: - path ("*.bed"), emit: bed - //TODO version number missing + path("*.bed") , emit: bed + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -20,7 +20,7 @@ process CREATE_INTERVALS_BED { // If intervals file is in BED format, // Fifth column is interpreted to contain runtime estimates // Which is then used to combine short-running jobs - if (intervals.toString().toLowerCase().endsWith("bed")) + if (intervals.toString().toLowerCase().endsWith("bed")) { """ awk -vFS="\t" '{ t = \$5 # runtime estimate @@ -39,19 +39,35 @@ process CREATE_INTERVALS_BED { chunk += t print \$0 > name }' ${intervals} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS """ - else if (intervals.toString().toLowerCase().endsWith("interval_list")) + } else if (intervals.toString().toLowerCase().endsWith("interval_list")) { """ grep -v '^@' ${intervals} | awk -vFS="\t" '{ name = sprintf("%s_%d-%d", \$1, \$2, \$3); printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" }' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS """ - else + } else { """ awk -vFS="[:-]" '{ name = sprintf("%s_%d-%d", \$1, \$2, \$3); printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" }' ${intervals} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS """ + } } diff --git a/subworkflows/local/prepare_cnvkit_reference.nf b/subworkflows/local/prepare_cnvkit_reference.nf new file mode 100644 index 000000000..67042c23e --- /dev/null +++ b/subworkflows/local/prepare_cnvkit_reference.nf @@ -0,0 +1,33 @@ +// +// PREPARE CNVKIT REFERENCE +// + +// Initialize channels based on params or indices that were just built +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { CNVKIT_ANTITARGET } from '../../modules/nf-core/modules/cnvkit/antitarget/main' +include { CNVKIT_REFERENCE } from '../../modules/nf-core/modules/cnvkit/reference/main' + +workflow PREPARE_CNVKIT_REFERENCE { + take: + fasta // channel: [mandatory] fasta + intervals_bed_combined // channel: [] + + main: + + ch_versions = Channel.empty() + + // prepare a antitarget reference files for tumor_only mode of cnvkit + CNVKIT_ANTITARGET(intervals_bed_combined.flatten().map{ it -> [[id:it[0].baseName], it] }) + CNVKIT_REFERENCE(fasta, intervals_bed_combined, CNVKIT_ANTITARGET.out.bed.map{ meta, bed -> [bed]} ) + + ch_versions = ch_versions.mix(CNVKIT_ANTITARGET.out.versions) + ch_versions = ch_versions.mix(CNVKIT_REFERENCE.out.versions) + + emit: + versions = ch_versions + cnvkit_reference = CNVKIT_REFERENCE.out.cnn +} + + diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index b6287b6ab..f01e0c221 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -10,8 +10,6 @@ include { BWA_INDEX as BWAMEM1_INDEX } from '../../modules/nf-core/modules/bwa/index/main' include { BWAMEM2_INDEX } from '../../modules/nf-core/modules/bwamem2/index/main' -include {CNVKIT_ANTITARGET } from '../../modules/nf-core/modules/cnvkit/antitarget/main' -include {CNVKIT_REFERENCE } from '../../modules/nf-core/modules/cnvkit/reference/main' include { DRAGMAP_HASHTABLE } from '../../modules/nf-core/modules/dragmap/hashtable/main' include { GATK4_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/modules/gatk4/createsequencedictionary/main' include { MSISENSORPRO_SCAN } from '../../modules/nf-core/modules/msisensorpro/scan/main' @@ -37,7 +35,6 @@ workflow PREPARE_GENOME { fasta // channel: [mandatory] fasta fasta_fai // channel: [optional] fasta_fai germline_resource // channel: [optional] germline_resource - intervals_bed_combined // channel: [] known_indels // channel: [optional] known_indels pon // channel: [optional] pon @@ -63,10 +60,6 @@ workflow PREPARE_GENOME { TABIX_KNOWN_INDELS( known_indels.flatten().map{ it -> [[id:it.baseName], it] } ) TABIX_PON(pon.flatten().map{ it -> [[id:it.baseName], it] }) - // prepare a reference for tumor_only mode based on target_baits - CNVKIT_ANTITARGET(intervals_bed_combined.flatten().map{ it -> [[id:it[0].baseName], it] }) - CNVKIT_REFERENCE(fasta, intervals_bed_combined, CNVKIT_ANTITARGET.out.bed.map{ meta, bed -> [bed]} ) - // prepare ascat reference files allele_files = ascat_alleles if (params.ascat_alleles && params.ascat_alleles.endsWith('.zip')) { @@ -106,8 +99,6 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) ch_versions = ch_versions.mix(BWAMEM1_INDEX.out.versions) ch_versions = ch_versions.mix(BWAMEM2_INDEX.out.versions) - ch_versions = ch_versions.mix(CNVKIT_ANTITARGET.out.versions) - ch_versions = ch_versions.mix(CNVKIT_REFERENCE.out.versions) ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) ch_versions = ch_versions.mix(MSISENSORPRO_SCAN.out.versions) ch_versions = ch_versions.mix(TABIX_DBSNP.out.versions) @@ -127,7 +118,6 @@ workflow PREPARE_GENOME { msisensorpro_scan = MSISENSORPRO_SCAN.out.list.map{ meta, list -> [list] } // path: genome_msi.list pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi chr_files = chr_files - cnvkit_reference = CNVKIT_REFERENCE.out.cnn allele_files = allele_files loci_files = loci_files gc_file = gc_file diff --git a/subworkflows/local/prepare_intervals.nf b/subworkflows/local/prepare_intervals.nf index 2da68f065..03a5498fa 100644 --- a/subworkflows/local/prepare_intervals.nf +++ b/subworkflows/local/prepare_intervals.nf @@ -6,10 +6,12 @@ // For all modules here: // A when clause condition is defined in the conf/modules.config to determine if the module should be run -include { BUILD_INTERVALS } from '../../modules/local/build_intervals/main' -include { CREATE_INTERVALS_BED } from '../../modules/local/create_intervals_bed/main' -include { GATK4_INTERVALLISTTOBED } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main' -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main' +include { BUILD_INTERVALS } from '../../modules/local/build_intervals/main' +include { CNVKIT_ANTITARGET } from '../../modules/nf-core/modules/cnvkit/antitarget/main' +include { CNVKIT_REFERENCE } from '../../modules/nf-core/modules/cnvkit/reference/main' +include { CREATE_INTERVALS_BED } from '../../modules/local/create_intervals_bed/main' +include { GATK4_INTERVALLISTTOBED } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main' workflow PREPARE_INTERVALS { take: @@ -42,15 +44,21 @@ workflow PREPARE_INTERVALS { //If no interval/target file is provided, then intervals are generated from FASTA file if (!params.intervals) { - BUILD_INTERVALS(fasta_fai) - ch_intervals_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.simpleName], it] } + BUILD_INTERVALS(fasta_fai.map{it -> [[id:it.baseName], it]}) - ch_intervals = CREATE_INTERVALS_BED(ch_intervals_combined) + ch_intervals_combined = BUILD_INTERVALS.out.bed + + ch_intervals = CREATE_INTERVALS_BED(ch_intervals_combined.map{meta, path -> path}).bed + + ch_versions = ch_versions.mix(BUILD_INTERVALS.out.versions) + ch_versions = ch_versions.mix(CREATE_INTERVALS_BED.out.versions) } else { ch_intervals_combined = Channel.fromPath(file(params.intervals)).map{it -> [[id:it.baseName], it] } - ch_intervals = CREATE_INTERVALS_BED(file(params.intervals)) + + ch_intervals = CREATE_INTERVALS_BED(file(params.intervals)).bed + ch_versions = ch_versions.mix(CREATE_INTERVALS_BED.out.versions) //If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format if(!params.intervals.endsWith(".bed")) { @@ -98,8 +106,8 @@ workflow PREPARE_INTERVALS { } emit: - intervals_bed = ch_intervals // path: intervals.bed, num_intervals [intervals split for parallel execution] - intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi, num_intervals [intervals split for parallel execution] - intervals_bed_combined = ch_intervals_combined.map{meta, bed -> bed }.collect() // path: intervals.bed [all intervals in one file] - versions = ch_versions // channel: [ versions.yml ] + intervals_bed = ch_intervals // path: intervals.bed, num_intervals [intervals split for parallel execution] + intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi, num_intervals [intervals split for parallel execution] + intervals_bed_combined = ch_intervals_combined.map{meta, bed -> bed }.collect() // path: intervals.bed [all intervals in one file] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/tests/test_targeted.yml b/tests/test_targeted.yml index 909b1b332..23f84129b 100644 --- a/tests/test_targeted.yml +++ b/tests/test_targeted.yml @@ -31,3 +31,31 @@ - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi - path: results/reports/samtools/test/test.md.cram.stats - path: results/reports/samtools/test/test.recal.cram.stats + +- name: Run intervals false pipeline + command: nextflow run main.nf -profile test,docker --intervals false + tags: + - default + - preprocessing + files: + - path: results/csv/markduplicates.csv + - path: results/csv/markduplicates_no_table.csv + - path: results/csv/recalibrated.csv + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + - path: results/preprocessing/recal_table/test/test.recal.table + - path: results/preprocessing/recalibrated/test/test.recal.cram + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.metrics + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + - path: results/reports/samtools/test/test.md.cram.stats + - path: results/reports/samtools/test/test.recal.cram.stats diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 750d06bbb..32ac29309 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -201,6 +201,9 @@ include { PREPARE_GENOME } from '../subwor // Build intervals if needed include { PREPARE_INTERVALS } from '../subworkflows/local/prepare_intervals' +// Build CNVkit reference if needed +include { PREPARE_CNVKIT_REFERENCE } from '../subworkflows/local/prepare_cnvkit_reference' + // Convert BAM files to FASTQ files include { ALIGNMENT_TO_FASTQ as ALIGNMENT_TO_FASTQ_INPUT } from '../subworkflows/nf-core/alignment_to_fastq' include { ALIGNMENT_TO_FASTQ as ALIGNMENT_TO_FASTQ_UMI } from '../subworkflows/nf-core/alignment_to_fastq' @@ -296,15 +299,6 @@ workflow SAREK { // To gather used softwares versions for MultiQC ch_versions = Channel.empty() - // Build intervals if needed - PREPARE_INTERVALS(fasta_fai) - - // Intervals for speed up preprocessing/variant calling by spread/gather - intervals_bed_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_combined // [interval.bed] all intervals in one file - intervals_for_preprocessing = params.wes ? intervals_bed_combined : [] // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS) - - intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather - intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather // Build indices if needed PREPARE_GENOME( @@ -317,7 +311,6 @@ workflow SAREK { fasta, fasta_fai, germline_resource, - intervals_bed_combined, known_indels, pon) @@ -326,7 +319,6 @@ workflow SAREK { bwa = params.fasta ? params.bwa ? Channel.fromPath(params.bwa).collect() : PREPARE_GENOME.out.bwa : [] bwamem2 = params.fasta ? params.bwamem2 ? Channel.fromPath(params.bwamem2).collect() : PREPARE_GENOME.out.bwamem2 : [] chr_files = PREPARE_GENOME.out.chr_files - cnvkit_reference = params.tools && params.tools.split(',').contains('cnvkit') ? PREPARE_GENOME.out.cnvkit_reference : Channel.empty() dragmap = params.fasta ? params.dragmap ? Channel.fromPath(params.dragmap).collect() : PREPARE_GENOME.out.hashtable : [] dict = params.fasta ? params.dict ? Channel.fromPath(params.dict).collect() : PREPARE_GENOME.out.dict : [] fasta_fai = params.fasta ? params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : PREPARE_GENOME.out.fasta_fai : [] @@ -349,10 +341,25 @@ workflow SAREK { known_sites = dbsnp.concat(known_indels).collect() known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect() + // Build intervals if needed + PREPARE_INTERVALS(fasta_fai) + + // Intervals for speed up preprocessing/variant calling by spread/gather + intervals_bed_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_combined // [interval.bed] all intervals in one file + intervals_for_preprocessing = params.wes ? intervals_bed_combined : [] // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS) + + intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather + intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather + // Gather used softwares versions ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) ch_versions = ch_versions.mix(PREPARE_INTERVALS.out.versions) + // Antitarget based reference for CNVKit + PREPARE_CNVKIT_REFERENCE(fasta, intervals_bed_combined) + cnvkit_reference = params.tools && params.tools.split(',').contains('cnvkit') ? PREPARE_CNVKIT_REFERENCE.out.cnvkit_reference : Channel.empty() + ch_versions = ch_versions.mix(PREPARE_CNVKIT_REFERENCE.out.versions) + // PREPROCESSING if (params.step == 'mapping') {