diff --git a/CHANGELOG.md b/CHANGELOG.md index 54583b0d0..ad04aa23f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -104,7 +104,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#599](https://github.com/nf-core/sarek/pull/599) - Add checks for correct data type for `params.step` - [#599](https://github.com/nf-core/sarek/pull/599) - Add checks for no empty `--tools` with `--step variant_calling` or `--step annotation` - [#600](https://github.com/nf-core/sarek/pull/600) - Remove `nf-core lint` warnings -- [#602](https://github.com/nf-core/sarek/pull/602/) - Fixed bug in `alignment_to_fastq` and added tests +- [#602](https://github.com/nf-core/sarek/pull/602) - Fixed bug in `alignment_to_fastq` and added tests +- [#609](https://github.com/nf-core/sarek/pull/609) - Remove unused intervals code, reorganize combined intervals file ### Deprecated diff --git a/subworkflows/local/pair_variant_calling.nf b/subworkflows/local/pair_variant_calling.nf index f4480b081..7416486c4 100644 --- a/subworkflows/local/pair_variant_calling.nf +++ b/subworkflows/local/pair_variant_calling.nf @@ -81,16 +81,16 @@ workflow PAIR_VARIANT_CALLING { [meta, tumor_cram, intervals] } RUN_MPILEUP_NORMAL(cram_normal_intervals_no_index, fasta) - mpileup_normal = RUN_MPILEUP_NORMAL.out.mpileup RUN_MPILEUP_TUMOR(cram_tumor_intervals_no_index, fasta) + + mpileup_normal = RUN_MPILEUP_NORMAL.out.mpileup mpileup_tumor = RUN_MPILEUP_TUMOR.out.mpileup - ch_versions = ch_versions.mix(RUN_MPILEUP_NORMAL.out.versions) - ch_versions = ch_versions.mix(RUN_MPILEUP_TUMOR.out.versions) controlfreec_input = mpileup_normal.cross(mpileup_tumor) .map{ normal, tumor -> [normal[0], normal[1], tumor[1], [], [], [], []] } + RUN_CONTROLFREEC_SOMATIC(controlfreec_input, fasta, fasta_fai, @@ -99,6 +99,9 @@ workflow PAIR_VARIANT_CALLING { chr_files, mappability, intervals_bed_combined) + + ch_versions = ch_versions.mix(RUN_MPILEUP_NORMAL.out.versions) + ch_versions = ch_versions.mix(RUN_MPILEUP_TUMOR.out.versions) ch_versions = ch_versions.mix(RUN_CONTROLFREEC_SOMATIC.out.versions) } diff --git a/subworkflows/local/prepare_intervals.nf b/subworkflows/local/prepare_intervals.nf index 1673a2af5..7d4e0a025 100644 --- a/subworkflows/local/prepare_intervals.nf +++ b/subworkflows/local/prepare_intervals.nf @@ -6,11 +6,10 @@ // For all modules here: // A when clause condition is defined in the conf/modules.config to determine if the module should be run -include { BUILD_INTERVALS } from '../../modules/local/build_intervals/main' -include { CREATE_INTERVALS_BED } from '../../modules/local/create_intervals_bed/main' -include { GATK4_INTERVALLISTTOBED } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main' -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main' -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_ALL } from '../../modules/nf-core/modules/tabix/bgziptabix/main' +include { BUILD_INTERVALS } from '../../modules/local/build_intervals/main' +include { CREATE_INTERVALS_BED } from '../../modules/local/create_intervals_bed/main' +include { GATK4_INTERVALLISTTOBED } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main' workflow PREPARE_INTERVALS { take: @@ -20,9 +19,9 @@ workflow PREPARE_INTERVALS { ch_versions = Channel.empty() - ch_intervals = Channel.empty() - ch_intervals_bed_gz_tbi = Channel.empty() - ch_intervals_combined_bed_gz_tbi = Channel.empty() // Create bed.gz and bed.gz.tbi for input/or created interval file. Contains ALL regions. + ch_intervals = Channel.empty() // List of bed files, one for each region + ch_intervals_bed_gz_tbi = Channel.empty() // List of bed.gz, bed,gz.tbi, one for each region + ch_intervals_combined = Channel.empty() // Bed file containing all intervals if (params.no_intervals) { file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" @@ -35,46 +34,37 @@ workflow PREPARE_INTERVALS { ch_intervals_bed_gz_tbi = Channel.fromPath(file("${params.outdir}/no_intervals.bed.{gz,gz.tbi}")) .collect().map{ it -> [it, 0]} - ch_intervals_combined_bed_gz_tbi = Channel.fromPath(file("${params.outdir}/no_intervals.bed.{gz,gz.tbi}")) - .collect() + ch_intervals_combined = Channel.fromPath(file("${params.outdir}/no_intervals.bed")).collect().map{ it -> [[id:it.simpleName], it]} } else if (params.step != 'annotate' && params.step != 'controlfreec') { - tabix_in_combined = Channel.empty() - //If no interval/target file is provided, then intervals are generated from FASTA file if (!params.intervals) { BUILD_INTERVALS(fasta_fai) - tabix_in_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.simpleName], it] } + ch_intervals_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.simpleName], it] } - ch_intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS.out.bed) + ch_intervals = CREATE_INTERVALS_BED(ch_intervals_combined) } else { - tabix_in_combined = Channel.fromPath(file(params.intervals)).map{it -> [[id:it.baseName], it] } + ch_intervals_combined = Channel.fromPath(file(params.intervals)).map{it -> [[id:it.baseName], it] } + ch_intervals = CREATE_INTERVALS_BED(file(params.intervals)) //If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format if(!params.intervals.endsWith(".bed")) { - GATK4_INTERVALLISTTOBED(tabix_in_combined) - tabix_in_combined = GATK4_INTERVALLISTTOBED.out.bed + GATK4_INTERVALLISTTOBED(ch_intervals_combined) + ch_intervals_combined = GATK4_INTERVALLISTTOBED.out.bed ch_versions = ch_versions.mix(GATK4_INTERVALLISTTOBED.out.versions) } - ch_intervals = CREATE_INTERVALS_BED(file(params.intervals)) } // Now for the interval.bed the following operations are done: - // 1. Complete intervals file (with all intervals) is indexed - // 2. Interval file is split up into multiple bed files for scatter/gather - // 3. Each bed file from 2. is indexed - - // 1. Index complete interval file - TABIX_BGZIPTABIX_INTERVAL_ALL(tabix_in_combined) - ch_intervals_combined_bed_gz_tbi = TABIX_BGZIPTABIX_INTERVAL_ALL.out.gz_tbi.map{ meta, bed, tbi -> [bed, tbi] } - ch_versions = ch_versions.mix(TABIX_BGZIPTABIX_INTERVAL_ALL.out.versions) + // 1. Interval file is split up into multiple bed files for scatter/gather + // 2. Each bed file from 2. is indexed - // 2. Interval file is split up into multiple bed files for scatter/gather & grouping together small intervals + // 1. Interval file is split up into multiple bed files for scatter/gather & grouping together small intervals ch_intervals = ch_intervals.flatten() .map{ intervalFile -> def duration = 0.0 @@ -95,7 +85,7 @@ workflow PREPARE_INTERVALS { [it, it.size() ] // Adding number of intervals as elements }.transpose() - // 3. Create bed.gz and bed.gz.tbi for each interval file. They are split by region (see above) + // 2. Create bed.gz and bed.gz.tbi for each interval file. They are split by region (see above) tabix_in = ch_intervals.map{ file, num_intervals -> [[id:file.baseName], file] } TABIX_BGZIPTABIX_INTERVAL_SPLIT(tabix_in) ch_intervals_bed_gz_tbi = TABIX_BGZIPTABIX_INTERVAL_SPLIT.out.gz_tbi.map{ meta, bed, tbi -> [bed, tbi ]}.toList().map{ @@ -107,8 +97,8 @@ workflow PREPARE_INTERVALS { } emit: - intervals_bed = ch_intervals // path: intervals.bed, num_intervals [intervals split for parallel execution] - intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi, num_intervals [intervals split for parallel execution] - intervals_combined_bed_gz_tbi = ch_intervals_combined_bed_gz_tbi // path: interval.bed.gz, interval.bed.gz.tbi [all intervals in one file] - versions = ch_versions // channel: [ versions.yml ] + intervals_bed = ch_intervals // path: intervals.bed, num_intervals [intervals split for parallel execution] + intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi, num_intervals [intervals split for parallel execution] + intervals_bed_combined = ch_intervals_combined.map{meta, bed -> bed } // path: intervals.bed [all intervals in one file] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/tests/test_tools.yml b/tests/test_tools.yml index 4106ee812..1c4f37f21 100644 --- a/tests/test_tools.yml +++ b/tests/test_tools.yml @@ -70,45 +70,6 @@ - path: results/variant_calling/sample1/cnvkit/test.paired_end.recalibrated.sorted.cns - path: results/variant_calling/sample1/cnvkit/test.paired_end.recalibrated.sorted.call.cns -- name: Run variant calling on somatic sample with tiddit - command: nextflow run main.nf -profile test,tools_somatic,docker --tools tiddit -c ./tests/nextflow.config - tags: - - tiddit - - somatic - - variant_calling - files: - - path: results/variant_calling/sample3/tiddit/sample3.ploidies.tab - - path: results/variant_calling/sample3/tiddit/sample3.vcf.gz - - path: results/variant_calling/sample3/tiddit/sample3.vcf.gz.tbi - - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3.normal.vcf.gz - - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3.normal.vcf.gz.tbi - - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3.tumor.vcf.gz - - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3.tumor.vcf.gz.tbi - - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3_sv_merge.vcf - - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3.ploidies.tab - -- name: Run variant calling on germline sample with tiddit - command: nextflow run main.nf -profile test,tools_germline,docker --tools tiddit -c ./tests/nextflow.config - tags: - - tiddit - - germline - - variant_calling - files: - - path: results/variant_calling/sample1/tiddit/sample1.ploidies.tab - - path: results/variant_calling/sample1/tiddit/sample1.vcf.gz - - path: results/variant_calling/sample1/tiddit/sample1.vcf.gz.tbi - -- name: Run variant calling on tumor_only sample with tiddit - command: nextflow run main.nf -profile test,tools_tumoronly,docker --tools tiddit -c ./tests/nextflow.config - tags: - - tiddit - - tumor_only - - variant_calling - files: - - path: results/variant_calling/sample2/tiddit/sample2.ploidies.tab - - path: results/variant_calling/sample2/tiddit/sample2.vcf.gz - - path: results/variant_calling/sample2/tiddit/sample2.vcf.gz.tbi - - name: Run variant calling on somatic samples with controlfreec command: nextflow run main.nf -profile test,tools_somatic,docker --tools controlfreec -c ./tests/nextflow.config tags: @@ -135,7 +96,7 @@ - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3.tumor.mpileup.gz_sample.cpn - name: Run variant calling on somatic samples with controlfreec without intervals - command: nextflow run main.nf -profile test,tools_somatic,docker --tools controlfreec -c ./tests/nextflow.config --no_intervals + command: nextflow run main.nf -profile test,tools_somatic,docker --tools controlfreec -c ./tests/nextflow.config --no_intervals -stub-run tags: - controlfreec - no_intervals @@ -150,15 +111,15 @@ - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3.bed - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3.circos.txt - path: results/variant_calling/sample4_vs_sample3/mpileup/sample4_vs_sample3.normal.mpileup.gz - - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3.normal.mpileup.gz_control.cpn + - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3_sample.cpn - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3.p.value.txt - path: results/variant_calling/sample4_vs_sample3/mpileup/sample4_vs_sample3.tumor.mpileup.gz - - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3.tumor.mpileup.gz_BAF.txt - - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3.tumor.mpileup.gz_CNVs - - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3.tumor.mpileup.gz_info.txt - - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3.tumor.mpileup.gz_ratio.BedGraph - - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3.tumor.mpileup.gz_ratio.txt - - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3.tumor.mpileup.gz_sample.cpn + - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3_BAF.txt + - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3_CNVs + - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3_info.txt + - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3_ratio.BedGraph + - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3_ratio.txt + - path: results/variant_calling/sample4_vs_sample3/controlfreec/sample4_vs_sample3_sample.cpn - name: Run variant calling on tumor_only sample with controlfreec command: nextflow run main.nf -profile test,tools_tumoronly,docker --tools controlfreec -stub-run @@ -404,6 +365,40 @@ - path: results/variant_calling/sample4_vs_sample3/manta/sample4_vs_sample3.somatic_sv.vcf.gz - path: results/variant_calling/sample4_vs_sample3/manta/sample4_vs_sample3.somatic_sv.vcf.gz.tbi +- name: Run variant calling on tumor_only sample to test mpileup + command: nextflow run main.nf -profile test,tools_tumoronly,docker --tools mpileup + tags: + - tumor_only + - mpileup + files: + - path: results/variant_calling/sample2/mpileup/sample2.tumor.mpileup.gz + +- name: Run variant calling on tumor_only sample to test mpileup without intervals + command: nextflow run main.nf -profile test,tools_tumoronly,docker --tools mpileup --no_intervals + tags: + - tumor_only + - mpileup + - no_intervals + files: + - path: results/variant_calling/sample2/mpileup/sample2.tumor.mpileup.gz + +- name: Run variant calling on germline sample to test mpileup + command: nextflow run main.nf -profile test,tools_germline,docker --tools mpileup + tags: + - germline + - mpileup + files: + - path: results/variant_calling/sample1/mpileup/sample1.normal.mpileup.gz + +- name: Run variant calling on germline sample to test mpileup without intervals + command: nextflow run main.nf -profile test,tools_germline,docker --tools mpileup --no_intervals + tags: + - germline + - mpileup + - no_intervals + files: + - path: results/variant_calling/sample1/mpileup/sample1.normal.mpileup.gz + - name: Run variant calling on tumoronly sample with mutect2 command: nextflow run main.nf -profile test,tools_tumoronly,docker --tools mutect2 tags: @@ -620,36 +615,41 @@ - path: results/variant_calling/sample4_vs_sample3/strelka/sample4_vs_sample3.somatic_snvs.vcf.gz.tbi - path: results/csv/variantcalled.csv -- name: Run variant calling on tumor_only sample to test mpileup - command: nextflow run main.nf -profile test,tools_tumoronly,docker --tools mpileup - tags: - - tumor_only - - mpileup - files: - - path: results/variant_calling/sample2/mpileup/sample2.tumor.mpileup.gz - -- name: Run variant calling on tumor_only sample to test mpileup without intervals - command: nextflow run main.nf -profile test,tools_tumoronly,docker --tools mpileup --no_intervals +- name: Run variant calling on somatic sample with tiddit + command: nextflow run main.nf -profile test,tools_somatic,docker --tools tiddit -c ./tests/nextflow.config tags: - - tumor_only - - mpileup - - no_intervals + - tiddit + - somatic + - variant_calling files: - - path: results/variant_calling/sample2/mpileup/sample2.tumor.mpileup.gz + - path: results/variant_calling/sample3/tiddit/sample3.ploidies.tab + - path: results/variant_calling/sample3/tiddit/sample3.vcf.gz + - path: results/variant_calling/sample3/tiddit/sample3.vcf.gz.tbi + - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3.normal.vcf.gz + - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3.normal.vcf.gz.tbi + - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3.tumor.vcf.gz + - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3.tumor.vcf.gz.tbi + - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3_sv_merge.vcf + - path: results/variant_calling/sample4_vs_sample3/tiddit/sample4_vs_sample3.ploidies.tab -- name: Run variant calling on germline sample to test mpileup - command: nextflow run main.nf -profile test,tools_germline,docker --tools mpileup +- name: Run variant calling on germline sample with tiddit + command: nextflow run main.nf -profile test,tools_germline,docker --tools tiddit -c ./tests/nextflow.config tags: + - tiddit - germline - - mpileup + - variant_calling files: - - path: results/variant_calling/sample1/mpileup/sample1.normal.mpileup.gz + - path: results/variant_calling/sample1/tiddit/sample1.ploidies.tab + - path: results/variant_calling/sample1/tiddit/sample1.vcf.gz + - path: results/variant_calling/sample1/tiddit/sample1.vcf.gz.tbi -- name: Run variant calling on germline sample to test mpileup without intervals - command: nextflow run main.nf -profile test,tools_germline,docker --tools mpileup --no_intervals +- name: Run variant calling on tumor_only sample with tiddit + command: nextflow run main.nf -profile test,tools_tumoronly,docker --tools tiddit -c ./tests/nextflow.config tags: - - germline - - mpileup - - no_intervals + - tiddit + - tumor_only + - variant_calling files: - - path: results/variant_calling/sample1/mpileup/sample1.normal.mpileup.gz + - path: results/variant_calling/sample2/tiddit/sample2.ploidies.tab + - path: results/variant_calling/sample2/tiddit/sample2.vcf.gz + - path: results/variant_calling/sample2/tiddit/sample2.vcf.gz.tbi diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 22bc0b9dc..67faad102 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -52,7 +52,7 @@ for (param in checkPathParamList) if (param) file(param, checkIfExists: true) // Set input, can either be from --input or from automatic retrieval in WorkflowSarek.groovy ch_input_sample = extract_csv(file(params.input, checkIfExists: true)) -if (params.wes) { +if (params.wes && !params.step == 'annotate') { if (params.intervals && !params.intervals.endsWith("bed")) exit 1, "Target file specified with `--intervals` must be in BED format" } else { if (params.intervals && !params.intervals.endsWith("bed") && !params.intervals.endsWith("interval_list")) exit 1, "Interval file must end with .bed or .interval_list" @@ -291,17 +291,11 @@ workflow SAREK { PREPARE_INTERVALS(fasta_fai) // Intervals for speed up preprocessing/variant calling by spread/gather - // this is not good, we need the combined bed for some tools that don't support scatter/gather. Why would we not use the same intervals for WGS? - // intervals_bed_combined = (params.intervals && params.wes) ? Channel.fromPath(params.intervals).collect() : [] - // check if this actually still works if interval_list format - intervals_bed_combined = params.intervals ? Channel.fromPath(params.intervals).collect() : [] - //TODO: intervals also with WGS data? Probably need a parameter if WGS for deepvariant tool, that would allow to check here too - intervals_for_preprocessing = (params.wes && params.intervals) ? intervals_bed_combined : [] - - intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather - intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather - + intervals_bed_combined = params.no_intervals ? [] : PREPARE_INTERVALS.out.intervals_bed_combined // [interval.bed] all intervals in one file + intervals_for_preprocessing = params.wes ? intervals_bed_combined : [] // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS) + intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather + intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather // Gather used softwares versions ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions)