Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Intervals #609

Merged
merged 9 commits into from
Jun 28, 2022
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#599](https://github.com/nf-core/sarek/pull/599) - Add checks for correct data type for `params.step`
- [#599](https://github.com/nf-core/sarek/pull/599) - Add checks for no empty `--tools` with `--step variant_calling` or `--step annotation`
- [#600](https://github.com/nf-core/sarek/pull/600) - Remove `nf-core lint` warnings
- [#602](https://github.com/nf-core/sarek/pull/602/) - Fixed bug in `alignment_to_fastq` and added tests
- [#602](https://github.com/nf-core/sarek/pull/602) - Fixed bug in `alignment_to_fastq` and added tests
- [#609](https://github.com/nf-core/sarek/pull/609) - Remove unused intervals code, reorganize combined intervals file

### Deprecated

Expand Down
9 changes: 6 additions & 3 deletions subworkflows/local/pair_variant_calling.nf
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,16 @@ workflow PAIR_VARIANT_CALLING {
[meta, tumor_cram, intervals]
}
RUN_MPILEUP_NORMAL(cram_normal_intervals_no_index, fasta)
mpileup_normal = RUN_MPILEUP_NORMAL.out.mpileup
RUN_MPILEUP_TUMOR(cram_tumor_intervals_no_index, fasta)

mpileup_normal = RUN_MPILEUP_NORMAL.out.mpileup
mpileup_tumor = RUN_MPILEUP_TUMOR.out.mpileup
ch_versions = ch_versions.mix(RUN_MPILEUP_NORMAL.out.versions)
ch_versions = ch_versions.mix(RUN_MPILEUP_TUMOR.out.versions)

controlfreec_input = mpileup_normal.cross(mpileup_tumor)
.map{ normal, tumor ->
[normal[0], normal[1], tumor[1], [], [], [], []]
}

RUN_CONTROLFREEC_SOMATIC(controlfreec_input,
fasta,
fasta_fai,
Expand All @@ -99,6 +99,9 @@ workflow PAIR_VARIANT_CALLING {
chr_files,
mappability,
intervals_bed_combined)

ch_versions = ch_versions.mix(RUN_MPILEUP_NORMAL.out.versions)
ch_versions = ch_versions.mix(RUN_MPILEUP_TUMOR.out.versions)
ch_versions = ch_versions.mix(RUN_CONTROLFREEC_SOMATIC.out.versions)
}

Expand Down
54 changes: 22 additions & 32 deletions subworkflows/local/prepare_intervals.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
// For all modules here:
// A when clause condition is defined in the conf/modules.config to determine if the module should be run

include { BUILD_INTERVALS } from '../../modules/local/build_intervals/main'
include { CREATE_INTERVALS_BED } from '../../modules/local/create_intervals_bed/main'
include { GATK4_INTERVALLISTTOBED } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main'
include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main'
include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_ALL } from '../../modules/nf-core/modules/tabix/bgziptabix/main'
include { BUILD_INTERVALS } from '../../modules/local/build_intervals/main'
include { CREATE_INTERVALS_BED } from '../../modules/local/create_intervals_bed/main'
include { GATK4_INTERVALLISTTOBED } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main'
include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main'

workflow PREPARE_INTERVALS {
take:
Expand All @@ -20,9 +19,9 @@ workflow PREPARE_INTERVALS {

ch_versions = Channel.empty()

ch_intervals = Channel.empty()
ch_intervals_bed_gz_tbi = Channel.empty()
ch_intervals_combined_bed_gz_tbi = Channel.empty() // Create bed.gz and bed.gz.tbi for input/or created interval file. Contains ALL regions.
ch_intervals = Channel.empty() // List of bed files, one for each region
ch_intervals_bed_gz_tbi = Channel.empty() // List of bed.gz, bed,gz.tbi, one for each region
ch_intervals_combined = Channel.empty() // Bed file containing all intervals

if (params.no_intervals) {
file("${params.outdir}/no_intervals.bed").text = "no_intervals\n"
Expand All @@ -35,46 +34,37 @@ workflow PREPARE_INTERVALS {
ch_intervals_bed_gz_tbi = Channel.fromPath(file("${params.outdir}/no_intervals.bed.{gz,gz.tbi}"))
.collect().map{ it -> [it, 0]}

ch_intervals_combined_bed_gz_tbi = Channel.fromPath(file("${params.outdir}/no_intervals.bed.{gz,gz.tbi}"))
.collect()
ch_intervals_combined = Channel.fromPath(file("${params.outdir}/no_intervals.bed")).collect().map{ it -> [[id:it.simpleName], it]}

} else if (params.step != 'annotate' && params.step != 'controlfreec') {

tabix_in_combined = Channel.empty()

//If no interval/target file is provided, then intervals are generated from FASTA file
if (!params.intervals) {

BUILD_INTERVALS(fasta_fai)
tabix_in_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.simpleName], it] }
ch_intervals_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.simpleName], it] }

ch_intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS.out.bed)
ch_intervals = CREATE_INTERVALS_BED(ch_intervals_combined)

} else {

tabix_in_combined = Channel.fromPath(file(params.intervals)).map{it -> [[id:it.baseName], it] }
ch_intervals_combined = Channel.fromPath(file(params.intervals)).map{it -> [[id:it.baseName], it] }
ch_intervals = CREATE_INTERVALS_BED(file(params.intervals))

//If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format
if(!params.intervals.endsWith(".bed")) {
GATK4_INTERVALLISTTOBED(tabix_in_combined)
tabix_in_combined = GATK4_INTERVALLISTTOBED.out.bed
GATK4_INTERVALLISTTOBED(ch_intervals_combined)
ch_intervals_combined = GATK4_INTERVALLISTTOBED.out.bed
ch_versions = ch_versions.mix(GATK4_INTERVALLISTTOBED.out.versions)
}

ch_intervals = CREATE_INTERVALS_BED(file(params.intervals))
}

// Now for the interval.bed the following operations are done:
// 1. Complete intervals file (with all intervals) is indexed
// 2. Interval file is split up into multiple bed files for scatter/gather
// 3. Each bed file from 2. is indexed

// 1. Index complete interval file
TABIX_BGZIPTABIX_INTERVAL_ALL(tabix_in_combined)
ch_intervals_combined_bed_gz_tbi = TABIX_BGZIPTABIX_INTERVAL_ALL.out.gz_tbi.map{ meta, bed, tbi -> [bed, tbi] }
ch_versions = ch_versions.mix(TABIX_BGZIPTABIX_INTERVAL_ALL.out.versions)
// 1. Interval file is split up into multiple bed files for scatter/gather
// 2. Each bed file from 2. is indexed

// 2. Interval file is split up into multiple bed files for scatter/gather & grouping together small intervals
// 1. Interval file is split up into multiple bed files for scatter/gather & grouping together small intervals
ch_intervals = ch_intervals.flatten()
.map{ intervalFile ->
def duration = 0.0
Expand All @@ -95,7 +85,7 @@ workflow PREPARE_INTERVALS {
[it, it.size() ] // Adding number of intervals as elements
}.transpose()

// 3. Create bed.gz and bed.gz.tbi for each interval file. They are split by region (see above)
// 2. Create bed.gz and bed.gz.tbi for each interval file. They are split by region (see above)
tabix_in = ch_intervals.map{ file, num_intervals -> [[id:file.baseName], file] }
TABIX_BGZIPTABIX_INTERVAL_SPLIT(tabix_in)
ch_intervals_bed_gz_tbi = TABIX_BGZIPTABIX_INTERVAL_SPLIT.out.gz_tbi.map{ meta, bed, tbi -> [bed, tbi ]}.toList().map{
Expand All @@ -107,8 +97,8 @@ workflow PREPARE_INTERVALS {
}

emit:
intervals_bed = ch_intervals // path: intervals.bed, num_intervals [intervals split for parallel execution]
intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi, num_intervals [intervals split for parallel execution]
intervals_combined_bed_gz_tbi = ch_intervals_combined_bed_gz_tbi // path: interval.bed.gz, interval.bed.gz.tbi [all intervals in one file]
versions = ch_versions // channel: [ versions.yml ]
intervals_bed = ch_intervals // path: intervals.bed, num_intervals [intervals split for parallel execution]
intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi, num_intervals [intervals split for parallel execution]
intervals_bed_combined = ch_intervals_combined.map{meta, bed -> bed } // path: intervals.bed [all intervals in one file]
versions = ch_versions // channel: [ versions.yml ]
}
16 changes: 5 additions & 11 deletions workflows/sarek.nf
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ for (param in checkPathParamList) if (param) file(param, checkIfExists: true)
// Set input, can either be from --input or from automatic retrieval in WorkflowSarek.groovy
ch_input_sample = extract_csv(file(params.input, checkIfExists: true))

if (params.wes) {
if (params.wes && !params.step == 'annotate') {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, so you fixed it already 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah since it was a one liner at roughly fit into the theme: fix intervals :D

if (params.intervals && !params.intervals.endsWith("bed")) exit 1, "Target file specified with `--intervals` must be in BED format"
} else {
if (params.intervals && !params.intervals.endsWith("bed") && !params.intervals.endsWith("interval_list")) exit 1, "Interval file must end with .bed or .interval_list"
Expand Down Expand Up @@ -291,17 +291,11 @@ workflow SAREK {
PREPARE_INTERVALS(fasta_fai)

// Intervals for speed up preprocessing/variant calling by spread/gather
// this is not good, we need the combined bed for some tools that don't support scatter/gather. Why would we not use the same intervals for WGS?
// intervals_bed_combined = (params.intervals && params.wes) ? Channel.fromPath(params.intervals).collect() : []
// check if this actually still works if interval_list format
intervals_bed_combined = params.intervals ? Channel.fromPath(params.intervals).collect() : []
//TODO: intervals also with WGS data? Probably need a parameter if WGS for deepvariant tool, that would allow to check here too
intervals_for_preprocessing = (params.wes && params.intervals) ? intervals_bed_combined : []

intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather
intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather

intervals_bed_combined = params.no_intervals ? [] : PREPARE_INTERVALS.out.intervals_bed_combined // [interval.bed] all intervals in one file
intervals_for_preprocessing = params.wes ? intervals_bed_combined : [] // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS)

intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather
intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather

// Gather used softwares versions
ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions)
Expand Down