Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix --intervals false #655

Merged
merged 20 commits into from
Jul 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#642](https://github.com/nf-core/sarek/pull/642) - Only unzip ref files if tool is run, only publish ref files if `--save_reference` and simplify CNKit logic
- [#650](https://github.com/nf-core/sarek/pull/650) - Fix intervals checks
- [#654](https://github.com/nf-core/sarek/pull/654) - Allow any step but annotation to start from BAM files
- [#655](https://github.com/nf-core/sarek/pull/655) - Fix `--intervals false` logic & add versioning for local modules
- [#658](https://github.com/nf-core/sarek/pull/658) - Fix split fastq names in multiqc-report

### Deprecated
Expand Down
12 changes: 9 additions & 3 deletions modules/local/build_intervals/main.nf
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
process BUILD_INTERVALS {
tag "$fasta_fai"
tag "$meta.id"

conda (params.enable_conda ? "anaconda::gawk=5.1.0" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
'quay.io/biocontainers/gawk:5.1.0' }"

input:
path fasta_fai
tuple val(meta), path(fasta_fai)

output:
path "*.bed", emit: bed
tuple val(meta), path("${fasta_fai.baseName}.bed") , emit: bed
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
"""
awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fasta_fai} > ${fasta_fai.baseName}.bed

cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
END_VERSIONS
"""
}
28 changes: 22 additions & 6 deletions modules/local/create_intervals_bed/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ process CREATE_INTERVALS_BED {
'quay.io/biocontainers/gawk:5.1.0' }"

input:
path intervals
path(intervals)

output:
path ("*.bed"), emit: bed
//TODO version number missing
path("*.bed") , emit: bed
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when
Expand All @@ -20,7 +20,7 @@ process CREATE_INTERVALS_BED {
// If intervals file is in BED format,
// Fifth column is interpreted to contain runtime estimates
// Which is then used to combine short-running jobs
if (intervals.toString().toLowerCase().endsWith("bed"))
if (intervals.toString().toLowerCase().endsWith("bed")) {
"""
awk -vFS="\t" '{
t = \$5 # runtime estimate
Expand All @@ -39,19 +39,35 @@ process CREATE_INTERVALS_BED {
chunk += t
print \$0 > name
}' ${intervals}

cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
END_VERSIONS
"""
else if (intervals.toString().toLowerCase().endsWith("interval_list"))
} else if (intervals.toString().toLowerCase().endsWith("interval_list")) {
"""
grep -v '^@' ${intervals} | awk -vFS="\t" '{
name = sprintf("%s_%d-%d", \$1, \$2, \$3);
printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed"
}'

cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
END_VERSIONS
"""
else
} else {
"""
awk -vFS="[:-]" '{
name = sprintf("%s_%d-%d", \$1, \$2, \$3);
printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed"
}' ${intervals}

cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
END_VERSIONS
"""
}
}
33 changes: 33 additions & 0 deletions subworkflows/local/prepare_cnvkit_reference.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//
// PREPARE CNVKIT REFERENCE
//

// Initialize channels based on params or indices that were just built
// For all modules here:
// A when clause condition is defined in the conf/modules.config to determine if the module should be run

include { CNVKIT_ANTITARGET } from '../../modules/nf-core/modules/cnvkit/antitarget/main'
include { CNVKIT_REFERENCE } from '../../modules/nf-core/modules/cnvkit/reference/main'

workflow PREPARE_CNVKIT_REFERENCE {
take:
fasta // channel: [mandatory] fasta
intervals_bed_combined // channel: []

main:

ch_versions = Channel.empty()

// prepare a antitarget reference files for tumor_only mode of cnvkit
CNVKIT_ANTITARGET(intervals_bed_combined.flatten().map{ it -> [[id:it[0].baseName], it] })
CNVKIT_REFERENCE(fasta, intervals_bed_combined, CNVKIT_ANTITARGET.out.bed.map{ meta, bed -> [bed]} )

ch_versions = ch_versions.mix(CNVKIT_ANTITARGET.out.versions)
ch_versions = ch_versions.mix(CNVKIT_REFERENCE.out.versions)

emit:
versions = ch_versions
cnvkit_reference = CNVKIT_REFERENCE.out.cnn
}


10 changes: 0 additions & 10 deletions subworkflows/local/prepare_genome.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@

include { BWA_INDEX as BWAMEM1_INDEX } from '../../modules/nf-core/modules/bwa/index/main'
include { BWAMEM2_INDEX } from '../../modules/nf-core/modules/bwamem2/index/main'
include {CNVKIT_ANTITARGET } from '../../modules/nf-core/modules/cnvkit/antitarget/main'
include {CNVKIT_REFERENCE } from '../../modules/nf-core/modules/cnvkit/reference/main'
include { DRAGMAP_HASHTABLE } from '../../modules/nf-core/modules/dragmap/hashtable/main'
include { GATK4_CREATESEQUENCEDICTIONARY } from '../../modules/nf-core/modules/gatk4/createsequencedictionary/main'
include { MSISENSORPRO_SCAN } from '../../modules/nf-core/modules/msisensorpro/scan/main'
Expand All @@ -37,7 +35,6 @@ workflow PREPARE_GENOME {
fasta // channel: [mandatory] fasta
fasta_fai // channel: [optional] fasta_fai
germline_resource // channel: [optional] germline_resource
intervals_bed_combined // channel: []
known_indels // channel: [optional] known_indels
pon // channel: [optional] pon

Expand All @@ -63,10 +60,6 @@ workflow PREPARE_GENOME {
TABIX_KNOWN_INDELS( known_indels.flatten().map{ it -> [[id:it.baseName], it] } )
TABIX_PON(pon.flatten().map{ it -> [[id:it.baseName], it] })

// prepare a reference for tumor_only mode based on target_baits
CNVKIT_ANTITARGET(intervals_bed_combined.flatten().map{ it -> [[id:it[0].baseName], it] })
CNVKIT_REFERENCE(fasta, intervals_bed_combined, CNVKIT_ANTITARGET.out.bed.map{ meta, bed -> [bed]} )

// prepare ascat reference files
allele_files = ascat_alleles
if (params.ascat_alleles && params.ascat_alleles.endsWith('.zip')) {
Expand Down Expand Up @@ -106,8 +99,6 @@ workflow PREPARE_GENOME {
ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions)
ch_versions = ch_versions.mix(BWAMEM1_INDEX.out.versions)
ch_versions = ch_versions.mix(BWAMEM2_INDEX.out.versions)
ch_versions = ch_versions.mix(CNVKIT_ANTITARGET.out.versions)
ch_versions = ch_versions.mix(CNVKIT_REFERENCE.out.versions)
ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions)
ch_versions = ch_versions.mix(MSISENSORPRO_SCAN.out.versions)
ch_versions = ch_versions.mix(TABIX_DBSNP.out.versions)
Expand All @@ -127,7 +118,6 @@ workflow PREPARE_GENOME {
msisensorpro_scan = MSISENSORPRO_SCAN.out.list.map{ meta, list -> [list] } // path: genome_msi.list
pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi
chr_files = chr_files
cnvkit_reference = CNVKIT_REFERENCE.out.cnn
allele_files = allele_files
loci_files = loci_files
gc_file = gc_file
Expand Down
32 changes: 20 additions & 12 deletions subworkflows/local/prepare_intervals.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
// For all modules here:
// A when clause condition is defined in the conf/modules.config to determine if the module should be run

include { BUILD_INTERVALS } from '../../modules/local/build_intervals/main'
include { CREATE_INTERVALS_BED } from '../../modules/local/create_intervals_bed/main'
include { GATK4_INTERVALLISTTOBED } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main'
include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main'
include { BUILD_INTERVALS } from '../../modules/local/build_intervals/main'
include { CNVKIT_ANTITARGET } from '../../modules/nf-core/modules/cnvkit/antitarget/main'
include { CNVKIT_REFERENCE } from '../../modules/nf-core/modules/cnvkit/reference/main'
include { CREATE_INTERVALS_BED } from '../../modules/local/create_intervals_bed/main'
include { GATK4_INTERVALLISTTOBED } from '../../modules/nf-core/modules/gatk4/intervallisttobed/main'
include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../modules/nf-core/modules/tabix/bgziptabix/main'

workflow PREPARE_INTERVALS {
take:
Expand Down Expand Up @@ -42,15 +44,21 @@ workflow PREPARE_INTERVALS {
//If no interval/target file is provided, then intervals are generated from FASTA file
if (!params.intervals) {

BUILD_INTERVALS(fasta_fai)
ch_intervals_combined = BUILD_INTERVALS.out.bed.map{it -> [[id:it.simpleName], it] }
BUILD_INTERVALS(fasta_fai.map{it -> [[id:it.baseName], it]})

ch_intervals = CREATE_INTERVALS_BED(ch_intervals_combined)
ch_intervals_combined = BUILD_INTERVALS.out.bed

ch_intervals = CREATE_INTERVALS_BED(ch_intervals_combined.map{meta, path -> path}).bed

ch_versions = ch_versions.mix(BUILD_INTERVALS.out.versions)
ch_versions = ch_versions.mix(CREATE_INTERVALS_BED.out.versions)

} else {

ch_intervals_combined = Channel.fromPath(file(params.intervals)).map{it -> [[id:it.baseName], it] }
ch_intervals = CREATE_INTERVALS_BED(file(params.intervals))

ch_intervals = CREATE_INTERVALS_BED(file(params.intervals)).bed
ch_versions = ch_versions.mix(CREATE_INTERVALS_BED.out.versions)

//If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format
if(!params.intervals.endsWith(".bed")) {
Expand Down Expand Up @@ -98,8 +106,8 @@ workflow PREPARE_INTERVALS {
}

emit:
intervals_bed = ch_intervals // path: intervals.bed, num_intervals [intervals split for parallel execution]
intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi, num_intervals [intervals split for parallel execution]
intervals_bed_combined = ch_intervals_combined.map{meta, bed -> bed }.collect() // path: intervals.bed [all intervals in one file]
versions = ch_versions // channel: [ versions.yml ]
intervals_bed = ch_intervals // path: intervals.bed, num_intervals [intervals split for parallel execution]
intervals_bed_gz_tbi = ch_intervals_bed_gz_tbi // path: target.bed.gz, target.bed.gz.tbi, num_intervals [intervals split for parallel execution]
intervals_bed_combined = ch_intervals_combined.map{meta, bed -> bed }.collect() // path: intervals.bed [all intervals in one file]
versions = ch_versions // channel: [ versions.yml ]
}
28 changes: 28 additions & 0 deletions tests/test_targeted.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,31 @@
- path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi
- path: results/reports/samtools/test/test.md.cram.stats
- path: results/reports/samtools/test/test.recal.cram.stats

- name: Run intervals false pipeline
command: nextflow run main.nf -profile test,docker --intervals false
tags:
- default
- preprocessing
files:
- path: results/csv/markduplicates.csv
- path: results/csv/markduplicates_no_table.csv
- path: results/csv/recalibrated.csv
- path: results/multiqc
- path: results/preprocessing/markduplicates/test/test.md.cram
- path: results/preprocessing/markduplicates/test/test.md.cram.crai
- path: results/preprocessing/recal_table/test/test.recal.table
- path: results/preprocessing/recalibrated/test/test.recal.cram
- path: results/preprocessing/recalibrated/test/test.recal.cram.crai
- path: results/reports/fastqc/test-test_L1
- path: results/reports/markduplicates/test/test.md.metrics
- path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt
- path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt
- path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt
- path: results/reports/mosdepth/test/test.md.regions.bed.gz
- path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt
- path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt
- path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt
- path: results/reports/mosdepth/test/test.recal.regions.bed.gz
- path: results/reports/samtools/test/test.md.cram.stats
- path: results/reports/samtools/test/test.recal.cram.stats
29 changes: 18 additions & 11 deletions workflows/sarek.nf
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ include { PREPARE_GENOME } from '../subwor
// Build intervals if needed
include { PREPARE_INTERVALS } from '../subworkflows/local/prepare_intervals'

// Build CNVkit reference if needed
include { PREPARE_CNVKIT_REFERENCE } from '../subworkflows/local/prepare_cnvkit_reference'

// Convert BAM files to FASTQ files
include { ALIGNMENT_TO_FASTQ as ALIGNMENT_TO_FASTQ_INPUT } from '../subworkflows/nf-core/alignment_to_fastq'
include { ALIGNMENT_TO_FASTQ as ALIGNMENT_TO_FASTQ_UMI } from '../subworkflows/nf-core/alignment_to_fastq'
Expand Down Expand Up @@ -296,15 +299,6 @@ workflow SAREK {
// To gather used softwares versions for MultiQC
ch_versions = Channel.empty()

// Build intervals if needed
PREPARE_INTERVALS(fasta_fai)

// Intervals for speed up preprocessing/variant calling by spread/gather
intervals_bed_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_combined // [interval.bed] all intervals in one file
intervals_for_preprocessing = params.wes ? intervals_bed_combined : [] // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS)

intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather
intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather

// Build indices if needed
PREPARE_GENOME(
Expand All @@ -317,7 +311,6 @@ workflow SAREK {
fasta,
fasta_fai,
germline_resource,
intervals_bed_combined,
known_indels,
pon)

Expand All @@ -326,7 +319,6 @@ workflow SAREK {
bwa = params.fasta ? params.bwa ? Channel.fromPath(params.bwa).collect() : PREPARE_GENOME.out.bwa : []
bwamem2 = params.fasta ? params.bwamem2 ? Channel.fromPath(params.bwamem2).collect() : PREPARE_GENOME.out.bwamem2 : []
chr_files = PREPARE_GENOME.out.chr_files
cnvkit_reference = params.tools && params.tools.split(',').contains('cnvkit') ? PREPARE_GENOME.out.cnvkit_reference : Channel.empty()
dragmap = params.fasta ? params.dragmap ? Channel.fromPath(params.dragmap).collect() : PREPARE_GENOME.out.hashtable : []
dict = params.fasta ? params.dict ? Channel.fromPath(params.dict).collect() : PREPARE_GENOME.out.dict : []
fasta_fai = params.fasta ? params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : PREPARE_GENOME.out.fasta_fai : []
Expand All @@ -349,10 +341,25 @@ workflow SAREK {
known_sites = dbsnp.concat(known_indels).collect()
known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect()

// Build intervals if needed
PREPARE_INTERVALS(fasta_fai)

// Intervals for speed up preprocessing/variant calling by spread/gather
intervals_bed_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_combined // [interval.bed] all intervals in one file
intervals_for_preprocessing = params.wes ? intervals_bed_combined : [] // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS)

intervals = PREPARE_INTERVALS.out.intervals_bed // [interval, num_intervals] multiple interval.bed files, divided by useful intervals for scatter/gather
intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [interval_bed, tbi, num_intervals] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather

// Gather used softwares versions
ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions)
ch_versions = ch_versions.mix(PREPARE_INTERVALS.out.versions)

// Antitarget based reference for CNVKit
PREPARE_CNVKIT_REFERENCE(fasta, intervals_bed_combined)
cnvkit_reference = params.tools && params.tools.split(',').contains('cnvkit') ? PREPARE_CNVKIT_REFERENCE.out.cnvkit_reference : Channel.empty()
ch_versions = ch_versions.mix(PREPARE_CNVKIT_REFERENCE.out.versions)

// PREPROCESSING

if (params.step == 'mapping') {
Expand Down