Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add skip_tools haplotypecaller_filter #889

Merged
merged 13 commits into from
Dec 12, 2022
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- [#864](https://github.com/nf-core/sarek/pull/864) - Added possibilities to export assembled haplotypes and locally realigned reads
- [#792](https://github.com/nf-core/sarek/pull/792) - Added the option `--concatenate_vcfs` for concatenating the germline vcf-files. Per default, the resulting vcf-files will be placed under `<outDir>/variant_calling/concat`.
- [#889](https://github.com/nf-core/sarek/pull/889) - Added possibilities to skip variant filtering after Haplotypecaller

### Changed

Expand Down
2 changes: 1 addition & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
"fa_icon": "fas fa-forward",
"description": "Disable specified tools.",
"help_text": "Multiple tools can be specified, separated by commas.\n\n> **NB** `--skip_tools baserecalibrator_report` is actually just not saving the reports.\n> **NB** `--skip_tools markduplicates_report` does not skip `MarkDuplicates` but prevent the collection of duplicate metrics that slows down performance.",
"pattern": "^((baserecalibrator|baserecalibrator_report|bcftools|documentation|fastqc|markduplicates|markduplicates_report|mosdepth|multiqc|samtools|vcftools|versions)?,?)*[^,]+$"
"pattern": "^((baserecalibrator|baserecalibrator_report|bcftools|documentation|fastqc|haplotypecaller_filter|markduplicates|markduplicates_report|mosdepth|multiqc|samtools|vcftools|versions)?,?)*[^,]+$"
}
},
"fa_icon": "fas fa-user-cog"
Expand Down
6 changes: 4 additions & 2 deletions subworkflows/local/bam_variant_calling_germline_all/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ include { BAM_VARIANT_CALLING_SINGLE_TIDDIT } from '../bam_variant_calling_sin
workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
take:
tools // Mandatory, list of tools to apply
skip_tools // Mandatory, list of tools to skip
cram_recalibrated // channel: [mandatory] cram
bwa // channel: [mandatory] bwa
dbsnp // channel: [mandatory] dbsnp
Expand Down Expand Up @@ -180,9 +181,10 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
known_sites_indels_tbi,
known_sites_snps,
known_sites_snps_tbi,
intervals_bed_combined_haplotypec)
intervals_bed_combined_haplotypec,
(skip_tools && skip_tools.split(',').contains('haplotypecaller_filter')))

haplotypecaller_vcf = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.filtered_vcf
haplotypecaller_vcf = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.vcf
ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.versions)

}
Expand Down
140 changes: 74 additions & 66 deletions subworkflows/local/bam_variant_calling_haplotypecaller/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ workflow BAM_VARIANT_CALLING_HAPLOTYPECALLER {
known_sites_snps
known_sites_snps_tbi
intervals_bed_combined // channel: [mandatory] intervals/target regions in one file unzipped, no_intervals.bed if no_intervals

skip_haplotypecaller_filter

main:

ch_versions = Channel.empty()
filtered_vcf = Channel.empty()
versions = Channel.empty()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we in general replacing ch_versions with versions?

vcf = Channel.empty()
realigned_bam = Channel.empty()

GATK4_HAPLOTYPECALLER(
Expand All @@ -34,28 +34,31 @@ workflow BAM_VARIANT_CALLING_HAPLOTYPECALLER {
dbsnp_tbi)

// Figure out if using intervals or no_intervals
GATK4_HAPLOTYPECALLER.out.vcf.branch{
haplotypecaller_vcf_branch = GATK4_HAPLOTYPECALLER.out.vcf.branch{
intervals: it[0].num_intervals > 1
no_intervals: it[0].num_intervals <= 1
}.set{haplotypecaller_vcf_branch}
}

GATK4_HAPLOTYPECALLER.out.tbi.branch{
haplotypecaller_tbi_branch = GATK4_HAPLOTYPECALLER.out.tbi.branch{
intervals: it[0].num_intervals > 1
no_intervals: it[0].num_intervals <= 1
}.set{haplotypecaller_tbi_branch}
}

GATK4_HAPLOTYPECALLER.out.bam.branch{
haplotypecaller_bam_branch = GATK4_HAPLOTYPECALLER.out.bam.branch{
intervals: it[0].num_intervals > 1
no_intervals: it[0].num_intervals <= 1
}.set{haplotypecaller_bam_branch}
}

if (params.joint_germline) {
// merge vcf and tbis
genotype_gvcf_to_call = Channel.empty().mix(GATK4_HAPLOTYPECALLER.out.vcf
.join(GATK4_HAPLOTYPECALLER.out.tbi)
.join(cram).map{ meta, vcf, tbi, cram, crai, intervals, dragstr_model ->
[ meta, vcf, tbi, intervals ]
})
genotype_gvcf_to_call = Channel.empty().mix(
GATK4_HAPLOTYPECALLER.out.vcf
.join(GATK4_HAPLOTYPECALLER.out.tbi)
.join(cram)
.map{ meta, vcf, tbi, cram, crai, intervals, dragstr_model ->
[ meta, vcf, tbi, intervals ]
})

// make channels from labels
dbsnp_vqsr = params.dbsnp_vqsr ? Channel.value(params.dbsnp_vqsr) : Channel.empty()
known_indels_vqsr = params.known_indels_vqsr ? Channel.value(params.known_indels_vqsr) : Channel.empty()
Expand All @@ -77,84 +80,89 @@ workflow BAM_VARIANT_CALLING_HAPLOTYPECALLER {
known_sites_snps_tbi,
known_snps_vqsr)

filtered_vcf = BAM_JOINT_CALLING_GERMLINE_GATK.out.genotype_vcf
ch_versions = ch_versions.mix(BAM_JOINT_CALLING_GERMLINE_GATK.out.versions)
vcf = BAM_JOINT_CALLING_GERMLINE_GATK.out.genotype_vcf
versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_GATK.out.versions)

} else {

// Only when using intervals
MERGE_HAPLOTYPECALLER(
haplotypecaller_vcf_branch.intervals
.map{ meta, vcf ->

new_meta = [
id: meta.sample,
num_intervals: meta.num_intervals,
patient: meta.patient,
sample: meta.sample,
sex: meta.sex,
status: meta.status
]
id: meta.sample,
num_intervals: meta.num_intervals,
patient: meta.patient,
sample: meta.sample,
sex: meta.sex,
status: meta.status,
variantcaller: "haplotypecaller"
]

[groupKey(new_meta, new_meta.num_intervals), vcf]
}.groupTuple(),
dict.map{ it -> [[id:it[0].baseName], it]})
}.groupTuple(), dict.map{ it -> [[id:it[0].baseName], it]})

haplotypecaller_vcf = Channel.empty().mix(
MERGE_HAPLOTYPECALLER.out.vcf,
haplotypecaller_vcf_branch.no_intervals)
MERGE_HAPLOTYPECALLER.out.vcf,
haplotypecaller_vcf_branch.no_intervals)

haplotypecaller_tbi = Channel.empty().mix(
MERGE_HAPLOTYPECALLER.out.tbi,
haplotypecaller_tbi_branch.no_intervals)
MERGE_HAPLOTYPECALLER.out.tbi,
haplotypecaller_tbi_branch.no_intervals)

// BAM output
BAM_MERGE_INDEX_SAMTOOLS(
haplotypecaller_bam_branch.intervals
.map{ meta, bam ->
.map{ meta, bam ->

new_meta = [
id: meta.sample,
num_intervals: meta.num_intervals,
patient: meta.patient,
sample: meta.sample,
sex: meta.sex,
status: meta.status
]
new_meta = [
id: meta.sample,
num_intervals: meta.num_intervals,
patient: meta.patient,
sample: meta.sample,
sex: meta.sex,
status: meta.status
]

[groupKey(new_meta, new_meta.num_intervals), bam]
}.groupTuple().mix(haplotypecaller_bam_branch.no_intervals))
}.groupTuple()
.mix(haplotypecaller_bam_branch.no_intervals))

realigned_bam = BAM_MERGE_INDEX_SAMTOOLS.out.bam_bai

VCF_VARIANT_FILTERING_GATK(haplotypecaller_vcf.join(haplotypecaller_tbi),
fasta,
fasta_fai,
dict,
intervals_bed_combined,
known_sites_indels.concat(known_sites_snps).flatten().unique().collect(),
known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect())

filtered_vcf = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf.map{ meta, vcf-> [
[
patient:meta.patient,
sample:meta.sample,
status:meta.status,
sex:meta.sex,
id:meta.sample,
num_intervals:meta.num_intervals,
variantcaller:"haplotypecaller"
],
vcf
]
}
if (!skip_haplotypecaller_filter) {

VCF_VARIANT_FILTERING_GATK(
haplotypecaller_vcf.join(haplotypecaller_tbi),
fasta,
fasta_fai,
dict,
intervals_bed_combined,
known_sites_indels.concat(known_sites_snps).flatten().unique().collect(),
known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect())

vcf = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf.map{ meta, vcf ->
[[
id: meta.sample,
num_intervals: meta.num_intervals,
patient: meta.patient,
sample: meta.sample,
sex: meta.sex,
status: meta.status,
variantcaller: "haplotypecaller"
], vcf ]}

versions = versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions)

} else vcf = haplotypecaller_vcf

versions = versions.mix(MERGE_HAPLOTYPECALLER.out.versions)

ch_versions = ch_versions.mix(GATK4_HAPLOTYPECALLER.out.versions)
ch_versions = ch_versions.mix(MERGE_HAPLOTYPECALLER.out.versions)
ch_versions = ch_versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions)
}
versions = versions.mix(GATK4_HAPLOTYPECALLER.out.versions)

emit:
versions = ch_versions
filtered_vcf
realigned_bam
vcf
versions
}
5 changes: 5 additions & 0 deletions tests/config/pytest_tags.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ tumor_normal_pair:
- modules/**
- subworkflows/**
- workflows/**
- nextflow_schema.json
- nextflow.config
- main.nf

Expand All @@ -13,6 +14,7 @@ save_mapped_only:
- modules/**
- subworkflows/**
- workflows/**
- nextflow_schema.json
- nextflow.config
- main.nf

Expand All @@ -21,6 +23,7 @@ save_output_as_bam_only:
- modules/**
- subworkflows/**
- workflows/**
- nextflow_schema.json
- nextflow.config
- main.nf

Expand All @@ -29,6 +32,7 @@ skip_all_qc:
- modules/**
- subworkflows/**
- workflows/**
- nextflow_schema.json
- nextflow.config
- main.nf

Expand All @@ -37,6 +41,7 @@ skip_markduplicates:
- modules/**
- subworkflows/**
- workflows/**
- nextflow_schema.json
- nextflow.config
- main.nf

Expand Down
28 changes: 28 additions & 0 deletions tests/test_haplotypecaller.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,34 @@
# binary changes md5sums on reruns.
- path: results/haplotypecaller
should_exist: false
- name: Run variant calling on germline sample with haplotypecaller and skip filter
command: nextflow run main.nf -profile test,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools haplotypecaller --step variant_calling --skip_tools haplotypecaller_filter --outdir results
tags:
- germline
- haplotypecaller
- variant_calling
files:
- path: results/csv/variantcalled.csv
md5sum: f1041cfc30cedb240f224dd8e3dbf9d2
- path: results/multiqc
- path: results/preprocessing/converted/test/test.converted.cram
# binary changes md5sums on reruns.
- path: results/preprocessing/converted/test/test.converted.cram.crai
# binary changes md5sums on reruns.
- path: results/preprocessing/recalibrated/test/test.recal.cram
should_exist: false
- path: results/preprocessing/recalibrated/test/test.recal.cram.crai
should_exist: false
- path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz
should_exist: false
- path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz.tbi
should_exist: false
- path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz
# binary changes md5sums on reruns.
- path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz.tbi
# binary changes md5sums on reruns.
- path: results/haplotypecaller
should_exist: false
- name: Run variant calling on germline sample with haplotypecaller without intervals
command: nextflow run main.nf -profile test,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools haplotypecaller --step variant_calling --no_intervals --outdir results
tags:
Expand Down
1 change: 1 addition & 0 deletions workflows/sarek.nf
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,7 @@ workflow SAREK {
// GERMLINE VARIANT CALLING
BAM_VARIANT_CALLING_GERMLINE_ALL(
params.tools,
params.skip_tools,
maxulysse marked this conversation as resolved.
Show resolved Hide resolved
ch_cram_variant_calling_status_normal,
[[id:"bwa"],[]], //bwa_index for tiddit; not used here
dbsnp,
Expand Down