Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Joint calling: Fix meta map info for proper joining #1220

Merged
merged 7 commits into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ Rapaselet is a delta formed by the Rapaätno river between the Bielloriehppe mas
- [#1215](https://github.com/nf-core/sarek/pull/1215) - Fix `meta.id` for mutect2 tumor_only subworkflows
- [#1216](https://github.com/nf-core/sarek/pull/1216) - Better test coverage for variant calling `*_all` subworkflows
- [#1217](https://github.com/nf-core/sarek/pull/1217) - Fix `groupTuple` statement for mutect2 tumor_only subworkflows
- [#1220](https://github.com/nf-core/sarek/pull/1220) - Fix channel and meta logic for `joint_mutect2` feature

### Dependencies

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,8 @@ workflow BAM_VARIANT_CALLING_SOMATIC_ALL {
// joint_mutect2 mode needs different meta.map than regular mode
cram.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai ->
joint_mutect2 ?
[ meta + [ id:meta.patient ] - meta.subMap('patient', 'tumor_id'), [ normal_cram, tumor_cram ], [ normal_crai, tumor_crai ] ] :
//we need to keep all fields and then remove on a per-tool-basis to ensure proper joining at the filtering step
[ meta + [ id:meta.patient ], [ normal_cram, tumor_cram ], [ normal_crai, tumor_crai ] ] :
[ meta, [ normal_cram, tumor_cram ], [ normal_crai, tumor_crai ] ]
},
// Remap channel to match module/subworkflow
Expand Down
46 changes: 31 additions & 15 deletions subworkflows/local/bam_variant_calling_somatic_mutect2/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,28 @@ workflow BAM_VARIANT_CALLING_SOMATIC_MUTECT2 {
.map{ meta, input_list, input_index_list, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], input_list, input_index_list, intervals ] }

if (joint_mutect2) {
// Separate normal cram files and remove duplicates
ch_normal_cram = input.map{ meta, cram, crai -> [ meta, cram[0], crai[0] ] }.unique()

// Separate normal cram files
// Extract tumor cram files
ch_tumor_cram = input.map{ meta, cram, crai -> [ meta, cram[1], crai[1] ] }
// Merge normal and tumor crams by patient
ch_tn_cram = ch_normal_cram.mix(ch_tumor_cram).groupTuple()
ch_cram = input.multiMap{ meta, cram, crai ->
normal: [ meta - meta.subMap('normal_id', 'tumor_id') , cram[0], crai[0] ]
tumor: [ meta - meta.subMap('normal_id', 'tumor_id') , cram[1], crai[1] ]
}

// Remove duplicates from normal channel and merge normal and tumor crams by patient
ch_tn_cram = ch_cram.normal.unique().mix(ch_cram.tumor).groupTuple()
// Combine input and intervals for scatter and gather strategy
ch_tn_intervals = ch_tn_cram.combine(intervals)
// Move num_intervals to meta map and reorganize channel for MUTECT2_PAIRED module
// meta: [id:patient_id, num_intervals, patient, sex]
.map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, intervals ] }

MUTECT2_PAIRED( ch_tn_intervals, fasta, fai, dict, germline_resource, germline_resource_tbi, panel_of_normals, panel_of_normals_tbi)
}
else {

// Perform variant calling using mutect2 module pair mode
// meta: [id:tumor_id_vs_normal_id, normal_id, num_intervals, patient, sex, tumor_id]
MUTECT2_PAIRED( input_intervals, fasta, fai, dict, germline_resource, germline_resource_tbi, panel_of_normals, panel_of_normals_tbi)
}

Expand Down Expand Up @@ -94,10 +102,10 @@ workflow BAM_VARIANT_CALLING_SOMATIC_MUTECT2 {
MERGEMUTECTSTATS(stats_to_merge)

// Mix intervals and no_intervals channels together and remove no longer necessary field: normal_id, tumor_id, num_intervals
vcf = Channel.empty().mix(MERGE_MUTECT2.out.vcf, vcf_branch.no_intervals).map{ meta, vcf -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals'), vcf ]}
tbi = Channel.empty().mix(MERGE_MUTECT2.out.tbi, tbi_branch.no_intervals).map{ meta, tbi -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals'), tbi ]}
stats = Channel.empty().mix(MERGEMUTECTSTATS.out.stats, stats_branch.no_intervals).map{ meta, stats -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals'), stats ]}
f1r2 = Channel.empty().mix(f1r2_to_merge, f1r2_branch.no_intervals).map{ meta, f1r2 -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals'), f1r2 ]}
vcf = Channel.empty().mix(MERGE_MUTECT2.out.vcf, vcf_branch.no_intervals).map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ]}
tbi = Channel.empty().mix(MERGE_MUTECT2.out.tbi, tbi_branch.no_intervals).map{ meta, tbi -> [ meta - meta.subMap('num_intervals'), tbi ]}
stats = Channel.empty().mix(MERGEMUTECTSTATS.out.stats, stats_branch.no_intervals).map{ meta, stats -> [ meta - meta.subMap('num_intervals'), stats ]}
f1r2 = Channel.empty().mix(f1r2_to_merge, f1r2_branch.no_intervals).map{ meta, f1r2 -> [ meta - meta.subMap('num_intervals'), f1r2 ]}

// Generate artifactpriors using learnreadorientationmodel on the f1r2 output of mutect2
LEARNREADORIENTATIONMODEL(f1r2)
Expand All @@ -112,7 +120,7 @@ workflow BAM_VARIANT_CALLING_SOMATIC_MUTECT2 {
// Therefore, we use unique function to generate normal pileup summaries once for each patient for better efficiency.
pileup_normal = pileup.normal.map{ meta, cram, crai, intervals -> [ meta - meta.subMap('tumor_id') + [ id:meta.normal_id ], cram, crai, intervals] }.unique()
// Prepare input channel for tumor pileup summaries.
pileup_tumor = pileup.tumor.map{ meta, cram, crai, intervals -> [ meta + [ id:meta.tumor_id ], cram, crai, intervals ] }
pileup_tumor = pileup.tumor.map{ meta, cram, crai, intervals -> [ meta - meta.subMap('normal_id') + [ id:meta.tumor_id ], cram, crai, intervals ] }

// Generate pileup summary tables using getepileupsummaries. tumor sample should always be passed in as the first input and input list entries of vcf_to_filter,
GETPILEUPSUMMARIES_NORMAL(pileup_normal, fasta, fai, dict, germline_resource_pileup, germline_resource_pileup_tbi)
Expand Down Expand Up @@ -142,12 +150,18 @@ workflow BAM_VARIANT_CALLING_SOMATIC_MUTECT2 {

// Do some channel magic to generate tumor-normal pairs again.
// This is necessary because we generated one normal pileup summary for each patient but we need run calculate contamination for each tumor-normal pair.
pileup_table_tumor = Channel.empty().mix(GATHERPILEUPSUMMARIES_TUMOR.out.table, pileup_table_tumor_branch.no_intervals).map{meta, table -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals') + [id:meta.patient], meta.id, table ] }
pileup_table_normal= Channel.empty().mix(GATHERPILEUPSUMMARIES_NORMAL.out.table, pileup_table_normal_branch.no_intervals).map{meta, table -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals') + [id:meta.patient], meta.id, table ] }
pileup_table_tumor = Channel.empty().mix(GATHERPILEUPSUMMARIES_TUMOR.out.table, pileup_table_tumor_branch.no_intervals).map{meta, table -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals') + [id:meta.patient], meta.id, table ] }
pileup_table_normal = Channel.empty().mix(GATHERPILEUPSUMMARIES_NORMAL.out.table, pileup_table_normal_branch.no_intervals).map{meta, table -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals') + [id:meta.patient], meta.id, table ] }

ch_calculatecontamination_in_tables = pileup_table_tumor.combine(
pileup_table_normal, by:0).map{
meta, tumor_id, tumor_table, normal_id, normal_table -> [ meta + [ id: tumor_id + "_vs_" + normal_id ], tumor_table, normal_table]
meta, tumor_id, tumor_table, normal_id, normal_table ->
if(joint_mutect2){
[ meta + [ id: tumor_id + "_vs_" + normal_id], tumor_table, normal_table]
}else{
// we need tumor and normal ID for further post processing
[ meta + [ id: tumor_id + "_vs_" + normal_id, normal_id:normal_id, tumor_id:tumor_id ], tumor_table, normal_table]
}
}

CALCULATECONTAMINATION(ch_calculatecontamination_in_tables)
Expand All @@ -158,8 +172,8 @@ workflow BAM_VARIANT_CALLING_SOMATIC_MUTECT2 {

if (joint_mutect2) {
// Reduce the meta to only patient name
calculatecontamination_out_seg = CALCULATECONTAMINATION.out.segmentation.map{ meta, seg -> [ meta - meta.subMap('tumor_id') + [id: meta.patient], seg]}.groupTuple()
calculatecontamination_out_cont = CALCULATECONTAMINATION.out.contamination.map{ meta, cont -> [ meta - meta.subMap('tumor_id') + [id: meta.patient], cont]}.groupTuple()
calculatecontamination_out_seg = CALCULATECONTAMINATION.out.segmentation.map{ meta, seg -> [ meta + [id: meta.patient], seg]}.groupTuple()
calculatecontamination_out_cont = CALCULATECONTAMINATION.out.contamination.map{ meta, cont -> [ meta + [id: meta.patient], cont]}.groupTuple()
}
else {
// Keep tumor_vs_normal ID
Expand All @@ -168,6 +182,8 @@ workflow BAM_VARIANT_CALLING_SOMATIC_MUTECT2 {
}

// Mutect2 calls filtered by filtermutectcalls using the artifactpriors, contamination and segmentation tables
// meta joint calling: [id:patient_id, patient, sex]
// meta paired calling: [id:tumorID_vs_normalID, normal_ID, patient, sex, tumorID]
vcf_to_filter = vcf.join(tbi, failOnDuplicate: true, failOnMismatch: true)
.join(stats, failOnDuplicate: true, failOnMismatch: true)
.join(LEARNREADORIENTATIONMODEL.out.artifactprior, failOnDuplicate: true, failOnMismatch: true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,12 @@ workflow BAM_VARIANT_CALLING_TUMOR_ONLY_ALL {
if (tools.split(',').contains('mutect2')) {
BAM_VARIANT_CALLING_TUMOR_ONLY_MUTECT2(
// Adjust meta.map to simplify joining channels
cram.map{ meta, cram, crai -> [ meta + [ id:meta.sample ] - meta.subMap('sample', 'status', 'data_type'), cram, crai ] },
cram.map{ meta, cram, crai ->
joint_mutect2 ?
//we need to keep all fields and then remove on a per-tool-basis to ensure proper joining at the filtering step
[ meta - meta.subMap('data_type', 'status') + [ id:meta.patient ], cram, crai ] :
[ meta - meta.subMap('data_type', 'status'), cram, crai ]
},
// Remap channel to match module/subworkflow
fasta.map{ it -> [ [ id:'fasta' ], it ] },
// Remap channel to match module/subworkflow
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ workflow BAM_VARIANT_CALLING_TUMOR_ONLY_MUTECT2 {
// Perform variant calling using mutect2 module in tumor single mode
// Group cram files by patient
input_joint = input
.map{ meta, input, index -> [ meta + [ id:meta.patient ], input, index ] }
.map{ meta, input, index -> [ meta - meta.subMap('sample') + [ id:meta.patient ], input, index ] }
.groupTuple()

// Add intervals for scatter-gather scaling
Expand Down Expand Up @@ -102,8 +102,10 @@ workflow BAM_VARIANT_CALLING_TUMOR_ONLY_MUTECT2 {
// Generate artifactpriors using learnreadorientationmodel on the f1r2 output of mutect2
LEARNREADORIENTATIONMODEL(f1r2)

pileup_input = input_intervals.map{ meta, cram, crai, intervals -> [ meta + [ id:meta.sample ], cram, crai, intervals] }.unique()

// Generate pileup summary table using getepileupsummaries
GETPILEUPSUMMARIES(input_intervals, fasta, fai, dict, germline_resource_pileup, germline_resource_pileup_tbi)
GETPILEUPSUMMARIES(pileup_input, fasta, fai, dict, germline_resource_pileup, germline_resource_pileup_tbi)

// Figuring out if there is one or more table(s) from the same sample
pileup_table_branch = GETPILEUPSUMMARIES.out.table.branch{
Expand All @@ -118,7 +120,7 @@ workflow BAM_VARIANT_CALLING_TUMOR_ONLY_MUTECT2 {
GATHERPILEUPSUMMARIES(pileup_table_to_merge, dict.map{ meta, dict -> [ dict ] })

// Mix intervals and no_intervals channels together
pileup_table = Channel.empty().mix(GATHERPILEUPSUMMARIES.out.table, pileup_table_branch.no_intervals)
pileup_table = Channel.empty().mix(GATHERPILEUPSUMMARIES.out.table, pileup_table_branch.no_intervals).map{meta, table -> [ meta - meta.subMap('num_intervals') + [id:meta.sample], table ] }

// Contamination and segmentation tables created using calculatecontamination on the pileup summary table
CALCULATECONTAMINATION(pileup_table.map{ meta, table -> [ meta, table, [] ] })
Expand All @@ -129,8 +131,8 @@ workflow BAM_VARIANT_CALLING_TUMOR_ONLY_MUTECT2 {

if (joint_mutect2) {
// Group tables by samples
calculatecontamination_out_seg = CALCULATECONTAMINATION.out.segmentation.map{ meta, seg -> [ meta - meta.subMap('num_intervals'), seg ] }.groupTuple()
calculatecontamination_out_cont = CALCULATECONTAMINATION.out.contamination.map{ meta, cont -> [ meta - meta.subMap('num_intervals'), cont ] }.groupTuple()
calculatecontamination_out_seg = CALCULATECONTAMINATION.out.segmentation.map{ meta, seg -> [ meta - meta.subMap('sample', 'num_intervals') + [id:meta.patient], seg ] }.groupTuple()
calculatecontamination_out_cont = CALCULATECONTAMINATION.out.contamination.map{ meta, cont -> [ meta - meta.subMap('sample', 'num_intervals') + [id:meta.patient], cont ] }.groupTuple()
} else {
// Regular single sample mode
calculatecontamination_out_seg = CALCULATECONTAMINATION.out.segmentation.map{ meta, seg -> [ meta - meta.subMap('num_intervals'), seg ] }
Expand Down
4 changes: 2 additions & 2 deletions tests/test_tools_manually.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
- variant_calling
files:
- path: results/csv/variantcalled.csv
md5sum: f87290ce1c6ea523e08354ed6c258b0b
asp8200 marked this conversation as resolved.
Show resolved Hide resolved
md5sum: d3c9f0559d48696c54f3c463b1606586
- path: results/multiqc
- path: results/no_intervals.bed
md5sum: f3dac01ea66b95fe477446fde2d31489
Expand Down Expand Up @@ -91,7 +91,7 @@
- variant_calling
files:
- path: results/csv/variantcalled.csv
md5sum: f87290ce1c6ea523e08354ed6c258b0b
md5sum: d3c9f0559d48696c54f3c463b1606586
- path: results/multiqc
- path: results/reports/bcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.bcftools_stats.txt
md5sum: 9876607145d11c6b8492264936d7a82c
Expand Down