-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathlong_read_aligner_scattered.wdl
executable file
·213 lines (195 loc) · 11.7 KB
/
long_read_aligner_scattered.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
version 1.0
import "../../ext/hpp_production_workflows/QC/wdl/tasks/extract_reads_toGZ.wdl" as extractReadsToGZ_t
import "../../ext/hpp_production_workflows/QC/wdl/tasks/arithmetic.wdl" as arithmetic_t
import "../tasks/alignment/merge_bams.wdl" as mergeBams_t
import "../tasks/alignment/read_set_splitter.wdl" as readSetSplitter_t
import "../tasks/alignment/long_read_aligner.wdl" as longReadAligner_t
import "../tasks/alignment/calmd.wdl" as calmd_t
import "../tasks/other/misc.wdl" as misc_t
import "../../ext/secphase/wdls/workflows/correct_bam.wdl" as correct_bam_t
import "../../ext/secphase/wdls/workflows/secphase.wdl" as secphase_t
import "../../ext/hpp_production_workflows/assembly/wdl/tasks/filter_short_reads.wdl" as filter_short_reads_t
workflow longReadAlignmentScattered {
meta {
author: "Mobin Asri"
email: "masri@ucsc.edu"
description: "Map reads to a diploid or haploid assembly using different mappers like minimap2, winnowmap and veritymap."
}
parameter_meta {
readfiles: "An array of read files. Their format can be either fastq, fq, fastq.gz, fq.gz, bam or cram. For cram format referenceFastaForReadExtraction should also be passed."
assemblyFasta: "Path to uncompressed or gzip-compressed fasta file of the diploid or haploid assembly"
aligner: "Name of the aligner. It can be either minimap2, winnowmap or veritymap. (Default = winnowmap)"
preset: "(Required) Paremeter preset should be selected based on aligner and sequencing platform. Common presets are lr:hqae/map-ont for minimap2, map-pb/map-ont for winnowmap and hifi-haploid/hifi-haploid-complete/hifi-diploid/ont-haploid-complete for veritymap"
kmerSize: "The kmer size for using minimap2 or winnowmap. With winnowmap kmer size should be 15 and with minimap2 kmer size should be 15 and 25 for using the presets map-ont and lr:hqae respectively."
alignerOptions: "Aligner options. It can be something like '--eqx --cs -Y -L -y' for minimap2/winnowmap. Note that if assembly is diploid and aligner is either minimap2 or winnowmap '-I8g' is necessary. If the reads contain modification tags and these tags are supposed to be present in the final alignment file, alignerOptions should contain '-y' and the aligner should be either minimap2 or winnowmap. If running secphase is enabled it is recommended to add '-p0.5' to alignerOptions; it will keep more secondary alignments so secphase will have more candidates per read. For veritymap '--careful' can be used but not recommended for whole-genome assembly since it increases the runtime dramatically."
readExtractionOptions: "The options to be used while converting bam to fastq with samtools fastq. If the reads contain epigentic modification tags it is necessary to use '-TMm,Ml'"
sampleName: "Name of the sample or assembly. For example 'HG002', 'GRCh38' or 'HG002_hifiasm_v0.19.5'"
suffix: "Suffix string that contains information about this alignment. It will be appended to the name of the final alignment. For example 'hifi_winnowmap_v2.03_hprc_y2'"
referenceFastaForReadExtraction: "If reads are in CRAM format then the related reference should be passed to this parameter for read extraction."
enableAddingMDTag: "If true it will call samtools calmd to add MD tags to the final bam file. [Default = true]"
splitNumber: "The number of chunks which the input reads should be equally split into. Note that enableSplittingReadsEqually should be set to true if user wants to split reads into equally sized chunks. [Default = 16]"
enableSplittingReadsEqually: "If true it will merge all reads together and then split them into multiple chunks of roughly equal size. Each chunk will then be aligned via a separate task. This feature is useful for running alignment on cloud/slurm systems where there are multiple nodes available with enough computing power and having alignments tasks distributed among small nodes is more efficient or cheaper than running a single alignment task in a large node. If the whole workflow is being on a single node it is not recommened to use this feature since mergin and splitting reads takes its own time. [Default = false]"
minReadLength: "If it is greater than zero, a task will be executed for filtering reads shorter than this value before alignment. [Default = 0]"
alignerThreadCount : "The number of threads for mapping in each alignment task [Default = 16]"
alignerMemSize : "The size of the memory in Gb for mapping in each alignment task [Default = 48]"
alignerDockerImage : "The mapping docker image [Default = 'mobinasri/long_read_aligner:v1.1.0']"
enableRunningSecphase: "If true it will run Secphase and apply the corrections reported by Secphase to the final output. [Default = false]"
secphaseOptions: "--hifi for hifi reads and --ont for ont reads [Default = '--hifi'] "
secphaseDockerImage: "The secphase docker image. [Default = 'mobinasri/secphase:v0.4.4']"
secphaseVersion: "Secphase version [Default = 'v0.4.4']"
correctBamOptions: "Options for the correct_bam program that applies the secphase output. [Default = '--primaryOnly --minReadLen 5000 --minAlignment 5000 --maxDiv 0.1']"
enableRunningCorrectBam: "If true it will run correct_bam with correctBamOptions"
preemptible: "Number of retries to use preemptible nodes on Terra/GCP. [Default = 2]"
zones: "Name of the zone for taking nodes on Terra/GCP. (Default = us-west2-a)"
}
input {
Array[File] readFiles
File assemblyFasta
String aligner="winnowmap"
String preset
Int kmerSize = 15
String alignerOptions="--eqx --cs -Y -L -y"
String readExtractionOptions="-TMM,ML,Mm,Ml"
String sampleName
String suffix
File? referenceFastaForReadExtraction
Boolean enableAddingMDTag=true
Int splitNumber = 16
Boolean enableSplittingReadsEqually=false
Int minReadLength = 0
Int alignerThreadCount = 16
Int alignerMemSize = 48
String alignerDockerImage = "mobinasri/long_read_aligner:v1.1.0"
Boolean enableRunningSecphase=false
String secphaseOptions="--hifi"
String secphaseDockerImage="mobinasri/secphase:v0.4.4"
String secphaseVersion="v0.4.4"
Boolean enableRunningCorrectBam=false
String correctBamOptions="--primaryOnly --minReadLen 5000 --minAlignment 5000 --maxDiv 0.1"
Int preemptible=2
String zones="us-west2-a"
}
# convert reads file into gzip-compressed fastq
scatter (readFile in readFiles) {
call extractReadsToGZ_t.extractReadstoGZ {
input:
readFile=readFile,
referenceFasta=referenceFastaForReadExtraction,
fastqOptions=readExtractionOptions,
memSizeGB=4,
threadCount=4,
diskSizeGB= floor(size(readFile, 'GB')) * 4 + 32,
dockerImage="mobinasri/bio_base:v0.4.0"
}
}
# Distribute reads into multiple files with roughly equal sizes.
# The number of files is as many as requested by splitNumber parameter
if (enableSplittingReadsEqually){
call readSetSplitter_t.readSetSplitter {
input:
readFastqs = extractReadstoGZ.extractedRead,
splitNumber = splitNumber,
diskSize = floor(size(extractReadstoGZ.extractedRead, 'GB') * 2.5)
}
}
Array[File] readsArrayToMap = select_first([readSetSplitter.splitReadFastqs, extractReadstoGZ.extractedRead])
# Map reads
scatter (readFile in readsArrayToMap) {
# filter short reads if minReadLength is greater than 0
if (0 < minReadLength){
call filter_short_reads_t.filterShortReads {
input:
readFastq = readFile,
minReadLength = minReadLength,
diskSizeGB= floor(size(readFile, 'GB')) * 3 + 32,
}
}
call longReadAligner_t.alignmentBam as alignment{
input:
aligner = aligner,
preset = preset,
kmerSize = kmerSize,
refAssembly = assemblyFasta,
readFastq_or_queryAssembly = select_first([filterShortReads.longReadFastqGz, readFile]),
options = alignerOptions,
threadCount = alignerThreadCount,
memSize = alignerMemSize,
dockerImage = alignerDockerImage,
diskSize = 64 + floor(size(readFile, 'GB')) * 6,
preemptible = preemptible,
zones = zones,
}
}
## Merge bam files
call mergeBams_t.merge as mergeBams{
input:
sampleName = sampleName,
sampleSuffix = suffix,
sortedBamFiles = alignment.sortedBamFile,
diskSize = floor(size(alignment.sortedBamFile, 'GB') * 2.5) + 32,
preemptible = preemptible,
zones = zones
}
## If it is requested add MD tag with samtools calmd
if (enableAddingMDTag) {
call calmd_t.calmd {
input:
bamFile = mergeBams.mergedBam,
assemblyFasta = assemblyFasta,
diskSize = floor(size(mergeBams.mergedBam, 'GB') * 2.5) + 32,
preemptible = preemptible,
zones = zones
}
}
# If it is requested run secphase and make a new bam file with corrected alignments
if (enableRunningSecphase) {
# gz-compress assembly if it's not since secphase wdl works only with gzipped assembly
call misc_t.gzipCompress as compressAssembly{
input:
inputFile = assemblyFasta
}
# Running Secphase to find the alignments that need correction
call secphase_t.runSecPhase as secphase {
input:
inputBam = select_first([calmd.outputBamFile, mergeBams.mergedBam]),
diploidAssemblyFastaGz = compressAssembly.gzCompressedFile,
secphaseOptions = secphaseOptions,
secphaseDockerImage = secphaseDockerImage,
version = secphaseVersion
}
}
if (enableRunningSecphase || enableRunningCorrectBam){
# Running correctBam to swap primary/secondary tag for detected alignments
call correct_bam_t.correctBam {
input:
phasingLogText = secphase.outLog,
bam = select_first([calmd.outputBamFile, mergeBams.mergedBam]),
options = correctBamOptions,
suffix = if enableRunningSecphase then "secphase_${secphaseVersion}" else "corrected",
flagRemoveMultiplePrimary = false,
flagRemoveSupplementary = false,
memSize=8,
threadCount=8,
dockerImage=secphaseDockerImage,
preemptible=preemptible
}
}
# Tasks for running secphase and adding MD tags make index files by default
# if none of them is enabled then make an index using this task
if ((enableAddingMDTag == false) && (enableRunningSecphase == false) && (enableRunningCorrectBam == false)) {
call longReadAligner_t.indexBam{
input:
bam = mergeBams.mergedBam
}
}
File finalBam = select_first([correctBam.correctedBam, calmd.outputBamFile, mergeBams.mergedBam])
File finalBamIndex = select_first([correctBam.correctedBamIndex, calmd.outputBaiFile, indexBam.bamIndex])
output {
File bamFile = finalBam
File baiFile = finalBamIndex
# secphase output
File? secphaseOutputLog = secphase.outLog
File? secphaseModifiedReadBlocksMarkersBed = secphase.modifiedReadBlocksMarkersBed
File? secphaseMarkerBlocksBed = secphase.markerBlocksBed
}
}